<a href="https://colab.research.google.com/github/kazuma2002/OpenScienceDataChallenge/blob/main/CNN_ricecroppred.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Supress Warnings
import warnings
warnings.filterwarnings('ignore')

# Visualization
import ipyleaflet
import matplotlib.pyplot as plt
from IPython.display import Image
import seaborn as sns

# Data Science
import numpy as np
import pandas as pd
import statsmodels.api as sm

# Feature Engineering
from sklearn.model_selection import train_test_split, KFold

# Machine Learning
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.metrics import r2_score


# Planetary Computer Tools
import pystac
import pystac_client
import odc
from pystac_client import Client
from pystac.extensions.eo import EOExtension as eo
from odc.stac import stac_load
import planetary_computer as pc

#Please pass your API key here
pc.settings.set_subscription_key('c3ed0e9c76f44014a77ef43b454f6747')

# Others
import requests
import rich.table
from itertools import cycle
from tqdm import tqdm
tqdm.pandas()
from tqdm.notebook import tqdm_notebook
tqdm_notebook.pandas()

#Additionals
!pip install mlxtend
import multiprocessing
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
!pip install tensorflow
import tensorflow as tf
from tensorflow.keras import layers, models
from sklearn.base import BaseEstimator, RegressorMixin
from tensorflow.keras import models
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Dropout, BatchNormalization
from tensorflow.keras.layers import Conv2D, MaxPooling2D
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.model_selection import RandomizedSearchCV

In [None]:
crop_yield_data = pd.read_csv("Crop_Yield_Data_challenge_2.csv")
crop_yield_data.head()

In [None]:
features_data = pd.read_csv("Features1_data.csv")
features_data.head()

In [None]:
def combine_two_datasets(dataset1,dataset2):
    data = pd.concat([dataset1,dataset2], axis=1)
    return data

In [None]:
crop_data = combine_two_datasets(crop_yield_data,features_data)
crop_data.head()

In [None]:
#Take all columns of Features_data and Features2_data
crop_data = crop_data[['min_vv', 'max_vv', 'range_vv', 'mean_vv', 'correlation_vv', 'permutation_entropy_vv',
                       'min_vh', 'max_vh', 'range_vh', 'mean_vh', 'correlation_vh', 'permutation_entropy_vh',
                       'min_vv_by_vh',  'max_vv_by_vh', 'range_vv_by_vh', 'mean_vv_by_vh', 'correlation_vv_by_vh',
                       'permutation_entropy_vv_by_vh', 'rvi', 'backscatter_coefficient', 'polarization',

                       'r_mean', 'g_mean', 'b_mean', 'nir_mean', 'swir_mean', 'ndvi', 'ndwi', 'ndmi',
                       'red_mean','blue_mean', 'green_mean', 'brightness', 'contrast', 'correlation',
                       'energy', 'homogeneity', 'Field size (ha)', 'Rice Yield (kg/ha)']]
crop_data.head()

In [None]:
#correlation matrix
corrmat = crop_data.corr()
f, ax = plt.subplots(figsize=(12, 9))
sns.heatmap(corrmat, vmax=.8, square=True)

In [None]:
#Use columns correlated with Rice Yield
crop_data = crop_data[['permutation_entropy_vv','permutation_entropy_vh','correlation_vv', 'correlation_vh',
                       'permutation_entropy_vv_by_vh', 'correlation_vv_by_vh', 'rvi', 'backscatter_coefficient', 'polarization',
                       'ndvi', 'ndwi', 'ndmi', 'brightness', 'contrast', 'correlation',
                       'energy', 'homogeneity', 'Field size (ha)',

                       'Rice Yield (kg/ha)']]
crop_data.head()

In [None]:
# Visualize the distribution of values for a specific column
def plot_column_distribution(data, column):
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))

    sns.boxplot(data=data[column], ax=ax1)
    ax1.set_title(f'Box plot of {column}')

    sns.histplot(data=data[column], kde=True, ax=ax2)
    ax2.set_title(f'Histogram and KDE of {column}')

    plt.show()

# List of feature columns
feature_columns = ['permutation_entropy_vv','permutation_entropy_vh','correlation_vv', 'correlation_vh',
                       'permutation_entropy_vv_by_vh', 'correlation_vv_by_vh', 'rvi', 'backscatter_coefficient', 'polarization',
                       'ndvi', 'ndwi', 'ndmi', 'brightness', 'contrast', 'correlation',
                       'energy', 'homogeneity', 'Field size (ha)',

                       'Rice Yield (kg/ha)']

# Visualize the distribution for all feature columns
for column in feature_columns:
    plot_column_distribution(crop_data, column)

In [None]:
# Drop rows with all missing values in training and validation data
crop_data = crop_data.dropna(axis=0, how='any')

#Check if there is missing value
missing_val_count_by_column = (crop_data.isnull().sum())
print(missing_val_count_by_column[missing_val_count_by_column > 0])

In [None]:
from scipy.stats import mstats
def winsorize_data(data, columns, limits=(0.05, 0.05)):
    winsorized_data = data.copy()
    for col in columns:
        winsorized_data[col] = mstats.winsorize(winsorized_data[col], limits=limits)
    return winsorized_data

columns_to_winsorize = ['homogeneity', 'correlation', 'brightness', 'contrast', 'Field size (ha)']
winsorized_data = winsorize_data(crop_data, columns_to_winsorize)

def log_transform_data(data, columns):
    log_transformed_data = data.copy()
    for col in columns:
        log_transformed_data[col] = np.log1p(log_transformed_data[col])
    return log_transformed_data

columns_to_log_transform = ['homogeneity', 'correlation', 'brightness', 'contrast', 'Field size (ha)']
log_transformed_data = log_transform_data(crop_data, columns_to_log_transform)

In [None]:
log_transformed_data.describe().transpose()

In [None]:
crop_data = log_transformed_data

In [None]:
# Split data into features and target
X = crop_data.drop('Rice Yield (kg/ha)', axis=1)
y = crop_data['Rice Yield (kg/ha)']

# Convert pandas DataFrames to NumPy arrays
X = X.values
y = y.values

# Reshape input data for the FCN
X = X.reshape(X.shape[0], X.shape[1])

# Scale input data
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(X_train.size)
print(X_test.size)
print(X.shape)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Train the Random Forest model
check_model = RandomForestClassifier(n_estimators=100, random_state=42)
check_model.fit(X_train, y_train)

# Make predictions and evaluate the model
y_pred = check_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Model accuracy: {accuracy:.2f}')

# Analyze feature importances
importances = check_model.feature_importances_
feature_names = np.array(['permutation_entropy_vv','permutation_entropy_vh','correlation_vv', 'correlation_vh',
                       'permutation_entropy_vv_by_vh', 'correlation_vv_by_vh', 'rvi', 'backscatter_coefficient', 'polarization',
                       'ndvi', 'ndwi', 'ndmi', 'brightness', 'contrast', 'correlation',
                       'energy', 'homogeneity', 'Field size (ha)',])
feature_importances = np.column_stack((feature_names, importances))
sorted_idx = np.argsort(importances)[::-1]
sorted_feature_importances = feature_importances[sorted_idx]

print(sorted_feature_importances)

In [None]:
# Create CNN model
def create_cnn(input_shape):
    model.add(Conv1D(64, kernel_size=3, activation='relu', input_shape=input_shape))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Conv1D(128, kernel_size=3, activation='relu'))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Conv1D(256, kernel_size=3, activation='relu'))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Conv1D(512, kernel_size=3, activation='relu'))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Flatten())
    model.add(Dense(256, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(128, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(1, activation='linear'))

    return model

# Create FCN model
def create_fcn(input_shape):
    model = Sequential()
    model.add(Dense(256, activation='relu', input_shape=(input_shape,)))
    model.add(BatchNormalization())

    model.add(Dense(128, activation='relu'))
    #model.add(BatchNormalization())

    model.add(Dense(64, activation='relu'))
    #model.add(BatchNormalization())
    model.add(Dropout(0.5))

    model.add(Dense(256, activation='relu'))
    #model.add(BatchNormalization())

    model.add(Dense(128, activation='relu'))
    #model.add(BatchNormalization())

    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.5))

    model.add(Dense(32, activation='relu'))
    #model.add(BatchNormalization())

    model.add(Dense(16, activation='relu'))
    model.add(BatchNormalization())

    model.add(Dense(1, activation='linear'))

    return model

In [None]:
# Define a function that creates the model with the specified learning rate
def create_model(learning_rate=0.001):
    model = create_cnnn(input_shape) # cnn or fcn
    optimizer = Adam(learning_rate=learning_rate)
    model.compile(loss='mean_squared_error', optimizer=optimizer)
    return model

# Create the KerasRegressor wrapper
model = KerasRegressor(build_fn=create_model, epochs=100, verbose=0)

# Define the hyperparameter grid or distribution
param_dist = {
    'learning_rate': np.logspace(-5, -2, 30),
    'batch_size': [8, 16, 32, 64, 128]
}

# Create the random search object
input_shape = 14 # Check the data shape (18 columns)
random_search = RandomizedSearchCV(
    estimator=model, param_distributions=param_dist, n_iter=20, cv=3, verbose=2, n_jobs=-1
)

# Run the random search
random_search.fit(X_train, y_train)

In [None]:
# Print the best hyperparameters
print("Best parameters found: ", random_search.best_params_)

In [None]:
# Generate a model
input_shape = 18
model = create_fcn(input_shape) # cnn or fcn

# Compile the model
learning_rate = 0.0025  # Adjust this value
optimizer = Adam(learning_rate=learning_rate)
model.compile(loss='mean_squared_error', optimizer=optimizer)
model.summary()