<a href="https://colab.research.google.com/github/kazuma2002/OpenScienceDataChallenge/blob/main/Model_ricecropped.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Supress Warnings
import warnings
warnings.filterwarnings('ignore')

# Visualization
import ipyleaflet
import matplotlib.pyplot as plt
from IPython.display import Image
import seaborn as sns

# Data Science
import numpy as np
import pandas as pd
import statsmodels.api as sm

# Feature Engineering
from sklearn.model_selection import train_test_split, KFold

# Machine Learning
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.metrics import r2_score


# Planetary Computer Tools
import pystac
import pystac_client
import odc
from pystac_client import Client
from pystac.extensions.eo import EOExtension as eo
from odc.stac import stac_load
import planetary_computer as pc

#Please pass your API key here
pc.settings.set_subscription_key('c3ed0e9c76f44014a77ef43b454f6747')

# Others
import requests
import rich.table
from itertools import cycle
from tqdm import tqdm
tqdm.pandas()
from tqdm.notebook import tqdm_notebook
tqdm_notebook.pandas()

#Additionals
!pip install mlxtend
import multiprocessing
from sklearn.model_selection import cross_val_score
from mlxtend.regressor import StackingCVRegressor
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor , GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import ExtraTreesRegressor
!pip install xgboost
import xgboost as xgb
!pip install lightgbm
import lightgbm as lgb

In [None]:
crop_yield_data = pd.read_csv("Crop_Yield_Data_challenge_2.csv")
crop_yield_data.head()

In [None]:
features_data = pd.read_csv("Features1_data.csv")
features_data.head()

In [None]:
def combine_two_datasets(dataset1,dataset2):
    data = pd.concat([dataset1,dataset2], axis=1)
    return data

In [None]:
crop_data = combine_two_datasets(crop_yield_data,features_data)
crop_data.head()

In [None]:
#Take all columns of Features_data and Features2_data
crop_data = crop_data[['min_vv', 'max_vv', 'range_vv', 'mean_vv', 'correlation_vv', 'permutation_entropy_vv',
                       'min_vh', 'max_vh', 'range_vh', 'mean_vh', 'correlation_vh', 'permutation_entropy_vh',
                       'min_vv_by_vh',  'max_vv_by_vh', 'range_vv_by_vh', 'mean_vv_by_vh', 'correlation_vv_by_vh',
                       'permutation_entropy_vv_by_vh', 'rvi', 'backscatter_coefficient', 'polarization',

                       'r_mean', 'g_mean', 'b_mean', 'nir_mean', 'swir_mean', 'ndvi', 'ndwi', 'ndmi',
                       'red_mean','blue_mean', 'green_mean', 'brightness', 'contrast', 'correlation',
                       'energy', 'homogeneity', 'Field size (ha)', 'Rice Yield (kg/ha)']]
crop_data.head()

In [None]:
#correlation matrix
corrmat = crop_data.corr()
f, ax = plt.subplots(figsize=(12, 9))
sns.heatmap(corrmat, vmax=.8, square=True)

In [None]:
#Use columns correlated with Rice Yield
crop_data = crop_data[['ndvi', 'ndmi', 'brightness', 'homogeneity', 'correlation',
                       'min_vv_by_vh',  'max_vv_by_vh', 'range_vv_by_vh', 'mean_vv_by_vh', 'correlation_vv_by_vh',
                       'permutation_entropy_vv_by_vh', 'rvi', 'backscatter_coefficient', 'polarization',

                       'Rice Yield (kg/ha)']]
crop_data.head()

In [None]:
from scipy.stats import mstats
def log_transform_data(data, columns):
    log_transformed_data = data.copy()
    for col in columns:
        log_transformed_data[col] = np.log1p(log_transformed_data[col])
    return log_transformed_data

columns_to_log_transform = ['homogeneity', 'brightness', 'Field size (ha)']
#crop_data = log_transform_data(crop_data, columns_to_log_transform)
crop_data.describe().transpose()

In [None]:
# Drop rows with all missing values in training and validation data
crop_data = crop_data.dropna(axis=0, how='any')

#Check if there is missing value
missing_val_count_by_column = (crop_data.isnull().sum())
print(missing_val_count_by_column[missing_val_count_by_column > 0])

In [None]:
# Split data into features and target
X = crop_data.drop('Rice Yield (kg/ha)', axis=1)
y = crop_data['Rice Yield (kg/ha)']

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(X_train_scaled.size)
print(X_test_scaled.size)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Train the Random Forest model
check_model = RandomForestClassifier(n_estimators=100, random_state=42)
check_model.fit(X_train, y_train)

# Make predictions and evaluate the model
y_pred = check_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Model accuracy: {accuracy:.2f}')

# Analyze feature importances
importances = check_model.feature_importances_
feature_importances = pd.DataFrame({'feature': X.columns, 'importance': importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
print(feature_importances)

In [None]:
##GridSearch to find hyperparameter
from sklearn.model_selection import GridSearchCV

# Define the hyperparameters to search for each model
dt_params = {
    'max_depth': [5, 10, 15, 20],
    'min_samples_leaf': [1, 2, 5, 10],
    'min_samples_split': [2, 5, 10, 15],
    'max_features': ['sqrt', 'log2', None]
}
rf_params = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['log2', 'sqrt'],
    'bootstrap' : [True, False]
}
gbr_params = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 1],
    'max_depth': [3, 5, 7],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2', None]
}
svm_params = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf', 'poly'],
    'epsilon': [0.1, 0.2],
    'gamma': ['scale', 'auto']
}
nb_params = {
    'var_smoothing': [1e-9, 1e-8, 1e-7, 1e-6]
}
knn_params = {
    'n_neighbors': [3, 5, 7, 9],
    'weights': ['uniform', 'distance'],
    'p': [1, 2]
}
xgb_params = {
    'max_depth': [5, 10, 15, 20],
    'min_child_weight': [1, 2, 5, 10],
    'learning_rate': [0.01, 0.05, 0.1],
    'subsample': [0.5, 0.75, 1],
    'colsample_bytree': [0.5, 0.75, 1],
    'n_estimators': [50, 100, 200]
}
lgb_params = {
    'num_leaves': [31, 50, 100],
    'learning_rate': [0.05, 0.1, 0.2],
    'num_iterations': [50, 100, 200],
    'max_depth': [-1, 5, 10],
    'min_data_in_leaf': [20, 50, 100]
}
et_params = {
    'n_estimators': [50, 100, 200],
    'max_depth': [10, 20, 30],
    'max_features': ['auto', 'sqrt', 'log2']
}

# Create the four models
dt_model = DecisionTreeRegressor()
rf_model = RandomForestRegressor()
gbr_model = GradientBoostingRegressor()
svm_model = SVR()
nb_model = GaussianNB()
knn_model = KNeighborsRegressor()
xgb_model = xgb.XGBRegressor()
lgb_model = lgb.LGBMRegressor(objective='regression', metric='rmse')
et_model = ExtraTreesRegressor()

# Define the parameter grid for each model
dt_grid = GridSearchCV(dt_model, dt_params, cv=5, n_jobs= -1)
rf_grid = GridSearchCV(rf_model, rf_params, cv=5, n_jobs= -1)
gbr_grid = GridSearchCV(gbr_model, gbr_params, cv=5, n_jobs= -1)
svm_grid = GridSearchCV(svm_model, svm_params, cv=5, n_jobs= -1)
nb_grid = GridSearchCV(nb_model, nb_params, cv=5, n_jobs= -1)
knn_grid = GridSearchCV(knn_model, knn_params, cv=5, n_jobs=-1)
xgb_grid = GridSearchCV(xgb_model, xgb_params, cv=5, n_jobs=-1)
lgb_grid = GridSearchCV(lgb_model, lgb_params, cv=5, n_jobs=-1)
et_grid = GridSearchCV(et_model, et_params, cv=5, n_jobs=-1)

# Fit the grids to the training data with a progress bar
for grid in tqdm([dt_grid, rf_grid, gbr_grid, nb_grid, knn_grid, xgb_grid, et_grid, lgb_grid]):
    grid.fit(X_train, y_train)

In [None]:
# Print the best hyperparameters for each model
print("Decision Tree best params:", dt_grid.best_params_)
print("Random Forest best params:", rf_grid.best_params_)
print("Gradient Boosting Regressor best params:", gbr_grid.best_params_)
print("Naive Bayes best params:", nb_grid.best_params_)
print("K-nearest Regressor best params:", knn_grid.best_params_)
print("XGBoost best params:", xgb_grid.best_params_)
print("Light GBM best params:", lgb_grid.best_params_)
print("Extra Tree best params:", et_grid.best_params_)

In [None]:
# Train the four models on the training data
dt_model = DecisionTreeRegressor(max_depth=5, max_features=None, min_samples_leaf=2, min_samples_split=2)
dt_cv_scores = cross_val_score(dt_model, X_train, y_train, cv=5, scoring='neg_root_mean_squared_error')
dt_cv_rmse = np.mean(np.abs(dt_cv_scores))
dt_model.fit(X_train, y_train)
rf_model = RandomForestRegressor(bootstrap=True, max_depth=10, max_features='log2',
                                 min_samples_leaf=1, min_samples_split=5, n_estimators=50)
rf_cv_scores = cross_val_score(rf_model, X_train, y_train, cv=5, scoring='neg_root_mean_squared_error')
rf_cv_rmse = np.mean(np.abs(rf_cv_scores))
rf_model.fit(X_train, y_train)
gbr_model = GradientBoostingRegressor(learning_rate=0.01, max_depth=5, max_features=None,
                                      min_samples_leaf=2, min_samples_split=5, n_estimators=200)
gbr_cv_scores = cross_val_score(gbr_model, X_train, y_train, cv=5, scoring='neg_root_mean_squared_error')
gbr_cv_rmse = np.mean(np.abs(gbr_cv_scores))
gbr_model.fit(X_train, y_train)
nb_model = GaussianNB(var_smoothing=1e-07)
nb_cv_scores = cross_val_score(nb_model, X_train_scaled, y_train, cv=5, scoring='neg_root_mean_squared_error')
nb_cv_rmse = np.mean(np.abs(nb_cv_scores))
nb_model.fit(X_train_scaled, y_train)
knn_model = KNeighborsRegressor(n_neighbors=9, p=1, weights='uniform')
knn_cv_scores = cross_val_score(knn_model, X_train_scaled, y_train, cv=5, scoring='neg_root_mean_squared_error')
knn_cv_rmse = np.mean(np.abs(knn_cv_scores))
knn_model.fit(X_train_scaled, y_train)
xgb_model = xgb.XGBRegressor(colsample_bytree=0.5, learning_rate=0.05, max_depth=5,
                             min_child_weight=5, n_estimators=100, subsample=1)
xgb_cv_scores = cross_val_score(xgb_model, X_train, y_train, cv=5, scoring='neg_root_mean_squared_error')
xgb_cv_rmse = np.mean(np.abs(xgb_cv_scores))
xgb_model.fit(X_train, y_train)
lgb_model = lgb.LGBMRegressor(learning_rate=0.05, max_depth=-5, min_data_in_leaf=20, num_iterations=50,
                              num_leaves=31, verbosity=-1)
lgb_cv_scores = cross_val_score(lgb_model, X_train, y_train, cv=5, scoring='neg_root_mean_squared_error')
lgb_cv_rmse = np.mean(np.abs(lgb_cv_scores))
lgb_model.fit(X_train, y_train)
et_model = ExtraTreesRegressor(max_depth=10, max_features='auto', n_estimators=200)
et_cv_scores = cross_val_score(et_model, X_train, y_train, cv=5, scoring='neg_root_mean_squared_error')
et_cv_rmse = np.mean(np.abs(et_cv_scores))
et_model.fit(X_train, y_train)
svm_model = SVR(C=10, epsilon=0.1, gamma='scale', kernel='linear')
svm_cv_scores = cross_val_score(svm_model, X_train_scaled, y_train, cv=5, scoring='neg_root_mean_squared_error')
svm_cv_rmse = np.mean(np.abs(svm_cv_scores))
svm_model.fit(X_train_scaled, y_train)

# Make predictions on the testing data
dt_pred = dt_model.predict(X_test)
rf_pred = rf_model.predict(X_test)
gbr_pred = gbr_model.predict(X_test)
nb_pred = nb_model.predict(X_test)
knn_pred = knn_model.predict(X_test)
xgb_pred = xgb_model.predict(X_test)
lgb_pred = lgb_model.predict(X_test)
svm_pred = svm_model.predict(X_test)
et_pred = et_model.predict(X_test)

print("Decision Tree Cross-validation RMSE:", dt_cv_rmse)
print("Random Forest Cross-validation RMSE:", rf_cv_rmse)
print("Gradient Boosting Cross-validation RMSE:", gbr_cv_rmse)
print("Naive Bayes Cross-validation RMSE:", nb_cv_rmse)
print("Extra Tree Cross-validation RMSE:", et_cv_rmse)
print("K-Nearest Neighbors Cross-validation RMSE:", knn_cv_rmse)
print("XGboost Cross-validation RMSE:", xgb_cv_rmse)
print("Light GBM Cross-validation RMSE:", lgb_cv_rmse)
print("SVM Cross-validation RMSE:", svm_cv_rmse)

In [None]:
dt_r2 = r2_score(y_test, dt_pred)
rf_r2 = r2_score(y_test, rf_pred)
gbr_r2 = r2_score(y_test, gbr_pred)
nb_r2 = r2_score(y_test, nb_pred)
knn_r2 = r2_score(y_test, knn_pred)
xgb_r2 = r2_score(y_test, xgb_pred)
lgb_r2 = r2_score(y_test, lgb_pred)
svm_r2 = r2_score(y_test, dt_pred)
et_r2 = r2_score(y_test, et_pred)
print("Decision Tree R2:", dt_r2)
print("Random Forest R2:", rf_r2)
print("Gradient Boosting R2:", gbr_r2)
print("Naive Bayes R2:", nb_r2)
print("Extra Tree R2:", et_r2)
print("K-Nearest Neighbors R2:", knn_r2)
print("XGboost R2:", xgb_r2)
print("Light GBM R2:", lgb_r2)
print("SVM R2:", svm_r2)

In [None]:
# Make another data set for another aspect
X2_train, X2_test, y2_train, y2_test = train_test_split(X, y, test_size=0.2, random_state=123)
# Fit the grids to the training data with a progress bar
for grid in tqdm([dt_grid, rf_grid, gbr_grid, et_grid, xgb_grid, lgb_grid]):
    grid.fit(X2_train, y2_train)

In [None]:
# Print the best hyperparameters for each model
print("Decision Tree best params:", dt_grid.best_params_)
print("Random Forest best params:", rf_grid.best_params_)
print("Gradient Boosting Regressor best params:", gbr_grid.best_params_)
print("XGBoost best params:", xgb_grid.best_params_)
print("Light GBM best params:", lgb_grid.best_params_)
print("Extra Tree best params:", et_grid.best_params_)

In [None]:
# Train the four models on the training data
dt2_model = DecisionTreeRegressor(max_depth=5, max_features=None, min_samples_leaf=1, min_samples_split=5)
dt2_cv_scores = cross_val_score(dt2_model, X2_train, y2_train, cv=5, scoring='neg_root_mean_squared_error')
dt2_cv_rmse = np.mean(np.abs(dt2_cv_scores))
dt2_model.fit(X2_train, y2_train)
rf2_model = RandomForestRegressor(bootstrap=False, max_depth=5, max_features='sqrt',
                                 min_samples_leaf=1, min_samples_split=2, n_estimators=200)
rf2_cv_scores = cross_val_score(rf2_model, X2_train, y2_train, cv=5, scoring='neg_root_mean_squared_error')
rf2_cv_rmse = np.mean(np.abs(rf2_cv_scores))
rf2_model.fit(X2_train, y2_train)
gbr2_model = GradientBoostingRegressor(learning_rate=0.1, max_depth=3, max_features='sqrt',
                                      min_samples_leaf=4, min_samples_split=10, n_estimators=50)
gbr2_cv_scores = cross_val_score(gbr2_model, X2_train, y2_train, cv=5, scoring='neg_root_mean_squared_error')
gbr2_cv_rmse = np.mean(np.abs(gbr2_cv_scores))
gbr2_model.fit(X2_train, y2_train)
xgb2_model = xgb.XGBRegressor(colsample_bytree=0.75, learning_rate=0.1, max_depth=5,
                             min_child_weight=2, n_estimators=50, subsample=1)
xgb2_cv_scores = cross_val_score(xgb2_model, X2_train, y2_train, cv=5, scoring='neg_root_mean_squared_error')
xgb2_cv_rmse = np.mean(np.abs(xgb2_cv_scores))
xgb2_model.fit(X2_train, y2_train)
lgb2_model = lgb.LGBMRegressor(learning_rate=0.05, max_depth=5, min_data_in_leaf=20, num_iterations=50,
                               num_leaves=31, verbosity=-1)
lgb2_cv_scores = cross_val_score(lgb2_model, X2_train, y2_train, cv=5, scoring='neg_root_mean_squared_error')
lgb2_cv_rmse = np.mean(np.abs(lgb2_cv_scores))
lgb2_model.fit(X2_train, y2_train)
et2_model = ExtraTreesRegressor(max_depth=10, max_features='auto', n_estimators=200)
et2_cv_scores = cross_val_score(et2_model, X2_train, y2_train, cv=5, scoring='neg_root_mean_squared_error')
et2_cv_rmse = np.mean(np.abs(et2_cv_scores))
et2_model.fit(X2_train, y2_train)

print("Decision Tree2 Cross-validation RMSE:", dt2_cv_rmse)
print("Random Forest2 Cross-validation RMSE:", rf2_cv_rmse)
print("Gradient Boosting2 Cross-validation RMSE:", gbr2_cv_rmse)
print("XGboost2 Cross-validation RMSE:", xgb2_cv_rmse)
print("Light GBM2 Cross-validation RMSE:", lgb2_cv_rmse)
print("Extra Tree2 Cross-validation RMSE:", et2_cv_rmse)

In [None]:
dt_r2 = r2_score(y_test, dt_pred)
rf_r2 = r2_score(y_test, rf_pred)
gbr_r2 = r2_score(y_test, gbr_pred)
xgb_r2 = r2_score(y_test, xgb_pred)
lgb_r2 = r2_score(y_test, lgb_pred)
et_r2 = r2_score(y_test, et_pred)
print("Decision Tree R2:", dt_r2)
print("Random Forest R2:", rf_r2)
print("Gradient Boosting R2:", gbr_r2)
print("Extra Tree R2:", et_r2)
print("XGboost R2:", xgb_r2)
print("Light GBM R2:", lgb_r2)

In [None]:
# Create a stack model
lr_model = LinearRegression()
stack_model = StackingCVRegressor(regressors=(lgb_model,gbr_model,rf_model,et_model,xgb_model,
                                              rf2_model, gbr2_model, xgb2_model, lgb2_model, et2_model),
                                  meta_regressor=lr_model, cv=KFold(n_splits=5, shuffle=True, random_state=21))

sm_cv_scores = cross_val_score(stack_model, X4_train, y4_train, cv=5, scoring='neg_root_mean_squared_error')
sm_cv_rmse = np.mean(np.abs(sm_cv_scores))

stack_model.fit(X4_train, y4_train)
stack_pred = stack_model.predict(X4_test)
stack_r2 = r2_score(y4_test, stack_pred)

In [None]:
print("Stacking R2Score:", stack_r2)
print("Cross-validation RMSE:", sm_cv_rmse)