# Business Understanding
Smallholder farmers are crucial contributors to global food production, and in India often suffer most from poverty and malnutrition. These farmers face challenges such as limited access to modern agriculture, unpredictable weather, and resource constraints. To tackle this issue, Digital Green collected data via surveys, offering insights into farming practices, environmental conditions, and crop yields.

#### Objective
`The objective of this challenge is to create a machine learning solution to predict the crop yield per acre of rice or wheat crops in India. Our goal is to empower these farmers and break the cycle of poverty and malnutrition.`
A crop yield model could revolutionise Indian agriculture, and serve as a global model for smallholder farmers. Accurate yield predictions empower smallholder farmers to make informed planting and resource allocation decisions, reducing poverty and malnutrition and improving food security. As climate change intensifies, adaptive farming practices become crucial, making precise yield predictions even more valuable. Solutions developed here can drive sustainable agriculture and ensure a stable food supply for the world's growing population. This challenge offers data scientists and machine learning enthusiasts a unique chance to make a real difference in vulnerable populations' lives while advancing global food security in a concise, impactful way.

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# Import libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import warnings
from scipy.stats import chi2_contingency
from sklearn.preprocessing import StandardScaler
import string
import random


# To ignore all warnings
warnings.filterwarnings("ignore")

# Set seed for reproducability
SEED = 2023
random.seed(SEED)
np.random.seed(SEED)

In [None]:
!pip install xgboost
!pip install catboost
!pip install optuna

Traceback (most recent call last):
  File "/usr/local/bin/pip3", line 5, in <module>
    from pip._internal.cli.main import main
  File "/usr/local/lib/python3.10/dist-packages/pip/_internal/cli/main.py", line 10, in <module>
    from pip._internal.cli.autocompletion import autocomplete
  File "/usr/local/lib/python3.10/dist-packages/pip/_internal/cli/autocompletion.py", line 10, in <module>
    from pip._internal.cli.main_parser import create_main_parser
  File "/usr/local/lib/python3.10/dist-packages/pip/_internal/cli/main_parser.py", line 9, in <module>
    from pip._internal.build_env import get_runnable_pip
  File "/usr/local/lib/python3.10/dist-packages/pip/_internal/build_env.py", line 19, in <module>
    from pip._internal.cli.spinners import open_spinner
  File "/usr/local/lib/python3.10/dist-packages/pip/_internal/cli/spinners.py", line 9, in <module>
    from pip._internal.utils.logging import get_indentation
  File "/usr/local/lib/python3.10/dist-packages/pip/_internal/util

In [20]:
# Load files
data_path = "/content/drive/MyDrive/Digital Green Crop Yield Estimate Challenge/"
train = pd.read_csv(data_path + 'Train.csv')
test = pd.read_csv(data_path + 'Test.csv')
sample_submission = pd.read_csv(data_path + 'SampleSubmission.csv')
var_desc = pd.read_csv(data_path + 'VariableDescription.csv')
# Seting the display options
pd.set_option('display.max_colwidth', None)  # Display entire contents of each cell
pd.set_option('display.max_rows', None)       # Display all rows (no truncation)

In [21]:
# Preview files
train.head()

Unnamed: 0,ID,District,Block,CultLand,CropCultLand,LandPreparationMethod,CropTillageDate,CropTillageDepth,CropEstMethod,RcNursEstDate,...,Harv_method,Harv_date,Harv_hand_rent,Threshing_date,Threshing_method,Residue_length,Residue_perc,Stubble_use,Acre,Yield
0,ID_GTFAC7PEVWQ9,Nalanda,Noorsarai,45,40,TractorPlough FourWheelTracRotavator,2022-07-20,5,Manual_PuddledRandom,2022-06-27,...,machine,2022-11-16,,2022-11-16,machine,30,40,plowed_in_soil,0.3125,600
1,ID_TK40ARLSPOKS,Nalanda,Rajgir,26,26,WetTillagePuddling TractorPlough FourWheelTracRotavator,2022-07-18,5,Manual_PuddledRandom,2022-06-20,...,hand,2022-11-25,3.0,2022-12-24,machine,24,10,plowed_in_soil,0.3125,600
2,ID_1FJY2CRIMLZZ,Gaya,Gurua,10,10,TractorPlough FourWheelTracRotavator,2022-06-30,6,Manual_PuddledRandom,2022-06-20,...,hand,2022-12-12,480.0,2023-01-11,machine,30,10,plowed_in_soil,0.148148,225
3,ID_I3IPXS4DB7NE,Gaya,Gurua,15,15,TractorPlough FourWheelTracRotavator,2022-06-16,6,Manual_PuddledRandom,2022-06-17,...,hand,2022-12-02,240.0,2022-12-29,hand,26,10,plowed_in_soil,0.222222,468
4,ID_4T8YQWXWHB4A,Nalanda,Noorsarai,60,60,TractorPlough WetTillagePuddling,2022-07-19,4,Manual_PuddledRandom,2022-06-21,...,machine,2022-11-30,,2022-12-02,machine,24,40,plowed_in_soil,0.46875,550


Unnamed: 0,ID,District,Block,CultLand,CropCultLand,LandPreparationMethod,CropTillageDate,CropTillageDepth,CropEstMethod,RcNursEstDate,...,Harv_method,Harv_date,Harv_hand_rent,Threshing_date,Threshing_method,Residue_length,Residue_perc,Stubble_use,Acre,Yield
0,ID_GTFAC7PEVWQ9,Nalanda,Noorsarai,45,40,TractorPlough FourWheelTracRotavator,2022-07-20,5,Manual_PuddledRandom,2022-06-27,...,machine,2022-11-16,,2022-11-16,machine,30,40,plowed_in_soil,0.3125,600
1,ID_TK40ARLSPOKS,Nalanda,Rajgir,26,26,WetTillagePuddling TractorPlough FourWheelTracRotavator,2022-07-18,5,Manual_PuddledRandom,2022-06-20,...,hand,2022-11-25,3.0,2022-12-24,machine,24,10,plowed_in_soil,0.3125,600
2,ID_1FJY2CRIMLZZ,Gaya,Gurua,10,10,TractorPlough FourWheelTracRotavator,2022-06-30,6,Manual_PuddledRandom,2022-06-20,...,hand,2022-12-12,480.0,2023-01-11,machine,30,10,plowed_in_soil,0.148148,225
3,ID_I3IPXS4DB7NE,Gaya,Gurua,15,15,TractorPlough FourWheelTracRotavator,2022-06-16,6,Manual_PuddledRandom,2022-06-17,...,hand,2022-12-02,240.0,2022-12-29,hand,26,10,plowed_in_soil,0.222222,468
4,ID_4T8YQWXWHB4A,Nalanda,Noorsarai,60,60,TractorPlough WetTillagePuddling,2022-07-19,4,Manual_PuddledRandom,2022-06-21,...,machine,2022-11-30,,2022-12-02,machine,24,40,plowed_in_soil,0.46875,550


In [22]:
test.head()

Unnamed: 0,ID,District,Block,CultLand,CropCultLand,LandPreparationMethod,CropTillageDate,CropTillageDepth,CropEstMethod,RcNursEstDate,...,MineralFertAppMethod.1,Harv_method,Harv_date,Harv_hand_rent,Threshing_date,Threshing_method,Residue_length,Residue_perc,Stubble_use,Acre
0,ID_F9XXEXN2ADR2,Jamui,Khaira,20,13,TractorPlough,2022-07-24,4,Manual_PuddledLine,2022-07-03,...,Broadcasting,hand,2022-12-02,500.0,2022-12-28,hand,22,10,plowed_in_soil,0.272727
1,ID_SO3VW2X4QO93,Jamui,Khaira,25,25,TractorPlough,2022-07-24,5,Manual_PuddledRandom,2022-07-08,...,Broadcasting,hand,2022-11-12,150.0,2022-12-20,machine,28,10,plowed_in_soil,0.227273
2,ID_UKUQ7JM8E894,Nalanda,Rajgir,30,30,WetTillagePuddling TractorPlough FourWheelTracRotavator,2022-07-07,5,Manual_PuddledRandom,2022-06-24,...,Broadcasting,hand,2022-11-24,1200.0,2022-12-05,machine,28,10,plowed_in_soil,0.28125
3,ID_QUISMWEZR2H4,Vaishali,Mahua,15,10,WetTillagePuddling TractorPlough BullockPlough FourWheelTracRotavator,2022-07-04,5,Manual_PuddledRandom,2022-06-20,...,SoilApplied,hand,2022-10-15,300.0,2022-10-25,hand,30,10,plowed_in_soil,0.136364
4,ID_25JGI455VKCZ,Nalanda,Rajgir,30,10,WetTillagePuddling TractorPlough FourWheelTracRotavator,2022-07-21,3,Manual_PuddledRandom,2022-06-24,...,Broadcasting,machine,2022-11-20,,2022-11-20,machine,26,40,plowed_in_soil,0.3125


Unnamed: 0,ID,District,Block,CultLand,CropCultLand,LandPreparationMethod,CropTillageDate,CropTillageDepth,CropEstMethod,RcNursEstDate,...,MineralFertAppMethod.1,Harv_method,Harv_date,Harv_hand_rent,Threshing_date,Threshing_method,Residue_length,Residue_perc,Stubble_use,Acre
0,ID_F9XXEXN2ADR2,Jamui,Khaira,20,13,TractorPlough,2022-07-24,4,Manual_PuddledLine,2022-07-03,...,Broadcasting,hand,2022-12-02,500.0,2022-12-28,hand,22,10,plowed_in_soil,0.272727
1,ID_SO3VW2X4QO93,Jamui,Khaira,25,25,TractorPlough,2022-07-24,5,Manual_PuddledRandom,2022-07-08,...,Broadcasting,hand,2022-11-12,150.0,2022-12-20,machine,28,10,plowed_in_soil,0.227273
2,ID_UKUQ7JM8E894,Nalanda,Rajgir,30,30,WetTillagePuddling TractorPlough FourWheelTracRotavator,2022-07-07,5,Manual_PuddledRandom,2022-06-24,...,Broadcasting,hand,2022-11-24,1200.0,2022-12-05,machine,28,10,plowed_in_soil,0.28125
3,ID_QUISMWEZR2H4,Vaishali,Mahua,15,10,WetTillagePuddling TractorPlough BullockPlough FourWheelTracRotavator,2022-07-04,5,Manual_PuddledRandom,2022-06-20,...,SoilApplied,hand,2022-10-15,300.0,2022-10-25,hand,30,10,plowed_in_soil,0.136364
4,ID_25JGI455VKCZ,Nalanda,Rajgir,30,10,WetTillagePuddling TractorPlough FourWheelTracRotavator,2022-07-21,3,Manual_PuddledRandom,2022-06-24,...,Broadcasting,machine,2022-11-20,,2022-11-20,machine,26,40,plowed_in_soil,0.3125


In [23]:
sample_submission.head()

Unnamed: 0,ID,Yield
0,ID_F9XXEXN2ADR2,0
1,ID_SO3VW2X4QO93,0
2,ID_UKUQ7JM8E894,0
3,ID_QUISMWEZR2H4,0
4,ID_25JGI455VKCZ,0


Unnamed: 0,ID,Yield
0,ID_F9XXEXN2ADR2,0
1,ID_SO3VW2X4QO93,0
2,ID_UKUQ7JM8E894,0
3,ID_QUISMWEZR2H4,0
4,ID_25JGI455VKCZ,0


In [24]:
train['Yield'] = np.log1p(train['Yield'])

# Splitting data

In [25]:
from sklearn.preprocessing import PolynomialFeatures

# Calculate the correlations with the target 'Yield' for the 'train' dataset
correlations = train.corr()['Yield'].abs()

# Select features with correlation <= 0.1
selected_features = correlations[correlations <= 0.1].index.tolist()

# Create a DataFrame with only the selected features in the 'train' data
selected_features_df_train = train[selected_features]

# Fill missing values with 0 in the 'train' data
selected_features_df_train = selected_features_df_train.fillna(0)

# Create polynomial features for the selected features (up to 2nd degree) in the 'train' data
poly = PolynomialFeatures(degree=2, include_bias=False, interaction_only=False)
selected_features_poly_train = poly.fit_transform(selected_features_df_train)

# Manually construct polynomial feature names
original_feature_names = selected_features_df_train.columns
poly_feature_names = poly.get_feature_names_out(input_features=original_feature_names)

# Rename the new features by adding "2" at the end
poly_feature_names = [name + '2' if name in original_feature_names else name for name in poly_feature_names]

# Convert the result into a DataFrame with appropriate column names for the 'train' data
selected_features_poly_df_train = pd.DataFrame(selected_features_poly_train, columns=poly_feature_names)

# Remove duplicated columns (if any) in the 'train' data
selected_features_poly_df_train = selected_features_poly_df_train.loc[:, ~selected_features_poly_df_train.columns.duplicated()]

# Combine the polynomial features with the original 'train' DataFrame
train = pd.concat([train, selected_features_poly_df_train], axis=1)

# Now, create the same polynomial features for the 'test' dataset (without the target variable)

# Create a DataFrame with only the selected features in the 'test' data
selected_features_df_test = test[selected_features]

# Fill missing values with 0 in the 'test' data
selected_features_df_test = selected_features_df_test.fillna(0)

# Create polynomial features for the selected features (up to 2nd degree) in the 'test' data
selected_features_poly_test = poly.transform(selected_features_df_test)

# Convert the result into a DataFrame with appropriate column names for the 'test' data
selected_features_poly_df_test = pd.DataFrame(selected_features_poly_test, columns=poly_feature_names)

# Remove duplicated columns (if any) in the 'test' data
selected_features_poly_df_test = selected_features_poly_df_test.loc[:, ~selected_features_poly_df_test.columns.duplicated()]

# Combine the polynomial features with the original 'test' DataFrame
test = pd.concat([test, selected_features_poly_df_test], axis=1)

In [26]:
from sklearn.preprocessing import PolynomialFeatures

# Calculate the correlations with the target 'Yield' for the 'train' dataset
correlations = train.corr()['Yield'].abs()

# Select features with correlation <= 0.1
selected_features = correlations[correlations <= 0.2].index.tolist()

# Create a DataFrame with only the selected features in the 'train' data
selected_features_df_train = train[selected_features]

# Fill missing values with 0 in the 'train' data
selected_features_df_train = selected_features_df_train.fillna(0)

# Create polynomial features for the selected features (up to 2nd degree) in the 'train' data
poly = PolynomialFeatures(degree=2, include_bias=False, interaction_only=False)
selected_features_poly_train = poly.fit_transform(selected_features_df_train)

# Manually construct polynomial feature names
original_feature_names = selected_features_df_train.columns
poly_feature_names = poly.get_feature_names_out(input_features=original_feature_names)

# Rename the new features by adding "3" at the end
poly_feature_names = [name + '3' if name in original_feature_names else name for name in poly_feature_names]

# Convert the result into a DataFrame with appropriate column names for the 'train' data
selected_features_poly_df_train = pd.DataFrame(selected_features_poly_train, columns=poly_feature_names)

# Remove duplicated columns (if any) in the 'train' data
selected_features_poly_df_train = selected_features_poly_df_train.loc[:, ~selected_features_poly_df_train.columns.duplicated()]

# Combine the polynomial features with the original 'train' DataFrame
train = pd.concat([train, selected_features_poly_df_train], axis=1)

# Now, create the same polynomial features for the 'test' dataset (without the target variable)

# Create a DataFrame with only the selected features in the 'test' data
selected_features_df_test = test[selected_features]

# Fill missing values with 0 in the 'test' data
selected_features_df_test = selected_features_df_test.fillna(0)

# Create polynomial features for the selected features (up to 2nd degree) in the 'test' data
selected_features_poly_test = poly.transform(selected_features_df_test)

# Convert the result into a DataFrame with appropriate column names for the 'test' data
selected_features_poly_df_test = pd.DataFrame(selected_features_poly_test, columns=poly_feature_names)

# Remove duplicated columns (if any) in the 'test' data
selected_features_poly_df_test = selected_features_poly_df_test.loc[:, ~selected_features_poly_df_test.columns.duplicated()]

# Combine the polynomial features with the original 'test' DataFrame
test = pd.concat([test, selected_features_poly_df_test], axis=1)

In [27]:
# Split data for training and local testing

X = train.drop(['ID', 'Yield'], axis = 1)
X =X.select_dtypes(include=np.number)
y = train.Yield

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 1234)


In [28]:
# from sklearn.preprocessing import StandardScaler

# # Standardize your data
# scaler = StandardScaler()
# X_train = scaler.fit_transform(X_train)
# X_test = scaler.transform(X_test)


# Random Forest

In [12]:
# Instantiate model
model = RandomForestRegressor(random_state = 1234)

# Fit model
model.fit(X_train.fillna(0), y_train)

# Make predictions
preds = model.predict(X_test.fillna(0))

# Measure model performance
mean_squared_error(y_test, preds, squared=False)

0.4049654605108804

# Xgboost

In [13]:
from xgboost import XGBRegressor

# Instantiate model
xgb = XGBRegressor(random_state = 1234)

# Fit model
xgb.fit(X_train.fillna(0), y_train)

# Make predictions
train_preds = xgb.predict(X_train.fillna(0))
test_preds = xgb.predict(X_test.fillna(0))

# Measure model performance
print(f'Train RMSE - {mean_squared_error(y_train, train_preds, squared=False)}')
print(f'Test RMSE - {mean_squared_error(y_test, test_preds, squared=False)}')

Train RMSE - 0.054368130686815765
Test RMSE - 0.3618441661330071


In [14]:
from xgboost import XGBRegressor

# Instantiate model
xgb_tuned = XGBRegressor(random_state = 1234,colsample_bytree = 0.6,learning_rate = 0.1,
                   max_depth=4, n_estimators = 400, reg_alpha =1.0,reg_lambda=1.0,subsample=0.9)

# Fit model
xgb_tuned.fit(X_train.fillna(0), y_train)

# Make predictions
train_preds = xgb_tuned.predict(X_train.fillna(0))
test_preds = xgb_tuned.predict(X_test.fillna(0))

# Measure model performance
print(f'Train RMSE - {mean_squared_error(y_train, train_preds, squared=False)}')
print(f'Test RMSE - {mean_squared_error(y_test, test_preds, squared=False)}')

Train RMSE - 0.13570118533703748
Test RMSE - 0.3390442967161887


In [15]:
from sklearn.ensemble import VotingRegressor

# Create a voting regressor that combines the XGBoost and Random Forest models
voting_regressor = VotingRegressor(estimators=[('xgb', xgb), ('rf', model)])

# Fit the voting regressor to the training data
voting_regressor.fit(X_train.fillna(0), y_train)

# Make predictions with the voting regressor
train_preds = voting_regressor.predict(X_train.fillna(0))
test_preds = voting_regressor.predict(X_test.fillna(0))

# Calculate RMSE for both the training and test sets
train_rmse = mean_squared_error(y_train, train_preds, squared=False)
test_rmse = mean_squared_error(y_test, test_preds, squared=False)

print(f'Train RMSE - {train_rmse}')
print(f'Test RMSE - {test_rmse}')

Train RMSE - 0.08172518358034166
Test RMSE - 0.3707461149157848


In [16]:
# from xgboost import XGBRegressor
# from sklearn.model_selection import GridSearchCV
# from sklearn.metrics import mean_squared_error

# param_grid = {
#     'n_estimators': [100, 200, 300, 400],
#     'max_depth': [3, 4, 5, 6],
#     'learning_rate': [0.01, 0.1, 0.2, 0.3],
#     'subsample': [0.8, 0.9, 1.0],  # Fraction of samples used for fitting trees
#     'colsample_bytree': [0.6, 0.7, 0.8, 0.9, 1.0],  # Fraction of features used for each tree
#     'reg_alpha': [0.0, 0.01, 0.1, 1.0],  # L1 regularization term
#     'reg_lambda': [0.0, 0.01, 0.1, 1.0],  # L2 regularization term
#     'min_child_weight': [1, 2, 3, 4],  # Minimum sum of instance weight (Hessian) needed in a child
#     'gamma': [0, 0.1, 0.2, 0.3],  # Minimum loss reduction required to make a further partition on a leaf node
#     'scale_pos_weight': [1, 2, 3, 4],  # Controls the balance of positive and negative weights
# }


# # Create a GridSearchCV object
# grid_search = GridSearchCV(estimator=xgb, param_grid=param_grid, scoring='neg_mean_squared_error', cv=5, n_jobs=-1)

# # Fit the GridSearchCV object to the data
# grid_search.fit(X_train.fillna(0), y_train)

# # Get the best hyperparameters and the best model
# best_params = grid_search.best_params_
# best_xgb = grid_search.best_estimator_

# # Train the best model on the full training data
# best_xgb.fit(X_train.fillna(0), y_train)

# # Calculate train RMSE
# train_preds = best_xgb.predict(X_train.fillna(0))
# train_rmse = mean_squared_error(y_train, train_preds, squared=False)

# # Make predictions on the test data
# test_preds = best_xgb.predict(X_test.fillna(0))

# # Measure model performance using RMSE for test data
# test_rmse = mean_squared_error(y_test, test_preds, squared=False)

# # Print the best hyperparameters and RMSE for both train and test data
# print("Best Hyperparameters:", best_params)
# print("Train RMSE with best model:", train_rmse)
# print("Test RMSE with best model:", test_rmse)

# Catboost

In [17]:
from catboost import CatBoostRegressor

# Instantiate model
cat = CatBoostRegressor(random_state = 1234,silent=True)

# Fit model
cat.fit(X_train.fillna(0), y_train)

# Make predictions
preds = cat.predict(X_test.fillna(0))

# Measure model performance
mean_squared_error(y_test, preds, squared=False)

0.32102163278500756

# Predictions

In [18]:
# Make predictions on the Zindi test set
test_df = test[X.columns]
preds = cat.predict(test_df.fillna(0))
preds = np.expm1(preds)


# Create submisiion file to be uploaded to Zindi for scoring
sub = pd.DataFrame({'ID': test.ID, 'Yield': preds})
sub.to_csv('BenchmarkSubmission.csv', index = False)

sub.head()

Unnamed: 0,ID,Yield
0,ID_F9XXEXN2ADR2,606.933358
1,ID_SO3VW2X4QO93,444.981892
2,ID_UKUQ7JM8E894,485.206549
3,ID_QUISMWEZR2H4,300.731296
4,ID_25JGI455VKCZ,560.360783


In [19]:
from google.colab import files

sub.to_csv('BenchmarkSubmission.csv', index = False)
# files.download('BenchmarkSubmission.csv')