

> Al-Powered Crop Yield Prediction and Optimization





# Importing Libraries


In [1]:
!pip install pandas numpy scikit-learn xgboost



In [2]:

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score, root_mean_squared_error
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor, GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
import pickle
import warnings
warnings.filterwarnings('ignore')

# Data Collection

In [3]:
df = pd.read_csv('crop_yield.csv')

In [4]:
df.head()

Unnamed: 0,Crop,Crop_Year,Season,State,Area,Production,Annual_Rainfall,Fertilizer,Pesticide,Yield
0,Arecanut,1997,Whole Year,Assam,73814.0,56708,2051.4,7024878.38,22882.34,0.796087
1,Arhar/Tur,1997,Kharif,Assam,6637.0,4685,2051.4,631643.29,2057.47,0.710435
2,Castor seed,1997,Kharif,Assam,796.0,22,2051.4,75755.32,246.76,0.238333
3,Coconut,1997,Whole Year,Assam,19656.0,126905000,2051.4,1870661.52,6093.36,5238.051739
4,Cotton(lint),1997,Kharif,Assam,1739.0,794,2051.4,165500.63,539.09,0.420909


In [5]:
df.shape

(19689, 10)

# Data Analysis

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19689 entries, 0 to 19688
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Crop             19689 non-null  object 
 1   Crop_Year        19689 non-null  int64  
 2   Season           19689 non-null  object 
 3   State            19689 non-null  object 
 4   Area             19689 non-null  float64
 5   Production       19689 non-null  int64  
 6   Annual_Rainfall  19689 non-null  float64
 7   Fertilizer       19689 non-null  float64
 8   Pesticide        19689 non-null  float64
 9   Yield            19689 non-null  float64
dtypes: float64(5), int64(2), object(3)
memory usage: 1.5+ MB


In [7]:
df.describe(include='all')

Unnamed: 0,Crop,Crop_Year,Season,State,Area,Production,Annual_Rainfall,Fertilizer,Pesticide,Yield
count,19689,19689.0,19689,19689,19689.0,19689.0,19689.0,19689.0,19689.0,19689.0
unique,55,,6,30,,,,,,
top,Rice,,Kharif,Karnataka,,,,,,
freq,1197,,8232,1432,,,,,,
mean,,2009.127584,,,179926.6,16435940.0,1437.755177,24103310.0,48848.35,79.954009
std,,6.498099,,,732828.7,263056800.0,816.909589,94946000.0,213287.4,878.306193
min,,1997.0,,,0.5,0.0,301.3,54.17,0.09,0.0
25%,,2004.0,,,1390.0,1393.0,940.7,188014.6,356.7,0.6
50%,,2010.0,,,9317.0,13804.0,1247.6,1234957.0,2421.9,1.03
75%,,2015.0,,,75112.0,122718.0,1643.7,10003850.0,20041.7,2.388889


In [8]:
df.isnull().sum()

Crop               0
Crop_Year          0
Season             0
State              0
Area               0
Production         0
Annual_Rainfall    0
Fertilizer         0
Pesticide          0
Yield              0
dtype: int64

No Null Values

In [9]:
df.duplicated().sum()

np.int64(0)

No Duplicate Values

In [10]:
X = df[['Crop', 'Crop_Year', 'Season', 'State', 'Area', 'Production',
        'Annual_Rainfall', 'Fertilizer', 'Pesticide']]
y = df['Yield']

categorical_features = ['Crop', 'Season', 'State']
numeric_features = ['Crop_Year', 'Area', 'Production', 'Annual_Rainfall', 'Fertilizer', 'Pesticide']


# Categorical Handling and Scaling //Preprocessing

In [11]:
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(sparse_output=False,handle_unknown='ignore'), categorical_features),
    ('num', StandardScaler(), numeric_features)
])

In [12]:
X = df[categorical_features + numeric_features]

X_encoded = preprocessor.fit_transform(X)

feature_names = preprocessor.get_feature_names_out()

encoded_df = pd.DataFrame(X_encoded, columns=feature_names)

encoded_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19689 entries, 0 to 19688
Data columns (total 97 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   cat__Crop_Arecanut               19689 non-null  float64
 1   cat__Crop_Arhar/Tur              19689 non-null  float64
 2   cat__Crop_Bajra                  19689 non-null  float64
 3   cat__Crop_Banana                 19689 non-null  float64
 4   cat__Crop_Barley                 19689 non-null  float64
 5   cat__Crop_Black pepper           19689 non-null  float64
 6   cat__Crop_Cardamom               19689 non-null  float64
 7   cat__Crop_Cashewnut              19689 non-null  float64
 8   cat__Crop_Castor seed            19689 non-null  float64
 9   cat__Crop_Coconut                19689 non-null  float64
 10  cat__Crop_Coriander              19689 non-null  float64
 11  cat__Crop_Cotton(lint)           19689 non-null  float64
 12  cat__Crop_Cowpea(L

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model Building

In [14]:
model_rfr = Pipeline([
    ('preprocess', preprocessor),
    ('regressor', RandomForestRegressor(random_state=42))
])

In [15]:
model_xgb = Pipeline([
    ('preprocess', preprocessor),
    ('regressor', XGBRegressor(objective='reg:squarederror', random_state=42))
])

In [16]:
model_knn = Pipeline([
    ('preprocess', preprocessor),
    ('regressor', KNeighborsRegressor())
])

In [17]:
model_ada = Pipeline([
    ('preprocess', preprocessor),
    ('regressor', AdaBoostRegressor(random_state=42))
])

In [18]:
model_gbr = Pipeline([
    ('preprocess', preprocessor),
    ('regressor', GradientBoostingRegressor(random_state=42))
])

In [19]:
model_dtr = Pipeline([
    ('preprocess', preprocessor),
    ('regressor', DecisionTreeRegressor(random_state=42))
])

In [20]:
model_mlp = Pipeline([
    ('preprocess', preprocessor),
    ('regressor', MLPRegressor(hidden_layer_sizes=(100,), max_iter=500, random_state=42))
])

# RFR

In [21]:
model_rfr.fit(X_train, y_train)

0,1,2
,steps,"[('preprocess', ...), ('regressor', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('cat', ...), ('num', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,n_estimators,100
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [22]:
y_pred_rfr = model_rfr.predict(X_test)

In [23]:
print('RMSE:', root_mean_squared_error(y_test, y_pred_rfr))
print('R2 score:', r2_score(y_test, y_pred_rfr))
print("cross_val_score:", cross_val_score(model_rfr, X_test, y_test, cv=4).mean())

RMSE: 98.18022510596349
R2 score: 0.9879694495693093
cross_val_score: 0.8105052873574623


# XGB

In [24]:
model_xgb.fit(X_train, y_train)

0,1,2
,steps,"[('preprocess', ...), ('regressor', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('cat', ...), ('num', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,objective,'reg:squarederror'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


In [25]:
y_pred_xgb = model_xgb.predict(X_test)

In [26]:
print('RMSE:', root_mean_squared_error(y_test, y_pred_xgb))
print('R2 score:', r2_score(y_test, y_pred_xgb))
print("cross_val_score:", cross_val_score(model_xgb, X_test, y_test, cv=4).mean())

RMSE: 245.66052314614342
R2 score: 0.9246803669218919
cross_val_score: 0.9040915283250951


# KNN

In [27]:
model_knn.fit(X_train, y_train)

0,1,2
,steps,"[('preprocess', ...), ('regressor', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('cat', ...), ('num', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,n_neighbors,5
,weights,'uniform'
,algorithm,'auto'
,leaf_size,30
,p,2
,metric,'minkowski'
,metric_params,
,n_jobs,


In [28]:
y_pred_knn = model_knn.predict(X_test)

In [29]:
print('RMSE:', root_mean_squared_error(y_test, y_pred_knn))
print('R2 score:', r2_score(y_test, y_pred_knn))
print("cross_val_score:", cross_val_score(model_knn, X_test, y_test, cv=4).mean())

RMSE: 247.02778015473746
R2 score: 0.9238396305016969
cross_val_score: 0.6599682833146745


# ADA

In [30]:
model_ada.fit(X_train, y_train)

0,1,2
,steps,"[('preprocess', ...), ('regressor', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('cat', ...), ('num', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,estimator,
,n_estimators,50
,learning_rate,1.0
,loss,'linear'
,random_state,42


In [31]:
y_pred_ada = model_ada.predict(X_test)

In [32]:
print('RMSE:', root_mean_squared_error(y_test, y_pred_ada))
print('R2 score:', r2_score(y_test, y_pred_ada))
print("cross_val_score:", cross_val_score(model_ada, X_test, y_test, cv=4).mean())

RMSE: 268.6706145776475
R2 score: 0.909909750751352
cross_val_score: 0.8016471592765568


# GBR

In [33]:
model_gbr.fit(X_train, y_train)

0,1,2
,steps,"[('preprocess', ...), ('regressor', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('cat', ...), ('num', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,loss,'squared_error'
,learning_rate,0.1
,n_estimators,100
,subsample,1.0
,criterion,'friedman_mse'
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_depth,3
,min_impurity_decrease,0.0


In [34]:
y_pred_gbr = model_gbr.predict(X_test)

In [35]:
print('RMSE:', root_mean_squared_error(y_test, y_pred_gbr))
print('R2 score:', r2_score(y_test, y_pred_gbr))
print("cross_val_score:", cross_val_score(model_gbr, X_test, y_test, cv=4).mean())

RMSE: 107.95738977092307
R2 score: 0.9854540462643023
cross_val_score: 0.8584037627003777


# DTR

In [36]:
model_dtr.fit(X_train, y_train)

0,1,2
,steps,"[('preprocess', ...), ('regressor', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('cat', ...), ('num', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,criterion,'squared_error'
,splitter,'best'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,42
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [37]:
y_pred_dtr = model_dtr.predict(X_test)

In [38]:
print('RMSE:', root_mean_squared_error(y_test, y_pred_dtr))
print('R2 score:', r2_score(y_test, y_pred_dtr))
print("cross_val_score:", cross_val_score(model_dtr, X_test, y_test, cv=4).mean())

RMSE: 199.60950958959853
R2 score: 0.9502721244116253
cross_val_score: 0.872210697045118


# MLP

In [39]:
model_mlp.fit(X_train, y_train)

0,1,2
,steps,"[('preprocess', ...), ('regressor', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('cat', ...), ('num', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,loss,'squared_error'
,hidden_layer_sizes,"(100,)"
,activation,'relu'
,solver,'adam'
,alpha,0.0001
,batch_size,'auto'
,learning_rate,'constant'
,learning_rate_init,0.001
,power_t,0.5
,max_iter,500


In [40]:
y_pred_mlp = model_mlp.predict(X_test)

In [41]:
print('RMSE:', root_mean_squared_error(y_test, y_pred_mlp))
print('R2 score:', r2_score(y_test, y_pred_mlp))
print("cross_val_score:", cross_val_score(model_mlp, X_test, y_test, cv=4).mean())

RMSE: 178.89252954286786
R2 score: 0.9600587308147563
cross_val_score: 0.7139627426701635


# Model testing

In our model evaluation results, RFR model shows the best result. So we choose RFR

In [42]:
sample = pd.DataFrame({
    'Crop': ['Wheat'],
    'Crop_Year': [2023],
    'Season': ['Rabi'],
    'State': ['Punjab'],
    'Area': [250],
    'Production': [5000],
    'Annual_Rainfall': [800],
    'Fertilizer': [50],
    'Pesticide': [10]
})


In [43]:

predicted_yield = model_rfr.predict(sample)

print('Predicted Yield:', predicted_yield[0])

Predicted Yield: 7.30157731715


# Model Deployment

In [44]:
with open('ml_model.pk1', 'wb') as file:
  pickle.dump(model_rfr, file)
print("Model saved to file")

Model saved to file
