<a href="https://colab.research.google.com/github/harshbansal7/profarm-dvh/blob/main/Ensemble_Methods_for_Yield_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Crop Yield Prediction based on DecisionTreeRegression

#### Importing all required libraries and CSV Dataset

In [1]:
import numpy as np
import pandas as pd # to make dataframes
import matplotlib.pyplot as plt # to visualize data (using plots)
import seaborn as sns # also for visualization
from sklearn import model_selection
from sklearn.model_selection import train_test_split # splitting data into test-train sections
from sklearn.tree import DecisionTreeRegressor # the actual Decision Maker
from sklearn import metrics # to calculate Accuracy of the Model

df = pd.read_csv('https://raw.githubusercontent.com/harshbansal7/profarm-dvh/main/Datasets%20and%20Notebooks/main2.csv') # importing dataset to a dataframe
df.info()
df.nunique()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 676425 entries, 0 to 676424
Data columns (total 10 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   State             676425 non-null  object 
 1   District          676425 non-null  object 
 2   Crop              675458 non-null  object 
 3   Year              676425 non-null  object 
 4   Season            676425 non-null  object 
 5   Area              675458 non-null  float64
 6   Area Units        675458 non-null  object 
 7   Production        665684 non-null  float64
 8   Production Units  675458 non-null  object 
 9   Yield             675458 non-null  float64
dtypes: float64(3), object(7)
memory usage: 51.6+ MB


State                   35
District               709
Crop                    56
Year                    23
Season                   6
Area                 47390
Area Units               1
Production           67906
Production Units         3
Yield               193205
dtype: int64

#### Cleaning Data 

1. Removing Rows with any NaN entry.
2. Sorting data in Descending Order of Year for better understanding.
3. `Production, Production Units, Area, Area Units` colums were dropped.


In [2]:
df = df.dropna()
df = df.sort_values(ascending=False, by='Year') 

df = df.drop('Area Units', axis=1)
df = df.drop('Production', axis=1)
df = df.drop('Production Units', axis=1)
df = df.drop('Area', axis=1)

# df = df[df['Year'].str.split('-').str[0].astype(int) >= 2010]

#### More Cleaning and handling String Entries (using Categories)

- Season cleaned up to remove disparities. 
- CSV Saved to have a Light Weight Dataset for quicker processing. 
- Converting `Year, State, Season, District, Crop` to Categorical Variables for better analysis.
- Printing Arrays of Categorical Entries for reference 

In [3]:
df = df.replace({'Season':{'Autumn':'Kharif','Summer':'Zaid','Winter':'Rabi'}})

df.to_csv('yield_cleaned_2.csv')

df['Year'] = pd.Categorical(df['Year'], ordered = True)
df['State'] = pd.Categorical(df['State'], ordered = True)
df['District'] = pd.Categorical(df['District'], ordered = True)
df['Season'] = pd.Categorical(df['Season'], ordered = True)
df['Crop'] = pd.Categorical(df['Crop'], ordered = True)

print (df['Year'].cat.categories)
print (df['State'].cat.categories)
print (df['District'].cat.categories)
print (df['Season'].cat.categories)
print (df['Crop'].cat.categories)

df

Index(['1997-98', '1998-99', '1999-00', '2000-01', '2001-02', '2002-03',
       '2003-04', '2004-05', '2005-06', '2006-07', '2007-08', '2008-09',
       '2009-10', '2010-11', '2011-12', '2012-13', '2013-14', '2014-15',
       '2015-16', '2016-17', '2017-18', '2018-19', '2019-20'],
      dtype='object')
Index(['Andaman and Nicobar Islands', 'Andhra Pradesh', 'Arunachal Pradesh',
       'Assam', 'Bihar', 'Chandigarh', 'Chhattisgarh',
       'Dadra and Nagar Haveli', 'Daman and Diu', 'Delhi', 'Goa', 'Gujarat',
       'Haryana', 'Himachal Pradesh', 'Jammu and Kashmir', 'Jharkhand',
       'Karnataka', 'Kerala', 'Madhya Pradesh', 'Maharashtra', 'Manipur',
       'Meghalaya', 'Mizoram', 'Nagaland', 'Odisha', 'Puducherry', 'Punjab',
       'Rajasthan', 'Sikkim', 'Tamil Nadu', 'Telangana', 'Tripura',
       'Uttar Pradesh', 'Uttarakhand', 'West Bengal'],
      dtype='object')
Index(['24 PARAGANAS NORTH', '24 PARAGANAS SOUTH', 'ADILABAD', 'AGAR MALWA',
       'AGRA', 'AHMADABAD', 'AHMEDNAGAR', 

Unnamed: 0,State,District,Crop,Year,Season,Yield
287384,Tamil Nadu,KARUR,Jowar,2019-20,Rabi,0.407208
621290,Madhya Pradesh,REWA,Banana,2019-20,Whole Year,35.300000
285525,Nagaland,DIMAPUR,Sesamum,2019-20,Kharif,0.617691
285524,Nagaland,ZUNHEBOTO,Rice,2019-20,Kharif,2.336540
621340,Madhya Pradesh,ANUPPUR,Coriander,2019-20,Whole Year,1.800008
...,...,...,...,...,...,...
659314,Odisha,SAMBALPUR,Urad,1997-98,Rabi,0.261665
659313,Odisha,SAMBALPUR,Urad,1997-98,Zaid,0.571429
659312,Odisha,SAMBALPUR,Urad,1997-98,Kharif,0.311966
333429,Uttar Pradesh,HARDOI,Small millets,1997-98,Kharif,0.777778


#### Converting Categorical Columns to Integer Codes

In [4]:
cat_columns = df.select_dtypes(['category']).columns
df[cat_columns] = df[cat_columns].apply(lambda x: x.cat.codes)

df.to_csv('yield_processing_1.csv')

df

Unnamed: 0,State,District,Crop,Year,Season,Yield
287384,29,325,21,22,1,0.407208
621290,18,544,3,22,2,35.300000
285525,23,173,44,22,0,0.617691
285524,23,708,41,22,0,2.336540
621340,18,29,10,22,2,1.800008
...,...,...,...,...,...,...
659314,24,561,53,0,1,0.261665
659313,24,561,53,0,3,0.571429
659312,24,561,53,0,0,0.311966
333429,32,244,45,0,0,0.777778


#### Finally Dividing the Data into Two parts and doing the ML part

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

def scores_print(name_of_regr, regr, x_test, y_test):
  y_pred = regr.predict(x_test)
  mae = mean_absolute_error(y_test, y_pred)
  mse = mean_squared_error(y_test,y_pred)
  r2 = r2_score(y_test,y_pred)
  print(name_of_regr + " MAE : {:.3f} | MSE : {:.3f} | R^2 Score : {:.5f}".format(mae, mse, r2))

In [6]:
X = df.drop(columns=['Yield'], axis=1)
Y = df['Yield']
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = .20, random_state=42)
    
regr = DecisionTreeRegressor(random_state = 42) # Do not use fit_intercept = False if you have removed 1 column after dummy encoding
regr.fit(X_train, Y_train)

DecisionTreeRegressor(random_state=42)

In [7]:
scores_print('Decision Tree Regressor', regr, X_test, Y_test)

Decision Tree Regressor MAE : 4.001 | MSE : 22513.754 | R^2 Score : 0.974


In [8]:
from sklearn.ensemble import BaggingRegressor

regr2 = BaggingRegressor(n_estimators=10, random_state=42)
regr2.fit(X_train, Y_train)

BaggingRegressor(random_state=42)

In [9]:
scores_print('Bagging Regression DT', regr2, X_test, Y_test)

Bagging Regression DT MAE : 7.859 | MSE : 26070.935 | R^2 Score : 0.970


In [10]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import VotingRegressor

r1 = LinearRegression()
r2 = RandomForestRegressor(n_estimators=10, random_state=42)
r3 = KNeighborsRegressor()
r4 = DecisionTreeRegressor(max_depth = 10)

In [11]:
er = VotingRegressor([('rf', r2), ('r3', r3), ('dtr', r4)])
er.fit(X_train, Y_train)

VotingRegressor(estimators=[('rf',
                             RandomForestRegressor(n_estimators=10,
                                                   random_state=42)),
                            ('r3', KNeighborsRegressor()),
                            ('dtr', DecisionTreeRegressor(max_depth=10))])

In [12]:
r1.fit(X_train, Y_train)
scores_print('LinearRegression', r1, X_test, Y_test)

LinearRegression MAE : 199.172 | MSE : 837703.211 | R^2 Score : 0.024


In [13]:
r2.fit(X_train, Y_train)
scores_print('RFR', r2, X_test, Y_test)

RFR MAE : 7.882 | MSE : 26126.525 | R^2 Score : 0.970


In [14]:
r3.fit(X_train, Y_train)
scores_print('KNN', r3, X_test, Y_test)

KNN MAE : 35.138 | MSE : 158864.054 | R^2 Score : 0.815


In [15]:
r4.fit(X_train, Y_train)
scores_print('DTR', r4, X_test, Y_test)

DTR MAE : 18.379 | MSE : 77273.048 | R^2 Score : 0.910


In [16]:
scores_print('Voting Regressor', er, X_test, Y_test)

Voting Regressor MAE : 17.996 | MSE : 49200.103 | R^2 Score : 0.943


In [17]:
regr.get_depth()

48

In [18]:
r5 = DecisionTreeRegressor(max_depth=10)
r5.fit(X_train, Y_train)

DecisionTreeRegressor(max_depth=10)

In [19]:
scores_print('DT D =10', r5, X_test, Y_test)

DT D =10 MAE : 18.379 | MSE : 77273.048 | R^2 Score : 0.910


In [36]:
from sklearn.ensemble import AdaBoostRegressor
dtr6 = DecisionTreeRegressor(max_depth=10)
adaregr = AdaBoostRegressor(base_estimator = dtr6, random_state=42, n_estimators=10)
adaregr.fit(X_train, Y_train)

AdaBoostRegressor(base_estimator=DecisionTreeRegressor(max_depth=10),
                  n_estimators=10, random_state=42)

In [37]:
scores_print('AdaBoostRegressor', adaregr, X_test, Y_test)

AdaBoostRegressor MAE : 17.432 | MSE : 58182.921 | R^2 Score : 0.932


In [33]:
from sklearn.ensemble import BaggingRegressor

bag_regr = BaggingRegressor(base_estimator = r4, n_estimators=10, random_state=42)
bag_regr.fit(X_train, Y_train)

BaggingRegressor(base_estimator=DecisionTreeRegressor(max_depth=10),
                 random_state=42)

In [34]:
scores_print('Bagging Regressor with DT 10 Depth', bag_regr, X_test, Y_test)

Bagging Regressor with DT 10 Depth MAE : 17.367 | MSE : 68348.659 | R^2 Score : 0.920


In [39]:
from sklearn.ensemble import StackingRegressor

estimators = [
    ('dtr', r5),
    ('bag', bag_regr),
    ('adab', adaregr),
]
sregr = StackingRegressor(estimators, final_estimator=dtr6)
sregr.fit(X_train, Y_train)


StackingRegressor(estimators=[('dtr', DecisionTreeRegressor(max_depth=10)),
                              ('bag',
                               BaggingRegressor(base_estimator=DecisionTreeRegressor(max_depth=10),
                                                random_state=42)),
                              ('adab',
                               AdaBoostRegressor(base_estimator=DecisionTreeRegressor(max_depth=10),
                                                 n_estimators=10,
                                                 random_state=42))],
                  final_estimator=DecisionTreeRegressor(max_depth=10))

In [40]:
scores_print('Stacking Regressor', sregr, X_test, Y_test)

Stacking Regressor MAE : 18.004 | MSE : 71071.810 | R^2 Score : 0.917


In [47]:
estimators2 = [
    ('bag', bag_regr),
    ('adab', adaregr),
]
dtr7 = DecisionTreeRegressor(max_depth=5)
sregr2 = StackingRegressor(estimators2, final_estimator=dtr7)
sregr2.fit(X_train, Y_train)

StackingRegressor(estimators=[('bag',
                               BaggingRegressor(base_estimator=DecisionTreeRegressor(max_depth=10),
                                                random_state=42)),
                              ('adab',
                               AdaBoostRegressor(base_estimator=DecisionTreeRegressor(max_depth=10),
                                                 n_estimators=10,
                                                 random_state=42))],
                  final_estimator=DecisionTreeRegressor(max_depth=3))

In [48]:
scores_print('Stacking Regressor without DT Training', sregr2, X_train, Y_train)
scores_print('Stacking Regressor without DT', sregr2, X_test, Y_test)

Stacking Regressor without DT Training MAE : 21.601 | MSE : 57970.013 | R^2 Score : 0.934
Stacking Regressor without DT MAE : 23.125 | MSE : 68631.145 | R^2 Score : 0.920
