## Crop Yield Prediction based on DecisionTreeRegression

#### Importing all required libraries and CSV Dataset

In [1]:
import numpy as np
import pandas as pd # to make dataframes
import matplotlib.pyplot as plt # to visualize data (using plots)
import seaborn as sns # also for visualization
from sklearn import model_selection
from sklearn.model_selection import train_test_split # splitting data into test-train sections
from sklearn.tree import DecisionTreeRegressor # the actual Decision Maker
from sklearn import metrics # to calculate Accuracy of the Model

df = pd.read_csv('main2.csv') # importing dataset to a dataframe
df.info()
df.nunique()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 676425 entries, 0 to 676424
Data columns (total 10 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   State             676425 non-null  object 
 1   District          676425 non-null  object 
 2   Crop              675458 non-null  object 
 3   Year              676425 non-null  object 
 4   Season            676425 non-null  object 
 5   Area              675458 non-null  float64
 6   Area Units        675458 non-null  object 
 7   Production        665684 non-null  float64
 8   Production Units  675458 non-null  object 
 9   Yield             675458 non-null  float64
dtypes: float64(3), object(7)
memory usage: 51.6+ MB


State                   35
District               709
Crop                    56
Year                    23
Season                   6
Area                 47390
Area Units               1
Production           67906
Production Units         3
Yield               193205
dtype: int64

#### Cleaning Data 

1. Removing Rows with any NaN entry.
2. Sorting data in Descending Order of Year for better understanding.
3. `Production, Production Units, Area, Area Units` colums were dropped.


In [3]:
df = df.dropna()
df = df.sort_values(ascending=False, by='Year') 

df = df.drop('Area Units', axis=1)
df = df.drop('Production', axis=1)
df = df.drop('Production Units', axis=1)
df = df.drop('Area', axis=1)

# df = df[df['Year'].str.split('-').str[0].astype(int) >= 2010]

#### More Cleaning and handling String Entries (using Categories)

- Season cleaned up to remove disparities. 
- CSV Saved to have a Light Weight Dataset for quicker processing. 
- Converting `Year, State, Season, District, Crop` to Categorical Variables for better analysis.
- Printing Arrays of Categorical Entries for reference 

In [4]:
df = df.replace({'Season':{'Autumn':'Kharif','Summer':'Zaid','Winter':'Rabi'}})

df.to_csv('yield_cleaned_2.csv')

df['Year'] = pd.Categorical(df['Year'], ordered = True)
df['State'] = pd.Categorical(df['State'], ordered = True)
df['District'] = pd.Categorical(df['District'], ordered = True)
df['Season'] = pd.Categorical(df['Season'], ordered = True)
df['Crop'] = pd.Categorical(df['Crop'], ordered = True)

print (df['Year'].cat.categories)
print (df['State'].cat.categories)
print (df['District'].cat.categories)
print (df['Season'].cat.categories)
print (df['Crop'].cat.categories)

df

Index(['1997-98', '1998-99', '1999-00', '2000-01', '2001-02', '2002-03',
       '2003-04', '2004-05', '2005-06', '2006-07', '2007-08', '2008-09',
       '2009-10', '2010-11', '2011-12', '2012-13', '2013-14', '2014-15',
       '2015-16', '2016-17', '2017-18', '2018-19', '2019-20'],
      dtype='object')
Index(['Andaman and Nicobar Islands', 'Andhra Pradesh', 'Arunachal Pradesh',
       'Assam', 'Bihar', 'Chandigarh', 'Chhattisgarh',
       'Dadra and Nagar Haveli', 'Daman and Diu', 'Delhi', 'Goa', 'Gujarat',
       'Haryana', 'Himachal Pradesh', 'Jammu and Kashmir', 'Jharkhand',
       'Karnataka', 'Kerala', 'Madhya Pradesh', 'Maharashtra', 'Manipur',
       'Meghalaya', 'Mizoram', 'Nagaland', 'Odisha', 'Puducherry', 'Punjab',
       'Rajasthan', 'Sikkim', 'Tamil Nadu', 'Telangana', 'Tripura',
       'Uttar Pradesh', 'Uttarakhand', 'West Bengal'],
      dtype='object')
Index(['24 PARAGANAS NORTH', '24 PARAGANAS SOUTH', 'ADILABAD', 'AGAR MALWA',
       'AGRA', 'AHMADABAD', 'AHMEDNAGAR', 

Unnamed: 0,State,District,Crop,Year,Season,Yield
287384,Tamil Nadu,KARUR,Jowar,2019-20,Rabi,0.407208
621290,Madhya Pradesh,REWA,Banana,2019-20,Whole Year,35.300000
285525,Nagaland,DIMAPUR,Sesamum,2019-20,Kharif,0.617691
285524,Nagaland,ZUNHEBOTO,Rice,2019-20,Kharif,2.336540
621340,Madhya Pradesh,ANUPPUR,Coriander,2019-20,Whole Year,1.800008
...,...,...,...,...,...,...
659314,Odisha,SAMBALPUR,Urad,1997-98,Rabi,0.261665
659313,Odisha,SAMBALPUR,Urad,1997-98,Zaid,0.571429
659312,Odisha,SAMBALPUR,Urad,1997-98,Kharif,0.311966
333429,Uttar Pradesh,HARDOI,Small millets,1997-98,Kharif,0.777778


#### Converting Categorical Columns to Integer Codes

In [5]:
cat_columns = df.select_dtypes(['category']).columns
df[cat_columns] = df[cat_columns].apply(lambda x: x.cat.codes)

df.to_csv('yield_processing_1.csv')

df

Unnamed: 0,State,District,Crop,Year,Season,Yield
287384,29,325,21,22,1,0.407208
621290,18,544,3,22,2,35.300000
285525,23,173,44,22,0,0.617691
285524,23,708,41,22,0,2.336540
621340,18,29,10,22,2,1.800008
...,...,...,...,...,...,...
659314,24,561,53,0,1,0.261665
659313,24,561,53,0,3,0.571429
659312,24,561,53,0,0,0.311966
333429,32,244,45,0,0,0.777778


#### Finally Dividing the Data into Two parts and doing the ML part

In [7]:
X = df.drop(columns=['Yield'], axis=1)
Y = df['Yield']
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = .20, random_state=42)
    
regr = DecisionTreeRegressor(random_state = 42) # Do not use fit_intercept = False if you have removed 1 column after dummy encoding
regr.fit(X_train, Y_train)

DecisionTreeRegressor(random_state=42)

#### Accuracy Check 
using R Squared Score, Mean Absolute Error, Mean Square Error

In [8]:
predicted = regr.predict(X_test)

kfold = model_selection.KFold(n_splits=10, random_state=42, shuffle=True)
scoring = 'neg_mean_absolute_error'
results = model_selection.cross_val_score(regr, X, Y, cv=kfold, scoring=scoring)
print("Mean Absolute Error: %.3f (%.3f)" % (results.mean(), results.std()))

rval = np.sqrt(metrics.mean_squared_error(Y_test,predicted))
print('Root Mean Square Error :%.3f' % rval)

print('R^2 Score :%.3f' % metrics.r2_score(Y_test,predicted))

Mean Absolute Error: -1.587 (0.407)
Root Mean Square Error :150.045
R^2 Score :0.974
