In [1]:
import pandas as pd
import numpy as np
# Machine Learning Libraries
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression,Lasso,Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score
# warranings
import warnings
warnings.filterwarnings('ignore')
# display all rows and columns
pd.set_option('display.max_columns',None)
pd.set_option('display.max_rows', None)

In [9]:
df = pd.read_csv('C:/Users/User/Desktop/ML PROJECT/yield_df.csv')

In [10]:
df.head()

Unnamed: 0.1,Unnamed: 0,Area,Item,Year,hg/ha_yield,average_rain_fall_mm_per_year,pesticides_tonnes,avg_temp
0,0,Albania,Maize,1990,36613,1485.0,121.0,16.37
1,1,Albania,Potatoes,1990,66667,1485.0,121.0,16.37
2,2,Albania,"Rice, paddy",1990,23333,1485.0,121.0,16.37
3,3,Albania,Sorghum,1990,12500,1485.0,121.0,16.37
4,4,Albania,Soybeans,1990,7000,1485.0,121.0,16.37


In [11]:
print(f"Total number of rows is {df.shape[0]} and total number of columns is {df.shape[1]}")

Total number of rows is 28242 and total number of columns is 8


In [14]:
print(df['Unnamed: 0'].nunique())
print(df['Unnamed: 0'].duplicated().sum())

28242
0


In [15]:
df.drop('Unnamed: 0', axis=1,inplace=True)

In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28242 entries, 0 to 28241
Data columns (total 7 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Area                           28242 non-null  object 
 1   Item                           28242 non-null  object 
 2   Year                           28242 non-null  int64  
 3   hg/ha_yield                    28242 non-null  int64  
 4   average_rain_fall_mm_per_year  28242 non-null  float64
 5   pesticides_tonnes              28242 non-null  float64
 6   avg_temp                       28242 non-null  float64
dtypes: float64(3), int64(2), object(2)
memory usage: 1.5+ MB


In [19]:
df.describe(include='all')

Unnamed: 0,Area,Item,Year,hg/ha_yield,average_rain_fall_mm_per_year,pesticides_tonnes,avg_temp
count,28242,28242,28242.0,28242.0,28242.0,28242.0,28242.0
unique,101,10,,,,,
top,India,Potatoes,,,,,
freq,4048,4276,,,,,
mean,,,2001.544296,77053.332094,1149.05598,37076.909344,20.542627
std,,,7.051905,84956.612897,709.81215,59958.784665,6.312051
min,,,1990.0,50.0,51.0,0.04,1.3
25%,,,1995.0,19919.25,593.0,1702.0,16.7025
50%,,,2001.0,38295.0,1083.0,17529.44,21.51
75%,,,2008.0,104676.75,1668.0,48687.88,26.0


In [20]:
df.rename(columns={'Area':'Country','Item':'Crop', 'hg/ha_yield':'hg_ha_yield' }, inplace=True)

In [23]:
col = ['Year', 'average_rain_fall_mm_per_year','pesticides_tonnes', 'avg_temp', 'Country', 'Crop', 'hg_ha_yield']
df = df[col]
X = df.drop(columns='hg_ha_yield',axis=1)
y = df['hg_ha_yield']

In [24]:
X_train,X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=42, shuffle=True)

In [25]:
X_train.head()

Unnamed: 0,Year,average_rain_fall_mm_per_year,pesticides_tonnes,avg_temp,Country,Crop
5493,2005,1604.0,829.59,25.36,Cameroon,Sorghum
10969,1992,1083.0,70791.0,25.91,India,Soybeans
2001,1997,1292.0,484.59,25.81,Bahamas,Cassava
22157,1997,494.0,16936.0,23.76,Pakistan,Potatoes
311,2005,1010.0,40.0,24.41,Angola,Sweet potatoes


In [26]:
ohe = OneHotEncoder(drop='first')
sc = StandardScaler()
preprocesser = ColumnTransformer(
    transformers=[
        ('StandardScale', sc, [0, 1, 2, 3]),
        ('OneHotEncoder', ohe, [4,5]),
    ],
    remainder='passthrough'
)

In [27]:
X_train = preprocesser.fit_transform(X_train)
X_test = preprocesser.fit_transform(X_test)

In [30]:
models = {
    'LinearRegression': LinearRegression(),
    'Lasso': Lasso(),
    'Ridge' : Ridge(),
    'DecisionTreeRegressor' :DecisionTreeRegressor()
}
for name, model in models.items():
    model.fit(X_train,y_train)
    y_pred = model.predict(X_test)
    print(f"{name} : mae : {mean_absolute_error(y_test,y_pred)} score : {r2_score(y_test,y_pred)}")

LinearRegression : mae : 29582.642594872577 score : 0.7551898506730755
Lasso : mae : 29567.231138377196 score : 0.7551536231326947
Ridge : mae : 29530.1108430509 score : 0.7552700451351657
DecisionTreeRegressor : mae : 6433.794476898566 score : 0.9543355133344664


In [32]:
DTR = DecisionTreeRegressor()
DTR.fit(X_train,y_train)
y_pred = DTR.predict(X_test)

In [34]:
def prediction(Year, average_rain_fall_mm_per_year, pesticides_tonnes, avg_temp, Country, Crop):
    features = np.array([[Year, average_rain_fall_mm_per_year, pesticides_tonnes, avg_temp, Country, Crop]], dtype=object)
    transformed_features = preprocesser.transform(features)
    predicted_yield = DTR.predict(transformed_features).reshape(1, -1)

    return predicted_yield[0]

Year = 1990
average_rain_fall_mm_per_year =1485.0
pesticides_tonnes = 121.00
avg_temp = 16.37
Area = 'Albania'
Item = 'Maize'
result = prediction(Year, average_rain_fall_mm_per_year, pesticides_tonnes, avg_temp, Area, Item)
result

array([24876.])

In [42]:
import joblib

joblib.dump(DTR, 'cropYeild_model.pkl')

['cropYeild_model.pkl']

In [43]:
DTR_loaded = joblib.load('cropYeild_model.pkl')
result = DTR_loaded.predict(transformed_features)

In [44]:
Year = 2000
average_rain_fall_mm_per_year = 1000.0
pesticides_tonnes = 50.0
avg_temp = 24.0
Country = 'India'
Crop = 'Wheat'

features = np.array([[Year, average_rain_fall_mm_per_year, pesticides_tonnes, avg_temp, Country, Crop]], dtype=object)
transformed_features = preprocesser.transform(features)

result = DTR_loaded.predict(transformed_features)

print(f"Predicted crop yield for {Crop} in {Country} ({Year}): {result[0]}")

Predicted crop yield for Wheat in India (2000): 27785.0


In [45]:
Year = 2010
average_rain_fall_mm_per_year = 1500.0
pesticides_tonnes = 90.0
avg_temp = 18.5
Country = 'Brazil'
Crop = 'Maize'

features = np.array([[Year, average_rain_fall_mm_per_year, pesticides_tonnes, avg_temp, Country, Crop]], dtype=object)
transformed_features = preprocesser.transform(features)

result = DTR_loaded.predict(transformed_features)

print(f"Predicted crop yield for {Crop} in {Country} ({Year}): {result[0]}")

Predicted crop yield for Maize in Brazil (2010): 16569.0
