In [11]:
import numpy as np
import pandas as pd

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error



import warnings
warnings.simplefilter('ignore')

In [12]:
df = pd.read_csv("../dataset.csv", index_col=0)
df

Unnamed: 0,Year,Item,Yield (tonnes/ha),Harvest (ha),Production (tonnes),Change (C),Annual Mean Precipitation,Air Temp Mean,Frost Days,Hot Days,...,Monthly Average,Devil,Gustnado,Hail,Ice,Precip,Snow,Tornado,Wind,Total
0,1980,Asparagus,2.2023,6381,14053.0,-0.190,771.58,7.63,103.61,1.45,...,338.762500,0,0,6,0,8,1,27,12,54
1,1980,Barley,4.3104,2970671,12804914.0,-0.190,771.58,7.63,103.61,1.45,...,338.762500,0,0,6,0,8,1,27,12,54
2,1980,"Broad beans and horse beans, green",10.4409,1016,10608.0,-0.190,771.58,7.63,103.61,1.45,...,338.762500,0,0,6,0,8,1,27,12,54
3,1980,Cabbages,29.6605,24663,731518.0,-0.190,771.58,7.63,103.61,1.45,...,338.762500,0,0,6,0,8,1,27,12,54
4,1980,Carrots and turnips,31.5252,13021,410490.0,-0.190,771.58,7.63,103.61,1.45,...,338.762500,0,0,6,0,8,1,27,12,54
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1592,2021,Sugar beet,81.7645,390700,31945400.0,1.304,772.58,9.16,89.36,4.53,...,416.450833,2,0,464,13,1535,186,40,3762,6002
1593,2021,Sunflower seed,2.6031,38300,99700.0,1.304,772.58,9.16,89.36,4.53,...,416.450833,2,0,464,13,1535,186,40,3762,6002
1594,2021,Tomatoes,254.4250,400,101770.0,1.304,772.58,9.16,89.36,4.53,...,416.450833,2,0,464,13,1535,186,40,3762,6002
1595,2021,Triticale,5.8136,328300,1908600.0,1.304,772.58,9.16,89.36,4.53,...,416.450833,2,0,464,13,1535,186,40,3762,6002


## Creating working data frame

In [13]:
crops = ['Wheat', 'Barley', 'Maize', 'Sugar beet', 'Potatoes', 'Rye', 'Rape', 'Apples', 'Grapes', 'Cabbages']
mask = df["Item"].str.contains("|".join(crops))
df = df[mask]
df.reset_index(drop=True, inplace=True)
crop_names = df["Item"].unique()

In [14]:
df

Unnamed: 0,Year,Item,Yield (tonnes/ha),Harvest (ha),Production (tonnes),Change (C),Annual Mean Precipitation,Air Temp Mean,Frost Days,Hot Days,...,Monthly Average,Devil,Gustnado,Hail,Ice,Precip,Snow,Tornado,Wind,Total
0,1980,Barley,4.3104,2970671,12804914.0,-0.190,771.58,7.63,103.61,1.45,...,338.762500,0,0,6,0,8,1,27,12,54
1,1980,Cabbages,29.6605,24663,731518.0,-0.190,771.58,7.63,103.61,1.45,...,338.762500,0,0,6,0,8,1,27,12,54
2,1980,Maize (corn),5.6435,119749,675807.0,-0.190,771.58,7.63,103.61,1.45,...,338.762500,0,0,6,0,8,1,27,12,54
3,1980,Potatoes,20.9328,819108,17146192.0,-0.190,771.58,7.63,103.61,1.45,...,338.762500,0,0,6,0,8,1,27,12,54
4,1980,Rape or colza seed,2.6132,262441,685801.0,-0.190,771.58,7.63,103.61,1.45,...,338.762500,0,0,6,0,8,1,27,12,54
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
378,2021,Potatoes,43.7944,258300,11312100.0,1.304,772.58,9.16,89.36,4.53,...,416.450833,2,0,464,13,1535,186,40,3762,6002
379,2021,Rape or colza seed,3.5014,1000900,3504600.0,1.304,772.58,9.16,89.36,4.53,...,416.450833,2,0,464,13,1535,186,40,3762,6002
380,2021,Rye,5.2704,631000,3325600.0,1.304,772.58,9.16,89.36,4.53,...,416.450833,2,0,464,13,1535,186,40,3762,6002
381,2021,Sugar beet,81.7645,390700,31945400.0,1.304,772.58,9.16,89.36,4.53,...,416.450833,2,0,464,13,1535,186,40,3762,6002


## Creating the metrics data frame

In [15]:
## Calulcating metrics
def calculate_metrics(df, crop_names, y_test, y_pred):
    mae = metrics.mean_absolute_error(y_test, y_pred)
    mse = metrics.mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(metrics.mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test,y_pred)
    df.loc[crop_names] = [mae, mse, rmse, r2]

In [16]:
quality_metrics = ['MLR_MAE', 'MLR_MSE', 'MLR_RMSE', "MLR_R2"]
error_df = pd.DataFrame(columns = quality_metrics, index = crop_names)
#display(error_df)

## Exploring the data

In [17]:
percent_missing = df.isnull().sum() * 100 / len(df)
missing_value_df = pd.DataFrame({'column_name': df.columns,
                                 'percent_missing': percent_missing})
display(missing_value_df)

Unnamed: 0,column_name,percent_missing
Year,Year,0.0
Item,Item,0.0
Yield (tonnes/ha),Yield (tonnes/ha),0.0
Harvest (ha),Harvest (ha),0.0
Production (tonnes),Production (tonnes),0.0
Change (C),Change (C),0.0
Annual Mean Precipitation,Annual Mean Precipitation,0.0
Air Temp Mean,Air Temp Mean,0.0
Frost Days,Frost Days,0.0
Hot Days,Hot Days,0.0


## Multiple Linear Regression

In [18]:
df[df["Item"] == "Apples"].shape

(30, 24)

In [19]:
for crop in crop_names:
   crop_df = df[df['Item'] == crop]
   x = crop_df[[
      'Year', 'Harvest (ha)','Production (tonnes)', 
      'Change (C)', 'Annual Mean Precipitation',
      'Air Temp Mean', 'Frost Days', 'Hot Days', 
      'Ice Days', 'Summer Days','Sunshine Duration', 
      'Tropical Nights', 'Monthly Average', "Devil",	
      'Gustnado',	'Hail',	'Ice',	'Precip',	'Snow',	
      'Tornado',	'Wind',	'Total']]
   y = crop_df[['Yield (tonnes/ha)']]
   
   scaler = MinMaxScaler()
   regressor = LinearRegression()

   x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.1, shuffle=True)
   x_train = scaler.fit_transform(x_train)
   x_test = scaler.transform(x_test)
   
   regressor.fit(x_train, y_train)

   #prediction of new value
   y_pred = regressor.predict(x_test)
   crop_c = crop
   calculate_metrics(error_df, crop_c , y_test, y_pred)


## Results

In [20]:
error_df

Unnamed: 0,MLR_MAE,MLR_MSE,MLR_RMSE,MLR_R2
Barley,0.194854,0.079483,0.281928,0.928597
Cabbages,1.46722,3.490144,1.868193,0.862586
Maize (corn),0.283686,0.136367,0.369279,0.934619
Potatoes,2.247503,8.796367,2.965867,0.885421
Rape or colza seed,0.152459,0.027216,0.164973,0.880005
Rye,0.249007,0.086387,0.293917,0.893509
Sugar beet,0.858772,1.242931,1.114868,-0.532309
Wheat,0.062329,0.004897,0.069978,0.97639
Apples,0.929904,1.405756,1.185646,0.934915
Grapes,0.073647,0.009141,0.095608,0.904202
