In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import metrics
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score

import warnings
warnings.simplefilter('ignore')

In [2]:
df = pd.read_csv("../dataset.csv", index_col=0)
df

Unnamed: 0,Year,Item,Yield (tonnes/ha),Harvest (ha),Production (tonnes),Change (C),Annual Mean Precipitation,Air Temp Mean,Frost Days,Hot Days,...,Monthly Average,Devil,Gustnado,Hail,Ice,Precip,Snow,Tornado,Wind,Total
0,1980,Asparagus,2.2023,6381,14053.0,-0.190,771.58,7.63,103.61,1.45,...,338.762500,0,0,6,0,8,1,27,12,54
1,1980,Barley,4.3104,2970671,12804914.0,-0.190,771.58,7.63,103.61,1.45,...,338.762500,0,0,6,0,8,1,27,12,54
2,1980,"Broad beans and horse beans, green",10.4409,1016,10608.0,-0.190,771.58,7.63,103.61,1.45,...,338.762500,0,0,6,0,8,1,27,12,54
3,1980,Cabbages,29.6605,24663,731518.0,-0.190,771.58,7.63,103.61,1.45,...,338.762500,0,0,6,0,8,1,27,12,54
4,1980,Carrots and turnips,31.5252,13021,410490.0,-0.190,771.58,7.63,103.61,1.45,...,338.762500,0,0,6,0,8,1,27,12,54
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1592,2021,Sugar beet,81.7645,390700,31945400.0,1.304,772.58,9.16,89.36,4.53,...,416.450833,2,0,464,13,1535,186,40,3762,6002
1593,2021,Sunflower seed,2.6031,38300,99700.0,1.304,772.58,9.16,89.36,4.53,...,416.450833,2,0,464,13,1535,186,40,3762,6002
1594,2021,Tomatoes,254.4250,400,101770.0,1.304,772.58,9.16,89.36,4.53,...,416.450833,2,0,464,13,1535,186,40,3762,6002
1595,2021,Triticale,5.8136,328300,1908600.0,1.304,772.58,9.16,89.36,4.53,...,416.450833,2,0,464,13,1535,186,40,3762,6002


## Creating working data frame

In [3]:
crops = ['Wheat', 'Barley', 'Maize', 'Sugar beet', 'Potatoes', 'Rye', 'Rape', 'Apples', 'Grapes', 'Cabbages']
mask = df["Item"].str.contains("|".join(crops))
df = df[mask]
df.reset_index(drop=True, inplace=True)
crop_names = df["Item"].unique()

## Creating the metrics data frame

In [4]:
## Calulcating metrics
def calculate_metrics(df, crop_names, y_test, y_pred):
    mae = metrics.mean_absolute_error(y_test, y_pred)
    mse = metrics.mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(metrics.mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test,y_pred)
    df.loc[crop_names] = [mae, mse, rmse, r2]

In [5]:
quality_metrics = ['Random_forest_MAE', 'Random_forest_MSE', 'Random_forest_RMSE', "Random_Forest_R2"]
error_df = pd.DataFrame(columns = quality_metrics, index = crop_names)
#display(error_df)

## Exploring the data

In [6]:
percent_missing = df.isnull().sum() * 100 / len(df)
missing_value_df = pd.DataFrame({'column_name': df.columns,
                                 'percent_missing': percent_missing})
display(missing_value_df)

Unnamed: 0,column_name,percent_missing
Year,Year,0.0
Item,Item,0.0
Yield (tonnes/ha),Yield (tonnes/ha),0.0
Harvest (ha),Harvest (ha),0.0
Production (tonnes),Production (tonnes),0.0
Change (C),Change (C),0.0
Annual Mean Precipitation,Annual Mean Precipitation,0.0
Air Temp Mean,Air Temp Mean,0.0
Frost Days,Frost Days,0.0
Hot Days,Hot Days,0.0


## Random Forest Regression

In [7]:
for crop in crop_names:
   crop_df = df[df['Item'] == crop]
   x = crop_df[[
      'Year', 'Harvest (ha)','Production (tonnes)', 
      'Change (C)', 'Annual Mean Precipitation',
      'Air Temp Mean', 'Frost Days', 'Hot Days', 
      'Ice Days', 'Summer Days','Sunshine Duration', 
      'Tropical Nights', 'Monthly Average', "Devil",	
      'Gustnado',	'Hail',	'Ice',	'Precip',	'Snow',	
      'Tornado',	'Wind',	'Total']]
   
   y = crop_df[['Yield (tonnes/ha)']]

   scaler = MinMaxScaler()
   regressor = RandomForestRegressor(n_estimators = 100, random_state = 0)

   x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.1, shuffle=True)
   x_train = scaler.fit_transform(x_train)
   x_test = scaler.transform(x_test)
   
   regressor.fit(x_train, y_train)


   #prediction of new value
   y_pred = regressor.predict(x_test)
   crop_c = crop
   calculate_metrics(error_df, crop_c , y_test, y_pred)


## Result

In [8]:
error_df

Unnamed: 0,Random_forest_MAE,Random_forest_MSE,Random_forest_RMSE,Random_Forest_R2
Barley,0.242443,0.083575,0.289093,0.915017
Cabbages,2.597408,9.648099,3.106139,0.880174
Maize (corn),0.403491,0.197693,0.444627,0.824808
Potatoes,3.126789,11.725467,3.424247,-0.206351
Rape or colza seed,0.435504,0.216726,0.465538,0.534829
Rye,0.23854,0.091152,0.301915,0.845104
Sugar beet,2.976268,14.061994,3.749932,0.904438
Wheat,0.182151,0.046643,0.215969,0.799264
Apples,3.857091,24.562376,4.956044,-0.059884
Grapes,1.139938,1.708444,1.307075,-0.439812
