In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import metrics
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import r2_score

import warnings
warnings.simplefilter('ignore')

In [2]:
df = pd.read_csv("../working_data/dataset.csv", index_col=0)

## Creating working data frame

In [3]:
crops = ['Wheat', 'Barley', 'Maize', 'Sugar beet', 'Potatoes', 'Rye', 'Rape', 'Apples', 'Grapes', 'Cabbages']
mask = df["Item"].str.contains("|".join(crops))
df = df[mask]
df.reset_index(drop=True, inplace=True)
crop_names = df["Item"].unique()

## Creating the metrics data frame

In [4]:
def calculate_metrics(df, crop_name, y_true, y_pred):
    mae = metrics.mean_absolute_error(y_true, y_pred)
    mse = metrics.mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    mape = metrics.mean_absolute_percentage_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    df.loc[crop_name] = [mae, mse, rmse, mape, r2, y_true.values[0][0], y_pred[0]]

In [5]:
quality_metrics = ['xgb_MAE', 'xgb_MSE', 'xgb_RMSE', "xgb_MAPE","xgb_R2", "Actual Yield", "Predicted Yield"]
error_df = pd.DataFrame(columns = quality_metrics, index = crop_names)
#display(error_df)

## XGBoost (eXtreme Gradient Boosting)

In [6]:
for crop in crop_names:
   crop_df = df[df['Item'] == crop]
   x = crop_df[[
      'Year', 
      'Change (C)', 'Annual Mean Precipitation',
      'Air Temp Mean', 'Frost Days', 'Hot Days', 
      'Ice Days', 'Summer Days','Sunshine Duration', 
      'Tropical Nights', 'Mean CO2 (ppm)', "Devil",	
      'Gustnado',	'Hail',	'Ice',	'Precip',	'Snow',	
      'Tornado',	'Wind',	'Total']]
   
   y = crop_df[['Yield (tonnes/ha)']]

   scaler = MinMaxScaler()
   regressor = xgb.XGBRegressor(n_estimators=200, objective="reg:squarederror", random_state=15)

   x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.1, shuffle=True)
   x_train = scaler.fit_transform(x_train)
   x_test = scaler.transform(x_test)
   
   regressor.fit(x_train, y_train)

   #prediction of new value
   y_pred = regressor.predict(x_test)
   crop_c = crop
   calculate_metrics(error_df, crop_c , y_test, y_pred)

## Result

In [7]:
error_df

Unnamed: 0,xgb_MAE,xgb_MSE,xgb_RMSE,xgb_MAPE,xgb_R2,Actual Yield,Predicted Yield
Barley,0.516198,0.377799,0.614653,0.100414,-1.695563,5.4651,6.538125
Cabbages,2.763236,12.858989,3.585943,0.0553,-0.308046,55.8827,55.757137
Maize (corn),0.367697,0.154853,0.393513,0.049043,0.939602,9.4475,9.186249
Potatoes,2.706513,11.668918,3.41598,0.099745,-0.779689,32.9512,32.530441
Rape or colza seed,0.223571,0.082944,0.288,0.067718,0.799141,2.4255,2.648148
Rye,0.322685,0.197058,0.443912,0.07369,0.741192,5.7857,5.213073
Sugar beet,4.381952,24.318702,4.9314,0.075317,0.851003,56.3591,59.772633
Wheat,0.58477,0.364821,0.604004,0.082695,0.631607,6.2734,6.775066
Apples,4.547911,26.602158,5.157728,0.173707,-2.414847,24.1236,16.386459
Grapes,1.11286,2.379071,1.542424,0.089972,-7.574293,12.3493,14.530155
