In [1]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import MinMaxScaler
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.metrics import r2_score
from statsmodels.tsa.stattools import adfuller
from sklearn.impute import KNNImputer

import warnings
warnings.simplefilter('ignore')
pd.set_option('display.max_columns', None)

In [2]:
df = pd.read_csv("../working_data/dataset.csv", index_col=0)

In [3]:
df = df[df["Category"] != "Industrial Crops"]

## Creating working data frame

In [4]:
df['Year'] = pd.to_datetime(df['Year'], format='%Y')
df.set_index("Year", inplace=True)
# getting list of names
categories = df["Category"].unique()

## Creating the metrics data frame

In [5]:
def calculate_metrics(df, crop_name, y_true, y_pred):
    mae = metrics.mean_absolute_error(y_true, y_pred)
    mse = metrics.mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_true, y_pred)
    y_test_avg = pd.DataFrame(y_true).mean()
    y_pred_avg = y_pred.mean()

    df.loc[crop_name] = [mae, mse, rmse, r2, y_test_avg[0], y_pred_avg]

In [6]:
quality_metrics = ['RF_MAE', 'RF_MSE', 'RF_RMSE', "RF_R2", "Actual Yield", "Predicted Yield"]
error_df = pd.DataFrame(columns = quality_metrics, index = categories)
#display(error_df)

## Support Vector Regression

### Setting up dataframe

#### - Solving for stationarity

In [7]:
imputer = KNNImputer(n_neighbors=4)
adf_df = pd.DataFrame(columns=df.columns)

for cate in categories: 
    # Selects category
    crop_df = df[df["Category"] == cate]
    num_cols = crop_df.select_dtypes(include=[np.number])
    # Selects column for ADF Test
    for col in num_cols:
        if crop_df[col].count() < 5:
            continue
        adf_result = adfuller(crop_df[col])
        while adf_result[1] > 0.05:
            crop_df[col] = crop_df[col].diff()
            crop_df[num_cols.columns] = imputer.fit_transform(crop_df[num_cols.columns])
            adf_result = adfuller(crop_df[col])

    adf_df = pd.concat([adf_df, crop_df])

adf_df

Unnamed: 0,Item,Yield (tonnes/ha),Change (C),Annual Mean Precipitation,Air Temp Mean,Frost Days,Hot Days,Ice Days,Summer Days,Sunshine Duration,Tropical Nights,Mean CO2 (ppm),Devil,Gustnado,Hail,Ice,Precip,Snow,Tornado,Wind,Total,Category
1980-01-01,Asparagus,2.2023,-0.190,771.58,7.63,103.61,1.45,30.73,16.18,0.164829,0.02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Vegetables and Legumes
1980-01-01,"Beans, dry",2.4292,-0.190,771.58,7.63,103.61,1.45,30.73,16.18,0.164829,0.02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Vegetables and Legumes
1980-01-01,"Broad beans and horse beans, dry",2.4565,-0.190,771.58,7.63,103.61,1.45,30.73,16.18,0.164829,0.02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Vegetables and Legumes
1980-01-01,"Broad beans and horse beans, green",10.4409,-0.190,771.58,7.63,103.61,1.45,30.73,16.18,0.164829,0.02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Vegetables and Legumes
1980-01-01,Cabbages,29.6605,-0.190,771.58,7.63,103.61,1.45,30.73,16.18,0.164829,0.02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Vegetables and Legumes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-01-01,Pears,17.3645,1.304,772.58,9.16,89.36,4.53,15.70,37.06,0.186210,0.29,0.0,0.0,0.0,0.0,13.0,0.0,0.0,0.0,0.0,0.0,Fruits and Nuts
2021-01-01,Plums and sloes,9.1670,1.304,772.58,9.16,89.36,4.53,15.70,37.06,0.186210,0.29,0.0,0.0,0.0,0.0,13.0,0.0,0.0,0.0,0.0,0.0,Fruits and Nuts
2021-01-01,Raspberries,6.8824,1.304,772.58,9.16,89.36,4.53,15.70,37.06,0.186210,0.29,0.0,0.0,0.0,0.0,13.0,0.0,0.0,0.0,0.0,0.0,Fruits and Nuts
2021-01-01,Sour cherries,6.1278,1.304,772.58,9.16,89.36,4.53,15.70,37.06,0.186210,0.29,0.0,0.0,0.0,0.0,13.0,0.0,0.0,0.0,0.0,0.0,Fruits and Nuts


#### - Transforming time series data to supervised learning (adding lags to variables)

In [8]:
main_df = pd.DataFrame(columns=adf_df.columns)
imputer = KNNImputer(n_neighbors=4)

# Sliding Window
for cate in categories: 
    # Selects category
    crop_df = adf_df[adf_df["Category"] == cate]
    num_cols = crop_df.select_dtypes(include=[np.number])
    for col in num_cols.columns:   
        for i in range(1, 3):
            crop_df[f'{col}-{i}'] = crop_df[col].shift(i)

    main_df = pd.concat([main_df, crop_df])
    num_cols = main_df.select_dtypes(include=[np.number])
    main_df[num_cols.columns] = imputer.fit_transform(main_df[num_cols.columns])

main_df

Unnamed: 0,Item,Yield (tonnes/ha),Change (C),Annual Mean Precipitation,Air Temp Mean,Frost Days,Hot Days,Ice Days,Summer Days,Sunshine Duration,Tropical Nights,Mean CO2 (ppm),Devil,Gustnado,Hail,Ice,Precip,Snow,Tornado,Wind,Total,Category,Yield (tonnes/ha)-1,Yield (tonnes/ha)-2,Change (C)-1,Change (C)-2,Annual Mean Precipitation-1,Annual Mean Precipitation-2,Air Temp Mean-1,Air Temp Mean-2,Frost Days-1,Frost Days-2,Hot Days-1,Hot Days-2,Ice Days-1,Ice Days-2,Summer Days-1,Summer Days-2,Sunshine Duration-1,Sunshine Duration-2,Tropical Nights-1,Tropical Nights-2,Mean CO2 (ppm)-1,Mean CO2 (ppm)-2,Devil-1,Devil-2,Gustnado-1,Gustnado-2,Hail-1,Hail-2,Ice-1,Ice-2,Precip-1,Precip-2,Snow-1,Snow-2,Tornado-1,Tornado-2,Wind-1,Wind-2,Total-1,Total-2
1980-01-01,Asparagus,2.2023,-0.190,771.58,7.63,103.61,1.45,30.73,16.18,0.164829,0.02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Vegetables and Legumes,11.082025,12.393375,-0.190,-0.190,771.58,771.58,7.63,7.63,103.61,103.61,1.45,1.45,30.73,30.73,16.18,16.18,0.164829,0.164829,0.02,0.02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1980-01-01,"Beans, dry",2.4292,-0.190,771.58,7.63,103.61,1.45,30.73,16.18,0.164829,0.02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Vegetables and Legumes,2.202300,9.896450,-0.190,-0.190,771.58,771.58,7.63,7.63,103.61,103.61,1.45,1.45,30.73,30.73,16.18,16.18,0.164829,0.164829,0.02,0.02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1980-01-01,"Broad beans and horse beans, dry",2.4565,-0.190,771.58,7.63,103.61,1.45,30.73,16.18,0.164829,0.02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Vegetables and Legumes,2.429200,2.202300,-0.190,-0.190,771.58,771.58,7.63,7.63,103.61,103.61,1.45,1.45,30.73,30.73,16.18,16.18,0.164829,0.164829,0.02,0.02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1980-01-01,"Broad beans and horse beans, green",10.4409,-0.190,771.58,7.63,103.61,1.45,30.73,16.18,0.164829,0.02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Vegetables and Legumes,2.456500,2.429200,-0.190,-0.190,771.58,771.58,7.63,7.63,103.61,103.61,1.45,1.45,30.73,30.73,16.18,16.18,0.164829,0.164829,0.02,0.02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1980-01-01,Cabbages,29.6605,-0.190,771.58,7.63,103.61,1.45,30.73,16.18,0.164829,0.02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Vegetables and Legumes,10.440900,2.456500,-0.190,-0.190,771.58,771.58,7.63,7.63,103.61,103.61,1.45,1.45,30.73,30.73,16.18,16.18,0.164829,0.164829,0.02,0.02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-01-01,Pears,17.3645,1.304,772.58,9.16,89.36,4.53,15.70,37.06,0.186210,0.29,0.0,0.0,0.0,0.0,13.0,0.0,0.0,0.0,0.0,0.0,Fruits and Nuts,3.116000,11.433500,1.304,1.304,772.58,772.58,9.16,9.16,89.36,89.36,4.53,4.53,15.70,15.70,37.06,37.06,0.186210,0.186210,0.29,0.29,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,13.0,13.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2021-01-01,Plums and sloes,9.1670,1.304,772.58,9.16,89.36,4.53,15.70,37.06,0.186210,0.29,0.0,0.0,0.0,0.0,13.0,0.0,0.0,0.0,0.0,0.0,Fruits and Nuts,17.364500,3.116000,1.304,1.304,772.58,772.58,9.16,9.16,89.36,89.36,4.53,4.53,15.70,15.70,37.06,37.06,0.186210,0.186210,0.29,0.29,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,13.0,13.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2021-01-01,Raspberries,6.8824,1.304,772.58,9.16,89.36,4.53,15.70,37.06,0.186210,0.29,0.0,0.0,0.0,0.0,13.0,0.0,0.0,0.0,0.0,0.0,Fruits and Nuts,9.167000,17.364500,1.304,1.304,772.58,772.58,9.16,9.16,89.36,89.36,4.53,4.53,15.70,15.70,37.06,37.06,0.186210,0.186210,0.29,0.29,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,13.0,13.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2021-01-01,Sour cherries,6.1278,1.304,772.58,9.16,89.36,4.53,15.70,37.06,0.186210,0.29,0.0,0.0,0.0,0.0,13.0,0.0,0.0,0.0,0.0,0.0,Fruits and Nuts,6.882400,9.167000,1.304,1.304,772.58,772.58,9.16,9.16,89.36,89.36,4.53,4.53,15.70,15.70,37.06,37.06,0.186210,0.186210,0.29,0.29,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,13.0,13.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Running the model

In [9]:
scaler = MinMaxScaler()
for cat in categories:
   crop_df = main_df[main_df['Category'] == cat]
   encoded_df = pd.get_dummies(crop_df[["Item"]], prefix='', prefix_sep='')
   encoded_df = encoded_df.drop(encoded_df.columns[-1], axis=1)
   
   y = crop_df[['Yield (tonnes/ha)']]
   x = crop_df.drop(['Yield (tonnes/ha)', "Item", "Category"], axis=1)
   x = pd.concat([x, encoded_df], axis=1)
   

   mean = x.mean()
   std = x.std()
   lower_bound = mean - 2 * std
   upper_bound = mean + 2 * std
   x = x.clip(lower_bound, upper_bound, axis=1)

   x = scaler.fit_transform(x)

   x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, shuffle=True)
   regressor = SVR(kernel="rbf")#, C=1000, gamma=15)
   regressor.fit(x_train, y_train)

   #prediction of new value
   y_pred = regressor.predict(x_test)
   calculate_metrics(error_df, cat, y_test, y_pred)

## Result

In [10]:
error_df

Unnamed: 0,RF_MAE,RF_MSE,RF_RMSE,RF_R2,Actual Yield,Predicted Yield
Vegetables and Legumes,18.462871,1246.828514,35.310459,0.087316,30.011265,21.858832
Cereals,0.639196,0.888911,0.942821,0.814474,-0.027866,-0.034358
Fruits and Nuts,3.781063,34.279097,5.854835,0.315151,10.029524,8.354296
