In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import metrics
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from statsmodels.tsa.stattools import adfuller
from sklearn.impute import KNNImputer

import warnings
warnings.simplefilter('ignore')
pd.set_option('display.max_columns', None)

In [2]:
df = pd.read_csv("../working_data/dataset.csv", index_col=0)

In [3]:
df = df[df["Category"] != "Industrial Crops"]

## Creating working data frame

In [6]:
#df['Year'] = pd.to_datetime(df['Year'], format='%Y')
#df.set_index("Year", inplace=True)
# getting list of names
categories = df["Category"].unique()

## Creating the metrics data frame

In [7]:
def calculate_metrics(df, crop_name, y_true, y_pred):
    mae = metrics.mean_absolute_error(y_true, y_pred)
    mse = metrics.mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_true, y_pred)
    y_test_avg = pd.DataFrame(y_true).mean()
    y_pred_avg = y_pred.mean()

    df.loc[crop_name] = [mae, mse, rmse, r2, y_test_avg[0], y_pred_avg]

In [8]:
quality_metrics = ['RF_MAE', 'RF_MSE', 'RF_RMSE', "RF_R2", "Actual Yield", "Predicted Yield"]
error_df = pd.DataFrame(columns = quality_metrics, index = categories)
#display(error_df)

## Random Forest Regression

### Setting up dataframe

#### - Solving for stationarity

In [9]:
imputer = KNNImputer(n_neighbors=4)
adf_df = pd.DataFrame(columns=df.columns)

for cate in categories: 
    # Selects category
    crop_df = df[df["Category"] == cate]
    num_cols = crop_df.select_dtypes(include=[np.number])
    # Selects column for ADF Test
    for col in num_cols:
        if crop_df[col].count() < 5:
            continue
        adf_result = adfuller(crop_df[col])
        while adf_result[1] > 0.05:
            crop_df[col] = crop_df[col].diff()
            crop_df[num_cols.columns] = imputer.fit_transform(crop_df[num_cols.columns])
            adf_result = adfuller(crop_df[col])

    adf_df = pd.concat([adf_df, crop_df])

#### - Transforming time series data to supervised learning (adding lags to variables)

In [10]:
main_df = pd.DataFrame(columns=adf_df.columns)
imputer = KNNImputer(n_neighbors=4)

# Sliding Window
for cate in categories: 
    # Selects category
    crop_df = adf_df[adf_df["Category"] == cate]
    num_cols = crop_df.select_dtypes(include=[np.number])
    for col in num_cols.columns:   
         for i in range(1, 3):
            crop_df[f'{col}-{i}'] = crop_df[col].shift(i)

    main_df = pd.concat([main_df, crop_df])
    num_cols = main_df.select_dtypes(include=[np.number])
    main_df[num_cols.columns] = imputer.fit_transform(main_df[num_cols.columns])

## Running the model

In [15]:
scaler = MinMaxScaler()
for cat in categories:
    crop_df = main_df[main_df['Category'] == cat]
    encoded_df = pd.get_dummies(crop_df[["Season"]], prefix='', prefix_sep='')
    encoded_df = encoded_df.drop(encoded_df.columns[-1], axis=1)

    y = crop_df[['Yield (tonnes/ha)']]
    x = crop_df.drop(['Yield (tonnes/ha)', "Item", "Category", "Season"], axis=1)#, "Mean CO2 (ppm)", "Frost Days", "Summer Days"], axis=1)
    x = pd.concat([x, encoded_df], axis=1)
    

    mean = x.mean()
    std = x.std()
    lower_bound = mean - 2 * std
    upper_bound = mean + 2 * std
    x = x.clip(lower_bound, upper_bound, axis=1)

    x = scaler.fit_transform(x)

    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, shuffle=False)
    regressor = RandomForestRegressor(n_estimators = 400, random_state = 42)
    regressor.fit(x_train, y_train)
    
    #prediction of new value
    y_pred = regressor.predict(x_test)
    calculate_metrics(error_df, cat , pd.DataFrame(y_test), y_pred)

## Result

In [16]:
error_df

Unnamed: 0,RF_MAE,RF_MSE,RF_RMSE,RF_R2,Actual Yield,Predicted Yield
Vegetables and Legumes,17.565769,908.087257,30.134486,0.727086,45.004377,37.644662
Cereals,0.59522,0.608178,0.779858,0.904829,0.022847,0.133734
Fruits and Nuts,3.881348,29.0069,5.385805,0.468004,9.75563,10.810689
