In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder, StandardScaler

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.metrics import r2_score, mean_squared_error

from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV

import catboost as cb

In [2]:
df = pd.read_csv('./data/data.csv')

### Linear regression

In [28]:
# train/test split
df_train = df[~df['year'].isin([2018, 2019, 2020, 2021])]
df_test = df[df['year'].isin([2018, 2019, 2020, 2021])]

X_train = df_train.drop(columns=['target'])
y_train = df_train['target']

X_test = df_test.drop(columns=['target'])
y_test = df_test['target']

In [66]:
numeric_features = list(X_train.drop(columns=['state', 'crop']).columns)
categorical_features = ['state', 'crop']

In [67]:
column_trainsformer = ColumnTransformer([
    ('ohe', OneHotEncoder(handle_unknown='ignore'), categorical_features),
    ('scaling', StandardScaler(), numeric_features)
])

pipeline = Pipeline(steps=[
    ('ohe_and_scaling', column_trainsformer),
    ('regression', LinearRegression())
])

model = pipeline.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(f'RMSE: {mean_squared_error(y_test, y_pred, squared=False)}')
print(f'R2: {r2_score(y_test, y_pred)}')

RMSE: 41.081826325629216
R2: 0.5729797577343121


### Decision Tree

In [None]:
# Create a pipeline
column_trainsformer = ColumnTransformer([
    ('ohe', OneHotEncoder(handle_unknown='ignore'), categorical_features),
    ('scaling', StandardScaler(), numeric_features)
])

column_trainsformer = ColumnTransformer([
    ('ohe', OneHotEncoder(handle_unknown='ignore'), categorical_features),
    ('scaling', StandardScaler(), numeric_features)
])

pipeline = Pipeline(steps=[
    ('ohe_and_scaling', column_trainsformer),
    ('tree', DecisionTreeRegressor())
])

param_grig = {"tree__criterion": ["squared_error", "absolute_error"],
              "tree__min_samples_split": [5, 10, 20, 40, 50],
              "tree__max_depth": [2, 6, 8, 10, 15, 30, 50],
              "tree__min_samples_leaf": [5, 10, 20, 40, 100],
              "tree__max_leaf_nodes": [5, 10, 20, 40, 100]}

search = GridSearchCV(pipeline, param_grig, n_jobs=-1, scoring='r2')

search.fit(X_train, y_train)


In [91]:
best_tree = search.best_estimator_
model_tree = best_tree.fit(X_train, y_train)
y_pred = model_tree.predict(X_test)
print(f'RMSE: {mean_squared_error(y_test, y_pred, squared=False)}')
print(f'R2: {r2_score(y_test, y_pred)}')

RMSE: 13.514757520357902
R2: 0.9537868582884859


### Gradient Boosting

In [None]:
train_dataset = cb.Pool(X_train, y_train, 
                        cat_features=categorical_features)
test_dataset = cb.Pool(X_test, y_test,
                       cat_features=categorical_features)

In [111]:
model = cb.CatBoostRegressor(loss_function='RMSE')

In [112]:
grid = {'iterations': [100, 150, 200],
        'learning_rate': [0.03, 0.1],
        'depth': [2, 4, 6, 8],
        'l2_leaf_reg': [0.2, 0.5, 1, 3]}

In [None]:
model.grid_search(grid, train_dataset)

In [115]:
pred = model.predict(X_test)
rmse = (np.sqrt(mean_squared_error(y_test, pred)))
r2 = r2_score(y_test, pred)

print('Test preformance')
print('RMSE: {:.2f}'.format(rmse))
print('R2: {:.2f}'.format(r2))

Test preformance
RMSE: 11.93
R2: 0.96


  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,


# March model
Only data values up to March were taken

In [6]:
df = pd.read_csv('.data/data_march.csv', sep=';')

In [44]:
# train/test split
df_train = df[~df['year'].isin([2018, 2019, 2020, 2021])]
df_test = df[df['year'].isin([2018, 2019, 2020, 2021])]

X_train = df_train.drop(columns=['target'])
y_train = df_train['target']

X_test = df_test.drop(columns=['target'])
y_test = df_test['target']

In [45]:
numeric_features = list(X_train.drop(columns=['state', 'crop']).columns)
categorical_features = ['state', 'crop']

In [46]:
column_trainsformer = ColumnTransformer([
    ('ohe', OneHotEncoder(handle_unknown='ignore'), categorical_features),
    ('scaling', StandardScaler(), numeric_features)
])

pipeline = Pipeline(steps=[
    ('ohe_and_scaling', column_trainsformer),
    ('regression', LinearRegression())
])

model = pipeline.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(f'RMSE: {mean_squared_error(y_test, y_pred, squared=False)}')
print(f'R2: {r2_score(y_test, y_pred)}')

RMSE: 30.177339858926647
R2: 0.7695849269154751


### Decision Tree

In [None]:
# Create a pipeline
column_trainsformer = ColumnTransformer([
    ('ohe', OneHotEncoder(handle_unknown='ignore'), categorical_features),
    ('scaling', StandardScaler(), numeric_features)
])

column_trainsformer = ColumnTransformer([
    ('ohe', OneHotEncoder(handle_unknown='ignore'), categorical_features),
    ('scaling', StandardScaler(), numeric_features)
])

pipeline = Pipeline(steps=[
    ('ohe_and_scaling', column_trainsformer),
    ('tree', DecisionTreeRegressor())
])

param_grig = {"tree__criterion": ["squared_error", "absolute_error"],
              "tree__min_samples_split": [5, 10, 20, 40, 50],
              "tree__max_depth": [2, 6, 8, 10, 15, 30, 50],
              "tree__min_samples_leaf": [5, 10, 20, 40, 100],
              "tree__max_leaf_nodes": [5, 10, 20, 40, 100]}

search = GridSearchCV(pipeline, param_grig, n_jobs=-1, scoring='r2')

search.fit(X_train, y_train)

In [49]:
best_tree = search.best_estimator_
model_tree = best_tree.fit(X_train, y_train)
y_pred = model_tree.predict(X_test)
print(f'RMSE: {mean_squared_error(y_test, y_pred, squared=False)}')
print(f'R2: {r2_score(y_test, y_pred)}')

RMSE: 16.282918203169075
R2: 0.9329168479804021


### Gradient Boosting

In [None]:
train_dataset = cb.Pool(X_train, y_train, 
                        cat_features=categorical_features)
test_dataset = cb.Pool(X_test, y_test,
                       cat_features=categorical_features)

In [57]:
model = cb.CatBoostRegressor(loss_function='RMSE',
                             eval_metric='R2')

In [62]:
grid = {'iterations': [100, 150, 200, 500, 1000],
        'learning_rate': [0.01, 0.05, 0.1, 0.3],
        'depth': [2, 4, 6, 8],
        'l2_leaf_reg': [0.2, 0.5, 1, 3, 5, 10, 100]}

In [None]:
model.grid_search(grid, train_dataset, plot=True)

In [None]:
best_model = model.fit(train_dataset)

In [77]:
pred = best_model.predict(X_test)
rmse = (np.sqrt(mean_squared_error(y_test, pred)))
r2 = r2_score(y_test, pred)

print('Test preformance')
print('RMSE: {:.2f}'.format(rmse))
print('R2: {:.2f}'.format(r2))

Test preformance
RMSE: 12.12
R2: 0.96


  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,


# June model
Only data values up to June were taken

In [9]:
df = pd.read_csv('.data/data_june.csv', sep=';')

In [91]:
# train/test split
df_train = df[~df['year'].isin([2018, 2019, 2020, 2021])]
df_test = df[df['year'].isin([2018, 2019, 2020, 2021])]

X_train = df_train.drop(columns=['target'])
y_train = df_train['target']

X_test = df_test.drop(columns=['target'])
y_test = df_test['target']

# 
numeric_features = list(X_train.drop(columns=['state', 'crop']).columns)
categorical_features = ['state', 'crop']

In [92]:
column_trainsformer = ColumnTransformer([
    ('ohe', OneHotEncoder(handle_unknown='ignore'), categorical_features),
    ('scaling', StandardScaler(), numeric_features)
])

pipeline = Pipeline(steps=[
    ('ohe_and_scaling', column_trainsformer),
    ('regression', LinearRegression())
])

model = pipeline.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(f'RMSE: {mean_squared_error(y_test, y_pred, squared=False)}')
print(f'R2: {r2_score(y_test, y_pred)}')

RMSE: 37.54385127096785
R2: 0.6433628179491475


### Decision Tree

In [93]:
# Create a pipeline
column_trainsformer = ColumnTransformer([
    ('ohe', OneHotEncoder(handle_unknown='ignore'), categorical_features),
    ('scaling', StandardScaler(), numeric_features)
])

column_trainsformer = ColumnTransformer([
    ('ohe', OneHotEncoder(handle_unknown='ignore'), categorical_features),
    ('scaling', StandardScaler(), numeric_features)
])

pipeline = Pipeline(steps=[
    ('ohe_and_scaling', column_trainsformer),
    ('tree', DecisionTreeRegressor())
])

param_grig = {"tree__criterion": ["squared_error", "absolute_error"],
              "tree__min_samples_split": [5, 10, 20, 40, 50],
              "tree__max_depth": [2, 6, 8, 10, 15, 30, 50],
              "tree__min_samples_leaf": [5, 10, 20, 40, 100],
              "tree__max_leaf_nodes": [5, 10, 20, 40, 100]}

search = GridSearchCV(pipeline, param_grig, n_jobs=-1, scoring='r2')

search.fit(X_train, y_train)

best_tree = search.best_estimator_
model_tree = best_tree.fit(X_train, y_train)
y_pred = model_tree.predict(X_test)
print(f'RMSE: {mean_squared_error(y_test, y_pred, squared=False)}')
print(f'R2: {r2_score(y_test, y_pred)}')

RMSE: 16.95784166821787
R2: 0.927240428933661


### Gradient Boosting

In [None]:
# Create datasets
train_dataset = cb.Pool(X_train, y_train, 
                        cat_features=categorical_features)
test_dataset = cb.Pool(X_test, y_test,
                       cat_features=categorical_features)

# Define the model
model = cb.CatBoostRegressor(loss_function='RMSE')

# Define grid
grid = {'iterations': [100, 150, 200],
        'learning_rate': [0.03, 0.1],
        'depth': [2, 4, 6, 8],
        'l2_leaf_reg': [0.2, 0.5, 1, 3]}

# Train the model
model.grid_search(grid, train_dataset, plot=True)

In [None]:
best_model_june = model.fit(train_dataset)

In [96]:
pred = best_model_june.predict(X_test)
rmse = (np.sqrt(mean_squared_error(y_test, pred)))
r2 = r2_score(y_test, pred)

print('Test preformance')
print('RMSE: {:.2f}'.format(rmse))
print('R2: {:.2f}'.format(r2))

Test preformance
RMSE: 11.38
R2: 0.97


# August model
Only data values up to August were taken

In [11]:
df = pd.read_csv('.data/data_august.csv', sep=';')

In [98]:
# train/test split
df_train = df[~df['year'].isin([2018, 2019, 2020, 2021])]
df_test = df[df['year'].isin([2018, 2019, 2020, 2021])]

X_train = df_train.drop(columns=['target'])
y_train = df_train['target']

X_test = df_test.drop(columns=['target'])
y_test = df_test['target']

# 
numeric_features = list(X_train.drop(columns=['state', 'crop']).columns)
categorical_features = ['state', 'crop']

In [99]:
column_trainsformer = ColumnTransformer([
    ('ohe', OneHotEncoder(handle_unknown='ignore'), categorical_features),
    ('scaling', StandardScaler(), numeric_features)
])

pipeline = Pipeline(steps=[
    ('ohe_and_scaling', column_trainsformer),
    ('regression', LinearRegression())
])

model = pipeline.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(f'RMSE: {mean_squared_error(y_test, y_pred, squared=False)}')
print(f'R2: {r2_score(y_test, y_pred)}')

RMSE: 36.37961055393746
R2: 0.6651386149117529


### Decision Tree

In [100]:
# Create a pipeline
column_trainsformer = ColumnTransformer([
    ('ohe', OneHotEncoder(handle_unknown='ignore'), categorical_features),
    ('scaling', StandardScaler(), numeric_features)
])

column_trainsformer = ColumnTransformer([
    ('ohe', OneHotEncoder(handle_unknown='ignore'), categorical_features),
    ('scaling', StandardScaler(), numeric_features)
])

pipeline = Pipeline(steps=[
    ('ohe_and_scaling', column_trainsformer),
    ('tree', DecisionTreeRegressor())
])

param_grig = {"tree__criterion": ["squared_error", "absolute_error"],
              "tree__min_samples_split": [5, 10, 20, 40, 50],
              "tree__max_depth": [2, 6, 8, 10, 15, 30, 50],
              "tree__min_samples_leaf": [5, 10, 20, 40, 100],
              "tree__max_leaf_nodes": [5, 10, 20, 40, 100]}

search = GridSearchCV(pipeline, param_grig, n_jobs=-1, scoring='r2')

search.fit(X_train, y_train)

best_tree = search.best_estimator_
model_tree = best_tree.fit(X_train, y_train)
y_pred = model_tree.predict(X_test)
print(f'RMSE: {mean_squared_error(y_test, y_pred, squared=False)}')
print(f'R2: {r2_score(y_test, y_pred)}')


RMSE: 13.276292572298692
R2: 0.9554033119735253


### Gradient Boostng

In [None]:
# Create datasets
train_dataset = cb.Pool(X_train, y_train, 
                        cat_features=categorical_features)
test_dataset = cb.Pool(X_test, y_test,
                       cat_features=categorical_features)

# Define the model
model = cb.CatBoostRegressor(loss_function='RMSE')

# Define grid
grid = {'iterations': [100, 150, 200],
        'learning_rate': [0.03, 0.1],
        'depth': [2, 4, 6, 8],
        'l2_leaf_reg': [0.2, 0.5, 1, 3]}

# Train the model
model.grid_search(grid, train_dataset)

In [None]:
best_model_august = model.fit(train_dataset)

In [104]:
pred = model.predict(X_test)
rmse = (np.sqrt(mean_squared_error(y_test, pred)))
r2 = r2_score(y_test, pred)

print('Test preformance')
print('RMSE: {:.2f}'.format(rmse))
print('R2: {:.2f}'.format(r2))

Test preformance
RMSE: 11.94
R2: 0.96


  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,
