In [1]:
## create and activate virtual environment
# !source ./devops/create_env  

In [2]:
import pandas as pd
import numpy as np
import os
from os import pardir, path
import sys
import warnings
warnings.filterwarnings('ignore')

from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn import metrics

from sklearn import ensemble
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

import plotly.figure_factory as ff
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go

In [3]:
mod_path = os.getcwd()
if mod_path not in sys.path:
    sys.path.append(mod_path)
print(sys.path)

from package import Model, return_tot

['/Users/karimkhalil/Coding/development/commodity', '/Users/karimkhalil/Coding/development/commodity/commod_env/lib/python39.zip', '/Users/karimkhalil/Coding/development/commodity/commod_env/lib/python3.9', '/Users/karimkhalil/Coding/development/commodity/commod_env/lib/python3.9/lib-dynload', '', '/Users/karimkhalil/Coding/development/commodity/commod_env/lib/python3.9/site-packages']


In [4]:
## mapping

mat_codes = {
    "F": "01",
    "G": "02", 
    "H": "03", 
    "J": "04", 
    "K": "05",
    "M": "06",
    "N": "07", 
    "Q": "08",
    "U" : "09",
    "V": "10",
    "X": "11",
    "Z": "12"
}

## 1. Data Preparation

In [5]:
df = pd.read_csv('commodities.csv')
df['datetime'] = pd.to_datetime(df['date'], utc=True)
df['date_strf']=df.datetime.dt.strftime('%Y%m%d')
df['dayofweek'] = df['datetime'].dt.strftime("%a")
df['month'] = df['datetime'].dt.month

df['datetime_maturity'] = pd.to_datetime(df.maturity.str[-4:] + df.maturity.str[0].map(mat_codes), format='%Y%m', utc=True)

df['datetime_maturity'] = df['datetime_maturity']-pd.Timedelta(1, "d")
df['date_strf_maturity']=df.datetime_maturity.dt.strftime('%Y%m%d')
df['time2maturity_d'] = (df.datetime_maturity-df.datetime).dt.days

df.loc[df['time2maturity_d'] <0, 'time2maturity_d'] = 0

df['time2maturity_m'] = (df.time2maturity_d/30).round()
df_settle = df.loc[df['observation'] == 'Settle']
df_soy = df_settle.loc[df_settle['instrument'] == 'CBOT.ZS']
df_soy.sort_values(['datetime' , 'datetime_maturity'], ascending = [True, False], inplace=True)


In [6]:
## select prices with only 6 months maturity for comparability

dates = set(df_soy['date_strf'])

concat = []

for i in dates:
    duration = 6
    data = df_soy.loc[(df_soy['date_strf'] == i) & (df_soy['time2maturity_m'] == duration)]
    while data.shape[0] ==0:
        duration +=1
        data = df_soy.loc[(df_soy['date_strf'] == i) & (df_soy['time2maturity_m'] == duration)]
    concat.append(data)

df_soy_6m = pd.concat(concat)
df_soy_6m.set_index('date_strf' , inplace=True, drop=True)
df_soy_6m.sort_index(inplace=True)

### 1.1 Create Features

In [7]:
## lagged price

df_soy_6m['value_t-1'] = df_soy_6m['value'].shift(1)

## returns: predicted variable

df_soy_6m['pct_t-1'] = df_soy_6m.value.pct_change(1)

## calculate returns for the previous 7 days

for i in range(7):
    df_soy_6m[f'lagged_pct_t-{i+1}'] = df_soy_6m['value_t-1'].pct_change(i+1)

In [8]:
## rolling averages for prices and returns

vals = ['value', 'lagged_pct_t-1']

for i in [7, 15, 30 , 60]:
    for j in vals:
        df_soy_6m[f'roll_avg_pct_{i}'] = df_soy_6m[j].rolling(i).mean()

for i in vals:
    df_soy_6m[f'exp_avg_{i}'] = df_soy_6m[i].expanding(1).mean()

In [9]:
df_soy_6m[[i for i in df_soy_6m.columns if df_soy_6m[i].dtype != 'datetime64[ns, UTC]']].to_excel(os.path.join(os.getcwd(), 'df_soy_6m_v2.xlsx'))

In [10]:
## drop missing values
df_soy_6m_clean = df_soy_6m.dropna()

### 1.2 Descriptive Statistics

In [11]:
df_sorted = df_soy_6m_clean.sort_values('date')
fig = make_subplots(rows=2, cols=2, subplot_titles=["Price", "Daily Return Distribution", 'Price Distribution', 'Return Distribution'])

dist = ff.create_distplot([df_soy_6m_clean['pct_t-1'].values.tolist()], [''], bin_size=.01).data[1]
hist = ff.create_distplot([df_soy_6m_clean['pct_t-1'].values.tolist()], [''], bin_size=.01).data[0]
price = px.line(x=df_soy_6m_clean.value, y=df_soy_6m_clean.index)
price = px.line(y=df_sorted.value, x=df_sorted.date, title='Bean Price', labels=dict(x='Date', y='USD')).data[0]
# dist.update_layout(width=700, height=700)

fig.add_trace(go.Scatter(dist, line=dict(color='red')), row=1, col=2)
fig.add_trace(hist, row=1, col=2)
fig.add_trace(go.Scatter(price), row=1,col=1)
fig.add_trace(go.Box(x=df_sorted.value), row=2,col=1)
fig.add_trace(go.Box(x=df_sorted['pct_t-1']), row=2,col=2)

fig['layout'].update(height=800, width=1500, title='Returns Descriptive Statistics')

fig.update_xaxes(title_text="Date", row=1, col=1)
fig.update_yaxes(title_text="USD", row=1, col=1)

fig.update_xaxes(tickmode='array', row=1, col=2)

# fig.update_xaxes(ticktext =[f'{i:.4f} %' for i in range(-7,7 , 1)], tickvals =[i/100 for i in range(-7,7 , 1)], row=1, col=2)

# fig.add_trace(dist, row=1, col=1)
fig.show()

## 2. Model

In [12]:
model = Model(df_soy_6m_clean)

object instanciated


In [13]:
regr = LinearRegression()
xgb = ensemble.GradientBoostingRegressor(random_state=137)
randforest = ensemble.RandomForestRegressor(random_state=137)

In [14]:
cols_x = ['lagged_pct_t-1', 'lagged_pct_t-2', 'lagged_pct_t-3',
       'lagged_pct_t-4', 'lagged_pct_t-5', 'lagged_pct_t-6', 'lagged_pct_t-7',
       'roll_avg_pct_7', 'roll_avg_pct_15', 'roll_avg_pct_30']
       
cols_y = 'pct_t-1'

In [15]:
print('Linear regression error stats:')
df_lr, stat_lr = model.skbacktest(model.df, regr, cols_x, cols_y, 'date' ,'value', train_window=16 , test_window=1, test_gap = 0, expanding=False, print_iter=False)
# df_lr['value_lr'] = model.backtest(df_lr, 'value', 'predict', 'pct_t-1', 'predict')['value_strat']

Linear regression error stats:
Average MSE train: 6.49543362005166e-05
Average MSE test: 0.0011643675794354325


In [16]:
print('XGB regression error stats:')
df_xgb , stat_xgb = model.skbacktest(model.df, xgb, cols_x, cols_y, 'date' ,'value', train_window=16 , test_window=1, test_gap = 0, expanding=False, print_iter=False)

XGB regression error stats:
Average MSE train: 1.3086016713875701e-10
Average MSE test: 0.0004497235256369112


In [17]:

print('Random forest regression error stats:')
df_rf, stat_rf = model.skbacktest(model.df, randforest, cols_x, cols_y, 'date' ,'value', train_window=16 , test_window=1, test_gap = 0, expanding=False, print_iter=False)

Random forest regression error stats:
Average MSE train: 3.706141730061759e-05
Average MSE test: 0.00031504957578602956


In [18]:
## price of beans using random forest prediction
df_rf = model.backtest(df_rf, 'value', 'predict', 'value_rf')

In [19]:
## for validation only
# df_rf[['value', 'pct_t-1' ,'predict' , 'MSE', 'strat_return', 'value_strat']].to_excel(os.path.join(os.getcwd(), 'rf_backtest2.xlsx'))
# df_rf[['date', 'value', 'value_t-1', 'MSE', 'iteration',  'pct_t-1', 'predict','strat_return', 'value_strat']]
# model.backtest(df_rf,'value', 'pct_t-1', 'test')[['value', 'test']]

In [24]:
# fig = go.Figure()

fig = make_subplots(specs=[[{"secondary_y": True}]])

fig.add_trace(go.Scatter(x=df_rf['date'], y = df_rf['value'], name='Actual', line=(dict(width=3))))

## to hide when showing strategy
fig.add_trace(go.Scatter(x=df_rf['date'], y = df_rf['value_rf'], name='Random forest price prediction'))
fig.add_hline(y=1362, annotation_text='Baseline', line_dash="dot", annotation_position="bottom right")

######################## To un-hide ########################
# fig.add_trace(go.Scatter(x=df_rf['date'], y = df_rf['value_strat'], name='backtest random forest price'))
# # fig.add_vrect(x0="2021-5-13", x1="2021-6-10", fillcolor="green",  opacity=0.25, line_width=0 , annotation_text="below base line", annotation_position="top left")

# fig.add_annotation(x="2021-6-14", y=1700, text="Plunge prediction", showarrow=True, arrowhead=1)
# fig.add_annotation(x="2021-7-2", y=1700, text="", showarrow=True, arrowhead=1, ax=-50)
# fig.add_vrect(x0="2021-6-10", x1="2021-6-17", fillcolor="orange",  opacity=0.25, line_width=0)
# fig.add_vrect(x0="2021-7-2", x1="2021-7-7", fillcolor="orange",  opacity=0.25, line_width=0)
############################################################

fig['layout'].update(height=600, width=1600, title='Actual vs Predicted Value')
fig.update_xaxes(title_text="Date")
fig.update_yaxes(title_text="USD")

# fig.add_trace(go.Scatter(x=df_rf['date'], y = df_rf['MSE'], name='MSE'), secondary_y=True)

fig.show()


In [21]:
## Actual vs Strategy statistics

df_rf['pct_t-1_strat'] = df_rf.value_strat.pct_change(1)

cols_desc = ['value' , 'pct_t-1', 'pct_t-1_strat']
desc = df_rf[cols_desc].describe()


sharpe = desc.loc['mean', 'pct_t-1'] / desc.loc['std', 'pct_t-1']
sharpe_strat= desc.loc['mean', 'pct_t-1_strat'] / desc.loc['std', 'pct_t-1_strat']

avg_return = desc.loc['mean', 'pct_t-1']
avg_return_strat = desc.loc['mean', 'pct_t-1_strat']

tot_return_act = return_tot(df_rf, 'date', 'value', 'value')
tot_return_predict = return_tot(df_rf, 'date', 'value', 'value_strat')

print(f'Sharpe ratio (actual): {100*(sharpe):.2f} %')
print(f'Sharpe ratio (strategy): {100*(sharpe_strat):.2f} %')
print()

print(f'Average daily return (actual): {100*(avg_return):.2f} %')
print(f'Average daily return (strategy): {100*(avg_return_strat):.2f} %')

print()
print(f'Total return (actual): {100*(tot_return_act):.2f} %')
print(f'Total return (strategy): {100*(tot_return_predict):.2f} %')

desc


Sharpe ratio (actual): -2.54 %
Sharpe ratio (strategy): 3.78 %

Average daily return (actual): -0.04 %
Average daily return (strategy): 0.06 %

Total return (actual): -8.79 %
Total return (strategy): 7.27 %


Unnamed: 0,value,pct_t-1,pct_t-1_strat
count,153.0,153.0,152.0
mean,1325.263072,-0.000386,0.000576
std,55.905567,0.015239,0.015232
min,1212.0,-0.066481,-0.047742
25%,1279.25,-0.006965,-0.007315
50%,1318.75,0.0,-0.00165
75%,1362.0,0.007299,0.006729
max,1461.0,0.063438,0.066481


In [22]:
stats = [stat_lr, stat_xgb, stat_rf]
mse = [i['MSEtest'] for i in stats]

In [23]:

fig = make_subplots(rows=2, cols=2, subplot_titles=["Prices", "Returns Distribution", 'MSE time series', 'MSE by Model'])

x_dist_mse = ff.create_distplot([df_rf['MSE'].values.tolist()], [''], bin_size=.01).data[1]['x']
y_dist_mse = ff.create_distplot([df_rf['MSE'].values.tolist()], [''], bin_size=.01).data[1]['y']

x_dist_act = ff.create_distplot([df_rf['pct_t-1'].values.tolist()], [''], bin_size=.01).data[1]['x']
y_dist_act = ff.create_distplot([df_rf['pct_t-1'].values.tolist()], [''], bin_size=.01).data[1]['y']

x_dist_predict = ff.create_distplot([df_rf['predict'].values.tolist()], [''], bin_size=.01).data[1]['x']
y_dist_predict = ff.create_distplot([df_rf['predict'].values.tolist()], [''], bin_size=.01).data[1]['y']

fig.add_trace(go.Scatter(x=df_rf['date'], y = df_rf['value'], name='actual price'), row=1, col=1)
# fig.add_trace(go.Scatter(x=df_rf['date'], y = df_rf['value_rf'], name='random forest price'), row=1, col=1)
fig.add_trace(go.Scatter(x=df_rf['date'], y = df_rf['value_strat'], name='random forest strategy'), row=1, col=1)

fig.add_trace(go.Scatter(x=x_dist_act, y = y_dist_act, name='actual returns distribution'), row=1, col=2)
fig.add_trace(go.Scatter(x=x_dist_predict, y = y_dist_predict, name='random forestreturns distribution'), row=1, col=2)

fig.add_trace(go.Scatter(x=df_rf['date'], y = df_rf['MSE'], name='random forest MSE'), row=2, col=1)
fig.add_trace(go.Scatter(x=df_lr['date'], y = df_lr['MSE'], name='linear regr MSE'), row=2, col=1)
fig.add_trace(go.Scatter(x=df_xgb['date'], y = df_xgb['MSE'], name='XGB regr MSE'), row=2, col=1)
fig.add_trace(go.Bar(x=['Linear' , 'XGB', 'Random Forest'], y = mse, name='MSE comparison'), row=2, col=2)

fig['layout'].update(height=900, width=1500, title='Model Highlights')
fig.show()