In [1]:
## create and activate virtual environment
# !source ./devops/create_env  

In [2]:
import pandas as pd
import numpy as np
import os
from os import pardir, path
import sys
import warnings
warnings.filterwarnings('ignore')

from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn import metrics

from sklearn import ensemble
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV


import plotly.figure_factory as ff
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go

In [3]:
mod_path = os.getcwd()
if mod_path not in sys.path:
    sys.path.append(mod_path)
print(sys.path)

from package import Model

['/Users/karimkhalil/Coding/development/commodity', '/Users/karimkhalil/Coding/development/commodity/commod_env/lib/python39.zip', '/Users/karimkhalil/Coding/development/commodity/commod_env/lib/python3.9', '/Users/karimkhalil/Coding/development/commodity/commod_env/lib/python3.9/lib-dynload', '', '/Users/karimkhalil/Coding/development/commodity/commod_env/lib/python3.9/site-packages']


In [4]:
## mapping

mat_codes = {
    "F": "01",
    "G": "02", 
    "H": "03", 
    "J": "04", 
    "K": "05",
    "M": "06",
    "N": "07", 
    "Q": "08",
    "U" : "09",
    "V": "10",
    "X": "11",
    "Z": "12"
}

## 1. Data Preparation

In [5]:
df = pd.read_csv('commodities.csv')
df['datetime'] = pd.to_datetime(df['date'], utc=True)
df['date_strf']=df.datetime.dt.strftime('%Y%m%d')
df['dayofweek'] = df['datetime'].dt.strftime("%a")
df['month'] = df['datetime'].dt.month

df['datetime_maturity'] = pd.to_datetime(df.maturity.str[-4:] + df.maturity.str[0].map(mat_codes), format='%Y%m', utc=True)

df['datetime_maturity'] = df['datetime_maturity']-pd.Timedelta(1, "d")
df['date_strf_maturity']=df.datetime_maturity.dt.strftime('%Y%m%d')
df['time2maturity_d'] = (df.datetime_maturity-df.datetime).dt.days

df.loc[df['time2maturity_d'] <0, 'time2maturity_d'] = 0

df['time2maturity_m'] = (df.time2maturity_d/30).round()
df_settle = df.loc[df['observation'] == 'Settle']
df_soy = df_settle.loc[df_settle['instrument'] == 'CBOT.ZS']
df_soy.sort_values(['datetime' , 'datetime_maturity'], ascending = [True, False], inplace=True)


In [6]:
## select prices with only 6 months maturity for comparability

dates = set(df_soy['date_strf'])

concat = []

for i in dates:
    duration = 6
    data = df_soy.loc[(df_soy['date_strf'] == i) & (df_soy['time2maturity_m'] == duration)]
    while data.shape[0] ==0:
        duration +=1
        data = df_soy.loc[(df_soy['date_strf'] == i) & (df_soy['time2maturity_m'] == duration)]
    concat.append(data)

df_soy_6m = pd.concat(concat)
df_soy_6m.set_index('date_strf' , inplace=True, drop=True)
df_soy_6m.sort_index(inplace=True)

### 1.1 Create Features

In [7]:
## calculate returns for the previous 7 days

for i in range(7):
    df_soy_6m[f'pct_t-{i+1}'] = df_soy_6m.value.pct_change(i+1)

## rolling averages for prices and returns

vals = ['value', 'pct_t-1']

for i in [7, 15, 30 , 60]:
    for j in vals:
        df_soy_6m[f'roll_avg_pct_{i}'] = df_soy_6m[j].rolling(i).mean()

for i in vals:
    df_soy_6m[f'exp_avg_{i}'] = df_soy_6m[i].expanding(1).mean()

In [8]:
df_soy_6m[[i for i in df_soy_6m.columns if df_soy_6m[i].dtype != 'datetime64[ns, UTC]']].to_excel(os.path.join(os.getcwd(), 'df_soy_6m.xlsx'))

In [9]:
## drop missing values
df_soy_6m_clean = df_soy_6m.dropna()

### 1.2 Descriptive Statistics

In [10]:
desc = df_soy_6m_clean.describe()
desc

Unnamed: 0,value,month,time2maturity_d,time2maturity_m,pct_t-1,pct_t-2,pct_t-3,pct_t-4,pct_t-5,pct_t-6,pct_t-7,roll_avg_pct_7,roll_avg_pct_15,roll_avg_pct_30,roll_avg_pct_60,exp_avg_value,exp_avg_pct_t-1
count,171.0,171.0,171.0,171.0,171.0,171.0,171.0,171.0,171.0,171.0,171.0,171.0,171.0,171.0,171.0,171.0,171.0
mean,1320.261696,7.508772,194.94152,6.508772,0.000423,0.000711,0.001131,0.001545,0.001937,0.002345,0.002804,0.000419,0.000242,0.00015,4.7e-05,1315.436391,0.000286
std,55.533812,2.324722,17.504187,0.535432,0.015322,0.021192,0.026772,0.030715,0.033647,0.03529,0.036951,0.005274,0.002823,0.001888,0.001142,12.715297,0.000377
min,1212.0,3.0,165.0,6.0,-0.066481,-0.086664,-0.100876,-0.12756,-0.139802,-0.133126,-0.138179,-0.020777,-0.007644,-0.003393,-0.00189,1286.934932,-0.000457
25%,1273.125,6.0,180.0,6.0,-0.006777,-0.010777,-0.016196,-0.017538,-0.018498,-0.021231,-0.021902,-0.003106,-0.001641,-0.001278,-0.000881,1311.638898,1.2e-05
50%,1308.75,8.0,195.0,6.0,0.001292,0.001661,0.001881,0.000965,-0.000396,0.000918,0.0,6.3e-05,-0.000108,-0.000211,-0.000122,1320.301802,0.000179
75%,1356.375,9.5,210.0,7.0,0.007343,0.01336,0.016537,0.020027,0.020845,0.022471,0.025343,0.003619,0.002086,0.001139,0.001105,1325.244961,0.000495
max,1461.0,12.0,225.0,8.0,0.063438,0.065332,0.098921,0.097939,0.100883,0.087014,0.079443,0.011018,0.007952,0.006661,0.002231,1328.049051,0.00129


In [11]:
def return_tot(df, date, value):
    period_beg = df.date.min()
    period_end = df.date.max()

    price_beg = df.loc[df.date == period_beg, value].values[0]
    price_end = df.loc[df.date == period_end, value].values[0]

    ret = (price_end-price_beg)/ price_beg
    return ret

sharpe = desc.loc['mean', 'pct_t-1'] / desc.loc['std', 'pct_t-1']
avg_return = desc.loc['mean', 'pct_t-1']
tot_return = return_tot(df_soy_6m, 'date', 'value')

print(f'Sharpe ratio: {100*(sharpe):.2f} %')
print(f'Average daily return: {100*(avg_return):.2f} %')
print(f'Total return: {100*(tot_return):.2f} %')

Sharpe ratio: 2.76 %
Average daily return: 0.04 %
Total return: -4.12 %


In [12]:
df_sorted = df_soy_6m_clean.sort_values('date')
fig = make_subplots(rows=2, cols=2, subplot_titles=["Price", "Daily Return Distribution", 'Price Distribution', 'Return Distribution'])

dist = ff.create_distplot([df_soy_6m_clean['pct_t-1'].values.tolist()], [''], bin_size=.01).data[1]
hist = ff.create_distplot([df_soy_6m_clean['pct_t-1'].values.tolist()], [''], bin_size=.01).data[0]
price = px.line(x=df_soy_6m_clean.value, y=df_soy_6m_clean.index)
price = px.line(y=df_sorted.value, x=df_sorted.date, title='Bean Price', labels=dict(x='Date', y='USD')).data[0]
# dist.update_layout(width=700, height=700)

fig.add_trace(go.Scatter(dist, line=dict(color='red')), row=1, col=2)
fig.add_trace(hist, row=1, col=2)
fig.add_trace(go.Scatter(price), row=1,col=1)
fig.add_trace(go.Box(x=df_sorted.value), row=2,col=1)
fig.add_trace(go.Box(x=df_sorted['pct_t-1']), row=2,col=2)

fig['layout'].update(height=800, width=1500, title='Returns Descriptive Statistics')

fig.update_xaxes(title_text="Date", row=1, col=1)
fig.update_yaxes(title_text="USD", row=1, col=1)

fig.update_xaxes(tickmode='array', row=1, col=2)

# fig.update_xaxes(ticktext =[f'{i:.4f} %' for i in range(-7,7 , 1)], tickvals =[i/100 for i in range(-7,7 , 1)], row=1, col=2)

# fig.add_trace(dist, row=1, col=1)
fig.show()

## 2. Model

In [13]:
model = Model(df_soy_6m_clean)

object instanciated


In [14]:
regr = LinearRegression()
xgb = ensemble.GradientBoostingRegressor()
histgrad = ensemble.HistGradientBoostingRegressor()
randforest = ensemble.RandomForestRegressor()

In [15]:
cols_x = ['pct_t-2', 'pct_t-3', 'pct_t-4', 'pct_t-5', 'pct_t-6', 'pct_t-7','roll_avg_pct_7', 'roll_avg_pct_15', 'roll_avg_pct_30']
cols_y = 'pct_t-1'

In [16]:
print('Linear regression error stats:')
df_lr, stat_lr = model.skbacktest(model.df, regr, cols_x, cols_y, 'date' ,'value', train_window=16 , test_window=1, test_gap = 0, expanding=False, print_iter=False)
df_lr['value_lr'] = model.backtest(df_lr, 'value', 'predict', 'pct_t-1', 'predict')['value_strat']

Linear regression error stats:
Average MSE train: 6.780155412598385e-05
Average MSE test: 0.0007198151268960125


In [17]:
print('XGB regression error stats:')
df_xgb , stat_xgb = model.skbacktest(model.df, xgb, cols_x, cols_y, 'date' ,'value', train_window=16 , test_window=1, test_gap = 0, expanding=False, print_iter=False)
df_xgb['value_xgb'] = model.backtest(df_xgb, 'value', 'predict', 'pct_t-1', 'predict')['value_strat']

XGB regression error stats:
Average MSE train: 4.745971013168183e-10
Average MSE test: 0.00025850093541433


In [18]:

print('Random forest regression error stats:')
df_rf, stat_rf = model.skbacktest(model.df, randforest, cols_x, cols_y, 'date' ,'value', train_window=16 , test_window=1, test_gap = 0, expanding=False, print_iter=False)
df_rf['value_rf'] = model.backtest(df_rf, 'value', 'predict', 'pct_t-1', 'predict')['value_strat']
df_rf['value_ma'] = model.backtest(df_rf, 'value', 'roll_avg_pct_7', 'pct_t-1', 'predict')['value_strat']
# df_result[['value', 'pct_t-1' ,'predict' , 'MSE', 'strat_return', 'value_strat']].to_excel(os.path.join(os.getcwd(), 'backtest.xlsx'))

Random forest regression error stats:
Average MSE train: 3.493121554221e-05
Average MSE test: 0.00022344934017467643


In [19]:
## only for validation

# df_test = df_rf[['value', 'pct_t-1' , 'predict', 'MSE']]
# df_test.loc[df_test['predict'] > 0, 'strat1_return'] = df_test['pct_t-1']
# df_test.loc[df_test['predict'] < 0, 'strat1_return'] = -df_test['pct_t-1']

# df_test.sort_index(inplace=True)
# df_test['validate'] = df_test['value'].shift(1) * (1+df_test['pct_t-1'])
# df_test

In [20]:
# fig = go.Figure()
fig = make_subplots(specs=[[{"secondary_y": True}]])

fig.add_trace(go.Scatter(x=df_rf['date'], y = df_rf['value'], name='actual'))
fig.add_trace(go.Scatter(x=df_rf['date'], y = df_rf['value_rf'], name='random forest model'))

# fig.add_trace(go.Scatter(x=df_rf['date'], y = df_rf['MSE'], name='MSE'), secondary_y=True)

fig['layout'].update(height=600, width=1500, title='Actual vs Predicted Value')

fig.update_xaxes(title_text="Date")
fig.update_yaxes(title_text="USD")

fig.show()

In [21]:
stats = [stat_lr, stat_xgb, stat_rf]

mse = [i['MSEtest'] for i in stats]

In [22]:

fig = make_subplots(rows=2, cols=2, subplot_titles=["Prices", "Returns Distribution", 'MSE time series', '4'])

x_dist_mse = ff.create_distplot([df_rf['MSE'].values.tolist()], [''], bin_size=.01).data[1]['x']
y_dist_mse = ff.create_distplot([df_rf['MSE'].values.tolist()], [''], bin_size=.01).data[1]['y']

x_dist_act = ff.create_distplot([df_rf['pct_t-1'].values.tolist()], [''], bin_size=.01).data[1]['x']
y_dist_act = ff.create_distplot([df_rf['pct_t-1'].values.tolist()], [''], bin_size=.01).data[1]['y']

x_dist_predict = ff.create_distplot([df_rf['predict'].values.tolist()], [''], bin_size=.01).data[1]['x']
y_dist_predict = ff.create_distplot([df_rf['predict'].values.tolist()], [''], bin_size=.01).data[1]['y']

fig.add_trace(go.Scatter(x=df_rf['date'], y = df_rf['value'], name='actual price'), row=1, col=1)
fig.add_trace(go.Scatter(x=df_rf['date'], y = df_rf['value_rf'], name='random forest price'), row=1, col=1)

fig.add_trace(go.Scatter(x=x_dist_act, y = y_dist_act, name='actual returns distribution'), row=1, col=2)
fig.add_trace(go.Scatter(x=x_dist_predict, y = y_dist_predict, name='random forestreturns distribution'), row=1, col=2)

fig.add_trace(go.Scatter(x=df_rf['date'], y = df_rf['MSE'], name='random forest MSE'), row=2, col=1)
fig.add_trace(go.Scatter(x=df_lr['date'], y = df_lr['MSE'], name='linear regr MSE'), row=2, col=1)
fig.add_trace(go.Scatter(x=df_xgb['date'], y = df_xgb['MSE'], name='XGB regr MSE'), row=2, col=1)
fig.add_trace(go.Bar(x=['Linear' , 'XGB', 'Random Forest'], y = mse, name='MSE comparison'), row=2, col=2)

fig['layout'].update(height=800, width=1500, title='Model Statistics')

fig.show()