In [1]:
import pandas as pd
import os
from os import pardir, path
import sys
import warnings
warnings.filterwarnings('ignore')

from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn import metrics

from sklearn import ensemble
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [2]:
mod_path = os.getcwd()
if mod_path not in sys.path:
    sys.path.append(mod_path)
print(sys.path)

from func import Model

['/Users/karimkhalil/Coding/development/commodity', '/Users/karimkhalil/opt/anaconda3/lib/python39.zip', '/Users/karimkhalil/opt/anaconda3/lib/python3.9', '/Users/karimkhalil/opt/anaconda3/lib/python3.9/lib-dynload', '', '/Users/karimkhalil/opt/anaconda3/lib/python3.9/site-packages', '/Users/karimkhalil/opt/anaconda3/lib/python3.9/site-packages/aeosa']


In [3]:
## mapping

mat_codes = {
    "F": "01",
    "G": "02", 
    "H": "03", 
    "J": "04", 
    "K": "05",
    "M": "06",
    "N": "07", 
    "Q": "08",
    "U" : "09",
    "V": "10",
    "X": "11",
    "Z": "12"
}

## 1. Data Preparation

In [4]:
df = pd.read_csv('commodities.csv')
df['datetime'] = pd.to_datetime(df['date'], utc=True)
df['date_strf']=df.datetime.dt.strftime('%Y%m%d')
df['dayofweek'] = df['datetime'].dt.strftime("%a")
df['month'] = df['datetime'].dt.month

df['datetime_maturity'] = pd.to_datetime(df.maturity.str[-4:] + df.maturity.str[0].map(mat_codes), format='%Y%m', utc=True)

df['datetime_maturity'] = df['datetime_maturity']-pd.Timedelta(1, "d")
df['date_strf_maturity']=df.datetime_maturity.dt.strftime('%Y%m%d')
df['time2maturity_d'] = (df.datetime_maturity-df.datetime).dt.days

df.loc[df['time2maturity_d'] <0, 'time2maturity_d'] = 0

df['time2maturity_m'] = (df.time2maturity_d/30).round()
df_settle = df.loc[df['observation'] == 'Settle']
df_soy = df_settle.loc[df_settle['instrument'] == 'CBOT.ZS']
df_soy.sort_values(['datetime' , 'datetime_maturity'], ascending = [True, False], inplace=True)




In [5]:
## select prices with only 6 months maturity for comparability

dates = set(df_soy['date_strf'])

concat = []

for i in dates:
    duration = 6
    data = df_soy.loc[(df_soy['date_strf'] == i) & (df_soy['time2maturity_m'] == duration)]
    while data.shape[0] ==0:
        duration +=1
        data = df_soy.loc[(df_soy['date_strf'] == i) & (df_soy['time2maturity_m'] == duration)]
    concat.append(data)

df_soy_6m = pd.concat(concat)
df_soy_6m.set_index('date_strf' , inplace=True, drop=True)

### Create Features

In [6]:
## calculate returns for the previous 7 days

for i in range(7):
    df_soy_6m[f'pct_t-{i+1}'] = df_soy_6m.value.pct_change(i+1)

## rolling averages for prices and returns

vals = ['value', 'pct_t-1']

for i in [7, 15, 30 , 60]:
    for j in vals:
        df_soy_6m[f'roll_avg_pct_{i}'] = df_soy_6m[j].rolling(i).mean()

for i in vals:
    df_soy_6m[f'exp_avg_{i}'] = df_soy_6m[i].expanding(1).mean()


In [7]:
## drop missing values
df_soy_6m.dropna(inplace=True)

In [8]:
desc = df_soy_6m.describe()
desc

Unnamed: 0,value,month,time2maturity_d,time2maturity_m,pct_t-1,pct_t-2,pct_t-3,pct_t-4,pct_t-5,pct_t-6,pct_t-7,roll_avg_pct_7,roll_avg_pct_15,roll_avg_pct_30,roll_avg_pct_60,exp_avg_value,exp_avg_pct_t-1
count,171.0,171.0,171.0,171.0,171.0,171.0,171.0,171.0,171.0,171.0,171.0,171.0,171.0,171.0,171.0,171.0,171.0
mean,1315.73538,5.918129,192.274854,6.403509,0.002308,0.002216,0.002038,0.002355,0.002396,0.002849,0.00336,0.002083,0.001903,0.00179,0.001773,1311.186356,0.001976
std,54.674267,3.095399,17.876057,0.526687,0.061522,0.060495,0.05461,0.061017,0.060726,0.058995,0.059511,0.008637,0.003855,0.002014,0.001061,2.707478,0.00038
min,1186.25,1.0,165.0,6.0,-0.146367,-0.173682,-0.112055,-0.108886,-0.135352,-0.150555,-0.121886,-0.017201,-0.008203,-0.00379,-0.0012,1305.221875,0.000773
25%,1273.875,3.0,177.0,6.0,-0.04256,-0.038182,-0.0395,-0.041383,-0.040895,-0.038522,-0.038046,-0.003951,-0.000741,0.000355,0.001028,1309.001117,0.00172
50%,1304.0,6.0,190.0,6.0,0.002904,0.003349,-0.000204,0.001539,-0.000574,0.005703,-0.002839,0.001431,0.001752,0.001948,0.001906,1312.22433,0.001908
75%,1348.0,8.0,208.0,7.0,0.033745,0.039721,0.036763,0.045718,0.036287,0.040142,0.038494,0.008521,0.004269,0.00304,0.002522,1313.110659,0.002125
max,1461.0,11.0,225.0,8.0,0.20087,0.138604,0.138176,0.181083,0.165648,0.153166,0.171989,0.028178,0.012533,0.008122,0.003984,1315.024845,0.003266


In [9]:
def return_tot(df, date, value):
    period_beg = df.date.min()
    period_end = df.date.max()

    price_beg = df.loc[df.date == period_beg, value].values[0]
    price_end = df.loc[df.date == period_end, value].values[0]

    ret = (price_end-price_beg)/ price_beg
    return ret

sharpe = desc.loc['mean', 'pct_t-1'] / desc.loc['std', 'pct_t-1']
avg_return = desc.loc['mean', 'pct_t-1']
tot_return = return_tot(df_soy_6m, 'date', 'value')

print(f'Sharpe ratio: {100*(sharpe):.2f} %')
print(f'Average daily return: {100*(avg_return):.2f} %')
print(f'Total return: {100*(tot_return):.2f} %')

Sharpe ratio: 3.75 %
Average daily return: 0.23 %
Total return: -4.72 %


In [10]:
import plotly.figure_factory as ff
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go

df_sorted = df_soy_6m.sort_values('date')
fig = make_subplots(rows=2, cols=2, subplot_titles=["Price", "Daily Return Distribution", 'Price Distribution', 'Return Distribution'])


dist = ff.create_distplot([df_soy_6m['pct_t-1'].values.tolist()], [''], bin_size=.01).data[1]
hist = ff.create_distplot([df_soy_6m['pct_t-1'].values.tolist()], [''], bin_size=.01).data[0]
price = px.line(x=df_soy_6m.value, y=df_soy_6m.index)
price = px.line(y=df_sorted.value, x=df_sorted.date, title='Bean Price', labels=dict(x='Date', y='USD')).data[0]
# dist.update_layout(width=700, height=700)

fig.add_trace(go.Scatter(dist, line=dict(color='red')), row=1, col=2)
fig.add_trace(hist, row=1, col=2)
fig.add_trace(go.Scatter(price), row=1,col=1)
fig.add_trace(go.Box(x=df_sorted.value), row=2,col=1)
fig.add_trace(go.Box(x=df_sorted['pct_t-1']), row=2,col=2)

fig['layout'].update(height=800, width=1500, title='Returns Descriptive Statistics')

fig.update_xaxes(title_text="Date", row=1, col=1)
fig.update_yaxes(title_text="USD", row=1, col=1)

# fig.add_trace(dist, row=1, col=1)
fig.show()

In [11]:
model = Model(df_soy_6m)

object instanciated


In [12]:
model.df

Unnamed: 0_level_0,date,instrument,maturity,observation,value,currency,datetime,dayofweek,month,datetime_maturity,...,pct_t-4,pct_t-5,pct_t-6,pct_t-7,roll_avg_pct_7,roll_avg_pct_15,roll_avg_pct_30,roll_avg_pct_60,exp_avg_value,exp_avg_pct_t-1
date_strf,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
20210702,2021-07-02T00:00:00.000Z,CBOT.ZS,F2022,Settle,1402.25,USD,2021-07-02 00:00:00+00:00,Fri,7,2021-12-31 00:00:00+00:00,...,0.029930,0.120679,0.153166,0.053729,0.010075,0.003660,0.004299,0.003266,1305.942623,0.003266
20210809,2021-08-09T00:00:00.000Z,CBOT.ZS,H2022,Settle,1331.25,USD,2021-08-09 00:00:00+00:00,Mon,8,2022-02-28 00:00:00+00:00,...,0.040649,-0.022218,0.063936,0.094778,0.015160,0.004825,0.002858,0.002666,1306.350806,0.002382
20210824,2021-08-24T00:00:00.000Z,CBOT.ZS,H2022,Settle,1338.50,USD,2021-08-24 00:00:00+00:00,Tue,8,2022-02-28 00:00:00+00:00,...,0.053315,0.046316,-0.016893,0.069730,0.011797,0.006337,0.003786,0.002773,1306.861111,0.002432
20210416,2021-04-16T00:00:00.000Z,CBOT.ZS,X2021,Settle,1274.00,USD,2021-04-16 00:00:00+00:00,Fri,4,2021-10-31 00:00:00+00:00,...,0.027834,0.002558,-0.004104,-0.064267,-0.007674,0.002141,0.000050,0.003149,1306.347656,0.001628
20210115,2021-01-15T00:00:00.000Z,CBOT.ZS,N2021,Settle,1401.00,USD,2021-01-15 00:00:00+00:00,Fri,1,2021-06-30 00:00:00+00:00,...,-0.000891,0.130294,0.102499,0.095173,0.015197,0.005342,0.002385,0.002704,1307.803846,0.003161
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20210409,2021-04-09T00:00:00.000Z,CBOT.ZS,X2021,Settle,1263.25,USD,2021-04-09 00:00:00+00:00,Fri,4,2021-10-31 00:00:00+00:00,...,-0.106770,-0.135352,-0.032919,-0.022441,0.002705,0.002534,0.003284,0.001832,1312.709251,0.001750
20210722,2021-07-22T00:00:00.000Z,CBOT.ZS,H2022,Settle,1352.00,USD,2021-07-22 00:00:00+00:00,Thu,7,2022-02-28 00:00:00+00:00,...,0.119901,-0.044016,-0.074606,0.035024,0.011194,0.004976,0.003503,0.002951,1312.881579,0.002051
20210429,2021-04-29T00:00:00.000Z,CBOT.ZS,X2021,Settle,1318.75,USD,2021-04-29 00:00:00+00:00,Thu,4,2021-10-31 00:00:00+00:00,...,-0.090360,0.092359,-0.067527,-0.097365,-0.009244,0.006408,0.004800,0.002250,1312.907205,0.001934
20210222,2021-02-22T00:00:00.000Z,CBOT.ZS,U2021,Settle,1256.50,USD,2021-02-22 00:00:00+00:00,Mon,2,2021-08-31 00:00:00+00:00,...,-0.057919,-0.133299,0.040795,-0.111543,-0.011416,0.001040,-0.000038,0.001293,1312.661957,0.001720


In [13]:
l = model.train(df = model.df, col_period='date' , train_window=10 , test_window=1, test_gap = 0, expanding=False)

In [14]:
regr = LinearRegression()
ridge_regr = Ridge()
xgb = ensemble.GradientBoostingRegressor()
histgrad = ensemble.HistGradientBoostingRegressor()
randforest = ensemble.RandomForestRegressor()

In [15]:
df_soy_6m.columns

Index(['date', 'instrument', 'maturity', 'observation', 'value', 'currency',
       'datetime', 'dayofweek', 'month', 'datetime_maturity',
       'date_strf_maturity', 'time2maturity_d', 'time2maturity_m', 'pct_t-1',
       'pct_t-2', 'pct_t-3', 'pct_t-4', 'pct_t-5', 'pct_t-6', 'pct_t-7',
       'roll_avg_pct_7', 'roll_avg_pct_15', 'roll_avg_pct_30',
       'roll_avg_pct_60', 'exp_avg_value', 'exp_avg_pct_t-1'],
      dtype='object')

In [16]:
cols_x = ['pct_t-2', 'pct_t-3', 'pct_t-4', 'pct_t-5', 'pct_t-6', 'pct_t-7','roll_avg_pct_7', 'roll_avg_pct_15', 'roll_avg_pct_30']
cols_y = 'pct_t-1'

In [17]:
# model.skpredict(df_soy_6m, df_soy_6m, xgb, cols_x, cols_y, printstat=True)

In [18]:
df_result = model.skpredict_window(xgb, cols_x, cols_y, 'date' , train_window=4 , test_window=1, test_gap = 0, expanding=False, print_iter=False)[1]

Average MSE train: 1.1251211208160035e-12
Average MSE test: 0.0032314870927195203


In [19]:
df_result = model.skpredict_window(randforest, cols_x, cols_y, 'date' , train_window=4 , test_window=1, test_gap = 0, expanding=False, print_iter=False)[1]

Average MSE train: 0.0002905234448656446
Average MSE test: 0.0029644905923524745


In [28]:
for i in range(4,30,2):
    print(i)
    df_result = model.skpredict_window(randforest, cols_x, cols_y, 'date' , train_window=i , test_window=1, test_gap = 0, expanding=True, print_iter=False)[1]

4
Average MSE train: 0.0003386569411160924
Average MSE test: 0.0026535206920864788
6
Average MSE train: 0.00033857455559237084
Average MSE test: 0.0026002534309166
8
Average MSE train: 0.0003375875044445592
Average MSE test: 0.002627659602573164
10
Average MSE train: 0.0003388226834032392
Average MSE test: 0.00267509528569125
12
Average MSE train: 0.00033908139212218523
Average MSE test: 0.0025847467717817077
14
Average MSE train: 0.0003361622524377091
Average MSE test: 0.002559481561776129
16
Average MSE train: 0.00033805944752726496
Average MSE test: 0.0025545007046037286
18
Average MSE train: 0.0003394163038592217
Average MSE test: 0.0026135643679079403
20
Average MSE train: 0.00033871580630943243
Average MSE test: 0.0026882956138096553
22
Average MSE train: 0.00033686798431650897
Average MSE test: 0.0025618031476485983
24
Average MSE train: 0.00033772588327721
Average MSE test: 0.002607564629878796
26
Average MSE train: 0.00033915603203052083
Average MSE test: 0.0026728991415894358

In [27]:
for i in range(4,30,2):
    print(i)
    df_result = model.skpredict_window(randforest, cols_x, cols_y, 'date' , train_window=i , test_window=1, test_gap = 0, expanding=False, print_iter=False)[1]


4
Average MSE train: 0.0002978819966759724
Average MSE test: 0.002975647956351363
6
Average MSE train: 0.00031959306549231005
Average MSE test: 0.00272670279425826
8
Average MSE train: 0.0003201402091673382
Average MSE test: 0.0026814011673452357
10
Average MSE train: 0.0003280063869575518
Average MSE test: 0.0026699957114362785
12
Average MSE train: 0.0003262025479192083
Average MSE test: 0.0027731118532005254
14
Average MSE train: 0.0003303223041636231
Average MSE test: 0.0027213598969626634
16
Average MSE train: 0.00033550425479509857
Average MSE test: 0.002763918486131556
18
Average MSE train: 0.0003379464067530177
Average MSE test: 0.002690026920540326
20
Average MSE train: 0.0003401293548310919
Average MSE test: 0.0027842089887157246
22
Average MSE train: 0.00033656547826853436
Average MSE test: 0.002792801361542621
24
Average MSE train: 0.0003364674037628191
Average MSE test: 0.002711803261924129
26
Average MSE train: 0.0003367458589918841
Average MSE test: 0.002771077751998066


In [20]:
df_result = model.skpredict_window(regr, cols_x, cols_y, 'date' , train_window=4 , test_window=1, test_gap = 0, expanding=False, print_iter=False)[1]

Average MSE train: 1.7854125015062226e-33
Average MSE test: 0.004598746815074255
