In [9]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import glob
import polars as pl
import pandas as pd
from joblib import Parallel, delayed

In [3]:
%%time
num_cores = -1  # Use all available cores, you can adjust this value based on your system resources

# Use joblib to parallelize the reading of files
files = pd.concat(Parallel(n_jobs=num_cores, backend='multiprocessing')(delayed(read_file)(file_name) for file_name in files_name))

CPU times: user 17.3 s, sys: 1min 5s, total: 1min 23s
Wall time: 1min 45s


In [6]:
import polars as pl

df = pl.from_pandas(files)

In [1220]:
temp_df = temp_df.with_columns(temp_df['rec_epoch'].apply(lambda x: datetime.fromtimestamp(x).astimezone(timezone('Asia/Hong_Kong'))).alias('time'))

In [995]:
for i in range(1, 6):
    temp_df = temp_df.with_column((A/A.shift(1) - 1).rename(f'A{i}'))

In [1]:
import pandas as pd
import numpy as np
import time
import plotly.graph_objects as go
import plotly.express as px
import plotly.io as pio
from math import ceil
from sklearn.linear_model import LinearRegression, RidgeCV

pio.renderers.default = 'notebook'

import plotly.io as pio
pio.renderers.default = 'iframe' # or 'colab' or 'iframe' or 'iframe_connected' or '


test_cols = list(set(new_cols + ['alpha', 'new_alpha_linear']))
start_date = 20210701
end_train_date = 20220101
sel_freq = 60

############################# 
temp_df_full = temp_df.clone().fill_null(0)
temp_df_full = temp_df_full.sort('date')
temp_df = temp_df_full.filter(temp_df_full['date']>= start_date)

## check terms correlation
# Compute the correlation matrix using Polars
correlation_matrix = temp_df[new_cols].pearson_corr()
column_names = correlation_matrix.columns

# np.fill_diagonal(correlation_matrix, 0)

# Convert the correlation matrix to a NumPy array
# correlation_values = correlation_matrix.to_list()
# Create the heatmap
plt.figure(figsize=(20, 16))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', annot_kws={"size": 10},  xticklabels=column_names, yticklabels=column_names)

# Add title and show the plot
plt.title('Correlation Heatmap')
plt.show()

print('start to train linear regression model')
start_time = time.time()
############################# fitter
temp_df_train, temp_df_test = temp_df.filter(temp_df['date']<=end_train_date), temp_df.filter(temp_df['date']>end_train_date)
X_train, X_test = pl.concat([temp_df_train[new_cols], temp_df_train['alpha'].to_frame()], how = 'horizontal').transpose(), pl.concat([temp_df_test[new_cols], temp_df_test['alpha'].to_frame()], how = 'horizontal').transpose()
# X_train, X_test = temp_df_train[new_cols].transpose(), temp_df_test[new_cols].transpose()
y_train, y_test = temp_df_train['ret60s'].to_frame().transpose(), temp_df_test['ret60s'].to_frame().transpose()

# linear model
model = LinearRegression()
model.fit(X_train, y_train)
new_alpha = np.vstack([model.predict(X_train), model.predict(X_test)])
temp_df = temp_df.with_columns(pl.Series('new_alpha_linear', new_alpha.flatten()))

print(f'finish training, use {time.time() - start_time} s')
# # ridge model
# model = RidgeCV(alphas=[1e-3, 1e-2, 1e-1, 1])
# model.fit(X_train, y_train)
# new_alpha = np.vstack([model.predict(X_train), model.predict(X_test)])
# temp_df = temp_df.with_columns(pl.Series('new_alpha_ridge', new_alpha.flatten()))


#### scatter plots
n = len(new_cols)
n_perrow = 4
fig, axs = plt.subplots(ceil(n/n_perrow), n_perrow, figsize = (12 * n_perrow, 8 * int(n / n_perrow)))
for i, term in enumerate(new_cols):
    n_row = i // n_perrow
    n_col = i - n_row *4 
    sns.scatterplot(x = temp_df[term], y = temp_df['ret60s'], label = term, ax = axs[n_row][n_col])
    min_, max_ = temp_df[term].min(), temp_df[term].max()  
    x = np.linspace(min_, max_)
    sns.lineplot(x = x, y = np.zeros_like(x),  ax = axs[n_row][n_col], color = 'r')
plt.show()

### start to backtest
# Group the DataFrame by the "date" column
groups = temp_df.groupby('date')
# Iterate over each group and apply the FR function
for i, test_col in enumerate(test_cols):
    results = pd.DataFrame()
    for group in list(groups):
        ret_cols = sorted([col for col in group.columns if col.find('ret')!=-1])
        date = int(group['date'].unique()[0])
        x = group[test_col].fill_null(0)
        result = dict()
        result['date'] = date
        for ret_col in ret_cols:
            y = group[ret_col]
            result[f'{test_col}_FR_{ret_col[:-1]}'] = FR(x, y)
        result = pd.DataFrame([result])
        results = results.append(result, ignore_index=True)
    if i == 0:
        full_panel = results
    else:
        full_panel = full_panel.merge(results, left_on = 'date', right_on = 'date')
    
full_panel = full_panel.sort_values('date').reset_index(drop = True)
# Print the results
FR_cols = [col for col in full_panel.columns if col.find('FR')!=-1]
FR_cum = pd.concat([full_panel['date'], full_panel[FR_cols].cumsum()], axis = 1)
FR_cum['date'] = pd.to_datetime(FR_cum['date'], format='%Y%m%d')
# Create a plotly figure
fig = go.Figure()

# Add each line plot for each FR column
if sel_freq!= None:
    cum_plot_cols = [col for col in FR_cols if col.endswith(str(sel_freq))]
else:
    cum_plot_cols = FR_cols
    
for FR_col in cum_plot_cols:
    fig.add_trace(go.Scatter(x=FR_cum['date'], y=FR_cum[FR_col], mode='lines', name=FR_col))

fig.add_trace(go.Scatter(x=FR_cum['date'], y=FR_cum['new_alpha_linear_FR_ret60'] - FR_cum['alpha_FR_ret60'], mode='lines', name='Improvement'))
# fig.add_trace(go.Scatter(x=FR_cum['date'], y=FR_cum['MI_0.5_FR_ret60'] - FR_cum['tf.ABmaxshs_FR_ret60'], mode='lines', name='compare'))

# Add a vertical dashed line at end_train_date
fig.add_shape(
    go.layout.Shape(
        type='line',
        x0=pd.to_datetime(end_train_date, format='%Y%m%d'),
        x1=pd.to_datetime(end_train_date, format='%Y%m%d'),
        y0=0,
        y1=FR_cum.iloc[-1, 1:].max(),
        xref='x',
        yref='paper',
        line=dict(color='black', dash='dash')
    )
)

# Update the layout of the figure
fig.update_layout(
    title='Cumulative Sum of FR Columns',
    xaxis_title='Date',
    yaxis_title='Cumulative Sum',
    showlegend=True,
    legend_title_text='FR Columns'
)

# Show the plot
fig.show()

print(FR_cum['new_alpha_linear_FR_ret60'].iloc[-1] /FR_cum['alpha_FR_ret60'].iloc[-1] -1)

ModuleNotFoundError: No module named 'plotly'

In [1208]:
# Create a plotly figure
fig = go.Figure()

for test_col in test_cols:
    collect_cum = []
    for col in [col for col in FR_cum.columns if col.startswith(test_col)]:
        collect_cum.append((int(col.split('ret')[-1]), FR_cum[col].iloc[-1]))
    collect_cum = sorted(collect_cum, key=lambda x: x[0])
    x_vals = [item[0] for item in collect_cum]
    y_vals = [item[1] for item in collect_cum]

    # Add the line plot
    fig.add_trace(go.Scatter(x=x_vals, y=y_vals, mode='lines', name=f'Cumulative Sum of {test_col}'))
    print(test_col)

# Update the layout of the figure
fig.update_layout(
    title='Cumulative Sum for Different Time Windows',
    xaxis_title='Time Window',
    yaxis_title='Cumulative Sum',
    showlegend=True,
)

# Show the plot
fig.show()

of.B_A_diff_max.lvl
new_alpha_linear
tf.RetM
cf.B_A_diff_norders
alpha
tf.B_A_diff_max.age_exp_neg_lvl
mtni
mti
PDN
cf.ABmaxshsdiff
tf.B_A_diff_notl
OPD
cf.B_A_diff_max.age_exp_neg_lvl
tf.B_A_diff_max.age
of.B_A_diff_max.notl
tf.B_A_diff_norders
mtnid
tf.ABMPrcD
tf.B_A_diff_max.notl
coIO
bk.ABD1
CPD
bk.ABR2
of.B_A_diff_norders
cf.B_A_diff_notl
of.B_A_diff_max.age_exp_neg_lvl
cf.B_A_diff_max.age_exp_neg
OI2
tf.ABmaxshsdiff
tf.B_A_diff_max.shs
of.B_A_diff_max.age_exp_neg
OI1
bk.mid_price_var_1m
tf.B_A_diff_max.age_exp_neg
of.B_A_diff_max.age
PDG
of.B_A_diff_max.shs
of.B_A_diff_notl
of.ABmaxshsdiff
cf.B_A_diff_max.lvl
PDG2
PDM
tf.B_A_diff_max.lvl
cf.B_A_diff_max.shs
cf.B_A_diff_max.age
cf.B_A_diff_max.notl
tf.DPrcRGL1


### timezome change

In [18]:
from pytz import timezone

### select conditions
hkt = timezone('Asia/Hong_Kong')
df['rec_epoch'] = pd.to_datetime(df['rec_epoch'], unit = 's').dt.tz_localize('UTC').dt.tz_convert(hkt)

TypeError: Argument 'values' has incorrect type (expected numpy.ndarray, got Series)