In [1]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import seaborn as sns
import pickle

import matplotlib.pyplot as plt
plt.style.use("ggplot")
%config InlineBackend.figure_format='retina'

import QuantTrading.ImpactFitting as IF
from scipy.optimize import minimize

def load_from_pickle(filename):
    path = '../pkl_dump/'
    with open(path + filename, 'rb') as f:
        return pickle.load(f)

# Load data
traded_volume_df = load_from_pickle('traded_volume_df.pkl')
px_df = load_from_pickle('px_df.pkl')
daily_stock_info_df = load_from_pickle('daily_stock_info_df.pkl')
monthly_scaling_factor = load_from_pickle('monthly_scaling_factor.pkl')
stocks = traded_volume_df.reset_index()["stock"].unique()




In [2]:
ridge_summary = pd.DataFrame(columns=['stock', 'half_life', 'model_type', 'beta_estimate', 'alpha_estimate', 'valid_loss', 'best_eta'])
in_sample_month = 5
explanation_horizon_periods = 6

### Ridge model 1: global coef $\bar \lambda$ as the impact coef of the "stock index"

$$
\text{Loss} = \sum (y^{\text{real}}_i - \lambda x_i + \alpha) + \eta (\lambda - \bar\lambda^{\text{index}})^2,
$$

for each stock $s_j$.

In [12]:
def loss_function(params, x, y, global_coef, eta):
    coef, intercept = params  # Unpack the parameters
    predictions = coef * x + intercept
    residuals = y - predictions
    penalty = eta * (global_coef - coef) ** 2
    return (np.sum(residuals ** 2) + penalty)

In [13]:
in_sample_month = 5
half_life_list = np.array([60, 300, 900, 3600])
for model_type in ['sqrt', 'linear']:
    for half_life in half_life_list:
        impact_px_df = IF.get_impact_state(traded_volume_df, monthly_scaling_factor, half_life, model_type)
        req_stat_df = IF.impact_regression_statistics(impact_px_df, explanation_horizon_periods, px_df)
        req_stat_df = req_stat_df.loc[req_stat_df["y"] >= 1e-4].copy()
        req_stat_df["date"] = pd.to_datetime(req_stat_df["date"])
        reg_summary_naive = IF.regression_result_by_stock(req_stat_df, in_sample_month)
        for stock in stocks:
            initial_param = reg_summary_naive.loc[stock][["beta_estimate", "alpha_estimate"]].values
            global_coef = IF.get_index_impact_coef(traded_volume_df, px_df, monthly_scaling_factor, 
                                                half_life, model_type, in_sample_month)

            req_stat_df_in_sample_month = req_stat_df.loc[req_stat_df["date"].dt.month == in_sample_month].copy()\
                                        .loc[req_stat_df["stock"] == stock]
            x, y, x_valid, y_valid = IF.train_validation_split(req_stat_df_in_sample_month)

            eta_list = [0, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100]
            eta_info = IF.eta_info(x, y, x_valid, y_valid, eta_list, initial_param, global_coef, loss_function)

            best_eta = min(eta_info, key=lambda x: eta_info.get(x)[1])
            best_param, valid_loss = eta_info[best_eta][0], eta_info[best_eta][1]
            
            new_row = pd.DataFrame([[stock, half_life, model_type, best_param[0], best_param[1], valid_loss, best_eta]],
                                columns=ridge_summary.columns)
            ridge_summary = pd.concat([ridge_summary, new_row], ignore_index=True)

  cum_impact = pre_ewm.ewm(alpha=1-decay_factor, adjust=False, axis="columns").mean()  # Across columns
  cum_impact = pre_ewm.ewm(alpha=1-decay_factor, adjust=False, axis="columns").mean()  # Across columns
  result = minimize(fun=loss_function, x0=initial_param,
  ridge_summary = pd.concat([ridge_summary, new_row], ignore_index=True)
  cum_impact = pre_ewm.ewm(alpha=1-decay_factor, adjust=False, axis="columns").mean()  # Across columns
  result = minimize(fun=loss_function, x0=initial_param,
  cum_impact = pre_ewm.ewm(alpha=1-decay_factor, adjust=False, axis="columns").mean()  # Across columns
  result = minimize(fun=loss_function, x0=initial_param,
  cum_impact = pre_ewm.ewm(alpha=1-decay_factor, adjust=False, axis="columns").mean()  # Across columns
  result = minimize(fun=loss_function, x0=initial_param,
  cum_impact = pre_ewm.ewm(alpha=1-decay_factor, adjust=False, axis="columns").mean()  # Across columns
  result = minimize(fun=loss_function, x0=initial_param,
  cum_impact = pre

In [14]:
filename = "ridge1_summary.pkl"
path = "./impact_model_summary/"
with open(path + filename, 'wb') as f:
    pickle.dump(ridge_summary, f)

In [20]:
ridge_summary

Unnamed: 0,stock,half_life,model_type,beta_estimate,alpha_estimate,valid_loss,best_eta
0,A,60,sqrt,3.833531,0.000562,0.000848,0
1,AAL,60,sqrt,7.469310,0.000638,0.001093,0
2,AAP,60,sqrt,5.403316,0.000578,0.000956,0
3,AAPL,60,sqrt,14.279143,0.000468,0.000722,0
4,ABBV,60,sqrt,7.121990,0.000487,0.000607,0
...,...,...,...,...,...,...,...
395,APC,3600,linear,128.342007,0.000318,0.000308,0
396,APD,3600,linear,122.543174,0.000428,0.000392,0
397,APH,3600,linear,152.688662,0.000407,0.000432,0
398,APTV,3600,linear,150.433040,0.000551,0.001155,0


### Ridge model 2: global coef $\bar \lambda$ as average stock-wise $\lambda_j$

$$
\text{Loss} = \sum_{i, j} (y^{j, \text{real}}_i - \beta^j x^j_i + \alpha^j) + \eta \sum_j (\lambda_j - \bar\lambda)^2
$$

for each stock $s_j$.

In [3]:
in_sample_month = 5
half_life = 3600
model_type = 'linear'
impact_px_df = IF.get_impact_state(traded_volume_df, monthly_scaling_factor, half_life, model_type)
req_stat_df = IF.impact_regression_statistics(impact_px_df, explanation_horizon_periods, px_df)
req_stat_df = req_stat_df.loc[req_stat_df["y"] >= 1e-4].copy()
req_stat_df["date"] = pd.to_datetime(req_stat_df["date"])
reg_summary_naive = IF.regression_result_by_stock(req_stat_df, in_sample_month)

initial_param = reg_summary_naive[["beta_estimate", "alpha_estimate"]]  # n * 2 matrix
req_stat_df_in_sample_month = req_stat_df.loc[req_stat_df["date"].dt.month == in_sample_month].copy()


  cum_impact = pre_ewm.ewm(alpha=1-decay_factor, adjust=False, axis="columns").mean()  # Across columns


In [4]:
# train validation split
list_x, list_y, list_x_valid, list_y_valid = [], [], [], []
for stock in stocks:
    df_temp = req_stat_df_in_sample_month.loc[req_stat_df_in_sample_month['stock'] == stock][['stock', 'x', 'y']]
    x, y, x_valid, y_valid = IF.train_validation_split(df_temp)
    # add stocks column
    x = pd.DataFrame(x, columns=['x'])
    y = pd.DataFrame(y, columns=['y'])
    x_valid = pd.DataFrame(x_valid, columns=['x'])
    y_valid = pd.DataFrame(y_valid, columns=['y'])
    x['stock'] = stock
    y['stock'] = stock
    x_valid['stock'] = stock
    y_valid['stock'] = stock
    list_x.append(x)
    list_y.append(y)
    list_x_valid.append(x_valid)
    list_y_valid.append(y_valid)
x, y, x_valid, y_valid = pd.concat(list_x), pd.concat(list_y), pd.concat(list_x_valid), pd.concat(list_y_valid)
x.sort_values(by='stock', inplace=True)
y.sort_values(by='stock', inplace=True)
x_valid.sort_values(by='stock', inplace=True)
y_valid.sort_values(by='stock', inplace=True)

stocks = x.stock.unique()
initial_param.sort_index(inplace=True)

In [23]:
params = np.random.random((50, 2))

In [8]:
data = pd.merge(x, y, left_index=True, right_index=True)
data.drop('stock_y', axis=1, inplace=True)
data.columns = ['x', 'stock', 'y']

In [9]:
def loss_function(params, data, eta):
    '''
    params: n * 2 matrix
    x: dataframe by stocks
    y: dataframe by stocks
    '''
    loss = 0
    stocks = x.stock.unique()
    for i in range(len(stocks)):
        x_temp, y_temp = x.loc[x['stock'] == stocks[i]]['x'].values, y.loc[y['stock'] == stocks[i]]['y'].values
        y_pred_temp = x_temp * params[i][0] + params[i][1]
        loss += np.sum((y_temp - y_pred_temp) ** 2)
    
    loss += eta * params[:, 0].var()
    return loss

Unnamed: 0,x,stock,y
232382,8.500482e-07,A,0.000944
223886,-3.678436e-09,A,0.000504
236112,6.424101e-07,A,0.000744
222782,-2.786737e-07,A,0.000865
202396,2.102980e-07,A,0.000518
...,...,...,...
28670202,1.661375e-06,ARE,0.001094
28684955,1.645592e-08,ARE,0.000174
28641577,3.010775e-07,ARE,0.000629
28644073,-1.255611e-07,ARE,0.000424


In [None]:
# eta_list = [0, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100]
# eta_info = IF.eta_info(x, y, x_valid, y_valid, eta_list, initial_param, global_coef, loss_function)

# best_eta = min(eta_info, key=lambda x: eta_info.get(x)[1])
# best_param, valid_loss = eta_info[best_eta][0], eta_info[best_eta][1]

# new_row = pd.DataFrame([[stock, half_life, model_type, best_param[0], best_param[1], valid_loss, best_eta]],
#                     columns=ridge_summary.columns)
# ridge_summary = pd.concat([ridge_summary, new_row], ignore_index=True)
