# HKU QIDS 2023 Quantitative Investment Competition: Model

## Init Config

In [38]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from qids_package.qids import *
import warnings
from submit import submit

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso

warnings.filterwarnings("ignore")

In [39]:
seed = 257248
stock_num = 54
day_num_total = 1000
day_num = 1000 - 2
test_day_num = 700
timeslot_num = 50

In [40]:
def std(train, valid, test=None):
    scaler = StandardScaler()
    scaler.fit(train)
    train = scaler.transform(train)
    valid = scaler.transform(valid)
    if test is not None:
        test = scaler.transform(test)
    return train, valid, test

In [41]:
def calc_corr(df1, df2):
    return np.corrcoef(df1, df2)[0][1]

In [42]:
def evaluate(model, train, test, train_y, real_y):
    model.fit(train, train_y)
    # model_train_y = model.predict(train)
    pred = model.predict(test)
    # print(calc_corr(train_y, model_train_y))
    # print(calc_corr(real_y, pred)) #[:37692]
    return pred

In [43]:
def evaluate2(model, train, test, train_y, real_y):
    model.fit(train, train_y)
    # model_train_y = model.predict(train)
    pred = model.predict(test)
    return pred

## Load Data

In [44]:
write_path = "../data/"

train_path = write_path + "train.csv"
test_path = write_path + "test.csv"
# train_path = write_path + "train_github.csv"
# test_path = write_path + "test_github.csv"
real_return_path = write_path + "real_return.csv"
# real_return_path = write_path + "real_return_reorder.csv"

train = pd.read_csv(train_path)
test = pd.read_csv(test_path)
real_return = pd.read_csv(real_return_path)

In [45]:
train_y = train["return"]
train = train.drop(columns=["return", "date_time", "stock_id", "day"])

test = test.drop(columns=["date_time", "stock_id", "day"])

real_y = real_return["return"]

In [46]:
abandon_all = ['transactionAmount', 'volume_min', 'money_mean', 'money_max', 'low_mean', 'close_mean', 'open_mean', 'high_mean', 'low_min', 'volume_std']
train = train.drop(columns=abandon_all)
test = test.drop(columns=abandon_all)

In [47]:
for i in [0.012]:
    result = None
    for stock in range(stock_num):
        start = stock * day_num
        end = start + day_num
        model = Ridge(alpha=i, normalize=True)
        pred = evaluate2(
            model, 
            train.iloc[stock*day_num:(stock+1)*day_num, :], 
            test.iloc[stock*test_day_num:(stock+1)*test_day_num, :], 
            train_y.iloc[stock*day_num:(stock+1)*day_num], 
            real_y.iloc[stock*test_day_num:(stock+1)*test_day_num-2]
            # real_y.iloc[[i for i in range(stock, len(real_y), stock_num)]]
        )
        if result is None:
            result = pred
        else:
            result = np.concatenate([result, pred], axis=0)
    print(i, calc_corr(result, real_y))
    last_cor = calc_corr(result, real_y)

0.012 0.07592918039280715


In [48]:
submit(result)

In [11]:
abandon_all = []
# last_cor = 0.05496612532000942
last_abandon = ["all"]
while last_abandon:
    abandon = {}
    names = train.corr()[train.columns[-1]].sort_values().index
    for k in range(len(names)):
        # print(k, names[k])
        for i in [0.012]:
            result = None
            for stock in range(stock_num):
                start = stock * day_num
                end = start + day_num
                model = Ridge(alpha=i, normalize=True)
                pred = evaluate2(
                    model, 
                    train.drop(columns=[names[k]]).iloc[stock*day_num:(stock+1)*day_num, :], 
                    test.drop(columns=[names[k]]).iloc[stock*test_day_num:(stock+1)*test_day_num, :], 
                    # train[list(names[:k])].iloc[stock*day_num:(stock+1)*day_num, :], 
                    # test[list(names[:k])].iloc[stock*test_day_num:(stock+1)*test_day_num, :], 
                    train_y.iloc[stock*day_num:(stock+1)*day_num], 
                    real_y.iloc[stock*test_day_num:(stock+1)*test_day_num-2]\
                )
                if result is None:
                    result = pred
                else:
                    result = np.concatenate([result, pred], axis=0)
            cor = calc_corr(result, real_y)
            # print(i, cor)
            if cor > last_cor:
                # abandon.append([names[k], cor])
                abandon[names[k]] = cor
    # abandon_cols = [col[0] for col in abandon]
    if abandon:
        abandon_cols = [sorted(abandon, key=lambda k: abandon[k], reverse=True)[0]]
        print(abandon_cols)
    else:
        break
    for i in [0.012]:
        result = None
        for stock in range(stock_num):
            start = stock * day_num
            end = start + day_num
            model = Ridge(alpha=i, normalize=True)
            pred = evaluate2(
                model, 
                train.drop(columns=abandon_cols).iloc[stock*day_num:(stock+1)*day_num, :], 
                test.drop(columns=abandon_cols).iloc[stock*test_day_num:(stock+1)*test_day_num, :], 
                # train[list(names[:k])].iloc[stock*day_num:(stock+1)*day_num, :], 
                # test[list(names[:k])].iloc[stock*test_day_num:(stock+1)*test_day_num, :], 
                train_y.iloc[stock*day_num:(stock+1)*day_num], 
                real_y.iloc[stock*test_day_num:(stock+1)*test_day_num-2]
                # real_y.iloc[[i for i in range(stock, len(real_y), stock_num)]]
            )
            if result is None:
                result = pred
            else:
                result = np.concatenate([result, pred], axis=0)
        cor = calc_corr(result, real_y)
        print(i, cor)
    if cor < last_cor:
        break
    abandon_all.append(abandon_cols[0])
    last_cor = cor
    last_abandon = abandon
    train = train.drop(columns=abandon_cols)
    test = test.drop(columns=abandon_cols)

['transactionAmount']
0.012 0.07363859064248485
['volume_min']
0.012 0.07436969040456373
['money_mean']
0.012 0.07459084899432877
['money_max']
0.012 0.07476067767153077
['low_mean']
0.012 0.0748502346459337
['close_mean']
0.012 0.07495439090526287
['open_mean']
0.012 0.07512050794449307
['high_mean']
0.012 0.07540376922025022
['low_min']
0.012 0.07590299973434213
['volume_std']
0.012 0.07592918039280715


In [12]:
for i in [0.012]:
    result = None
    for stock in range(stock_num):
        start = stock * day_num
        end = start + day_num
        model = Ridge(alpha=i, normalize=True)
        pred = evaluate2(
            model, 
            train.iloc[stock*day_num:(stock+1)*day_num, :], 
            test.iloc[stock*test_day_num:(stock+1)*test_day_num, :], 
            train_y.iloc[stock*day_num:(stock+1)*day_num], 
            real_y.iloc[stock*test_day_num:(stock+1)*test_day_num-2]
            # real_y.iloc[[i for i in range(stock, len(real_y), stock_num)]]
        )
        if result is None:
            result = pred
        else:
            result = np.concatenate([result, pred], axis=0)
    print(i, calc_corr(result, real_y))

0.012 0.07592918039280715


In [328]:
submit(result)

#### temp

In [None]:
abandon_all = []
last_cor = 0.05496612532000942
last_abandon = ["all"]
while last_abandon:
    abandon = []
    names = train.corr()[train.columns[-1]].sort_values().index
    for k in range(len(names)):
        # print(k, names[k])
        for i in [0.012]:
            result = None
            for stock in range(stock_num):
                start = stock * day_num
                end = start + day_num
                model = Ridge(alpha=i, normalize=True)
                pred = evaluate2(
                    model, 
                    train.drop(columns=[names[k]]).iloc[stock*day_num:(stock+1)*day_num, :], 
                    test.drop(columns=[names[k]]).iloc[stock*test_day_num:(stock+1)*test_day_num, :], 
                    # train[list(names[:k])].iloc[stock*day_num:(stock+1)*day_num, :], 
                    # test[list(names[:k])].iloc[stock*test_day_num:(stock+1)*test_day_num, :], 
                    train_y.iloc[stock*day_num:(stock+1)*day_num], 
                    real_y.iloc[stock*test_day_num:(stock+1)*test_day_num-2]\
                )
                if result is None:
                    result = pred
                else:
                    result = np.concatenate([result, pred], axis=0)
            cor = calc_corr(result, real_y)
            # print(i, cor)
            if cor > last_cor:
                abandon.append([names[k], cor])
    abandon_cols = [col[0] for col in abandon]
    print(abandon_cols)
    for i in [0.012]:
        result = None
        for stock in range(stock_num):
            start = stock * day_num
            end = start + day_num
            model = Ridge(alpha=i, normalize=True)
            pred = evaluate2(
                model, 
                train.drop(columns=abandon_cols).iloc[stock*day_num:(stock+1)*day_num, :], 
                test.drop(columns=abandon_cols).iloc[stock*test_day_num:(stock+1)*test_day_num, :], 
                # train[list(names[:k])].iloc[stock*day_num:(stock+1)*day_num, :], 
                # test[list(names[:k])].iloc[stock*test_day_num:(stock+1)*test_day_num, :], 
                train_y.iloc[stock*day_num:(stock+1)*day_num], 
                real_y.iloc[stock*test_day_num:(stock+1)*test_day_num-2]
                # real_y.iloc[[i for i in range(stock, len(real_y), stock_num)]]
            )
            if result is None:
                result = pred
            else:
                result = np.concatenate([result, pred], axis=0)
        cor = calc_corr(result, real_y)
        print(i, cor)
    if cor < last_cor:
        break
    abandon_all.append()
    last_cor = cor
    last_abandon = abandon
    train = train.drop(columns=abandon_cols)
    test = test.drop(columns=abandon_cols)

In [273]:
abandon_more = ['low_min', 'low_mean', 'low', 'open_mean', 'close_mean', 'close', 'open', 'high', 'high_mean', 'high_max', 'pe_ttm', 'pe_sma50', 'pe_sma25', 'Unnamed: 0', 'pe_growth', 'pcf_growth', 'pcf_2', 'pcf_sma10', 'pcf_sma25', 'pe_ttm_sma25', 'pe_ttm_sma50', 'pe_2', 'pb', 'money_min', 'volume_var', 'money_mean', 'volume_min', 'money_max', 'volume_std', 'time_step']
train = train.drop(columns=abandon_more)
test = test.drop(columns=abandon_more)

In [276]:
abandon_more_2 = ["pe_ttm_1", "ps_2", "volume_max"]
train = train.drop(columns=abandon_more_2)
test = test.drop(columns=abandon_more_2)

In [231]:
abandon_cols = ['low_mean', 'money_mean', 'volume_min', 'transactionAmount']
train = train.drop(columns=abandon_cols)
test = test.drop(columns=abandon_cols)

In [237]:
abandon_cols_2 = ['low_min', 'open_mean', 'close_mean', 'high_mean', 'money_max', 'volume_std']
train = train.drop(columns=abandon_cols_2)
test = test.drop(columns=abandon_cols_2)

In [277]:
abandon = []
names = train.corr()[train.columns[-1]].sort_values().index
# for k in range(22, len(names)+1):
for k in range(len(names)):
    print(k, names[k])
    # for i in [1e-1,1e-2,1e-3,1e-4,1e-5,1e-6,1e-7,1e-8,1e-9,1e-10] + [1e1,1e2,1e3,1e4,1e5,1e6,1e7,1e8,1e9,1e10]:
    for i in [0.012]:
        # print(i)
        result = None
        for stock in range(stock_num):
            start = stock * day_num
            end = start + day_num
            model = Ridge(alpha=i, normalize=True)
            pred = evaluate2(
                model, 
                train.drop(columns=[names[k]]).iloc[stock*day_num:(stock+1)*day_num, :], 
                test.drop(columns=[names[k]]).iloc[stock*test_day_num:(stock+1)*test_day_num, :], 
                # train[list(names[:k])].iloc[stock*day_num:(stock+1)*day_num, :], 
                # test[list(names[:k])].iloc[stock*test_day_num:(stock+1)*test_day_num, :], 
                train_y.iloc[stock*day_num:(stock+1)*day_num], 
                real_y.iloc[stock*test_day_num:(stock+1)*test_day_num-2]
                # real_y.iloc[[i for i in range(stock, len(real_y), stock_num)]]
            )
            if result is None:
                result = pred
            else:
                result = np.concatenate([result, pred], axis=0)
        cor = calc_corr(result, real_y)
        print(i, cor)
        if cor > 0.07979735900331283:
            abandon.append([names[k], cor])

0 pe_ttm_2
0.012 0.05494779479418194
1 pe_sma10
0.012 0.07131244009881266
2 pe_ttm_growth
0.012 0.07204076266571377
3 pcf_1
0.012 0.0726384942350757
4 pcf
0.012 0.07275042539996805
5 ps_sma50
0.012 0.07198927238154523
6 ps_sma25
0.012 0.07187478998402276
7 ps_sma10
0.012 0.07185646757741083
8 pb_sma50
0.012 0.07228870143132736
9 ps_growth
0.012 0.07328976848384391
10 pb_sma25
0.012 0.07180950246102642
11 pb_sma10
0.012 0.07192946137831374
12 pe_ttm_sma10
0.012 0.07153167381225888
13 pcf_sma50
0.012 0.07079170496226996
14 pb_growth
0.012 0.07293469654867875
15 pe
0.012 0.06963762951100745
16 pe_1
0.012 0.07183680252306528
17 ps
0.012 0.06987838854137217
18 ps_1
0.012 0.0630989028232632
19 pb_1
0.012 0.05379910785513546
20 pb_2
0.012 0.044231389832232314
21 money_var
0.012 0.06859238503925247
22 money
0.012 0.07245716996859612
23 volume
0.012 0.07163198251019445
24 money_std
0.012 0.06951290589059808
25 transactionAmount
0.012 0.07775256987576029
26 volume_mean
0.012 0.06686655799387645


In [240]:
abandon_cols = [col[0] for col in abandon]
abandon_cols

[]

In [241]:
for i in [0.012]:
    # print(i)
    result = None
    for stock in range(stock_num):
        start = stock * day_num
        end = start + day_num
        model = Ridge(alpha=i, normalize=True)
        pred = evaluate2(
            model, 
            train.drop(columns=abandon_cols).iloc[stock*day_num:(stock+1)*day_num, :], 
            test.drop(columns=abandon_cols).iloc[stock*test_day_num:(stock+1)*test_day_num, :], 
            # train[list(names[:k])].iloc[stock*day_num:(stock+1)*day_num, :], 
            # test[list(names[:k])].iloc[stock*test_day_num:(stock+1)*test_day_num, :], 
            train_y.iloc[stock*day_num:(stock+1)*day_num], 
            real_y.iloc[stock*test_day_num:(stock+1)*test_day_num-2]
            # real_y.iloc[[i for i in range(stock, len(real_y), stock_num)]]
        )
        if result is None:
            result = pred
        else:
            result = np.concatenate([result, pred], axis=0)
    cor = calc_corr(result, real_y)
    print(i, cor)

0.012 0.07592918039280715


## Normalization

In [145]:
# train, valid, test = std(train.iloc[:, 3:-1], valid.iloc[:, 3:-1], test.iloc[:, 3:])

## Model

### Linear Regression

In [124]:
model = LinearRegression()
pred = evaluate(model, train, test, train_y, real_y)

0.08185975427789274
0.04034488821467451


### Ridge Regression

In [131]:
model = Ridge(alpha=1e8)
pred = evaluate(model, train, test, train_y, real_y)

0.07108389523027633
0.0597535859373132


### Lasso Regression

In [130]:
model = Lasso(alpha=1)
pred = evaluate(model, train, test, train_y, real_y)

0.07082556523046826
0.059702617533116835


In [129]:
# [1e-1,1e-2,1e-3,1e-4,1e-5,1e-6,1e-7,1e-8,1e-9,1e-10]
# [1e1,1e2,1e3,1e4,1e5,1e6,1e7,1e8,1e9,1e10]
for i in [0.012]:
    result = None
    for stock in range(stock_num):
        start = stock * day_num
        end = start + day_num
        model = Ridge(alpha=i, normalize=True)
        pred = evaluate2(
            model, 
            train.iloc[stock*day_num:(stock+1)*day_num, :], 
            test.iloc[stock*test_day_num:(stock+1)*test_day_num, :], 
            train_y.iloc[stock*day_num:(stock+1)*day_num], 
            real_y.iloc[stock*test_day_num:(stock+1)*test_day_num-2]
            # real_y.iloc[[i for i in range(stock, len(real_y), stock_num)]]
        )
        if result is None:
            result = pred
        else:
            result = np.concatenate([result, pred], axis=0)
    print(i, calc_corr(result, real_y))

0.012 0.07225595088513154


### Random Forest

In [32]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(n_estimators=1000, max_depth=3, min_samples_split=50, random_state=seed)
pred = evaluate(model, train, test, train_y, real_y)

0.09721523729185862
0.002919757303658706


## Submission

In [88]:
submit(pred)