In [2]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from lightgbm import LGBMClassifier
from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt
import sweetviz as sv

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd
  from .autonotebook import tqdm as notebook_tqdm


In [112]:
_Xtrain = pd.read_csv("x_train.csv", index_col='ID')
_Xtest = pd.read_csv('x_test.csv', index_col='ID')
_y = pd.read_csv("y_train.csv", index_col='ID')

In [155]:
print('Number of elements in Sector: ', _Xtrain['SECTOR'].nunique())
print('Number of elements in Industries: ', _Xtrain['INDUSTRY'].nunique())
print('Number of elements in Industries_Group: ', _Xtrain['INDUSTRY_GROUP'].nunique())
print('Number of elements in Sub_Industries: ', _Xtrain['SUB_INDUSTRY'].nunique())

Number of elements in Sector:  12
Number of elements in Industries:  72
Number of elements in Industries_Group:  26
Number of elements in Sub_Industries:  175


In [113]:
# Feature engineering
new_features = []

# Conditional aggregated features
shifts = [1]  # Choose some different shifts
statistics = ['mean']  # the type of stat
#gb_features_list = [['SECTOR', 'DATE'], ['INDUSTRY_GROUP', 'DATE']]
gb_features_list = [['SECTOR', 'DATE']]

target_features = ['RET','VOLUME']
for target_feature in target_features:
    for gb_features in gb_features_list:
        tmp_name = '_'.join(gb_features)
        for shift in shifts:
            for stat in statistics:
                name = f'{target_feature}_{shift}_{tmp_name}_{stat}'
                feat = f'{target_feature}_{shift}'
                new_features.append(name)
                for data in [_Xtrain, _Xtest]:
                    data[name] = data.groupby(gb_features)[feat].transform(stat)

In [114]:
shifts = [5, 20]
statistics = ['mean']  # the type of stat
#gb_features_list = [['SECTOR', 'DATE'], ['INDUSTRY_GROUP', 'DATE']]
gb_features_list = [['STOCK', 'DATE']]

target_features = ['RET', 'VOLUME']
for target_feature in target_features:
    for gb_features in gb_features_list:
        tmp_name = '_'.join(gb_features)
        for shift in shifts:
            for stat in statistics:
                name = f'{shift}_day_{stat}_MA_{gb_features[0]}_{target_feature}'
                feat = f'{target_feature}_{shift}'
                new_features.append(name)
                for data in [_Xtrain, _Xtest]:
                    data[name] = data.groupby(gb_features)[[f'{target_feature}_{day}' for day in range(1,shift+1)]].transform(stat).mean(axis=1)

In [115]:
COLUMNS = ['DATE','STOCK', 'SECTOR', 'INDUSTRY_GROUP', 'INDUSTRY', 'SUB_INDUSTRY']
#COLUMNS = []
for day in range(6,21):
    COLUMNS.append(f'RET_{day}')
    COLUMNS.append(f'VOLUME_{day}')

_Xtrain.drop(columns=COLUMNS, inplace=True)

_Xtest.drop(columns=COLUMNS, inplace=True)

In [116]:
_Xtrain

Unnamed: 0_level_0,RET_1,VOLUME_1,RET_2,VOLUME_2,RET_3,VOLUME_3,RET_4,VOLUME_4,RET_5,VOLUME_5,RET_1_SECTOR_DATE_mean,VOLUME_1_SECTOR_DATE_mean,5_day_mean_MA_STOCK_RET,20_day_mean_MA_STOCK_RET,5_day_mean_MA_STOCK_VOLUME,20_day_mean_MA_STOCK_VOLUME
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
0,-0.015748,0.147931,-0.015504,0.179183,0.010972,0.033832,-0.014672,-0.362868,0.016483,-0.972920,0.009178,0.006771,-0.003694,0.016318,-0.194968,0.457365
1,0.003984,,-0.090580,,0.018826,,-0.025540,,-0.038062,,0.006477,0.009725,-0.026274,-0.005572,,
2,0.000440,-0.096282,-0.058896,0.084771,-0.009042,-0.298777,0.024852,-0.157421,0.009354,0.091455,0.013449,0.124151,-0.006658,-0.001043,-0.075251,-0.209642
3,0.031298,-0.429540,0.007756,-0.089919,-0.004632,-0.639737,-0.019677,-0.940163,0.003544,-0.882464,0.017253,-0.121974,0.003658,-0.003154,-0.596365,-0.539049
4,0.027273,-0.847155,-0.039302,-0.943033,0.000000,-1.180629,0.000000,-1.313896,0.022321,-1.204398,0.006241,-0.190113,0.002059,-0.003551,-1.097822,0.606319
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
418590,0.021843,-0.217823,-0.021703,-0.125333,0.011141,-0.674800,0.021379,-0.150397,0.001727,-0.026910,0.016746,-0.241373,0.006877,0.002034,-0.239053,-0.079617
418591,-0.006920,-0.375251,0.000000,-0.029437,0.006968,3.189102,0.009143,0.403533,0.009226,-0.840418,0.012887,-0.296513,0.003683,-0.000310,0.469506,-0.130105
418592,0.021869,-0.978856,-0.005929,-1.026267,0.010989,-0.724158,0.006030,-0.784385,-0.015826,-0.701318,0.024201,0.190739,0.003427,0.005549,-0.842997,0.297771
418593,0.012248,-0.627169,0.010925,-0.842108,0.008076,-0.460447,0.014903,0.040382,0.006752,-0.039305,0.016746,-0.241373,0.010581,0.009756,-0.385729,-0.240608


In [131]:
model = LGBMClassifier(boosting_type='gbdt', n_estimators=1000,max_depth=100,random_state=0,num_leaves=100, feature_fraction=0.8)
model.fit(_Xtrain,_y.values.reshape(-1))
accuracy_score(model.predict(_Xtrain),_y.values.reshape(-1))

[LightGBM] [Info] Number of positive: 208846, number of negative: 209749
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.029493 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4080
[LightGBM] [Info] Number of data points in the train set: 418595, number of used features: 16
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.498921 -> initscore=-0.004314
[LightGBM] [Info] Start training from score -0.004314


0.8238416607938461

In [132]:
model.feature_importances_

array([6176, 6086, 6505, 5804, 6293, 5664, 6673, 5646, 6660, 5897, 7169,
       7444, 5385, 6316, 4661, 6621])

In [119]:
parameters = {
    'boosting_type': ['rf','gbdt'],
    'n_estimators': [500],
    'max_depth': [2**3, 2**4],
    'min_child_samples': [50,1000],
    'num_leaves': [100],
    'random_state': [0],
    'n_jobs': [-1], 
    'feature_fraction': [0.5,0.8], 
    'objective': ['binary'],
    'verbose': [-1]
}

In [120]:
search = GridSearchCV(LGBMClassifier(), param_grid=parameters, n_jobs=-1, scoring='accuracy', verbose=2)
search.fit(_Xtrain, _y.values.reshape(-1))

Fitting 5 folds for each of 16 candidates, totalling 80 fits


In [126]:
search.best_params_

{'boosting_type': 'rf',
 'feature_fraction': 0.5,
 'max_depth': 16,
 'min_child_samples': 50,
 'n_estimators': 500,
 'n_jobs': -1,
 'num_leaves': 100,
 'objective': 'binary',
 'random_state': 0,
 'verbose': -1}

In [127]:
search.best_estimator_.feature_importances_

array([2195, 1892, 2679, 1453, 2788, 1818, 3574, 1514, 3343, 1218, 9158,
       9316, 2291, 2682, 1887, 1692])

In [129]:
# mix industry_date for MA and sector_date for past day 
pd.DataFrame(search.cv_results_).sort_values(by='rank_test_score', ascending=True)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_boosting_type,param_feature_fraction,param_max_depth,param_min_child_samples,param_n_estimators,param_n_jobs,...,param_verbose,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
2,98.926106,1.632066,1.819633,0.22607,rf,0.5,16,50,500,-1,...,-1,"{'boosting_type': 'rf', 'feature_fraction': 0....",0.519344,0.528721,0.519297,0.503745,0.501188,0.514459,0.010408,1
0,62.756028,12.794661,1.738171,0.348985,rf,0.5,8,50,500,-1,...,-1,"{'boosting_type': 'rf', 'feature_fraction': 0....",0.520324,0.529008,0.520181,0.502323,0.500364,0.51444,0.011178,2
6,91.431783,0.793597,1.343985,0.032222,rf,0.8,16,50,500,-1,...,-1,"{'boosting_type': 'rf', 'feature_fraction': 0....",0.518102,0.525377,0.519703,0.506456,0.500424,0.514012,0.009165,3
3,105.667028,6.066559,2.104545,0.276228,rf,0.5,16,1000,500,-1,...,-1,"{'boosting_type': 'rf', 'feature_fraction': 0....",0.521769,0.526774,0.516514,0.500352,0.501893,0.513461,0.010595,4
1,57.012313,15.115247,1.670854,0.270737,rf,0.5,8,1000,500,-1,...,-1,"{'boosting_type': 'rf', 'feature_fraction': 0....",0.519022,0.527049,0.517409,0.501654,0.499898,0.513007,0.010521,5
4,87.374178,6.451481,1.310774,0.072065,rf,0.8,8,50,500,-1,...,-1,"{'boosting_type': 'rf', 'feature_fraction': 0....",0.514734,0.522223,0.51981,0.509992,0.49819,0.51299,0.008519,6
5,65.596519,4.393061,1.057972,0.074035,rf,0.8,8,1000,500,-1,...,-1,"{'boosting_type': 'rf', 'feature_fraction': 0....",0.513635,0.522796,0.520646,0.503745,0.502311,0.512627,0.008415,7
7,92.526387,1.925039,1.488797,0.042447,rf,0.8,16,1000,500,-1,...,-1,"{'boosting_type': 'rf', 'feature_fraction': 0....",0.515677,0.522271,0.518305,0.500878,0.50476,0.512378,0.008175,8
12,74.52665,4.447636,1.197993,0.044061,gbdt,0.8,8,50,500,-1,...,-1,"{'boosting_type': 'gbdt', 'feature_fraction': ...",0.517206,0.521614,0.51164,0.505512,0.505011,0.512197,0.006487,9
9,41.588604,0.827185,1.161193,0.031945,gbdt,0.5,8,1000,500,-1,...,-1,"{'boosting_type': 'gbdt', 'feature_fraction': ...",0.519834,0.524433,0.507806,0.498465,0.506659,0.511439,0.00942,10


In [130]:
y_pred = search.predict(_Xtrain)
print('Classic predict: ', accuracy_score(_y, y_pred))

Classic predict:  0.5691109545025621


In [137]:
y_pred = search.predict_proba(_Xtrain)[:, 1]
sub = _Xtrain.copy()
sub['pred'] = y_pred
y_pred = sub['pred'].transform(lambda x: x > x.median()).values
print('Proba predict: ', accuracy_score(_y, y_pred))

Proba predict:  0.569079898230987


In [135]:
print('Accuracy', accuracy_score(_y, search.best_estimator_.predict(_Xtrain)))

Accuracy 0.5691109545025621


In [110]:
_ypred = search.best_estimator_.predict(_Xtest)
df_results = pd.DataFrame({'RET': _ypred})
df_results.index = _Xtest.index
df_results.to_csv('y_test.csv')

In [111]:
df_results

Unnamed: 0_level_0,RET
ID,Unnamed: 1_level_1
418595,True
418596,False
418597,True
418598,True
418599,True
...,...
617019,False
617020,False
617021,False
617022,False
