In [187]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt
import sweetviz as sv

In [243]:
_Xtrain = pd.read_csv("x_train.csv", index_col='ID')
_y = pd.read_csv("y_train.csv", index_col='ID')
train = pd.concat([_Xtrain, _y], axis=1)
test = pd.read_csv('x_test.csv', index_col='ID')

In [189]:
print('Number of elements in Sector: ', _Xtrain['SECTOR'].nunique())
print('Number of elements in Industries: ', _Xtrain['INDUSTRY'].nunique())
print('Number of elements in Industries_Group: ', _Xtrain['INDUSTRY_GROUP'].nunique())
print('Number of elements in Sub_Industries: ', _Xtrain['SUB_INDUSTRY'].nunique())

Number of elements in Sector:  12
Number of elements in Industries:  72
Number of elements in Industries_Group:  26
Number of elements in Sub_Industries:  175


In [244]:
for column in train:
    train[column] = train[column].fillna(train[column].mean())

In [245]:
# Feature engineering
new_features = []

# Conditional aggregated features
shifts = [1,2,3,4]  # Choose some different shifts
statistics = ['mean']  # the type of stat
#gb_features_list = [['SECTOR', 'DATE'], ['INDUSTRY_GROUP', 'DATE']]
gb_features_list = [['SECTOR', 'DATE']]

target_features = ['RET']
for target_feature in target_features:
    for gb_features in gb_features_list:
        tmp_name = '_'.join(gb_features)
        for shift in shifts:
            for stat in statistics:
                name = f'{target_feature}_{shift}_{tmp_name}_{stat}'
                feat = f'{target_feature}_{shift}'
                new_features.append(name)
                for data in [train, test]:
                    data[name] = data.groupby(gb_features)[feat].transform(stat)

In [246]:
weeks = 4
statistics = ['mean', 'std']  # the type of stat
gb_features_list = [['STOCK', 'DATE']]

target_features = ['RET', 'VOLUME']
for target_feature in target_features:
    for gb_features in gb_features_list:
        tmp_name = '_'.join(gb_features)
        for stat in statistics:
            for week in range(weeks):
                name = f'{stat}_{target_feature}_WEEK_{week+1}'
                new_features.append(name)
                for data in [train, test]:
                    if stat == 'mean':
                        data[name] = data[[f'{target_feature}_{week*5 + day}' for day in range(1,6)]].mean(axis=1)
                    elif stat == 'std':
                        data[name] = data[[f'{target_feature}_{week*5 + day}' for day in range(1,6)]].std(axis=1)


In [247]:
shifts = [1,2,3,4]  # Choose some different shifts
statistics = ['sum']  # the type of stat
#gb_features_list = [['SECTOR', 'DATE'], ['INDUSTRY_GROUP', 'DATE']]
gb_features_list = [['SECTOR', 'DATE']]
target_features = ['mean_VOLUME_WEEK']
for target_feature in target_features:
    for gb_features in gb_features_list:
        tmp_name = '_'.join(gb_features)
        for shift in shifts:
            for stat in statistics:
                name = f'{target_feature}_{shift}_/total_VOLUME_of_DATE'
                feat = f'{target_feature}_{shift}'
                new_features.append(name)
                for data in [train, test]:
                    data[name] = data[feat]/data.groupby(gb_features)[feat].transform('sum')

In [248]:
target = 'RET'

n_shifts = 5  # If you don't want all the shifts to reduce noise
features = ['RET_%d' % (i + 1) for i in range(n_shifts)]
features += ['VOLUME_%d' % (i + 1) for i in range(n_shifts)]
features += new_features  # The conditional features
train[new_features].head()

Unnamed: 0_level_0,RET_1_SECTOR_DATE_mean,RET_2_SECTOR_DATE_mean,RET_3_SECTOR_DATE_mean,RET_4_SECTOR_DATE_mean,mean_RET_WEEK_1,mean_RET_WEEK_2,mean_RET_WEEK_3,mean_RET_WEEK_4,std_RET_WEEK_1,std_RET_WEEK_2,...,mean_VOLUME_WEEK_3,mean_VOLUME_WEEK_4,std_VOLUME_WEEK_1,std_VOLUME_WEEK_2,std_VOLUME_WEEK_3,std_VOLUME_WEEK_4,mean_VOLUME_WEEK_1_/total_VOLUME_of_DATE,mean_VOLUME_WEEK_2_/total_VOLUME_of_DATE,mean_VOLUME_WEEK_3_/total_VOLUME_of_DATE,mean_VOLUME_WEEK_4_/total_VOLUME_of_DATE
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.009178,-0.017313,-0.000148,-0.000655,-0.003694,0.059023,-0.001328,0.01127,0.016027,0.093068,...,-0.760344,0.062895,0.485623,3.506872,0.75043,0.375799,0.005078,-0.090636,-0.086208,0.003915
1,0.006477,-0.0233,-0.008673,-0.003666,-0.026274,0.038943,-0.022391,-0.012566,0.042498,0.072515,...,-0.079773,-0.078571,0.011118,0.006062,0.005748,0.005193,0.002006,0.002056,0.001294,0.002065
2,0.013422,-0.044869,-0.011669,-0.000975,-0.006658,0.010191,-0.00087,-0.006835,0.031757,0.031354,...,-0.216767,-0.309817,0.16625,0.468279,0.124713,0.185567,-0.001381,-0.004922,0.002533,0.005756
3,0.017253,0.001304,-0.014219,-0.009237,0.003658,-0.011622,0.005285,-0.009936,0.018668,0.022146,...,-0.87658,-0.427098,0.348658,0.530622,0.728515,0.286219,0.024334,0.010494,0.029748,0.038814
4,0.006241,-0.014918,-0.004146,-0.000578,0.002059,-0.00247,0.002895,-0.016689,0.026294,0.021603,...,3.913261,0.245354,0.194741,0.414731,8.385123,0.440387,0.103003,-0.739234,0.291141,0.107857


# Model and score

In [249]:
X_train = train[features]
y_train = train[target]
# A quiet large number of trees with low depth to prevent overfits
'''parameters = {
    'boosting_type': 'rf',
    'n_estimators': 500,
    'max_depth': 2**3,
    'random_state': 0,
    'n_jobs': -1, 
    'feature_fraction': np.sqrt(X_train.shape[1])/X_train.shape[1], 
    'objective': 'binary',
    'verbose': -1
}'''

rf_params = {
    'n_estimators': 500,
    'max_depth': 2**3,
    'random_state': 0,
    'n_jobs': -1
}
train_dates = train['DATE'].unique()
test_dates = test['DATE'].unique()

n_splits = 4
scores = []
models = []

splits = KFold(n_splits=n_splits, random_state=0,
               shuffle=True).split(train_dates) # Generates the splits of the indexes to use as train / test

for i, (local_train_dates_ids, local_test_dates_ids) in enumerate(splits):
    local_train_dates = train_dates[local_train_dates_ids]
    local_test_dates = train_dates[local_test_dates_ids]

    local_train_ids = train['DATE'].isin(local_train_dates)
    local_test_ids = train['DATE'].isin(local_test_dates)

    X_local_train = X_train.loc[local_train_ids]
    y_local_train = y_train.loc[local_train_ids]
    X_local_test = X_train.loc[local_test_ids]
    y_local_test = y_train.loc[local_test_ids]

    #X_local_train = X_local_train.fillna(0)
    #X_local_test = X_local_test.fillna(0)

    #model = LGBMClassifier(**parameters)
    model = RandomForestClassifier(**rf_params)
    model.fit(X_local_train, y_local_train.values.reshape(-1))

    y_local_pred = model.predict_proba(X_local_test)[:, 1]
    
    sub = _Xtrain.loc[local_test_ids].copy()
    sub['pred'] = y_local_pred
    y_local_pred = sub.groupby('DATE')['pred'].transform(lambda x: x > x.median()).values

    models.append(model)
    score = accuracy_score(y_local_test, y_local_pred)
    scores.append(score)
    print(f"Fold {i+1} - Accuracy: {score* 100:.2f}%")

mean = np.mean(scores)*100
std = np.std(scores)*100
u = (mean + std)
l = (mean - std)
print(f'Accuracy: {mean:.2f}% [{l:.2f} ; {u:.2f}] (+- {std:.2f})')

Fold 1 - Accuracy: 52.52%
Fold 2 - Accuracy: 50.52%


KeyboardInterrupt: 

In [258]:
X_train = train[features]
y_train = train[target]
# A quiet large number of trees with low depth to prevent overfits
parameters = {
    'boosting_type': 'rf',
    'n_estimators': 500,
    'max_depth': 2**3,
    'random_state': 0,
    'n_jobs': -1, 
    'feature_fraction': np.sqrt(X_train.shape[1])/X_train.shape[1], 
    'objective': 'binary',
    'verbose': -1
}

'''rf_params = {
    'n_estimators': 500,
    'max_depth': 2**3,
    'random_state': 0,
    'n_jobs': -1
}'''
train_dates = train['DATE'].unique()
test_dates = test['DATE'].unique()

n_splits = 4
scores = []
models = []

splits = KFold(n_splits=n_splits, random_state=0,
               shuffle=True).split(train_dates) # Generates the splits of the indexes to use as train / test

for i, (local_train_dates_ids, local_test_dates_ids) in enumerate(splits):
    local_train_dates = train_dates[local_train_dates_ids]
    local_test_dates = train_dates[local_test_dates_ids]

    local_train_ids = train['DATE'].isin(local_train_dates)
    local_test_ids = train['DATE'].isin(local_test_dates)

    X_local_train = X_train.loc[local_train_ids]
    y_local_train = y_train.loc[local_train_ids]
    X_local_test = X_train.loc[local_test_ids]
    y_local_test = y_train.loc[local_test_ids]

    #X_local_train = X_local_train.fillna(0)
    #X_local_test = X_local_test.fillna(0)

    model = LGBMClassifier(**parameters)
    #model = RandomForestClassifier(**rf_params)
    model.fit(X_local_train, y_local_train.values.reshape(-1))

    y_local_pred = model.predict_proba(X_local_test)[:, 1]
    
    sub = _Xtrain.loc[local_test_ids].copy()
    sub['pred'] = y_local_pred
    y_local_pred = sub.groupby('DATE')['pred'].transform(lambda x: x > x.median()).values

    models.append(model)
    score = accuracy_score(y_local_test, y_local_pred)
    scores.append(score)
    print(f"Fold {i+1} - Accuracy: {score* 100:.2f}%")

mean = np.mean(scores)*100
std = np.std(scores)*100
u = (mean + std)
l = (mean - std)
print(f'Accuracy: {mean:.2f}% [{l:.2f} ; {u:.2f}] (+- {std:.2f})')

Fold 1 - Accuracy: 52.01%
Fold 2 - Accuracy: 50.74%


In [119]:
parameters = {
    'boosting_type': ['rf','gbdt'],
    'n_estimators': [500],
    'max_depth': [2**3, 2**4],
    'min_child_samples': [50,1000],
    'num_leaves': [100],
    'random_state': [0],
    'n_jobs': [-1], 
    'feature_fraction': [0.5,0.8], 
    'objective': ['binary'],
    'verbose': [-1]
}

In [120]:
search = GridSearchCV(LGBMClassifier(), param_grid=parameters, n_jobs=-1, scoring='accuracy', verbose=2)
search.fit(_Xtrain, _y.values.reshape(-1))

Fitting 5 folds for each of 16 candidates, totalling 80 fits


In [126]:
search.best_params_

{'boosting_type': 'rf',
 'feature_fraction': 0.5,
 'max_depth': 16,
 'min_child_samples': 50,
 'n_estimators': 500,
 'n_jobs': -1,
 'num_leaves': 100,
 'objective': 'binary',
 'random_state': 0,
 'verbose': -1}

In [127]:
search.best_estimator_.feature_importances_

array([2195, 1892, 2679, 1453, 2788, 1818, 3574, 1514, 3343, 1218, 9158,
       9316, 2291, 2682, 1887, 1692])

In [129]:
# mix industry_date for MA and sector_date for past day 
pd.DataFrame(search.cv_results_).sort_values(by='rank_test_score', ascending=True)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_boosting_type,param_feature_fraction,param_max_depth,param_min_child_samples,param_n_estimators,param_n_jobs,...,param_verbose,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
2,98.926106,1.632066,1.819633,0.22607,rf,0.5,16,50,500,-1,...,-1,"{'boosting_type': 'rf', 'feature_fraction': 0....",0.519344,0.528721,0.519297,0.503745,0.501188,0.514459,0.010408,1
0,62.756028,12.794661,1.738171,0.348985,rf,0.5,8,50,500,-1,...,-1,"{'boosting_type': 'rf', 'feature_fraction': 0....",0.520324,0.529008,0.520181,0.502323,0.500364,0.51444,0.011178,2
6,91.431783,0.793597,1.343985,0.032222,rf,0.8,16,50,500,-1,...,-1,"{'boosting_type': 'rf', 'feature_fraction': 0....",0.518102,0.525377,0.519703,0.506456,0.500424,0.514012,0.009165,3
3,105.667028,6.066559,2.104545,0.276228,rf,0.5,16,1000,500,-1,...,-1,"{'boosting_type': 'rf', 'feature_fraction': 0....",0.521769,0.526774,0.516514,0.500352,0.501893,0.513461,0.010595,4
1,57.012313,15.115247,1.670854,0.270737,rf,0.5,8,1000,500,-1,...,-1,"{'boosting_type': 'rf', 'feature_fraction': 0....",0.519022,0.527049,0.517409,0.501654,0.499898,0.513007,0.010521,5
4,87.374178,6.451481,1.310774,0.072065,rf,0.8,8,50,500,-1,...,-1,"{'boosting_type': 'rf', 'feature_fraction': 0....",0.514734,0.522223,0.51981,0.509992,0.49819,0.51299,0.008519,6
5,65.596519,4.393061,1.057972,0.074035,rf,0.8,8,1000,500,-1,...,-1,"{'boosting_type': 'rf', 'feature_fraction': 0....",0.513635,0.522796,0.520646,0.503745,0.502311,0.512627,0.008415,7
7,92.526387,1.925039,1.488797,0.042447,rf,0.8,16,1000,500,-1,...,-1,"{'boosting_type': 'rf', 'feature_fraction': 0....",0.515677,0.522271,0.518305,0.500878,0.50476,0.512378,0.008175,8
12,74.52665,4.447636,1.197993,0.044061,gbdt,0.8,8,50,500,-1,...,-1,"{'boosting_type': 'gbdt', 'feature_fraction': ...",0.517206,0.521614,0.51164,0.505512,0.505011,0.512197,0.006487,9
9,41.588604,0.827185,1.161193,0.031945,gbdt,0.5,8,1000,500,-1,...,-1,"{'boosting_type': 'gbdt', 'feature_fraction': ...",0.519834,0.524433,0.507806,0.498465,0.506659,0.511439,0.00942,10


In [130]:
y_pred = search.predict(_Xtrain)
print('Classic predict: ', accuracy_score(_y, y_pred))

Classic predict:  0.5691109545025621


In [137]:
y_pred = search.predict_proba(_Xtrain)[:, 1]
sub = _Xtrain.copy()
sub['pred'] = y_pred
y_pred = sub['pred'].transform(lambda x: x > x.median()).values
print('Proba predict: ', accuracy_score(_y, y_pred))

Proba predict:  0.569079898230987


In [135]:
print('Accuracy', accuracy_score(_y, search.best_estimator_.predict(_Xtrain)))

Accuracy 0.5691109545025621


In [110]:
_ypred = search.best_estimator_.predict(_Xtest)
df_results = pd.DataFrame({'RET': _ypred})
df_results.index = _Xtest.index
df_results.to_csv('y_test.csv')

In [111]:
df_results

Unnamed: 0_level_0,RET
ID,Unnamed: 1_level_1
418595,True
418596,False
418597,True
418598,True
418599,True
...,...
617019,False
617020,False
617021,False
617022,False
