I have previously appplied several regression models and analyzed their results. They were not reliable enough to be used for real-world portfolio management.

Another way to approach the main problem of stock market investing is to consider it as a classification problem: can a model predict well enough if a stock price will increase by 5% or more in the next day ?

Let's investigate by building the appropriate dataset, and train various classifiers.

In [2]:
!pip install -r requirements.txt



In [3]:
import utils
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, accuracy_score, precision_score, fbeta_score
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
import pickle

In [4]:
X_df = utils.get_stock_feature_dataset('ALNOV.PA')

  dip[i] = 100 * (self._dip[i]/self._trs[i])
  din[i] = 100 * (self._din[i]/self._trs[i])


In [5]:
X_df.head()

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,volume_adi,volume_obv,volume_cmf,volume_fi,...,others_dlr,others_cr,cac40_Open,cac40_High,cac40_Low,cac40_Close,sbf120_Open,sbf120_High,sbf120_Low,sbf120_Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2012-10-12,8.7,8.7,8.7,8.7,8.7,450.0,0.0,450.0,0.0,-24.874197,...,-2.561258,0.0,3401.26001,3419.0,3389.080078,3389.080078,2607.149902,2607.149902,2607.149902,2607.149902
2012-10-15,8.48,8.48,8.48,8.48,8.48,730.0,0.0,-280.0,0.0,-24.874197,...,-2.561258,-2.528736,3387.76001,3439.52002,3387.76001,3420.280029,2630.100098,2630.100098,2630.100098,2630.100098
2012-10-16,8.7,8.7,8.7,8.7,8.7,3000.0,0.0,2720.0,0.0,-24.874197,...,2.561258,0.0,3443.600098,3500.939941,3428.969971,3500.939941,2686.030029,2686.030029,2686.030029,2686.030029
2012-10-17,8.72,8.72,8.72,8.72,8.72,800.0,0.0,3520.0,0.0,-24.874197,...,0.229621,0.229885,3503.820068,3530.790039,3494.350098,3527.5,2704.060059,2704.060059,2704.060059,2704.060059
2012-10-18,8.83,8.83,8.83,8.83,8.83,138.0,0.0,3658.0,0.0,-24.874197,...,1.253578,1.494253,3527.76001,3542.580078,3508.399902,3535.179932,2709.370117,2709.370117,2709.370117,2709.370117


In [6]:
def make_labels_dataset(X_df, increase=0.05, label_name='increase_tomorrow'):
    '''
        increase: float between 0 and 1, equivalent to the desired % increase when multiplied by 100
        label_name: name for the column containing labels
    '''

    # Build the target dataset: label 1 if stock price increased by 5% or more in the following days, 0 otherwise
    y_df = pd.DataFrame(index=X_df.index, columns=[label_name])
    for i in range(len(X_df) - 1):
        increase_threshold = X_df['Adj Close'].iloc[i] + increase * X_df['Adj Close'].iloc[i]
        y_df.iloc[i] = 1 if X_df['Adj Close'].iloc[i+1] > increase_threshold else 0

    # Drop last row, for which there is no label
    X_df.drop(X_df.tail(1).index, inplace=True)
    y_df.drop(y_df.tail(1).index, inplace=True)

    return X_df, y_df

In [7]:
X_df, y_df = make_labels_dataset(X_df)

In [8]:
y_df.tail()

Unnamed: 0_level_0,increase_tomorrow
Date,Unnamed: 1_level_1
2020-06-29,0
2020-06-30,0
2020-07-01,0
2020-07-02,1
2020-07-03,0


In [9]:
# Check that labels are correct:

# There was an important increase of our stock price on the market day just after April 9th 2020
print("Let's check if this {:.2f}% increase is correctly labeled...".format((X_df.loc['2020-04-14']['Adj Close'] - X_df.loc['2020-04-09']['Adj Close']) / X_df.loc['2020-04-09']['Adj Close'] * 100))

# Get index of April 9th 2020, the day before increase
idx = len(X_df.loc[:'2020-04-09']) - 1
assert np.array_equal(X_df.loc['2020-04-09'].values, X_df.iloc[idx].values)

# Check that its corresponding label is 1
assert y_df.iloc[idx]['increase_tomorrow'] == 1

print('Good!')

Let's check if this 31.62% increase is correctly labeled...
Good!


Let's split dataset into testing and training datasets, and normalize them:

In [10]:
# Split dataset into 90-10% training-testing sets.
# They can be shuffled safely, since the specificities of
# time series are not relevant anymore for our classification task
train_X, train_y, test_X, test_y = utils.split_dataset(X_df, y_df, train_size=0.9, do_shuffle=True)

In [11]:
# Check if there are both labels in the training and testing sets
print('training set contains {:.2f}% records labeled as 1'.format((train_y.values.sum()/train_y.shape[0] * 100)))
print('testing set contains {:.2f}% records labeled as 1'.format(test_y.values.sum()/test_y.shape[0] * 100))

training set contains 7.33% records labeled as 1
testing set contains 8.63% records labeled as 1


In [12]:
# Scale all values to have the same range:
X_scaler = MinMaxScaler().fit(train_X.values)

train_X_scaled = X_scaler.transform(train_X.values)
test_X_scaled = X_scaler.transform(test_X.values)

train_y = train_y.values.reshape(-1)
test_y = test_y.values.reshape(-1)

I will first use the benchmark algorithm 'DummyClassifier', and then apply and test LinearSVC, LogisticRegression, SVC, KNeighborsClassifier, RandomForestClassifier and AdaBoostClassifier.

The metrics I will evaluate for this classification task are accuracy, precision and Fbeta-score with beta=0.5 to penalize false positives more.

In [13]:
def print_metrics(y_true, y_pred):
    print('\taccuracy: {:.2f}%'.format(accuracy_score(y_true, y_pred) * 100))
    print('\tprecision: {:.2f}%'.format(precision_score(y_true, y_pred) * 100))
    print('\tfbeta: {:.3f}'.format(fbeta_score(y_true, y_pred, beta=0.5)))

def train_eval(model, train_X, train_y, test_X, test_y):
    model.fit(train_X, train_y)
    pred_y = model.predict(test_X)
    print('Results:')
    print_metrics(test_y, pred_y)

In [14]:
train_eval(DummyClassifier(), train_X_scaled, train_y, test_X_scaled, test_y)

Results:
	accuracy: 82.74%
	precision: 0.00%
	fbeta: 0.000




In [15]:
train_eval(LinearSVC(class_weight='balanced', max_iter=100000), train_X_scaled, train_y, test_X_scaled, test_y)

Results:
	accuracy: 74.62%
	precision: 17.65%
	fbeta: 0.204


In [16]:
train_eval(SGDClassifier(class_weight='balanced'), train_X_scaled, train_y, test_X_scaled, test_y)

Results:
	accuracy: 78.68%
	precision: 17.95%
	fbeta: 0.202


In [17]:
train_eval(LogisticRegression(class_weight='balanced', max_iter=100000), train_X_scaled, train_y, test_X_scaled, test_y)

Results:
	accuracy: 75.63%
	precision: 19.61%
	fbeta: 0.226


In [18]:
train_eval(SVC(class_weight='balanced'), train_X_scaled, train_y, test_X_scaled, test_y)

Results:
	accuracy: 75.63%
	precision: 19.61%
	fbeta: 0.226


In [19]:
train_eval(KNeighborsClassifier(), train_X_scaled, train_y, test_X_scaled, test_y)

Results:
	accuracy: 91.37%
	precision: 50.00%
	fbeta: 0.303


In [20]:
train_eval(RandomForestClassifier(n_estimators=2000, class_weight='balanced'), train_X_scaled, train_y, test_X_scaled, test_y)

Results:
	accuracy: 91.88%
	precision: 66.67%
	fbeta: 0.345


In [21]:
weak_learner = DecisionTreeClassifier(max_depth=10, class_weight='balanced')
ada_model = AdaBoostClassifier(weak_learner, algorithm="SAMME", n_estimators=2000)
train_eval(ada_model, train_X_scaled, train_y, test_X_scaled, test_y)

Results:
	accuracy: 90.86%
	precision: 40.00%
	fbeta: 0.270


Precision scores obtained by RandomForestClassifier and AdaBoostClassifier are almost good, but their Fscore results aren't, and it seems from their accuracy results that both models are overfitting.

Let's then try to get less overfitting and better precision and fbeta scores by aggregating a lot more stocks than just one in our training and testing sets, and re-train all these models on much more data points:

In [22]:
symbol_list = ['AI.PA', 'SAF.PA', 'GNFT.PA', 'ALNOV.PA', 'FDJ.PA', 'ETL.PA', 'DBV.PA',
              'BN.PA', 'KER.PA', 'AIR.PA', 'ENGI.PA', 'FP.PA', 'DG.PA', 'VIV.PA',
              'UG.PA', 'SU.PA', 'VIE.PA', 'ALPHA.PA', 'ALBIO.PA', 'CRI.PA', 'ALERS.PA']

X_df = pd.DataFrame()
y_df = pd.DataFrame()

for symbol in symbol_list:
    print('Processing {}...'.format(symbol))
    symbol_X_df = utils.get_stock_feature_dataset(symbol)
    symbol_X_df, symbol_y_df = make_labels_dataset(symbol_X_df)

    # reset index since dates are not required for classification
    X_df = X_df.append(symbol_X_df.reset_index(drop=True), ignore_index=True)
    y_df = y_df.append(symbol_y_df.reset_index(drop=True), ignore_index=True)
    print('Done! new X_df shape: {}, new y_df shape: {}'.format(X_df.shape, y_df.shape))
    print('')

Processing AI.PA...


  dip[i] = 100 * (self._dip[i]/self._trs[i])
  din[i] = 100 * (self._din[i]/self._trs[i])


Done! new X_df shape: (5272, 86), new y_df shape: (5272, 1)

Processing SAF.PA...
Done! new X_df shape: (10544, 86), new y_df shape: (10544, 1)

Processing GNFT.PA...
Done! new X_df shape: (14003, 86), new y_df shape: (14003, 1)

Processing ALNOV.PA...
Done! new X_df shape: (15974, 86), new y_df shape: (15974, 1)

Processing FDJ.PA...
Done! new X_df shape: (16131, 86), new y_df shape: (16131, 1)

Processing ETL.PA...
Done! new X_df shape: (19859, 86), new y_df shape: (19859, 1)

Processing DBV.PA...
Done! new X_df shape: (21968, 86), new y_df shape: (21968, 1)

Processing BN.PA...
Done! new X_df shape: (29739, 86), new y_df shape: (29739, 1)

Processing KER.PA...
Done! new X_df shape: (35011, 86), new y_df shape: (35011, 1)

Processing AIR.PA...
Done! new X_df shape: (39848, 86), new y_df shape: (39848, 1)

Processing ENGI.PA...
Done! new X_df shape: (45120, 86), new y_df shape: (45120, 1)

Processing FP.PA...
Done! new X_df shape: (50392, 86), new y_df shape: (50392, 1)

Processing DG

In [23]:
X_df = X_df.astype(float)
X_df.replace(np.inf, np.nan, inplace=True)
X_df.replace(-np.inf, np.nan, inplace=True)
X_df.interpolate(axis=0, limit_direction='both', inplace=True)

In [24]:
X_df.isna().sum().sum()

0

In [25]:
X_df.head()

Unnamed: 0,Open,High,Low,Close,Adj Close,Volume,volume_adi,volume_obv,volume_cmf,volume_fi,...,others_dlr,others_cr,cac40_Open,cac40_High,cac40_Low,cac40_Close,sbf120_Open,sbf120_High,sbf120_Low,sbf120_Close
0,34.854301,36.306599,34.771301,35.061798,10.226519,904282.0,-562079.8,904282.0,-0.402545,-833509.084988,...,-4.539366,0.0,6024.379883,6102.120117,5901.77002,5917.370117,4035.110107,4035.110107,4035.110107,4035.110107
1,35.061798,34.9995,32.613701,33.505798,9.772677,1381445.0,-910426.0,-477163.0,-0.402545,-833509.084988,...,-4.539366,-4.437879,5922.22998,5925.069824,5657.200195,5672.02002,3873.149902,3873.149902,3873.149902,3873.149902
2,32.779701,33.4021,32.2817,33.194599,9.681908,853763.0,-372901.3,-1330926.0,-0.402545,-833509.084988,...,-0.933132,-5.325451,5521.830078,5589.5,5461.589844,5479.700195,3743.870117,3743.870117,3743.870117,3743.870117
3,32.7589,36.223598,32.696701,35.580399,10.377778,1387137.0,508292.3,56211.0,-0.402545,-833509.084988,...,6.940771,1.479106,5485.930176,5530.259766,5388.850098,5450.109863,3728.080078,3728.080078,3728.080078,3728.080078
4,35.580399,37.136398,34.958,35.144798,10.250728,2198233.0,-1312943.0,-2142022.0,-0.402545,-833509.084988,...,-1.231828,0.236725,5423.879883,5561.689941,5423.879883,5539.609863,3794.070068,3794.070068,3794.070068,3794.070068


In [26]:
y_df.head()

Unnamed: 0,increase_tomorrow
0,0
1,0
2,1
3,0
4,0


In [27]:
train_X, train_y, test_X, test_y = utils.split_dataset(X_df, y_df, train_size=0.9, do_shuffle=True)

In [28]:
# Check if there are both labels in the training and testing sets
print('training set contains {:.2f}% records labeled as 1'.format((train_y.values.sum()/train_y.shape[0] * 100)))
print('testing set contains {:.2f}% records labeled as 1'.format(test_y.values.sum()/test_y.shape[0] * 100))

training set contains 4.01% records labeled as 1
testing set contains 3.73% records labeled as 1


In [29]:
# Scale all values to have the same range:
X_scaler = MinMaxScaler().fit(train_X.values)

train_X_scaled = X_scaler.transform(train_X.values)
test_X_scaled = X_scaler.transform(test_X.values)

train_y = train_y.values.reshape(-1)
test_y = test_y.values.reshape(-1)

In [30]:
%%time
train_eval(DummyClassifier(), train_X_scaled, train_y, test_X_scaled, test_y)

Results:
	accuracy: 92.55%
	precision: 2.87%
	fbeta: 0.029
Wall time: 15.6 ms




In [31]:
%%time
train_eval(LinearSVC(class_weight='balanced', max_iter=100000), train_X_scaled, train_y, test_X_scaled, test_y)

Results:
	accuracy: 83.49%
	precision: 15.48%
	fbeta: 0.184
Wall time: 6min 55s


In [32]:
%%time
train_eval(LogisticRegression(class_weight='balanced', max_iter=100000), train_X_scaled, train_y, test_X_scaled, test_y)

Results:
	accuracy: 82.76%
	precision: 14.90%
	fbeta: 0.178
Wall time: 5.33 s


In [33]:
%%time
train_eval(SVC(class_weight='balanced'), train_X_scaled, train_y, test_X_scaled, test_y)

Results:
	accuracy: 88.82%
	precision: 21.36%
	fbeta: 0.249
Wall time: 8min 35s


In [34]:
%%time
train_eval(SGDClassifier(class_weight='balanced'), train_X_scaled, train_y, test_X_scaled, test_y)

Results:
	accuracy: 84.73%
	precision: 16.51%
	fbeta: 0.196
Wall time: 1.16 s


In [35]:
%%time
train_eval(KNeighborsClassifier(), train_X_scaled, train_y, test_X_scaled, test_y)

Results:
	accuracy: 97.78%
	precision: 95.24%
	fbeta: 0.763
Wall time: 1min 27s


In [36]:
%%time
train_eval(RandomForestClassifier(n_estimators=2000, class_weight='balanced'), train_X_scaled, train_y, test_X_scaled, test_y)

Results:
	accuracy: 97.88%
	precision: 99.31%
	fbeta: 0.790
Wall time: 24min 58s


In [37]:
%%time
weak_learner = DecisionTreeClassifier(max_depth=10, class_weight='balanced')
ada_model = AdaBoostClassifier(weak_learner, algorithm="SAMME", n_estimators=2000)
train_eval(ada_model, train_X_scaled, train_y, test_X_scaled, test_y)

Results:
	accuracy: 96.91%
	precision: 60.07%
	fbeta: 0.579
Wall time: 1min 47s


KNeighborsClassifier, RandomForestClassifier and AdaBoostClassifier got very good results, let's try to tune their hyperparameters with a grid search:

In [38]:
scorer = make_scorer(fbeta_score, beta=0.5)

In [None]:
kn_clf = KNeighborsClassifier()
parameters = {
    'n_neighbors': [3, 5, 10],
    'leaf_size': [10, 30, 50],
    'weights': ['uniform', 'distance'],
    'p': [1, 2, 3]
}
grid_obj = GridSearchCV(estimator=kn_clf, param_grid=parameters, scoring=scorer)
grid_fit = grid_obj.fit(train_X_scaled, train_y)
best_kn_clf = grid_fit.best_estimator_
preds = best_kn_clf.predict(test_X_scaled)
print_metrics(test_y, preds)

# Save model to disk
outfile = open('best-kneighbors-clf.pickle', 'wb')
pickle.dump(best_kn_clf, outfile)
outfile.close()

In [None]:
# Plot the feature importance used by KNeighborsClassifier
utils.plot_feature_importance(best_kn_clf.feature_importances_, train_X)

In [None]:
rf_clf = RandomForestClassifier(class_weight='balanced', random_state=42)
parameters = {
    'n_estimators': [100, 500, 2000],
    'max_depth': [10, 20, 30]
}
grid_obj = GridSearchCV(estimator=rf_clf, param_grid=parameters, scoring=scorer)
grid_fit = grid_obj.fit(train_X_scaled, train_y)
best_rf_clf = grid_fit.best_estimator_
preds = best_rf_clf.predict(test_X_scaled)
print_metrics(test_y, preds)

# Save model to disk
outfile = open('best-randomforest-clf.pickle', 'wb')
pickle.dump(best_rf_clf, outfile)
outfile.close()

In [None]:
# Plot the feature importance used by RandomForestClassifier
utils.plot_feature_importance(best_rf_clf.feature_importances_, train_X)

In [None]:
weak_learner = DecisionTreeClassifier(class_weight='balanced')
ada_clf = AdaBoostClassifier(weak_learner)
parameters = {
    'algorithm': ['SAMME', 'SAMME.R'],
    'learning_rate': [0.5, 1.0, 2],
    'n_estimators': [100, 500, 2000],
    'base_estimator__criterion': ['gini', 'entropy'],
    'base_estimator__splitter': ['best', 'random'],
    'base_estimator__max_depth': [10, 20, 30]
}
grid_obj = GridSearchCV(estimator=ada_clf, param_grid=parameters, scoring=scorer)
grid_fit = grid_obj.fit(train_X_scaled, train_y)
best_ada_clf = grid_fit.best_estimator_
preds = best_ada_clf.predict(test_X_scaled)
print_metrics(test_y, preds)

# Save model to disk
outfile = open('best-adaboost-clf.pickle', 'wb')
pickle.dump(best_ada_clf, outfile)
outfile.close()

In [None]:
# Plot the feature importance used by AdaBoostClassifier
utils.plot_feature_importance(best_ada_clf.feature_importances_, train_X)