In [1]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import MinMaxScaler, RobustScaler, StandardScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import roc_auc_score, f1_score, precision_score, recall_score, classification_report
from sklearn import svm
from sklearn.cluster import KMeans
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Apple

In [2]:
# Apple
X_train_AAPL = pd.read_csv('data/AAPL/X_train_AAPL.csv', index_col=0)
y_train_AAPL = pd.read_csv('data/AAPL/y_train_AAPL.csv', index_col=0)
X_test_AAPL = pd.read_csv('data/AAPL/X_test_AAPL.csv', index_col=0)
y_test_AAPL = pd.read_csv('data/AAPL/y_test_AAPL.csv', index_col=0)

In [3]:
# Scale
scaler = MinMaxScaler()
X_train_AAPL = scaler.fit_transform(X_train_AAPL)
X_test_AAPL = scaler.transform(X_test_AAPL)

In [4]:
# model
apple =  HistGradientBoostingClassifier(random_state=42, max_depth=12, learning_rate=0.02, max_iter=100, min_samples_leaf=4, l2_regularization=0.1, class_weight='balanced')
apple.fit(X_train_AAPL, y_train_AAPL.values.ravel())


In [5]:
apple.score(X_train_AAPL, y_train_AAPL)

0.9018118059614261

In [6]:
print(classification_report(y_test_AAPL, apple.predict(X_test_AAPL)))
roc_auc_score(y_test_AAPL, apple.predict(X_test_AAPL))

              precision    recall  f1-score   support

           0       0.48      0.91      0.63       203
           1       0.54      0.09      0.16       224

    accuracy                           0.48       427
   macro avg       0.51      0.50      0.39       427
weighted avg       0.51      0.48      0.38       427



0.5025400246305419

In [7]:
apple.predict_proba(X_test_AAPL)

array([[0.57870102, 0.42129898],
       [0.67340424, 0.32659576],
       [0.56685205, 0.43314795],
       [0.60448399, 0.39551601],
       [0.58868908, 0.41131092],
       [0.51012869, 0.48987131],
       [0.59700898, 0.40299102],
       [0.20421129, 0.79578871],
       [0.54583421, 0.45416579],
       [0.43107285, 0.56892715],
       [0.53587781, 0.46412219],
       [0.46140135, 0.53859865],
       [0.55612865, 0.44387135],
       [0.49070334, 0.50929666],
       [0.52873245, 0.47126755],
       [0.50976425, 0.49023575],
       [0.46740485, 0.53259515],
       [0.42213299, 0.57786701],
       [0.4538859 , 0.5461141 ],
       [0.44259821, 0.55740179],
       [0.47390503, 0.52609497],
       [0.4189081 , 0.5810919 ],
       [0.40742873, 0.59257127],
       [0.38737531, 0.61262469],
       [0.32700797, 0.67299203],
       [0.53961953, 0.46038047],
       [0.31045267, 0.68954733],
       [0.32331448, 0.67668552],
       [0.28542635, 0.71457365],
       [0.42022685, 0.57977315],
       [0.

## Amazon

In [8]:
X_train_AMZN = pd.read_csv('data/AMZN/X_train_AMZN.csv', index_col=0)
y_train_AMZN = pd.read_csv('data/AMZN/y_train_AMZN.csv', index_col=0)
X_test_AMZN = pd.read_csv('data/AMZN/X_test_AMZN.csv', index_col=0)
y_test_AMZN = pd.read_csv('data/AMZN/y_test_AMZN.csv', index_col=0)

In [9]:
scaler = MinMaxScaler()
X_train_AMZN = scaler.fit_transform(X_train_AMZN)
X_test_AMZN = scaler.transform(X_test_AAPL)



In [10]:
amazon =  HistGradientBoostingClassifier(random_state=42, max_depth=12, learning_rate=0.02, max_iter=100, min_samples_leaf=4, l2_regularization=0.1)
amazon.fit(X_train_AMZN, y_train_AMZN.values.ravel())

In [11]:
amazon.score(X_train_AMZN, y_train_AMZN)

0.8872004675628288

In [12]:
print(classification_report(y_test_AMZN, amazon.predict(X_test_AMZN)))
roc_auc_score(y_test_AMZN, amazon.predict(X_test_AMZN))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00       185
           1       0.57      1.00      0.72       242

    accuracy                           0.57       427
   macro avg       0.28      0.50      0.36       427
weighted avg       0.32      0.57      0.41       427



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


0.5

## GS

In [13]:
X_train_GS = pd.read_csv('data/GS/X_train_GS.csv', index_col=0)
y_train_GS = pd.read_csv('data/GS/y_train_GS.csv', index_col=0)
X_test_GS = pd.read_csv('data/GS/X_test_GS.csv', index_col=0)
y_test_GS = pd.read_csv('data/GS/y_test_GS.csv', index_col=0)

In [14]:
scalar = MinMaxScaler()
X_train_GS = scalar.fit_transform(X_train_GS)
X_test_GS = scalar.transform(X_test_GS)

In [15]:
GS = HistGradientBoostingClassifier(random_state=42, max_depth=12, learning_rate=0.02, max_iter=100, min_samples_leaf=4, l2_regularization=0.1)
GS.fit(X_train_GS, y_train_GS.values.ravel())

In [16]:
print(classification_report(y_test_GS, GS.predict(X_test_GS)))
roc_auc_score(y_test_GS, GS.predict(X_test_GS))

              precision    recall  f1-score   support

           0       0.48      0.33      0.39       199
           1       0.54      0.69      0.61       228

    accuracy                           0.52       427
   macro avg       0.51      0.51      0.50       427
weighted avg       0.51      0.52      0.50       427



0.507614828528608

## INTC


In [17]:
X_train_INTC = pd.read_csv('data/INTC/X_train_INTC.csv', index_col=0)
y_train_INTC = pd.read_csv('data/INTC/y_train_INTC.csv', index_col=0)
X_test_INTC = pd.read_csv('data/INTC/X_test_INTC.csv', index_col=0)
y_test_INTC = pd.read_csv('data/INTC/y_test_INTC.csv', index_col=0)

In [18]:
scalar = MinMaxScaler()
X_train_INTC = scalar.fit_transform(X_train_INTC)
X_test_INTC = scalar.transform(X_test_INTC)

In [19]:
INTC = HistGradientBoostingClassifier(random_state=42, max_depth=12, learning_rate=0.02, max_iter=100, min_samples_leaf=4, l2_regularization=0.1)
INTC.fit(X_train_INTC, y_train_INTC.values.ravel())

In [20]:
print(classification_report(y_test_INTC, INTC.predict(X_test_INTC)))
roc_auc_score(y_test_INTC, INTC.predict(X_test_INTC))

              precision    recall  f1-score   support

           0       0.46      0.61      0.52       198
           1       0.52      0.37      0.43       229

    accuracy                           0.48       427
   macro avg       0.49      0.49      0.48       427
weighted avg       0.49      0.48      0.48       427



0.49114507520621054

### MSFT


In [21]:
X_train_MSFT = pd.read_csv('data/MSFT/X_train_MSFT.csv', index_col=0)   
y_train_MSFT = pd.read_csv('data/MSFT/y_train_MSFT.csv', index_col=0)
X_test_MSFT = pd.read_csv('data/MSFT/X_test_MSFT.csv', index_col=0)
y_test_MSFT = pd.read_csv('data/MSFT/y_test_MSFT.csv', index_col=0)

In [22]:
scalar = MinMaxScaler()
X_train_MSFT = scalar.fit_transform(X_train_MSFT)
X_test_MSFT = scalar.transform(X_test_MSFT)

In [24]:
MSFT = HistGradientBoostingClassifier(random_state=42, max_depth=12, learning_rate=0.02, max_iter=100, min_samples_leaf=4, l2_regularization=0.1)
MSFT.fit(X_train_MSFT, y_train_MSFT.values.ravel())

In [25]:
print(classification_report(y_test_MSFT, MSFT.predict(X_test_MSFT)))
roc_auc_score(y_test_MSFT, MSFT.predict(X_test_MSFT))

              precision    recall  f1-score   support

           0       0.47      0.66      0.55       199
           1       0.55      0.36      0.43       228

    accuracy                           0.50       427
   macro avg       0.51      0.51      0.49       427
weighted avg       0.51      0.50      0.49       427



0.5092898704046548

In [31]:
prob_MSFT = MSFT.predict_proba(X_test_MSFT)
pred_MSFT = MSFT.predict(X_test_MSFT)

# Backtest

In [70]:
def single_side(row):
    if row['pred'] == 1:
        return row['Close'] - row['Open']
    else:
        return row['Open'] - row['Close']

In [71]:
def single_side_with_prob_cutoff(row):
    if row['pred'] == 1 and row['prob1'] > 0.6:
        return row['Close'] - row['Open']
    elif row['pred'] == 0 and row['prob0'] > 0.6:
        return row['Open'] - row['Close']
    else:
        return 0

In [72]:
def long_short(row):
    return (row['Close'] - row['Open']) * row['prob1'] + (row['Open'] - row['Close']) * row['prob0']

In [75]:
def dollar_cost_averaging(df, interval, capital):
    shares = 0
    capital_used = 0
    for index, row in df.iterrows():
        if index % interval == 0:
            shares += capital / row['Open']
            capital_used += capital

    
    return capital_used, shares, shares * df.iloc[-1]['Close']



In [73]:
prob_MSFT = pd.DataFrame(prob_MSFT, columns=['prob_0', 'prob_1'])
prob_MSFT

Unnamed: 0,prob_0,prob_1
0,0.449876,0.550124
1,0.327562,0.672438
2,0.596223,0.403777
3,0.668389,0.331611
4,0.549018,0.450982
...,...,...
422,0.499714,0.500286
423,0.438293,0.561707
424,0.506107,0.493893
425,0.539444,0.460556


In [74]:
backtest_MSFT = pd.read_csv('data/MSFT/X_test_MSFT.csv')[['Open', 'Close']]
backtest_MSFT['pred'] = pred_MSFT
backtest_MSFT[['prob0', 'prob1']] = prob_MSFT
backtest_MSFT['single'] = backtest_MSFT.apply(single_side, axis=1)
backtest_MSFT['single_with_prob_cutoff'] = backtest_MSFT.apply(single_side_with_prob_cutoff, axis=1)
backtest_MSFT['cumsum'] = backtest_MSFT['single'].cumsum()
backtest_MSFT['cumsum_with_prob_cutoff'] = backtest_MSFT['single_with_prob_cutoff'].cumsum()
backtest_MSFT['long_short'] = backtest_MSFT.apply(long_short, axis=1)
backtest_MSFT['cumsum_long_short'] = backtest_MSFT['long_short'].cumsum()
backtest_MSFT

Unnamed: 0,Open,Close,pred,prob0,prob1,single,single_with_prob_cutoff,cumsum,cumsum_with_prob_cutoff,long_short,cumsum_long_short
0,57.47,57.53,1,0.449876,0.550124,0.06,0.00,0.06,0.00,0.006015,0.006015
1,57.50,57.25,1,0.327562,0.672438,-0.25,-0.25,-0.19,-0.25,-0.086219,-0.080204
2,60.28,59.66,0,0.596223,0.403777,0.62,0.00,0.43,-0.25,0.119317,0.039113
3,59.94,61.00,0,0.668389,0.331611,-1.06,-1.06,-0.63,-1.31,-0.356986,-0.317873
4,60.85,60.99,0,0.549018,0.450982,-0.14,0.00,-0.77,-1.31,-0.013725,-0.331597
...,...,...,...,...,...,...,...,...,...,...,...
422,100.00,98.39,1,0.499714,0.500286,-1.61,0.00,-19.68,-17.57,-0.000920,-5.877308
423,98.82,99.08,1,0.438293,0.561707,0.26,0.00,-19.42,-17.57,0.032088,-5.845220
424,99.58,97.54,0,0.506107,0.493893,2.04,0.00,-17.38,-17.57,0.024916,-5.820304
425,97.38,98.63,0,0.539444,0.460556,-1.25,0.00,-18.63,-17.57,-0.098611,-5.918915


In [77]:
dollar_cost_averaging(backtest_MSFT, 5, 100)


(8600, 114.62599847424194, 11303.269709544997)