In [7]:
import torch
from torch import nn as nn
import torch.nn.functional as F
from torch.nn import Linear
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from datetime import datetime
import yfinance as yahooFinance
import pickle
import xgboost as xgb


In [2]:
history = yahooFinance.Ticker('QQQ')
data = history.history(period="max")

data['Change'] = (data['Close'] - data['Open']) / data['Open'] 
data['Date'] = pd.to_datetime(data.index)
data['day_of_year'] = data['Date'].dt.dayofyear

print(data.head())
print(data.tail())
print(data.shape)

                                Open       High        Low      Close  \
Date                                                                    
1999-03-10 00:00:00-05:00  43.768583  43.795337  43.046241  43.715076   
1999-03-11 00:00:00-05:00  44.036098  44.290256  43.072976  43.929085   
1999-03-12 00:00:00-05:00  43.768579  43.795332  42.511169  42.858963   
1999-03-15 00:00:00-05:00  43.179995  44.143118  42.725187  44.089611   
1999-03-16 00:00:00-05:00  44.276879  44.651427  43.795318  44.464153   

                            Volume  Dividends  Stock Splits  Capital Gains  \
Date                                                                         
1999-03-10 00:00:00-05:00  5232000        0.0           0.0            0.0   
1999-03-11 00:00:00-05:00  9688600        0.0           0.0            0.0   
1999-03-12 00:00:00-05:00  8743600        0.0           0.0            0.0   
1999-03-15 00:00:00-05:00  6369000        0.0           0.0            0.0   
1999-03-16 00:00:00-

In [3]:
N = len(data)
W = 15
n_attributes = 4
X = np.zeros((N - W, n_attributes * W + 1))
y = data['Change'].copy()
y = y[W:].to_numpy()

for i in range(W, N):
    for j in range(W):
        X[i - W, j * n_attributes + 0] = abs(data['High'].iloc[i - j - 1] - data['Low'].iloc[i - j - 1])/data['Low'].iloc[i - j - 1]
        X[i - W, j * n_attributes + 1] = data['Volume'].iloc[i - j - 1]
        X[i - W, j * n_attributes + 2] = data['Change'].iloc[i - j - 1]
    X[i - W, -1] = data['day_of_year'].iloc[i]

print(X.shape, y.shape)
print(X[:5])
print(y[:5])


(6201, 61) (6201,)
[[ 1.82675309e-02  4.94060000e+06 -5.83430572e-03  0.00000000e+00
   3.06122449e-02  5.11380000e+06  2.38663484e-02  0.00000000e+00
   1.81928441e-02  8.25620000e+06 -2.40963855e-03  0.00000000e+00
   2.74666257e-02  8.13380000e+06  2.07317073e-02  0.00000000e+00
   4.12371134e-02  8.44700000e+06  2.47461929e-02  0.00000000e+00
   3.65618987e-02  1.09624000e+07 -2.98136646e-02  0.00000000e+00
   2.10396040e-02  5.02480000e+06 -1.64034022e-02  0.00000000e+00
   4.02930403e-02  7.16040000e+06 -3.81455399e-02  0.00000000e+00
   2.15477997e-02  4.84840000e+06  2.06310680e-02  0.00000000e+00
   1.15501520e-02  3.96500000e+06 -7.22021661e-03  0.00000000e+00
   1.95479536e-02  4.90580000e+06  4.22960725e-03  0.00000000e+00
   3.31872260e-02  6.36900000e+06  2.10656753e-02  0.00000000e+00
   3.02076778e-02  8.74360000e+06 -2.07823961e-02  0.00000000e+00
   2.82608696e-02  9.68860000e+06 -2.43013366e-03  0.00000000e+00
   1.74021131e-02  5.23200000e+06 -1.22249389e-03  0.0000

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, shuffle=False)
print(X_train[:10])
print(y_train[:10])

print('train size:', X_train.shape, y_train.shape)
print('test size:', X_test.shape, y_test.shape)

[[ 1.82675309e-02  4.94060000e+06 -5.83430572e-03  0.00000000e+00
   3.06122449e-02  5.11380000e+06  2.38663484e-02  0.00000000e+00
   1.81928441e-02  8.25620000e+06 -2.40963855e-03  0.00000000e+00
   2.74666257e-02  8.13380000e+06  2.07317073e-02  0.00000000e+00
   4.12371134e-02  8.44700000e+06  2.47461929e-02  0.00000000e+00
   3.65618987e-02  1.09624000e+07 -2.98136646e-02  0.00000000e+00
   2.10396040e-02  5.02480000e+06 -1.64034022e-02  0.00000000e+00
   4.02930403e-02  7.16040000e+06 -3.81455399e-02  0.00000000e+00
   2.15477997e-02  4.84840000e+06  2.06310680e-02  0.00000000e+00
   1.15501520e-02  3.96500000e+06 -7.22021661e-03  0.00000000e+00
   1.95479536e-02  4.90580000e+06  4.22960725e-03  0.00000000e+00
   3.31872260e-02  6.36900000e+06  2.10656753e-02  0.00000000e+00
   3.02076778e-02  8.74360000e+06 -2.07823961e-02  0.00000000e+00
   2.82608696e-02  9.68860000e+06 -2.43013366e-03  0.00000000e+00
   1.74021131e-02  5.23200000e+06 -1.22249389e-03  0.00000000e+00
   9.00000

In [5]:
params = {
    'max_depth': 5,
    'eta': 0.01,
    'objective': 'reg:squarederror',
    'eval_metric': 'rmse',
    'seed': 42,
    'n_estimators': 1200,
    'n_jobs': -1
}

model = xgb.XGBRegressor(**params)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print(y_pred[:10])
print(y_test[:10])

print('Mean absolute error: ', np.mean(abs(y_pred - y_test)))

[ 0.00059332  0.00139929  0.00027348  0.00077176 -0.00040054  0.0008291
 -0.00029746  0.00126914  0.00080263  0.0003845 ]
[-0.00166386  0.01130029  0.00462623 -0.00484629 -0.00393298  0.00937158
  0.00258551  0.00828265 -0.00549175 -0.0050475 ]
Mean absolute error:  0.010108220779139473


In [9]:
y2_train = np.array([0 if y_train[i] < 0 else 1 for i in range(len(y_train))])
y2_test = np.array([0 if y_test[i] < 0 else 1 for i in range(len(y_test))])

params2 = {
    'max_depth': 10,
    'eta': 0.01,
    'objective': 'binary:logistic',
    'seed': 42,
    'n_estimators': 1000,
    'n_jobs': -1
}

model2 = xgb.XGBClassifier(**params2)
model2.fit(X_train, y2_train)
y2_pred = model2.predict(X_test)
print(y2_pred[:20])
print(y2_test[:20])



print('Accuracy: ', accuracy_score(y2_test, y2_pred))
print(classification_report(y2_test, y2_pred))




TypeError: Not supported type for data.<class 'xgboost.core.DMatrix'>

In [800]:
model2.save_model('datasets/stocks/xgb_model_classification.json')