In [1]:
import pandas as pd
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report

In [2]:
# create data frame
data = pd.read_csv('stock_data.csv')

In [3]:
# transform date to datetime, get year,month,day from date column
data['date'] = pd.to_datetime(data['date'])
data['year'] = data['date'].dt.year
data['month'] = data['date'].dt.month
data['day'] = data['date'].dt.day

print(data.head(10))

        date   open   high     low  close    volume Name   7_day_ma  \
0 2013-02-08  45.07  45.35  45.000  45.08   1824755    A        NaN   
1 2013-02-11  45.17  45.18  44.450  44.60   2915405    A        NaN   
2 2013-02-12  44.81  44.95  44.500  44.62   2373731    A        NaN   
3 2013-02-13  44.81  45.24  44.680  44.75   2052338    A        NaN   
4 2013-02-14  44.72  44.78  44.360  44.58   3826245    A        NaN   
5 2013-02-15  43.48  44.24  42.210  42.25  14657315    A        NaN   
6 2013-02-19  42.21  43.12  42.210  43.01   4116141    A  44.127143   
7 2013-02-20  42.84  42.85  42.225  42.24   3873183    A  43.721429   
8 2013-02-21  42.14  42.14  41.470  41.63   3415149    A  43.297143   
9 2013-02-22  41.83  42.07  41.580  41.80   3354862    A  42.894286   

   15_day_ma  30_day_ma  daily_returns  daily_volatility  target  year  month  \
0        NaN        NaN            NaN               NaN       0  2013      2   
1        NaN        NaN      -0.010648               NaN

In [4]:
#save names of all companies prior to on-hotencoding
unique_companies = data['Name'].unique()

# One-hot encode company tickers to differentiate between companies in training
data = pd.get_dummies(data, columns=['Name'])

print(data.head(10))

        date   open   high     low  close    volume   7_day_ma  15_day_ma  \
0 2013-02-08  45.07  45.35  45.000  45.08   1824755        NaN        NaN   
1 2013-02-11  45.17  45.18  44.450  44.60   2915405        NaN        NaN   
2 2013-02-12  44.81  44.95  44.500  44.62   2373731        NaN        NaN   
3 2013-02-13  44.81  45.24  44.680  44.75   2052338        NaN        NaN   
4 2013-02-14  44.72  44.78  44.360  44.58   3826245        NaN        NaN   
5 2013-02-15  43.48  44.24  42.210  42.25  14657315        NaN        NaN   
6 2013-02-19  42.21  43.12  42.210  43.01   4116141  44.127143        NaN   
7 2013-02-20  42.84  42.85  42.225  42.24   3873183  43.721429        NaN   
8 2013-02-21  42.14  42.14  41.470  41.63   3415149  43.297143        NaN   
9 2013-02-22  41.83  42.07  41.580  41.80   3354862  42.894286        NaN   

   30_day_ma  daily_returns  ...  Name_XL  Name_XLNX  Name_XOM  Name_XRAY  \
0        NaN            NaN  ...        0          0         0          0  

In [5]:
#create columns with lag
lag_columns = ['open', 'high', 'low', 'volume', '7_day_ma', '15_day_ma', '30_day_ma', 'daily_returns', 'daily_volatility']
#for every company
for ticker in unique_companies:
    ticker_data = data.loc[data[f'Name_{ticker}'] == 1]  #get all data fora  specific company
    for col in lag_columns: #for every column we're creating lag for
        for n in [1, 3, 5,7,15,30]: #create lag for days 1,3,5,7,15,30
            column_name = f'{col}_lag_{n}' #create new column with afforementioned lag
            data.loc[data[f'Name_{ticker}'] == 1, column_name] = ticker_data[col].shift(n)#shift data by that many days



In [6]:
print(data.head(10))

        date   open   high     low  close    volume   7_day_ma  15_day_ma  \
0 2013-02-08  45.07  45.35  45.000  45.08   1824755        NaN        NaN   
1 2013-02-11  45.17  45.18  44.450  44.60   2915405        NaN        NaN   
2 2013-02-12  44.81  44.95  44.500  44.62   2373731        NaN        NaN   
3 2013-02-13  44.81  45.24  44.680  44.75   2052338        NaN        NaN   
4 2013-02-14  44.72  44.78  44.360  44.58   3826245        NaN        NaN   
5 2013-02-15  43.48  44.24  42.210  42.25  14657315        NaN        NaN   
6 2013-02-19  42.21  43.12  42.210  43.01   4116141  44.127143        NaN   
7 2013-02-20  42.84  42.85  42.225  42.24   3873183  43.721429        NaN   
8 2013-02-21  42.14  42.14  41.470  41.63   3415149  43.297143        NaN   
9 2013-02-22  41.83  42.07  41.580  41.80   3354862  42.894286        NaN   

   30_day_ma  daily_returns  ...  daily_returns_lag_5  daily_returns_lag_7  \
0        NaN            NaN  ...                  NaN                  NaN

In [7]:
# drop NaN values
data = data.dropna()
print(data.head(10))

         date   open    high     low  close    volume   7_day_ma  15_day_ma  \
60 2013-05-07  42.18  42.410  41.900  42.40   3524022  41.662857  41.933333   
61 2013-05-08  42.40  42.950  42.300  42.94   2119765  41.874286  41.983333   
62 2013-05-09  42.97  43.195  42.630  43.16   3159293  42.120000  42.072000   
63 2013-05-10  43.12  43.850  43.040  43.63   4662252  42.451429  42.180667   
64 2013-05-13  43.43  43.560  42.720  43.04   4260335  42.674286  42.260000   
65 2013-05-14  42.98  44.060  42.882  43.97   6075845  43.020000  42.350667   
66 2013-05-15  44.90  46.490  44.890  45.68  10289000  43.545714  42.539333   
67 2013-05-16  45.43  45.840  44.970  44.99   4890962  43.915714  42.690000   
68 2013-05-17  45.02  45.830  44.990  45.56   3247851  44.290000  42.974000   
69 2013-05-20  45.48  47.450  45.390  46.34   5698804  44.744286  43.299333   

    30_day_ma  daily_returns  ...  daily_returns_lag_5  daily_returns_lag_7  \
60  42.018333       0.009524  ...            -0.000

In [8]:
#drop one-hot encoded columns
columns_to_drop = [col for col in data.columns if col.startswith('Name_')]
data = data.drop(columns=columns_to_drop)

In [9]:

#define features, target vars

#a list of columns to exclude. This should be all the columns with data from a day d for which we are trying 
#to make predictions for (because we will not have access to this data in practice)
exclude = [col for col in lag_columns if any(f'{col}_lag_' in c for c in data.columns)]
#a list of the features to include. This is all columns that are not the date, target, or included
#in our list of columns to exclude
features = [col for col in data.columns if col not in ['date', 'target'] + exclude]

In [10]:
#split into features and target
X = data[features]
y = data['target']

In [11]:
# create TimeSeriesSplit
# split the data into five folds in chronological order based on time.
#first fold: (train)(test)
#second fold: (train)(train)(test)
#third fold: (train)(train)(train)(test)
#etc...
tscv = TimeSeriesSplit(n_splits=5)

In [12]:
#for every fold get the training indice and testing indice
#the training index will be an array used to get all data up to a point x and grows with each fold
#the testing indices are all the data from a point x to a point y
#the size of [x:y] is constant throughout all folds
for train_index, test_index in tscv.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    #get non-hot-encoded columns
    non_hot_cols = [col for col in X_train.columns if not col.startswith('Name_')]

    # standardize the features
    #new data will have mean of 0 and std. dev. of 1
    #this help model more correctly handle different data of various magnitudes
    scaler = StandardScaler()
    X_train_scaled = X_train.copy()
    X_test_scaled = X_test.copy()
    # Scale only the nont-hot-encoded features
    X_train_scaled[non_hot_cols] = scaler.fit_transform(X_train[non_hot_cols])
    X_test_scaled[non_hot_cols] = scaler.transform(X_test[non_hot_cols])

    #initialize,train model
    model = GaussianNB()
    model.fit(X_train_scaled, y_train)

    #make predictions
    predictions = model.predict(X_test_scaled)
    print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.50      0.09      0.16     47136
           1       0.52      0.92      0.66     50814

    accuracy                           0.52     97950
   macro avg       0.51      0.50      0.41     97950
weighted avg       0.51      0.52      0.42     97950

              precision    recall  f1-score   support

           0       0.49      0.13      0.21     47087
           1       0.52      0.87      0.65     50863

    accuracy                           0.52     97950
   macro avg       0.50      0.50      0.43     97950
weighted avg       0.50      0.52      0.44     97950

              precision    recall  f1-score   support

           0       0.49      0.25      0.33     47112
           1       0.52      0.76      0.62     50838

    accuracy                           0.52     97950
   macro avg       0.51      0.51      0.48     97950
weighted avg       0.51      0.52      0.48     97950

              preci

\******************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************

In this next section we will attempt to find hyperparamaters that increase the accuracy of our model

In [13]:
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report

In [14]:
# Define the parameter grid for 'var_smoothing'
#create different values for the var_smoothing paramater
#These values will range from very small 1 x 10^-10 to larger 1 x 10^-1
param_grid = {'var_smoothing': [1e-10, 1e-09, 1e-08, 1e-07, 1e-06, 1e-05, 1e-04, 1e-03, 1e-02, 1e-01]}

In [15]:
# Initialize Gaussian Naive Bayes classifier
gnb = GaussianNB()

In [16]:
# Initialize GridSearchCV
#GridSearchCV looks through the afforementioned hyper-paramaters
#to find the most optimal one for the model
grid_search = GridSearchCV(estimator=gnb, param_grid=param_grid, cv=5, scoring='accuracy')

# Perform the grid search on scaled data
grid_search.fit(X_train_scaled, y_train)

GridSearchCV(cv=5, estimator=GaussianNB(),
             param_grid={'var_smoothing': [1e-10, 1e-09, 1e-08, 1e-07, 1e-06,
                                           1e-05, 0.0001, 0.001, 0.01, 0.1]},
             scoring='accuracy')

In [17]:
# Print best paramater,score
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best score: {grid_search.best_score_}")

# Test best model on test set
best_model = grid_search.best_estimator_
predictions = best_model.predict(X_test_scaled)
print(classification_report(y_test, predictions))

Best parameters: {'var_smoothing': 0.001}
Best score: 0.4927942827973456
              precision    recall  f1-score   support

           0       0.48      0.86      0.62     46853
           1       0.53      0.14      0.23     51097

    accuracy                           0.49     97950
   macro avg       0.51      0.50      0.42     97950
weighted avg       0.51      0.49      0.41     97950

