# Dow Jones Stock Market Projections

### Imports

In [1]:
import pandas as pd
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler

### Setting up the DataFrame

In [2]:
Apple = pd.read_csv(r'C:\Users\hammy\Downloads\451060_852234_bundle_archive\AAPL.csv')

In [3]:
Apple.head(10)

Unnamed: 0,date,open,high,low,close,volume,unadjustedVolume,change,changePercent,vwap,label,changeOverTime
0,2014-02-21,69.9727,70.2061,68.8967,68.9821,69757247,9965321,-0.774858,-1.111,69.4256,"Feb 21, 14",0.0
1,2014-02-24,68.7063,69.5954,68.6104,69.2841,72364950,10337850,0.302061,0.438,69.1567,"Feb 24, 14",0.004378
2,2014-02-25,69.5245,69.5488,68.4239,68.5631,58247350,8321050,-0.72101,-1.041,68.9153,"Feb 25, 14",-0.006074
3,2014-02-26,68.7667,68.9492,67.7147,67.9446,69131286,9875898,-0.618575,-0.902,68.1373,"Feb 26, 14",-0.01504
4,2014-02-27,67.917,69.4457,67.7738,69.2999,75557321,10793903,1.3553,1.995,68.8615,"Feb 27, 14",0.004607
5,2014-02-28,69.4851,69.9671,68.571,69.1121,93074653,13296379,-0.187807,-0.271,69.2731,"Feb 28, 14",0.001885
6,2014-03-03,68.7417,69.6913,68.6616,69.3117,59667923,8523989,0.199626,0.289,69.1371,"Mar 3, 14",0.004778
7,2014-03-04,69.7372,69.9526,69.313,69.7688,64884834,9269262,0.457035,0.659,69.694,"Mar 4, 14",0.011404
8,2014-03-05,69.7267,70.2297,69.4916,69.9158,50065519,7152217,0.147093,0.211,71.9382,"Mar 5, 14",0.013535
9,2014-03-06,69.9723,70.189,69.3564,69.7044,46423111,6631873,-0.211448,-0.302,69.7696,"Mar 6, 14",0.010471


I just implement these sanity-checks at certain checkpoints I set for myself to make sure I'm doing what I intend to. It's not necessary for the code to run.

### Doing Some Data Prep

In [4]:
Apple = Apple.rename(columns = {'open' : 'openToday',
                                'high' : 'highYesterday',
                                'low' : 'lowYesterday',
                                'close' : 'closeYesterday',
                                'volume' : 'volumeYesterday'})

Apple['openYesterday'] = Apple.openToday.copy()
Apple['openYesterday'] = Apple['openYesterday'].shift(periods=1)
Apple['highYesterday'] = Apple['highYesterday'].shift(periods=1)
Apple['lowYesterday'] = Apple['lowYesterday'].shift(periods=1)
Apple['closeYesterday'] = Apple['closeYesterday'].shift(periods=1)
Apple['volumeYesterday'] = Apple['volumeYesterday'].shift(periods=1)
Apple.head()

Unnamed: 0,date,openToday,highYesterday,lowYesterday,closeYesterday,volumeYesterday,unadjustedVolume,change,changePercent,vwap,label,changeOverTime,openYesterday
0,2014-02-21,69.9727,,,,,9965321,-0.774858,-1.111,69.4256,"Feb 21, 14",0.0,
1,2014-02-24,68.7063,70.2061,68.8967,68.9821,69757247.0,10337850,0.302061,0.438,69.1567,"Feb 24, 14",0.004378,69.9727
2,2014-02-25,69.5245,69.5954,68.6104,69.2841,72364950.0,8321050,-0.72101,-1.041,68.9153,"Feb 25, 14",-0.006074,68.7063
3,2014-02-26,68.7667,69.5488,68.4239,68.5631,58247350.0,9875898,-0.618575,-0.902,68.1373,"Feb 26, 14",-0.01504,69.5245
4,2014-02-27,67.917,68.9492,67.7147,67.9446,69131286.0,10793903,1.3553,1.995,68.8615,"Feb 27, 14",0.004607,68.7667


Here, I'm removing some redundant or unnecessary columns, and simplifying the change percent to simply tell whether the stocks rose or fell. It's a much more simple hypothesis for a computer to make, and I don't want to start off with the most complex thing to play around with.

In [5]:
Apple = Apple.drop(0)
Apple = Apple.drop(columns=['changePercent', 'vwap','changeOverTime','label','unadjustedVolume'])
Apple['trade'] = pd.cut(Apple['change'], bins = [-100, 0, 100], labels = [0, 1])
Apple = Apple.drop(columns = ['change'])
Apple.head()

Unnamed: 0,date,openToday,highYesterday,lowYesterday,closeYesterday,volumeYesterday,openYesterday,trade
1,2014-02-24,68.7063,70.2061,68.8967,68.9821,69757247.0,69.9727,1
2,2014-02-25,69.5245,69.5954,68.6104,69.2841,72364950.0,68.7063,0
3,2014-02-26,68.7667,69.5488,68.4239,68.5631,58247350.0,69.5245,0
4,2014-02-27,67.917,68.9492,67.7147,67.9446,69131286.0,68.7667,1
5,2014-02-28,69.4851,69.4457,67.7738,69.2999,75557321.0,67.917,0


### Now to the fun stuff, Neural Networks!

I know that Neural Networks might not be the best thing to use for this dataset since it's not too complex, but I'm not comfortable enough with other systems to use them as well as I'd like to. If neural networks are a complete flop, I'll switch to some other predictor. 

In [6]:
mlpc = MLPClassifier(hidden_layer_sizes=(6,6,6), max_iter=1000)
X = Apple.drop(columns = ['trade','date'])
y = Apple['trade']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=41)
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

mlpc.fit(X_train, y_train)
pred_mlpc = mlpc.predict(X_test)

Testing both train and test data to more easily detect under- or overfitting problems

In [7]:
print(classification_report(y_train, mlpc.predict(X_train)))
print(confusion_matrix(y_train, mlpc.predict(X_train)))

              precision    recall  f1-score   support

           0       0.45      0.02      0.04       467
           1       0.54      0.98      0.69       538

    accuracy                           0.53      1005
   macro avg       0.49      0.50      0.37      1005
weighted avg       0.50      0.53      0.39      1005

[[ 10 457]
 [ 12 526]]


In [8]:
print(classification_report(y_test, pred_mlpc))
print(confusion_matrix(y_test, pred_mlpc))

              precision    recall  f1-score   support

           0       0.43      0.02      0.04       131
           1       0.48      0.97      0.64       121

    accuracy                           0.48       252
   macro avg       0.45      0.49      0.34       252
weighted avg       0.45      0.48      0.33       252

[[  3 128]
 [  4 117]]


That's pretty terrible accuracy. It basically just tells you to buy every time. Based on the accuracies for the training and test data, it looks like an underfitting problem. Reducing the regularization parameter likely won't help because it's already so small (0.0001), so I'll try adding more features. That shouldn't be too hard since I can just pull from earlier values in the table.

In [9]:
Apple['open-2'] = Apple.openToday.copy()
Apple['high-2'] = Apple.highYesterday.copy()
Apple['low-2'] = Apple.lowYesterday.copy()
Apple['close-2'] = Apple.closeYesterday.copy()
Apple['volume-2'] = Apple.volumeYesterday.copy()
Apple['trade2'] = Apple.trade.copy()

Apple['open-2'] = Apple['open-2'].shift(periods=1)
Apple['high-2'] = Apple['high-2'].shift(periods=1)
Apple['low-2'] = Apple['low-2'].shift(periods=1)
Apple['close-2'] = Apple['close-2'].shift(periods=1)
Apple['volume-2'] = Apple['volume-2'].shift(periods=1)
Apple = Apple.drop(columns=['trade'])
Apple = Apple.rename(columns={'trade2' : 'trade'})
Apple = Apple.drop(1)
Apple.head()

Unnamed: 0,date,openToday,highYesterday,lowYesterday,closeYesterday,volumeYesterday,openYesterday,open-2,high-2,low-2,close-2,volume-2,trade
2,2014-02-25,69.5245,69.5954,68.6104,69.2841,72364950.0,68.7063,68.7063,70.2061,68.8967,68.9821,69757247.0,0
3,2014-02-26,68.7667,69.5488,68.4239,68.5631,58247350.0,69.5245,69.5245,69.5954,68.6104,69.2841,72364950.0,0
4,2014-02-27,67.917,68.9492,67.7147,67.9446,69131286.0,68.7667,68.7667,69.5488,68.4239,68.5631,58247350.0,1
5,2014-02-28,69.4851,69.4457,67.7738,69.2999,75557321.0,67.917,67.917,68.9492,67.7147,67.9446,69131286.0,0
6,2014-03-03,68.7417,69.9671,68.571,69.1121,93074653.0,69.4851,69.4851,69.4457,67.7738,69.2999,75557321.0,1


That's more like it! Now the dataframe has data from both yesterday and the day before. Setting up a new neural network with the new dataframe:

In [24]:
mlpc = MLPClassifier(hidden_layer_sizes=(9,9,9,9), max_iter=5000, solver='lbfgs', random_state=4)
X2 = Apple.drop(columns=['trade', 'date'])
y2 = Apple['trade']
X_train2, X_test2, y_train2, y_test2 = train_test_split(X2, y2, test_size=0.1, random_state=41)
X_train2 = sc.fit_transform(X_train2)
X_test2 = sc.transform(X_test2)
mlpc.fit(X_train2, y_train2)
better_pred_mlpc = mlpc.predict(X_test2)

Checking accuracy on both training and test data for the same reason as above:

In [25]:
print(classification_report(y_train2, mlpc.predict(X_train2)))
print(confusion_matrix(y_train2, mlpc.predict(X_train2)))

              precision    recall  f1-score   support

           0       0.74      0.73      0.73       546
           1       0.75      0.76      0.75       584

    accuracy                           0.74      1130
   macro avg       0.74      0.74      0.74      1130
weighted avg       0.74      0.74      0.74      1130

[[397 149]
 [143 441]]


In [26]:
print(classification_report(y_test2, better_pred_mlpc))
print(confusion_matrix(y_test2, better_pred_mlpc))

              precision    recall  f1-score   support

           0       0.56      0.58      0.57        52
           1       0.69      0.68      0.68        74

    accuracy                           0.63       126
   macro avg       0.62      0.63      0.63       126
weighted avg       0.64      0.63      0.64       126

[[30 22]
 [24 50]]


A bit more accurate, but not by much. It's still wrong about 36% of the time. The thing is I'm not sure how accurate I can get using a neural network. I'll try adding one more set of features in a more generalized program.

In [77]:
def predict_stock_market_prices(company_stock_filepath):
    
    company = pd.read_csv(company_stock_filepath)
    
    company = company.rename(columns = {'open' : 'openToday',
                                        'high' : 'highYesterday',
                                        'low' : 'lowYesterday',
                                        'close' : 'closeYesterday',
                                        'volume' : 'volumeYesterday'})
    shifted_indices = ['openYesterday',
                       'highYesterday',
                       'lowYesterday',
                       'closeYesterday',
                       'volumeYesterday']

    company['openYesterday'] = company.openToday.copy()
    
    for index in shifted_indices: company[index] = company[index].shift(periods=1)
        
    company = company.drop(0)
    company = company.drop(columns=['changePercent', 'vwap','changeOverTime','label','unadjustedVolume'])
    company['trade'] = pd.cut(company['change'], bins = [-100, 0, 100], labels = [0, 1])
    company = company.drop(columns = ['change'])
    
    company['open-2'] = company.openToday.copy()
    company['high-2'] = company.highYesterday.copy()
    company['low-2'] = company.lowYesterday.copy()
    company['close-2'] = company.closeYesterday.copy()
    company['volume-2'] = company.volumeYesterday.copy()
    company['trade2'] = company.trade.copy()
    
    shifted_indices = ['open-2',
                       'high-2',
                       'low-2',
                       'close-2',
                       'volume-2']
    
    for index in shifted_indices: company[index] = company[index].shift(periods=1)
        
    company = company.drop(columns=['trade'])
    company = company.rename(columns={'trade2':'trade'})
    company = company.drop(1)
    
    company['open-3'] = company.openToday.copy()
    company['high-3'] = company.highYesterday.copy()
    company['low-3'] = company.lowYesterday.copy()
    company['close-3'] = company.closeYesterday.copy()
    company['volume-3'] = company.volumeYesterday.copy()
    company['trade2'] = company.trade.copy()
    
    shifted_indices = ['open-3',
                       'high-3',
                       'low-3',
                       'close-3',
                       'volume-3']
    
    for index in shifted_indices: company[index] = company[index].shift(periods=2)
        
    company = company.drop(columns=['trade'])
    company = company.rename(columns={'trade2':'trade'})
    company = company.drop(2)
    company = company.drop(3)
    company.head()
    
    mlpc = MLPClassifier(hidden_layer_sizes=(14,14,14,14), max_iter=10000, solver='lbfgs', random_state=42, alpha=1.5)
    X = company.drop(columns=['trade', 'date'])
    y = company['trade']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=41)
    X_train = sc.fit_transform(X_train)
    X_test = sc.transform(X_test)
    mlpc.fit(X_train, y_train)
    
    print('Training accuracy:')
    print(classification_report(y_train, mlpc.predict(X_train)))
    print(confusion_matrix(y_train, mlpc.predict(X_train)))
    print('Test accuracy:')
    print(classification_report(y_test, mlpc.predict(X_test)))
    print(confusion_matrix(y_test, mlpc.predict(X_test)))

In [82]:
predict_stock_market_prices(r'C:\Users\hammy\Downloads\451060_852234_bundle_archive\DIS.csv')

Training accuracy:
              precision    recall  f1-score   support

           0       0.76      0.79      0.78       467
           1       0.81      0.79      0.80       536

    accuracy                           0.79      1003
   macro avg       0.79      0.79      0.79      1003
weighted avg       0.79      0.79      0.79      1003

[[370  97]
 [114 422]]
Test accuracy:
              precision    recall  f1-score   support

           0       0.61      0.58      0.60       130
           1       0.57      0.61      0.59       121

    accuracy                           0.59       251
   macro avg       0.59      0.59      0.59       251
weighted avg       0.59      0.59      0.59       251

[[75 55]
 [47 74]]
