# Classifying Stocks Using Machine Learning
## Classifying Stock Direction
<hr>

The purpose of this notebook is to test machine learning techniques on our current dataset. 

### Objectives
 - Remove columns not necessary to ML
 - develop pipeline to normalize numeric data
 - run multiple classifications
 - review accuracy of predictions
 - review the weights of classifiers

In [1]:
import math
import os
import requests
from datetime import datetime, timedelta
import time
import warnings
import pandas as pd
import numpy as np
from collections import Counter 

In [16]:
df = pd.read_hdf('../../Data/stocksCleaned.h5')

In [17]:
df.head()

Unnamed: 0,ticker,date,headline,news_sentiment_score,source,url,amount_of_articles,open,close,volume,social_sentiments,mentions,news_sentiment,close_better,tomorrow_better
0,FB,2016-08-09,[Onetime Home of Warner Bros.’ Harry Warner As...,3.0,[DowJones],[https://finnhub.io/api/news?id=7dbe5db9757dda...,1,125.34,125.06,19620967,-0.199921,0,0.5,0,1
1,FB,2021-03-15,[Rupert Murdoch's News Corp strikes deal as Fa...,2.0,"[The Guardian, https://nypost.com, https://www...",[https://finnhub.io/api/news?id=61c0d589cb8bf9...,70,269.08,273.75,16856746,0.066288,45,0.0,1,1
2,FB,2021-03-16,[NetApp reformula a organização de vendas glob...,1.91,"[businesswire, benzinga, businesswire, busines...",[https://finnhub.io/api/news?id=6479351ac59fa2...,89,276.085,279.28,22437665,-0.339269,85,-0.045,1,1
3,FB,2021-03-17,[Facebook Promises More Support For Human Righ...,1.89,"[https://www.forbes.com, businesswire, busines...",[https://finnhub.io/api/news?id=ad0559e9f8ae60...,58,275.705,284.01,21315044,-0.589213,135,-0.055,1,0
4,FB,2021-03-18,[Take A Sneak Peek At The Weirdly-Shaped New P...,1.85,"[benzinga, benzinga, benzinga, businesswire, b...",[https://finnhub.io/api/news?id=e851ef47ee28e6...,77,279.87,278.62,18754853,-0.361794,534,-0.075,0,1


In [18]:
df.columns

Index(['ticker', 'date', 'headline', 'news_sentiment_score', 'source', 'url',
       'amount_of_articles', 'open', 'close', 'volume', 'social_sentiments',
       'mentions', 'news_sentiment', 'close_better', 'tomorrow_better'],
      dtype='object')

In [19]:
cols = ['ticker','news_sentiment', 'amount_of_articles', 
        'open', 'close', 'volume', 'social_sentiments',
       'mentions', 'close_better', 'tomorrow_better']

In [20]:
df = df[cols]

In [21]:
df

Unnamed: 0,ticker,news_sentiment,amount_of_articles,open,close,volume,social_sentiments,mentions,close_better,tomorrow_better
0,FB,0.500,1,125.340,125.06,19620967,-0.199921,0,0,1
1,FB,0.000,70,269.080,273.75,16856746,0.066288,45,1,1
2,FB,-0.045,89,276.085,279.28,22437665,-0.339269,85,1,1
3,FB,-0.055,58,275.705,284.01,21315044,-0.589213,135,1,0
4,FB,-0.075,77,279.870,278.62,18754853,-0.361794,534,0,1
...,...,...,...,...,...,...,...,...,...,...
394,UBER,-0.375,8,46.540,45.82,16874229,0.000000,1,0,1
395,UBER,-0.085,24,45.880,46.14,19510262,-0.472390,4,1,0
396,UBER,-0.050,30,44.120,44.69,51033697,-0.155166,0,1,0
397,UBER,0.030,18,44.380,43.46,22194938,0.195272,40,0,1


In [186]:
df = df[df.ticker == 'ORCL']

In [187]:
# close_better label is very balanced
a = len(df[df['close_better'] == 0])
b = len(df[df['close_better'] == 1])
print(a)
print(b)
c=b+a
print(a/c)
print(b/c)

38
61
0.3838383838383838
0.6161616161616161


In [188]:
# tomorrow_better label is very unbalanced
a = len(df[df['tomorrow_better'] == 0])
b = len(df[df['tomorrow_better'] == 1])
print(a)
print(b)
c=b+a
print(a/c)
print(b/c)

0
99
0.0
1.0


In [209]:
cols = ['news_sentiment', 'amount_of_articles', 
        'open', 'close', 'volume', 'social_sentiments',
       'mentions']
target_A = ['close_better']
target_B = ['tomorrow_better']

In [210]:
import pandas as pd
from imblearn.over_sampling import SMOTE

# apply SMOTE oversampling
X = df[cols]
y = df[target_A]

smote = SMOTE(random_state=0)
X_resampled, y_resampled = smote.fit_resample(X, y)

X_resampled = pd.DataFrame(X_resampled, columns=X.columns)

In [211]:
ones=0
zeros=0
for i in y_resampled['close_better']:
    if i ==0:
        zeros=zeros+1
    if i ==1:
        ones=ones+1
        
print(ones)
print(zeros)

61
61


In [212]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, cols)])

In [213]:
# this one uses smote

from sklearn.model_selection import train_test_split

X_resampled['close_better'] = y_resampled
X_resampled

X_train, X_test = train_test_split(X_resampled, test_size=0.2, random_state=42, stratify = X_resampled["close_better"])

In [223]:
# no oversampling

from sklearn.model_selection import train_test_split

X_noresampled = X
X_noresampled['close_better'] = y

X_train, X_test = train_test_split(X_noresampled, test_size=0.2, random_state=2, stratify = X_noresampled["close_better"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_noresampled['close_better'] = y


In [224]:
y_train = X_train["close_better"]
y_test = X_test["close_better"]

del X_train["close_better"]
del X_test["close_better"]

In [225]:
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('regression', LogisticRegression())])

In [226]:
clf = LogisticRegression().fit(X_train,y_train)

clf.fit(X_train, y_train)
clf.score(X_test, y_test)

0.6

In [227]:
X_train

Unnamed: 0,news_sentiment,amount_of_articles,open,close,volume,social_sentiments,mentions
146,0.000,3,78.99,78.69,6643073,-0.147627,90
120,0.165,3,75.25,76.12,10634896,0.087299,139
193,-0.145,7,90.99,87.69,12680902,-0.731201,1
170,-0.165,9,78.06,78.68,9845790,0.153795,263
136,0.200,5,77.07,78.53,15316549,-0.095840,136
...,...,...,...,...,...,...,...
172,0.045,11,78.65,77.74,9623990,-0.035580,200
111,-0.200,5,70.21,71.13,15183697,0.036111,0
121,0.000,3,76.16,76.67,13142446,0.171101,257
126,0.125,4,78.70,79.24,12562254,-0.063235,145


In [228]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(clf, X_test, y_test, cv=5)

In [229]:
scores

array([0.5 , 0.5 , 0.5 , 0.75, 0.75])

In [221]:
print(clf.coef_)

[[ 2.49125087e-15 -1.72091491e-13  6.39787715e-13  7.24323876e-13
   2.97141338e-08  5.46491543e-15  3.04104237e-13]]


In [222]:
Counter(y['close_better'])

Counter({1: 61, 0: 38})