 # Predicting Stocks

In [1]:
from pathlib import Path
#from path import Path
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
data = Path('resources/cleaned_close1.csv')
df = pd.read_csv(data)
df.head()

Unnamed: 0,symbol,date,high,iexClose,industry,low,sector,volume,death,deathIncrease,hospitalizedIncrease,hospitalizedCurrently,negative,negativeIncrease,positive,positiveIncrease,totalTestResults,totalTestResultsIncrease,p_status
0,PJUL,8/19/2020,27.84,27.71,Investment Trusts/Mutual Funds,27.76,Miscellaneous,3283,165088,1411,1983,43406,24462674,243232,5485765,45073,76356767,832115,1
1,PIM,8/19/2020,4.26,4.24,Investment Trusts/Mutual Funds,4.23,Miscellaneous,31824,165088,1411,1983,43406,24462674,243232,5485765,45073,76356767,832115,0
2,PJT,8/19/2020,59.67,58.885,Investment Banks/Brokers,58.6,Finance,23896,165088,1411,1983,43406,24462674,243232,5485765,45073,76356767,832115,1
3,PKBK,8/19/2020,13.46,13.46,Regional Banks,12.89,Finance,2365,165088,1411,1983,43406,24462674,243232,5485765,45073,76356767,832115,1
4,PICK,8/19/2020,28.49,28.38,Investment Trusts/Mutual Funds,28.25,Miscellaneous,71208,165088,1411,1983,43406,24462674,243232,5485765,45073,76356767,832115,Null


In [3]:
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')

# Drop the null rows
df = df.dropna()
df.drop(df.loc[df['p_status']== "Null"].index, inplace=True)
df= df.drop(columns = ["symbol", "industry", "sector"])
df.head()

Unnamed: 0,date,high,iexClose,low,volume,death,deathIncrease,hospitalizedIncrease,hospitalizedCurrently,negative,negativeIncrease,positive,positiveIncrease,totalTestResults,totalTestResultsIncrease,p_status
0,8/19/2020,27.84,27.71,27.76,3283,165088,1411,1983,43406,24462674,243232,5485765,45073,76356767,832115,1
1,8/19/2020,4.26,4.24,4.23,31824,165088,1411,1983,43406,24462674,243232,5485765,45073,76356767,832115,0
2,8/19/2020,59.67,58.885,58.6,23896,165088,1411,1983,43406,24462674,243232,5485765,45073,76356767,832115,1
3,8/19/2020,13.46,13.46,12.89,2365,165088,1411,1983,43406,24462674,243232,5485765,45073,76356767,832115,1
5,8/19/2020,8.77,8.605,8.44,2894906,165088,1411,1983,43406,24462674,243232,5485765,45073,76356767,832115,1


In [4]:
# Convert date to int
float or np.ndarray(dtype=float)
df['date'] = pd.to_datetime(df['date'])    
df['date_delta'] = (df['date'] - df['date'].min())  / np.timedelta64(1,'D')
# converting to object
df = df.astype({"date":'int64'}) 

  


In [5]:
df.dtypes

date                          int64
high                        float64
iexClose                    float64
low                         float64
volume                        int64
death                         int64
deathIncrease                 int64
hospitalizedIncrease          int64
hospitalizedCurrently         int64
negative                      int64
negativeIncrease              int64
positive                      int64
positiveIncrease              int64
totalTestResults              int64
totalTestResultsIncrease      int64
p_status                     object
date_delta                  float64
dtype: object

 ## Separate the Features (X) from the Target (y)

In [6]:
y = df["p_status"]
X = df.drop(columns="p_status")

 ## Split our data into training and testing

In [7]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1, 
                                                    stratify=y)
X_train.shape

(427800, 16)

 ## Create a Logistic Regression Model

In [8]:
classifier = LogisticRegression(solver='lbfgs',
                                max_iter=200,
                                random_state=1)

 ## Fit (train) or model using the training data

In [9]:
classifier.fit(X_train, y_train)

LogisticRegression(max_iter=200, random_state=1)

 ## Make predictions

In [10]:
y_pred = classifier.predict(X_test)
results = pd.DataFrame({"Prediction": y_pred, "Actual": y_test}).reset_index(drop=True)
results.head(20)

Unnamed: 0,Prediction,Actual
0,1,1
1,1,0
2,1,0
3,1,1
4,1,0
5,1,0
6,1,0
7,1,0
8,1,1
9,1,0


In [11]:
print(accuracy_score(y_test, y_pred))

0.5521739130434783
