 # Predicting Stocks

In [1]:
from pathlib import Path
#from path import Path
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder
import tensorflow as tf
import csv

In [2]:
data = Path('resources/logistic_regression_test.csv')
df = pd.read_csv(data)
df.head()

Unnamed: 0,data_index,symbol,industry,sector,date,volume,volume_previousday,volume_greater_previousday,volume_deathIncrease_up,volume_hospitalizedIncrease_up,...,hospitalizedIncrease_greater_previousday,negativeIncrease,negativeIncrease_previousday,negativeIncrease_greater_previousday,positiveIncrease,positiveIncrease_previousday,positiveIncrease_greater_previousday,totalTestResultsIncrease,totalTestResultsIncrease_previousday,totalTestResultsIncrease_greater_previousday
0,0,A,Medical Specialties,Health Technology,8/20/2020,318382,1651518,0,0,0,...,0,208472,243232,0,43844,45073,0,864210,832115,1
1,1,AA,Aluminum,Non-Energy Minerals,8/20/2020,1543881,5460701,0,0,0,...,0,208472,243232,0,43844,45073,0,864210,832115,1
2,2,AAAU,Investment Trusts/Mutual Funds,Miscellaneous,8/20/2020,151621,584100,0,0,0,...,0,208472,243232,0,43844,45073,0,864210,832115,1
3,3,AACG,Miscellaneous Commercial Services,Commercial Services,8/20/2020,10963,36708,0,0,0,...,0,208472,243232,0,43844,45073,0,864210,832115,1
4,4,AADR,Investment Trusts/Mutual Funds,Miscellaneous,8/20/2020,177,1191,0,0,0,...,0,208472,243232,0,43844,45073,0,864210,832115,1


In [3]:
# Convert date to int
float or np.ndarray(dtype=float)
df['date'] = pd.to_datetime(df['date'])    
df['date_delta'] = (df['date'] - df['date'].min())  / np.timedelta64(1,'D')
# converting to object
df = df.astype({"date":'int64'}) 

  


In [4]:
df = df.astype({"high":'int64', "high_previousday" :'int64', "high_previousday" :'int64', "iexClose" :'int64', "iexClose_previousday" :'int64', "date_delta":'int64'}) 

In [5]:
df.dtypes

data_index                                       int64
symbol                                          object
industry                                        object
sector                                          object
date                                             int64
volume                                           int64
volume_previousday                               int64
volume_greater_previousday                       int64
volume_deathIncrease_up                          int64
volume_hospitalizedIncrease_up                   int64
volume_negativeIncrease_up                       int64
volume_positiveIncrease_up                       int64
volume_totalTestResultsIncrease_up               int64
high                                             int64
high_previousday                                 int64
high_greater_previousday                         int64
high_deathIncrease_up                            int64
high_hospitalizedIncrease_up                     int64
high_negat

In [6]:
# Generate our categorical variable list
cat = df.dtypes[df.dtypes == "object"].index.tolist()

In [7]:
# Check the number of unique values in each column
df[cat].nunique()

symbol      7892
industry     617
sector        39
dtype: int64

In [8]:
df.dtypes

data_index                                       int64
symbol                                          object
industry                                        object
sector                                          object
date                                             int64
volume                                           int64
volume_previousday                               int64
volume_greater_previousday                       int64
volume_deathIncrease_up                          int64
volume_hospitalizedIncrease_up                   int64
volume_negativeIncrease_up                       int64
volume_positiveIncrease_up                       int64
volume_totalTestResultsIncrease_up               int64
high                                             int64
high_previousday                                 int64
high_greater_previousday                         int64
high_deathIncrease_up                            int64
high_hospitalizedIncrease_up                     int64
high_negat

In [9]:
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(df[cat]))

# Add the encoded variable names to the DataFrame
encode_df.columns = enc.get_feature_names(cat)
encode_df.head()



Unnamed: 0,symbol_A,symbol_AA,symbol_AAAU,symbol_AACG,symbol_AADR,symbol_AAL,symbol_AAMC,symbol_AAME,symbol_AAN,symbol_AAOI,...,sector_Producer Manufacturing,"sector_Professional, Scientific, and Technical Services",sector_Public Administration,sector_Real Estate and Rental and Leasing,sector_Retail Trade,sector_Technology Services,sector_Transportation,sector_Transportation and Warehousing,sector_Utilities,sector_Wholesale Trade
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
encode_df.dtypes

symbol_A                                 float64
symbol_AA                                float64
symbol_AAAU                              float64
symbol_AACG                              float64
symbol_AADR                              float64
                                          ...   
sector_Technology Services               float64
sector_Transportation                    float64
sector_Transportation and Warehousing    float64
sector_Utilities                         float64
sector_Wholesale Trade                   float64
Length: 8548, dtype: object

In [11]:
# Merge one-hot encoded features and drop the originals
df = df.merge(encode_df,left_index=True, right_index=True)
df = df.drop(cat,1)
df.head()

  This is separate from the ipykernel package so we can avoid doing imports until


MemoryError: Unable to allocate 39.7 GiB for an array with shape (8548, 623468) and data type float64

In [None]:
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')

# Drop the null rows
df = df.dropna()
df.drop(df.loc[df['iexClose_hospitalizedIncrease_up']== "Null"].index, inplace=True)
df= df.drop(columns = ["symbol", "industry", "sector"])
df.head()

In [None]:
# Convert date to int
float or np.ndarray(dtype=float)
df['date'] = pd.to_datetime(df['date'])    
df['date_delta'] = (df['date'] - df['date'].min())  / np.timedelta64(1,'D')
# converting to object
df = df.astype({"date":'int64'}) 

In [None]:
df.dtypes

 ## Separate the Features (X) from the Target (y)

In [None]:
# Remove target from features data
# y = df["volume_greater_previousday"]
# X = df.drop(columns="volume_greater_previousday")

y = df["iexClose_hospitalizedIncrease_up"].values
X = df.drop(columns="iexClose_hospitalizedIncrease_up").values

 ## Split our data into training and testing

In [None]:
# Split training/test datasets
# X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78, stratify=y)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)
X_train.shape

 ## Create a Logistic Regression Model

In [None]:
# Define the logistic regression model
classifier = LogisticRegression(solver='lbfgs',max_iter=200, random_state=42)

 ## Fit (train) or model using the training data

In [None]:
# Train the model
classifier.fit(X_train, y_train)

In [None]:
# Preprocess numerical data for neural network

# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [None]:
# Evaluate the model
y_pred = classifier.predict(X_test)
print(f" Logistic regression model accuracy: {accuracy_score(y_test,y_pred):.3f}")

In [None]:
# Define the basic neural network model
nn_model = tf.keras.models.Sequential()
nn_model.add(tf.keras.layers.Dense(units=16, activation="relu", input_dim=8))
nn_model.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Compile the Sequential model together and customize metrics
nn_model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

# Train the model
fit_model = nn_model.fit(X_train_scaled, y_train, epochs=50)

# Evaluate the model using the test data
model_loss, model_accuracy = nn_model.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

 ## Make predictions

In [None]:
y_pred = classifier.predict(X_test)
results = pd.DataFrame({"Prediction": y_pred, "Actual": y_test}).reset_index(drop=True)
results.head(20)

In [None]:
print(accuracy_score(y_test, y_pred))