In [81]:
import warnings
warnings.filterwarnings('ignore')

In [82]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report


In [83]:
# Loading and preprocessing Data

In [84]:
# Loading data
file_path = Path("../EDA/Resources/cleaned_close1.csv")
df_equities = pd.read_csv(file_path)
df_equities.head()

Unnamed: 0,symbol,date,high,iexClose,industry,low,sector,volume,death,deathIncrease,hospitalizedIncrease,hospitalizedCurrently,negative,negativeIncrease,positive,positiveIncrease,totalTestResults,totalTestResultsIncrease,p_status
0,PJUL,8/19/2020,27.84,27.71,Investment Trusts/Mutual Funds,27.76,Miscellaneous,3283,165088,1411,1983,43406,24462674,243232,5485765,45073,76356767,832115,1
1,PIM,8/19/2020,4.26,4.24,Investment Trusts/Mutual Funds,4.23,Miscellaneous,31824,165088,1411,1983,43406,24462674,243232,5485765,45073,76356767,832115,0
2,PJT,8/19/2020,59.67,58.885,Investment Banks/Brokers,58.6,Finance,23896,165088,1411,1983,43406,24462674,243232,5485765,45073,76356767,832115,1
3,PKBK,8/19/2020,13.46,13.46,Regional Banks,12.89,Finance,2365,165088,1411,1983,43406,24462674,243232,5485765,45073,76356767,832115,1
4,PICK,8/19/2020,28.49,28.38,Investment Trusts/Mutual Funds,28.25,Miscellaneous,71208,165088,1411,1983,43406,24462674,243232,5485765,45073,76356767,832115,Null


In [85]:
df_equities.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 631360 entries, 0 to 631359
Data columns (total 19 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   symbol                    631360 non-null  object 
 1   date                      631360 non-null  object 
 2   high                      631360 non-null  float64
 3   iexClose                  631360 non-null  float64
 4   industry                  631360 non-null  object 
 5   low                       631360 non-null  float64
 6   sector                    631360 non-null  object 
 7   volume                    631360 non-null  int64  
 8   death                     631360 non-null  int64  
 9   deathIncrease             631360 non-null  int64  
 10  hospitalizedIncrease      631360 non-null  int64  
 11  hospitalizedCurrently     631360 non-null  int64  
 12  negative                  631360 non-null  int64  
 13  negativeIncrease          631360 non-null  i

In [86]:
# Drop the null columns where all values are null
df_equities = df_equities.dropna(axis='columns', how='all')

# Drop the null rows
df_equities = df_equities.dropna()
df_equities = df_equities[df_equities.p_status != "Null"]
df_equities = df_equities.drop(columns = ['symbol','date','industry','sector'])

In [87]:
# Create our features
X = pd.get_dummies(df_equities, columns=["death", "deathIncrease", "hospitalizedIncrease", "hospitalizedCurrently", "negative", "positive", "totalTestResults", "totalTestResultsIncrease"])

# Create our target
y = pd.DataFrame(df_equities["p_status"])
X.head()

Unnamed: 0,high,iexClose,low,volume,negativeIncrease,positiveIncrease,p_status,death_165088,death_166217,death_167336,...,totalTestResultsIncrease_1839996,totalTestResultsIncrease_1845354,totalTestResultsIncrease_1873837,totalTestResultsIncrease_1904020,totalTestResultsIncrease_1918739,totalTestResultsIncrease_1967578,totalTestResultsIncrease_1969509,totalTestResultsIncrease_1974590,totalTestResultsIncrease_2004886,totalTestResultsIncrease_2010951
0,27.84,27.71,27.76,3283,243232,45073,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,4.26,4.24,4.23,31824,243232,45073,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,59.67,58.885,58.6,23896,243232,45073,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,13.46,13.46,12.89,2365,243232,45073,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
5,8.77,8.605,8.44,2894906,243232,45073,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [88]:
X.describe()

Unnamed: 0,high,iexClose,low,volume,negativeIncrease,positiveIncrease,death_165088,death_166217,death_167336,death_169289,...,totalTestResultsIncrease_1839996,totalTestResultsIncrease_1845354,totalTestResultsIncrease_1873837,totalTestResultsIncrease_1904020,totalTestResultsIncrease_1918739,totalTestResultsIncrease_1967578,totalTestResultsIncrease_1969509,totalTestResultsIncrease_1974590,totalTestResultsIncrease_2004886,totalTestResultsIncrease_2010951
count,570400.0,570400.0,570400.0,570400.0,570400.0,570400.0,570400.0,570400.0,570400.0,570400.0,...,570400.0,570400.0,570400.0,570400.0,570400.0,570400.0,570400.0,570400.0,570400.0,570400.0
mean,42.658513,44.23489,41.554962,923430.1,250182.4125,99462.8375,0.0125,0.0125,0.0125,0.0125,...,0.0125,0.0125,0.0125,0.0125,0.0125,0.0125,0.0125,0.0125,0.0125,0.0125
std,107.594631,108.536875,104.526355,6664651.0,124213.055681,68021.723619,0.111103,0.111103,0.111103,0.111103,...,0.111103,0.111103,0.111103,0.111103,0.111103,0.111103,0.111103,0.111103,0.111103,0.111103
min,0.0,0.0229,0.0,0.0,-658774.0,22310.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,8.16,9.64,7.805,13537.75,212750.75,43793.5,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,22.34,23.46,21.75,91928.5,260824.5,63143.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,43.98,45.48,42.865,443370.2,304880.25,161495.25,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,4530.0,4474.0,4382.55,1864680000.0,456078.0,236933.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [89]:
# Check the balance of our target values
y['p_status'].value_counts()

1    314960
0    255440
Name: p_status, dtype: int64

In [90]:
# Splitting into Train and Test sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [91]:
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [106]:
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=300, random_state=78) 

In [107]:
# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [94]:
# Making predictions using the testing data.
predictions = rf_model.predict(X_test_scaled)

In [95]:
predictions

array(['1', '0', '0', ..., '1', '1', '0'], dtype=object)

In [99]:
# Calculated the balanced accuracy score
balanced_accuracy_score(y_test, predictions)

1.0

In [104]:
# Display the confusion matrix
confusion_matrix(y_test, predictions)

array([[64076,     0],
       [    0, 78524]], dtype=int64)

In [105]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.00      0.00      1.00      0.00      0.00      0.00     64076
          1       0.55      1.00      0.00      0.71      0.00      0.00     78524

avg / total       0.30      0.55      0.45      0.39      0.00      0.00    142600



In [102]:
# Calculate feature importance in the Random Forest model.
importances = rf_model.feature_importances_
importances

array([9.76040678e-03, 1.02222177e-02, 9.64567030e-03, 1.03594284e-02,
       1.36038666e-04, 1.54341435e-04, 9.56503191e-01, 5.99819387e-06,
       5.92052776e-06, 5.81034982e-06, 5.24484864e-06, 4.42161474e-06,
       5.79264567e-06, 5.77293369e-06, 4.85802646e-06, 5.08535133e-06,
       4.93544380e-06, 4.15948733e-06, 4.16809612e-06, 5.30315450e-06,
       5.49234196e-06, 4.71812276e-06, 4.59689622e-06, 3.94416920e-06,
       5.04120372e-06, 5.45581853e-06, 5.51931361e-06, 5.35395429e-06,
       5.84112504e-06, 5.03963370e-06, 5.16540104e-06, 4.71277625e-06,
       5.23571933e-06, 5.96558795e-06, 5.61883628e-06, 5.35001628e-06,
       6.22719201e-06, 6.04175806e-06, 5.29143428e-06, 4.72056667e-06,
       5.18372484e-06, 5.80292293e-06, 5.26622347e-06, 5.02011637e-06,
       5.43411956e-06, 4.02343611e-06, 5.69181651e-06, 3.83930307e-06,
       5.94681644e-06, 5.31395672e-06, 6.48890918e-06, 5.87489326e-06,
       5.07061846e-06, 5.39334242e-06, 3.92741639e-06, 5.46024773e-06,
      

In [103]:
# We can sort the features by their importance.
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.9565031912474267, 'p_status'),
 (0.010359428449335017, 'volume'),
 (0.010222217712306839, 'iexClose'),
 (0.00976040677689494, 'high'),
 (0.009645670302824639, 'low'),
 (0.00015434143460724842, 'positiveIncrease'),
 (0.00013603866607738251, 'negativeIncrease'),
 (9.735125658508032e-06, 'hospitalizedIncrease_1983'),
 (9.021818177090903e-06, 'hospitalizedIncrease_3461'),
 (7.715575800296726e-06, 'positive_5485765'),
 (7.533665765764883e-06, 'hospitalizedCurrently_29891'),
 (7.467227503432107e-06, 'totalTestResults_123640600'),
 (7.139300589200592e-06, 'hospitalizedCurrently_77047'),
 (7.018630280674527e-06, 'deathIncrease_1129'),
 (7.005398744714245e-06, 'hospitalizedIncrease_1484'),
 (6.962886419674738e-06, 'hospitalizedCurrently_53380'),
 (6.949647377663586e-06, 'negative_35420950'),
 (6.9391225036069e-06, 'totalTestResults_80485262'),
 (6.803763982126524e-06, 'positive_8268021'),
 (6.786685149777114e-06, 'negative_32064596'),
 (6.729979812853005e-06, 'deathIncrease_1154'),
 (6.7076