In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.metrics import balanced_accuracy_score



In [3]:
# Loading and preprocessing Data

In [4]:
# Loading data
file_path = Path("../EDA/Resources/cleaned_close1.csv")
df_equities = pd.read_csv(file_path)
df_equities.head()

Unnamed: 0,symbol,date,high,iexClose,industry,low,sector,volume,death,deathIncrease,hospitalizedIncrease,hospitalizedCurrently,negative,negativeIncrease,positive,positiveIncrease,totalTestResults,totalTestResultsIncrease,p_status
0,PJUL,8/19/2020,27.84,27.71,Investment Trusts/Mutual Funds,27.76,Miscellaneous,3283,165088,1411,1983,43406,24462674,243232,5485765,45073,76356767,832115,1
1,PIM,8/19/2020,4.26,4.24,Investment Trusts/Mutual Funds,4.23,Miscellaneous,31824,165088,1411,1983,43406,24462674,243232,5485765,45073,76356767,832115,0
2,PJT,8/19/2020,59.67,58.885,Investment Banks/Brokers,58.6,Finance,23896,165088,1411,1983,43406,24462674,243232,5485765,45073,76356767,832115,1
3,PKBK,8/19/2020,13.46,13.46,Regional Banks,12.89,Finance,2365,165088,1411,1983,43406,24462674,243232,5485765,45073,76356767,832115,1
4,PICK,8/19/2020,28.49,28.38,Investment Trusts/Mutual Funds,28.25,Miscellaneous,71208,165088,1411,1983,43406,24462674,243232,5485765,45073,76356767,832115,Null


In [5]:
df_equities.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 631360 entries, 0 to 631359
Data columns (total 19 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   symbol                    631360 non-null  object 
 1   date                      631360 non-null  object 
 2   high                      631360 non-null  float64
 3   iexClose                  631360 non-null  float64
 4   industry                  631360 non-null  object 
 5   low                       631360 non-null  float64
 6   sector                    631360 non-null  object 
 7   volume                    631360 non-null  int64  
 8   death                     631360 non-null  int64  
 9   deathIncrease             631360 non-null  int64  
 10  hospitalizedIncrease      631360 non-null  int64  
 11  hospitalizedCurrently     631360 non-null  int64  
 12  negative                  631360 non-null  int64  
 13  negativeIncrease          631360 non-null  i

In [6]:

# Drop the null columns where all values are null
df_equities = df_equities.dropna(axis='columns', how='all')

# Drop the null rows
df_equities = df_equities.dropna(how='any')
df_equities.drop(df_equities.loc[df_equities['p_status']== "Null"].index, inplace=True)
df_equities = df_equities.drop(columns = ['symbol','date','industry','sector'])

In [7]:
df_equities.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 570400 entries, 0 to 631359
Data columns (total 15 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   high                      570400 non-null  float64
 1   iexClose                  570400 non-null  float64
 2   low                       570400 non-null  float64
 3   volume                    570400 non-null  int64  
 4   death                     570400 non-null  int64  
 5   deathIncrease             570400 non-null  int64  
 6   hospitalizedIncrease      570400 non-null  int64  
 7   hospitalizedCurrently     570400 non-null  int64  
 8   negative                  570400 non-null  int64  
 9   negativeIncrease          570400 non-null  int64  
 10  positive                  570400 non-null  int64  
 11  positiveIncrease          570400 non-null  int64  
 12  totalTestResults          570400 non-null  int64  
 13  totalTestResultsIncrease  570400 non-null  i

In [8]:
# Create our features
#X = pd.get_dummies(df_equities, columns=["death", "deathIncrease", "hospitalizedIncrease", "hospitalizedCurrently", "negative", "positive", "totalTestResults", "totalTestResultsIncrease"])
X = df_equities.drop('p_status',axis=1)
# Create our target
y = pd.DataFrame(df_equities["p_status"])
X.head()

Unnamed: 0,high,iexClose,low,volume,death,deathIncrease,hospitalizedIncrease,hospitalizedCurrently,negative,negativeIncrease,positive,positiveIncrease,totalTestResults,totalTestResultsIncrease
0,27.84,27.71,27.76,3283,165088,1411,1983,43406,24462674,243232,5485765,45073,76356767,832115
1,4.26,4.24,4.23,31824,165088,1411,1983,43406,24462674,243232,5485765,45073,76356767,832115
2,59.67,58.885,58.6,23896,165088,1411,1983,43406,24462674,243232,5485765,45073,76356767,832115
3,13.46,13.46,12.89,2365,165088,1411,1983,43406,24462674,243232,5485765,45073,76356767,832115
5,8.77,8.605,8.44,2894906,165088,1411,1983,43406,24462674,243232,5485765,45073,76356767,832115


In [9]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 570400 entries, 0 to 631359
Data columns (total 14 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   high                      570400 non-null  float64
 1   iexClose                  570400 non-null  float64
 2   low                       570400 non-null  float64
 3   volume                    570400 non-null  int64  
 4   death                     570400 non-null  int64  
 5   deathIncrease             570400 non-null  int64  
 6   hospitalizedIncrease      570400 non-null  int64  
 7   hospitalizedCurrently     570400 non-null  int64  
 8   negative                  570400 non-null  int64  
 9   negativeIncrease          570400 non-null  int64  
 10  positive                  570400 non-null  int64  
 11  positiveIncrease          570400 non-null  int64  
 12  totalTestResults          570400 non-null  int64  
 13  totalTestResultsIncrease  570400 non-null  i

In [10]:
X.describe()

Unnamed: 0,high,iexClose,low,volume,death,deathIncrease,hospitalizedIncrease,hospitalizedCurrently,negative,negativeIncrease,positive,positiveIncrease,totalTestResults,totalTestResultsIncrease
count,570400.0,570400.0,570400.0,570400.0,570400.0,570400.0,570400.0,570400.0,570400.0,570400.0,570400.0,570400.0,570400.0,570400.0
mean,42.658513,44.23489,41.554962,923430.1,221402.55,1275.5875,2559.1875,56305.5,39225070.0,250182.4125,9540573.0,99462.8375,143890500.0,1276617.0
std,107.594631,108.536875,104.526355,6664651.0,38625.202612,801.742567,1341.234424,29470.053439,9169443.0,124213.055681,3574027.0,68021.723619,46745910.0,401736.9
min,0.0,0.0229,0.0,0.0,165088.0,246.0,-752.0,28849.0,24462670.0,-658774.0,5485765.0,22310.0,76356770.0,546859.0
25%,8.16,9.64,7.805,13537.75,192707.25,851.5,1518.25,32712.0,31776110.0,212750.75,6823470.0,43793.5,105340800.0,947256.0
50%,22.34,23.46,21.75,91928.5,212496.5,1049.0,2136.0,41083.0,38233870.0,260824.5,8174878.0,63143.0,135648300.0,1169490.0
75%,43.98,45.48,42.865,443370.2,242672.5,1371.25,3486.25,79775.75,46512870.0,304880.25,11585340.0,161495.25,177727800.0,1639021.0
max,4530.0,4474.0,4382.55,1864680000.0,317492.0,3453.0,5411.0,119463.0,56743140.0,456078.0,18367210.0,236933.0,241223900.0,2010951.0


In [11]:
# Check the balance of our target values
y['p_status'].value_counts()

1    314960
0    255440
Name: p_status, dtype: int64

In [12]:
y.head()

Unnamed: 0,p_status
0,1
1,0
2,1
3,1
5,1


In [13]:
# Splitting into Train and Test sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [14]:
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [15]:
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=200, random_state=78) 

In [16]:
# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [17]:
# Making predictions using the testing data.
predictions = rf_model.predict(X_test_scaled)

In [18]:
predictions

array(['0', '0', '0', ..., '1', '1', '0'], dtype=object)

In [19]:
# Calculated the balanced accuracy score
balanced_accuracy_score(y_test, predictions)

0.5282653844605326

In [20]:
# Display the confusion matrix
confusion_matrix(y_test, predictions)

array([[26796, 37280],
       [28399, 50125]], dtype=int64)

In [21]:
# Calculate feature importance in the Random Forest model.
importances = rf_model.feature_importances_
importances

array([0.20639424, 0.23271716, 0.20541893, 0.24388101, 0.0102327 ,
       0.0121559 , 0.01184118, 0.0108975 , 0.01026902, 0.01233703,
       0.0102261 , 0.01150768, 0.01021961, 0.01190195])

In [22]:
# We can sort the features by their importance.
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.24388101486880567, 'volume'),
 (0.2327171584265753, 'iexClose'),
 (0.20639423604364684, 'high'),
 (0.20541893297913502, 'low'),
 (0.012337027120412757, 'negativeIncrease'),
 (0.012155899753590615, 'deathIncrease'),
 (0.01190194562891143, 'totalTestResultsIncrease'),
 (0.011841178116528276, 'hospitalizedIncrease'),
 (0.011507683736513693, 'positiveIncrease'),
 (0.0108974983861549, 'hospitalizedCurrently'),
 (0.010269019847264914, 'negative'),
 (0.010232699357882371, 'death'),
 (0.010226099766817447, 'positive'),
 (0.010219605967760679, 'totalTestResults')]

In [24]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, predictions))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.49      0.42      0.64      0.45      0.52      0.26     64076
          1       0.57      0.64      0.42      0.60      0.52      0.27     78524

avg / total       0.53      0.54      0.52      0.53      0.52      0.27    142600

