In [6]:
import warnings
warnings.filterwarnings('ignore')

In [7]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.metrics import balanced_accuracy_score



In [8]:
# Loading and preprocessing Data

In [10]:
# Loading data
file_path = Path("../apestogetherstrong/logistic_regression_test.csv")
df_equities = pd.read_csv(file_path)
df_equities.head()

Unnamed: 0,data_index,symbol,date,volume,volume_previousday,volume_greater_previousday,volume_deathIncrease_up,volume_hospitalizedIncrease_up,volume_negativeIncrease_up,volume_positiveIncrease_up,...,hospitalizedIncrease_greater_previousday,negativeIncrease,negativeIncrease_previousday,negativeIncrease_greater_previousday,positiveIncrease,positiveIncrease_previousday,positiveIncrease_greater_previousday,totalTestResultsIncrease,totalTestResultsIncrease_previousday,totalTestResultsIncrease_greater_previousday
0,0,A,8/20/2020,318382,1651518,0,0,0,0,0,...,0,208472,243232,0,43844,45073,0,864210,832115,1
1,1,AA,8/20/2020,1543881,5460701,0,0,0,0,0,...,0,208472,243232,0,43844,45073,0,864210,832115,1
2,2,AAAU,8/20/2020,151621,584100,0,0,0,0,0,...,0,208472,243232,0,43844,45073,0,864210,832115,1
3,3,AACG,8/20/2020,10963,36708,0,0,0,0,0,...,0,208472,243232,0,43844,45073,0,864210,832115,1
4,4,AADR,8/20/2020,177,1191,0,0,0,0,0,...,0,208472,243232,0,43844,45073,0,864210,832115,1


In [11]:
df_equities.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 623468 entries, 0 to 623467
Data columns (total 42 columns):
 #   Column                                        Non-Null Count   Dtype  
---  ------                                        --------------   -----  
 0   data_index                                    623468 non-null  int64  
 1   symbol                                        623468 non-null  object 
 2   date                                          623468 non-null  object 
 3   volume                                        623468 non-null  int64  
 4   volume_previousday                            623468 non-null  int64  
 5   volume_greater_previousday                    623468 non-null  int64  
 6   volume_deathIncrease_up                       623468 non-null  int64  
 7   volume_hospitalizedIncrease_up                623468 non-null  int64  
 8   volume_negativeIncrease_up                    623468 non-null  int64  
 9   volume_positiveIncrease_up                    62

In [21]:
rf_df = pd.DataFrame().assign(symbol=df_equities['symbol'],date=df_equities['date'],high=df_equities['high'],iexClose=df_equities['iexClose'],deathIncrease=df_equities['deathIncrease'],negativeIncrease=df_equities['negativeIncrease'],positiveIncrease=df_equities['positiveIncrease'],totalTestResultsIncrease=df_equities['totalTestResultsIncrease'],iexClose_deathIncrease_up=df_equities['iexClose_deathIncrease_up'])
print(rf_df)

       symbol        date    high  iexClose  deathIncrease  negativeIncrease  \
0           A   8/20/2020  98.150    98.065           1129            208472   
1          AA   8/20/2020  15.120    14.965           1129            208472   
2        AAAU   8/20/2020  19.440    19.400           1129            208472   
3        AACG   8/20/2020   1.126     1.190           1129            208472   
4        AADR   8/20/2020  56.330    58.260           1129            208472   
...       ...         ...     ...       ...            ...               ...   
623463    ZVO  12/23/2020   4.270     4.130           3393            304962   
623464   ZYME  12/23/2020  53.700    51.030           3393            304962   
623465   ZYNE  12/23/2020   3.505     3.415           3393            304962   
623466   ZYXI  12/23/2020  14.490    13.820           3393            304962   
623467   TRUE  12/23/2020   4.610     4.505           3393            304962   

        positiveIncrease  totalTestResu

In [22]:
rf_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 623468 entries, 0 to 623467
Data columns (total 9 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   symbol                     623468 non-null  object 
 1   date                       623468 non-null  object 
 2   high                       623468 non-null  float64
 3   iexClose                   623468 non-null  float64
 4   deathIncrease              623468 non-null  int64  
 5   negativeIncrease           623468 non-null  int64  
 6   positiveIncrease           623468 non-null  int64  
 7   totalTestResultsIncrease   623468 non-null  int64  
 8   iexClose_deathIncrease_up  623468 non-null  int64  
dtypes: float64(2), int64(5), object(2)
memory usage: 42.8+ MB


In [23]:
rf_df = rf_df.drop(labels=["symbol", "date"], axis=1)
rf_df.head() 



Unnamed: 0,high,iexClose,deathIncrease,negativeIncrease,positiveIncrease,totalTestResultsIncrease,iexClose_deathIncrease_up
0,98.15,98.065,1129,208472,43844,864210,0
1,15.12,14.965,1129,208472,43844,864210,0
2,19.44,19.4,1129,208472,43844,864210,0
3,1.126,1.19,1129,208472,43844,864210,0
4,56.33,58.26,1129,208472,43844,864210,0


In [24]:
rf_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 623468 entries, 0 to 623467
Data columns (total 7 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   high                       623468 non-null  float64
 1   iexClose                   623468 non-null  float64
 2   deathIncrease              623468 non-null  int64  
 3   negativeIncrease           623468 non-null  int64  
 4   positiveIncrease           623468 non-null  int64  
 5   totalTestResultsIncrease   623468 non-null  int64  
 6   iexClose_deathIncrease_up  623468 non-null  int64  
dtypes: float64(2), int64(5)
memory usage: 33.3 MB


In [26]:
# Create our features
#X = pd.get_dummies(df_equities, columns=["death", "deathIncrease", "hospitalizedIncrease", "hospitalizedCurrently", "negative", "positive", "totalTestResults", "totalTestResultsIncrease"])
X = rf_df.drop('iexClose_deathIncrease_up',axis=1)
# Create our target
y = pd.DataFrame(rf_df["iexClose_deathIncrease_up"])
X.head()

Unnamed: 0,high,iexClose,deathIncrease,negativeIncrease,positiveIncrease,totalTestResultsIncrease
0,98.15,98.065,1129,208472,43844,864210
1,15.12,14.965,1129,208472,43844,864210
2,19.44,19.4,1129,208472,43844,864210
3,1.126,1.19,1129,208472,43844,864210
4,56.33,58.26,1129,208472,43844,864210


In [27]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 623468 entries, 0 to 623467
Data columns (total 6 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   high                      623468 non-null  float64
 1   iexClose                  623468 non-null  float64
 2   deathIncrease             623468 non-null  int64  
 3   negativeIncrease          623468 non-null  int64  
 4   positiveIncrease          623468 non-null  int64  
 5   totalTestResultsIncrease  623468 non-null  int64  
dtypes: float64(2), int64(4)
memory usage: 28.5 MB


In [28]:
X.describe()

Unnamed: 0,high,iexClose,deathIncrease,negativeIncrease,positiveIncrease,totalTestResultsIncrease
count,623468.0,623468.0,623468.0,623468.0,623468.0,623468.0
mean,41.163351,42.778524,1273.873418,250270.392405,100151.316456,1282244.0
std,103.949506,104.890778,806.655191,124994.255672,68173.329913,401126.9
min,0.0,0.007,246.0,-658774.0,22310.0,546859.0
25%,7.58,9.18,847.0,211097.0,43642.0,950483.0
50%,21.68,22.92,1047.0,263255.0,63430.0,1182695.0
75%,42.73925,44.16,1358.0,304962.0,166503.0,1653269.0
max,4530.0,4474.0,3453.0,456078.0,236933.0,2010951.0


In [29]:
# Check the balance of our target values
y['iexClose_deathIncrease_up'].value_counts()

0    480449
1    143019
Name: iexClose_deathIncrease_up, dtype: int64

In [30]:
y.head()

Unnamed: 0,iexClose_deathIncrease_up
0,0
1,0
2,0
3,0
4,0


In [31]:
# Splitting into Train and Test sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [32]:
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [33]:
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=200, random_state=78) 

In [34]:
# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [35]:
# Making predictions using the testing data.
predictions = rf_model.predict(X_test_scaled)

In [36]:
predictions

array([1, 0, 1, ..., 0, 1, 0], dtype=int64)

In [37]:
# Calculated the balanced accuracy score
balanced_accuracy_score(y_test, predictions)

0.692358353086765

In [38]:
# Display the confusion matrix
confusion_matrix(y_test, predictions)

array([[104694,  15595],
       [ 17278,  18300]], dtype=int64)

In [39]:
# Calculate feature importance in the Random Forest model.
importances = rf_model.feature_importances_
importances

array([0.33750412, 0.37641494, 0.11258301, 0.05506499, 0.04824311,
       0.07018983])

In [42]:
# We can sort the features by their importance.
importances = rf_model.feature_importances_
importances

array([0.33750412, 0.37641494, 0.11258301, 0.05506499, 0.04824311,
       0.07018983])

In [41]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, predictions))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.86      0.87      0.51      0.86      0.67      0.46    120289
          1       0.54      0.51      0.87      0.53      0.67      0.43     35578

avg / total       0.79      0.79      0.60      0.79      0.67      0.46    155867

