In [1]:
# Initial imports
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn import tree
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

## Loading and Preprocessing Loans Encoded Data

In [2]:
# Loading data
file_path = Path("resources/cleaned_close.csv")
df_equities = pd.read_csv(file_path)
df_equities.head()

Unnamed: 0,symbol,date,high,iexClose,industry,low,sector,volume,death,deathIncrease,hospitalizedIncrease,hospitalizedCurrently,negative,negativeIncrease,positive,positiveIncrease,totalTestResults,totalTestResultsIncrease
0,PJUL,8/19/2020,27.84,27.71,Investment Trusts/Mutual Funds,27.76,Miscellaneous,3283,165088,1411,1983,43406,24462674,243232,5485765,45073,76356767,832115
1,PIM,8/19/2020,4.26,4.24,Investment Trusts/Mutual Funds,4.23,Miscellaneous,31824,165088,1411,1983,43406,24462674,243232,5485765,45073,76356767,832115
2,PJT,8/19/2020,59.67,58.885,Investment Banks/Brokers,58.6,Finance,23896,165088,1411,1983,43406,24462674,243232,5485765,45073,76356767,832115
3,PKBK,8/19/2020,13.46,13.46,Regional Banks,12.89,Finance,2365,165088,1411,1983,43406,24462674,243232,5485765,45073,76356767,832115
4,PICK,8/19/2020,28.49,28.38,Investment Trusts/Mutual Funds,28.25,Miscellaneous,71208,165088,1411,1983,43406,24462674,243232,5485765,45073,76356767,832115


In [3]:
float or np.ndarray(dtype=float)
# df = pd.read_csv("resources/cleaned_close.csv")
df_equities['date'] = pd.to_datetime(df_equities['date'])    
df_equities['date_delta'] = (df_equities['date'] - df_equities['date'].min())  / np.timedelta64(1,'D')
# city_data = df[df['city'] == 'London']
# result = sm.ols(formula = 'sales ~ date_delta', data = city_data).fit()

In [None]:
# df_equities = df_equities.drop(columns=["CEO","address","address2", "city" ])
# df_equities.head()

In [4]:
df_equities.head()

Unnamed: 0,symbol,date,high,iexClose,industry,low,sector,volume,death,deathIncrease,hospitalizedIncrease,hospitalizedCurrently,negative,negativeIncrease,positive,positiveIncrease,totalTestResults,totalTestResultsIncrease,date_delta
0,PJUL,2020-08-19,27.84,27.71,Investment Trusts/Mutual Funds,27.76,Miscellaneous,3283,165088,1411,1983,43406,24462674,243232,5485765,45073,76356767,832115,0.0
1,PIM,2020-08-19,4.26,4.24,Investment Trusts/Mutual Funds,4.23,Miscellaneous,31824,165088,1411,1983,43406,24462674,243232,5485765,45073,76356767,832115,0.0
2,PJT,2020-08-19,59.67,58.885,Investment Banks/Brokers,58.6,Finance,23896,165088,1411,1983,43406,24462674,243232,5485765,45073,76356767,832115,0.0
3,PKBK,2020-08-19,13.46,13.46,Regional Banks,12.89,Finance,2365,165088,1411,1983,43406,24462674,243232,5485765,45073,76356767,832115,0.0
4,PICK,2020-08-19,28.49,28.38,Investment Trusts/Mutual Funds,28.25,Miscellaneous,71208,165088,1411,1983,43406,24462674,243232,5485765,45073,76356767,832115,0.0


In [5]:
# Define features set
X = df_equities.copy()
X = X.drop("symbol", axis=1)
X.head()

Unnamed: 0,date,high,iexClose,industry,low,sector,volume,death,deathIncrease,hospitalizedIncrease,hospitalizedCurrently,negative,negativeIncrease,positive,positiveIncrease,totalTestResults,totalTestResultsIncrease,date_delta
0,2020-08-19,27.84,27.71,Investment Trusts/Mutual Funds,27.76,Miscellaneous,3283,165088,1411,1983,43406,24462674,243232,5485765,45073,76356767,832115,0.0
1,2020-08-19,4.26,4.24,Investment Trusts/Mutual Funds,4.23,Miscellaneous,31824,165088,1411,1983,43406,24462674,243232,5485765,45073,76356767,832115,0.0
2,2020-08-19,59.67,58.885,Investment Banks/Brokers,58.6,Finance,23896,165088,1411,1983,43406,24462674,243232,5485765,45073,76356767,832115,0.0
3,2020-08-19,13.46,13.46,Regional Banks,12.89,Finance,2365,165088,1411,1983,43406,24462674,243232,5485765,45073,76356767,832115,0.0
4,2020-08-19,28.49,28.38,Investment Trusts/Mutual Funds,28.25,Miscellaneous,71208,165088,1411,1983,43406,24462674,243232,5485765,45073,76356767,832115,0.0


In [7]:
# Define target vector
y = df_equities["symbol"].values.reshape(-1, 1)
y[:5]

array([['PJUL'],
       ['PIM'],
       ['PJT'],
       ['PKBK'],
       ['PICK']], dtype=object)

In [8]:
# Splitting into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [9]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(473520, 18)
(157840, 18)
(473520, 1)
(157840, 1)


In [10]:
X_train2, X_test2, y_train2, y_test2 = train_test_split(X, y, random_state=78, train_size=0.80)

In [11]:
print(X_train2.shape)
print(X_test2.shape)
print(y_train2.shape)
print(y_test2.shape)

(505088, 18)
(126272, 18)
(505088, 1)
(126272, 1)


In [12]:
# Creating StandardScaler instance
scaler = StandardScaler()

In [13]:
df_equities.dtypes

symbol                              object
date                        datetime64[ns]
high                               float64
iexClose                           float64
industry                            object
low                                float64
sector                              object
volume                               int64
death                                int64
deathIncrease                        int64
hospitalizedIncrease                 int64
hospitalizedCurrently                int64
negative                             int64
negativeIncrease                     int64
positive                             int64
positiveIncrease                     int64
totalTestResults                     int64
totalTestResultsIncrease             int64
date_delta                         float64
dtype: object

In [None]:
# df_equities[df_equities].nunique()


In [None]:
# converting to object
# df_equities = df_equities.astype({"date":'float', "high":'object', "iexClose":'object', "low":'object'}) 

In [14]:
df_equities.dtypes

symbol                              object
date                        datetime64[ns]
high                               float64
iexClose                           float64
industry                            object
low                                float64
sector                              object
volume                               int64
death                                int64
deathIncrease                        int64
hospitalizedIncrease                 int64
hospitalizedCurrently                int64
negative                             int64
negativeIncrease                     int64
positive                             int64
positiveIncrease                     int64
totalTestResults                     int64
totalTestResultsIncrease             int64
date_delta                         float64
dtype: object

In [15]:
df_equities.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 631360 entries, 0 to 631359
Data columns (total 19 columns):
 #   Column                    Non-Null Count   Dtype         
---  ------                    --------------   -----         
 0   symbol                    631360 non-null  object        
 1   date                      631360 non-null  datetime64[ns]
 2   high                      631360 non-null  float64       
 3   iexClose                  631360 non-null  float64       
 4   industry                  631360 non-null  object        
 5   low                       631360 non-null  float64       
 6   sector                    631360 non-null  object        
 7   volume                    631360 non-null  int64         
 8   death                     631360 non-null  int64         
 9   deathIncrease             631360 non-null  int64         
 10  hospitalizedIncrease      631360 non-null  int64         
 11  hospitalizedCurrently     631360 non-null  int64         
 12  ne

In [16]:
# Fitting Standard Scaller
X_scaler = scaler.fit(X_train)

TypeError: float() argument must be a string or a number, not 'Timestamp'

In [None]:
# Scaling data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)


## Fitting the Decision Tree Model

In [None]:
# Creating the decision tree classifier instance
model = tree.DecisionTreeClassifier()

In [None]:
# Fitting the model
model = model.fit(X_train_scaled, y_train)

## Making Predictions Using the Tree Model

In [None]:
# Making predictions using the testing data
predictions = model.predict(X_test_scaled)
predictions

## Model Evaluation

In [None]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)

In [None]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))
