In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime as dt
import pandas_datareader.data as web
from pandas.plotting import scatter_matrix
from sklearn.model_selection import train_test_split
from sklearn.metrics import plot_confusion_matrix as matrix

In [2]:
#import and clean Covid data
covid_raw = pd.read_csv(r"C:\Users\sclark\OneDrive - Navigant Consulting Inc\Documents\UMD Data Science\Principals of Data Science\Final Project\Data\COVID\United_States_COVID-19_Cases_and_Deaths_by_State_over_Time.csv")
variables = ["submission_date", "tot_cases", "new_case", "tot_death", "new_death"]
covid = covid_raw.loc[:,variables]
covid.corr()

FileNotFoundError: [Errno 2] File C:\Users\sclark\OneDrive - Navigant Consulting Inc\Documents\UMD Data Science\Principals of Data Science\Final Project\Data\COVID\United_States_COVID-19_Cases_and_Deaths_by_State_over_Time.csv does not exist: 'C:\\Users\\sclark\\OneDrive - Navigant Consulting Inc\\Documents\\UMD Data Science\\Principals of Data Science\\Final Project\\Data\\COVID\\United_States_COVID-19_Cases_and_Deaths_by_State_over_Time.csv'

In [None]:
#converting datetime and grouping weekly and by state
covid["submission_date"]=pd.to_datetime(covid["submission_date"])
covid.rename(columns={"submission_date":"Date"}, inplace=True)

In [None]:
#group weekly
weekly_covid = covid.groupby(pd.Grouper(key = "Date", freq ="W")).sum()
weekly_covid.reset_index(inplace=True)
weekly_covid.head()

#percent change
weekly_covid["percent_change_cases"] = weekly_covid["new_case"].diff().fillna(0)/weekly_covid["new_case"]
weekly_covid["percent_change_deaths"] = weekly_covid["new_death"].diff().fillna(0)/weekly_covid["new_death"]
weekly_covid["percent_change_deaths"] = weekly_covid["percent_change_deaths"].fillna(0)

In [None]:
#lists for stocks
energyStocks = ["CVX","XOM","KMI","WMB","LNG"]
retailStocks = ["WMT","AMZN","COST","BIG","ACI"]
bioStock = ["FBRX", "TRIL", "CODX", "ALIM", "BTAI"]
pharmStock = ["ABT", "RCUS", "LLY", "ELAN", "MRK"]

stocklist = energyStocks + bioStock + pharmStock

In [None]:
#cleaning stock data
start = "2020-01-22"
end = dt.now()

stocks = pd.DataFrame()

covid_stocks = pd.DataFrame()

for x in stocklist:
    df = web.DataReader(x, "yahoo", start, end)
    df.reset_index(inplace=True)
    
    weekly_stock = df.groupby([pd.Grouper(key="Date", freq="W")]).mean()
    weekly_stock["Stock"] = x
    weekly_stock.reset_index(inplace = True)
    
    weekly_stock["percent_change_stock"] = weekly_stock["Adj Close"].diff().fillna(0)/weekly_stock["Adj Close"]
    
    weekly_stock["Rise/Fall"] = np.where(weekly_stock["percent_change_stock"]>0, "Rise", "fall")
    
    stocks = stocks.append(weekly_stock[["Date","Stock", "percent_change_stock", "Rise/Fall"]], ignore_index=True)

    covid_stocks = pd.merge(stocks, weekly_covid, on="Date")    

covid_stocks["Category"] = ""
covid_stocks.loc[covid_stocks["Stock"].isin(energyStocks), "Category"] = "energyStocks"
covid_stocks.loc[covid_stocks["Stock"].isin(retailStocks), "Category"] = "retailStocks"
covid_stocks.loc[covid_stocks["Stock"].isin(bioStock), "Category"] = "bioStock"
covid_stocks.loc[covid_stocks["Stock"].isin(pharmStock), "Category"] = "pharmStock"


In [None]:
covid_stocks_dummies = pd.concat([covid_stocks, pd.get_dummies(covid_stocks["Category"], prefix="Dummy")], axis = 1)
covid_stocks_dummies.corr()

In [None]:
finalVars = ["Rise/Fall","percent_change_cases","percent_change_deaths","percent_change_stock","Dummy_bioStock", "Dummy_energyStocks","Dummy_pharmStock"]
final = covid_stocks_dummies.loc[:,finalVars]
final.corr()

In [None]:
#pairplots
import seaborn as sns

sns.pairplot(final, hue = "Rise/Fall", diag_kind = "kde", plot_kws = {'alpha': 0.6, 's': 80, 'edgecolor': 'k'})

In [None]:
final.corr()

In [None]:
#dropped date because i was getting an error
    #found out that date needs to be converted to categorical data to work
    #in the tree
Y = final["Rise/Fall"]
features = ["percent_change_cases","percent_change_deaths"]
X = final.loc[:,features]

In [None]:
# Spliting Data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3)
print(X_train.shape, Y_train.shape)
print(X_test.shape, Y_test.shape)

In [None]:
from sklearn import tree
clf = tree.DecisionTreeClassifier(random_state=0)
clf = clf.fit(X_train, Y_train)

In [None]:
#get sorted labels for plot 
import numpy as np
sorted = Y.unique()
sorted = np.sort(sorted)
sorted = list(map(str, sorted))
sorted

In [None]:
plt.subplots(figsize=(40, 40))
x = tree.plot_tree(clf,rounded=True,filled=True,class_names=sorted,feature_names=features) 


In [None]:
YPredicted = clf.predict(X_test)
YPredicted

In [None]:
from sklearn import metrics
accuracy = metrics.accuracy_score(Y_test, YPredicted)
accuracy

In [None]:
#setup plots for confusion matrix
from sklearn.metrics import plot_confusion_matrix as matrix
figSize = plt.rcParams["figure.figsize"]
figSize[0] = 30
figSize[1] = 5
plt.rcParams["figure.figsize"]=figSize
print(plt.rcParams.get('figure.figsize'))

#plot the confusion matrices 1 for normalzied the other un-normalized
values = ['true',None]
#cmap='cividis'
for x in values:
    disp = matrix(clf,X_test,Y_test,display_labels=sorted,normalize=x)
    disp.ax_.set_title("Confusion matrix with normalization = "+str(x))
print(disp.confusion_matrix)


In [None]:
TP = 52
FP = 29
FN = 33
TN = 71
recall = TP/(TP+FN)
percision= TP/(TP+FP)
fmeasure = (2*recall*percision)/(recall+percision)
print(recall, percision, fmeasure)