# Task 1: Acquire, pre-process and analyze the data
## Acquiring both datasets:
Dataset 1: [Search Trends](https://github.com/google-research/open-covid-19-data/blob/master/data/exports/search_trends_symptoms_dataset/README.mdhttps://)

Dataset 2: [COVID hospitalization cases](https://github.com/google-research/open-covid-19-data)

In [None]:
# Imports

import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [None]:
# the week of 08/24/2020 for the data collection
# Load into pandas dataframes
st_df = pd.read_csv('2020_US_weekly_symptoms_dataset.csv', low_memory=False)
hp_df = pd.read_csv('aggregated_cc_by.csv', low_memory=False)

## Preprocess the datasets

## Target Weeks range: 2020-03-09 to 2020-09-21


In [None]:
# Search trends dataset Part I

#TODO: Preprocessing, remove all symptoms that have all zero entries (clean COLUMN)
st_df = st_df.dropna(how='all', axis=1)

#Remove all rows not in the date of the week chosen (clean ROW)
st_df = st_df[st_df['date'] >= '2020-03-04']

nameList = list(st_df['sub_region_1']) #extract the region names from st_df database
nameList = list(dict.fromkeys(nameList))  #remove duplicates

In [None]:
# Hospitalization dataset Part I

#TODO: Preprocessing

#keep the hospitalization features and delete the rest  (clean COLUMN)
hp_df = hp_df[['open_covid_region_code','region_name','date', 'hospitalized_new']]

#select the regions that match the Search trends dataset (clean ROW)
hp_df= hp_df[hp_df.region_name.isin(nameList)]

#select the regions that have the valid date range (clean ROW)
hp_df = hp_df[(hp_df['date'] >= '2020-03-09') & (hp_df['date'] <= '2020-09-27')]

hp_df.reset_index(inplace = True) 
# print(hp_df.to_string())

In [None]:
# Hospitalization dataset Part II
# Here we want to group dates in the same week together as one date (the weekdate)
hp_df1 = hp_df
weekdate = '2020-03-09'

#This loop will update all the dates row by row
for i, n in hp_df1.iterrows():
    if (i%7 == 0):
        weekdate = n['date']  #first date of the week
    else:
        hp_df1.at[i,'date'] = weekdate

In [None]:
#sum up the hospitalized_vew for weekly
# we are only using this hp_df2 to rid regions that have insignificant hospitalized data, such as 0 for total hospitalization
def cleanRegions(df):
    hp_df2 = df
    f = dict.fromkeys(hp_df2.columns.difference(['region_name']), 'first')
    f['hospitalized_new'] = sum
    hp_df2 = hp_df2.groupby('region_name', as_index=False).agg(f)
    hp_df2 = hp_df2[hp_df2.hospitalized_new != 0]
    print(hp_df2.to_string())
    tmplist = list(hp_df2['region_name']) 
    tmplist = list(dict.fromkeys(tmplist))  
    return(tmplist)

#this nameList will be a new regions list that removes region with total of 0 hospitalization value for all its dates
nameList2 = cleanRegions(hp_df1)

#filter hp_df1 based on the nameList2 (clean ROW)
hp_df2= hp_df1[hp_df1.region_name.isin(nameList2)]

In [None]:
# Hospitalization dataset Part III

# merge 7 week rows into 1 and sum up the hospitalized_new data
hp_df3 = hp_df2.groupby(['region_name','date'])['hospitalized_new'].apply(sum).reset_index()

In [None]:
# Search trends dataset Part II

# Drop unnecessary columns (open_covid region_code, country_region_code, country_region) (clean COLUMN)
st_df1 = st_df.drop(st_df.columns[[0, 1, 2]], axis=1)

# Filter st_df based on nameList2 (clean ROW)
st_df1= st_df1[st_df.sub_region_1.isin(nameList2)]
# print(st_df1)
# print(st_df1.shape)

#Filter columns so that every column have at least sp_num% of non-zero entries  (clean COLUMN)
sp_num = 0.24  #optimized ratio without tremendous loss of dataset
st_df2 = st_df1.dropna(thresh=sp_num*len(st_df), axis=1)
# print("after........." )
# print(st_df2)
# print(st_df2.shape)

## Merging the datasets 

In [None]:
hpData = hp_df3["hospitalized_new"]
hpData = pd.Series(hpData)

st_df2['hospitalized_new'] = hpData.values # Merging the data_set
print(st_df2)

In [None]:
# Convert merged dataset into a numpy array
myarray = pd.DataFrame(st_df2).to_numpy()

# Task 2: Visualize and Cluster the Data

## Visualizing the evolution of the search frequency of popular symptoms

In [None]:
# Imports
import matplotlib.pyplot as plt

In [None]:
# Replace NaN values by 0s
myarray[pd.isnull(myarray)] = 0

In [None]:
regions = myarray[:,0]
time = myarray[:,2]
features = myarray[:,3:-1].astype(float)
label = myarray[:,-1].astype(int)

nData, nFeat = features.shape

In [None]:
# Get list of symptom names
nameSymptoms = np.array([])
for name in st_df2.columns.values[3:-1]:
    nameSymptoms = np.append(nameSymptoms, name.lstrip('symptom:'))

# print(nameSymptoms)

In [None]:
# Find the most popular searches (rank by # of instances)
mostPop = 5

arrInst = np.count_nonzero(features, axis=0)
# Get indices of most popular searches
topSearchInd = np.sort(np.argpartition(arrInst, -mostPop)[-mostPop:])
# print(topSearchInd)

In [None]:
# Get list of symptom names of most popular searches
namePopSymptoms = nameSymptoms[topSearchInd]
# print(namePopSymptoms)

# Get array of features of only the most popular searches
popFeatures = features[:,topSearchInd]
# print(popFeatures.shape)

In [None]:
# Reshape search data 2D array into 3D [region, time, feature]

uniqueRegions, ctRegions = np.unique(regions, return_counts=True)
# print(dict(zip(uniqueRegions, ctRegions)))

uniqueTime, ctTime = np.unique(time, return_counts=True)
# print(dict(zip(uniqueTime, ctTime)))

nRegions = ctTime[0]
nTime = ctRegions[0]

sdArr = popFeatures.reshape(-1, nTime, mostPop)
# print(sdArr.shape)
# print(sdArr[0,:,2])

In [None]:
# Plot heatmap
for i in range(mostPop):
    fig, ax = plt.subplots(figsize=(20, 10))
    im = ax.imshow(sdArr[:,:,i], cmap='magma', vmin=0, vmax=100)
    plt.colorbar(im)

    ax.set_xticks(np.arange(nTime))
    ax.set_yticks(np.arange(nRegions))

    ax.set_xticklabels(uniqueTime)
    ax.set_yticklabels(uniqueRegions)

    plt.setp(ax.get_xticklabels(), rotation=45, ha='right', rotation_mode='anchor')

    for j in range(nTime):
        for k in range(nRegions):
            text = ax.text(j, k, sdArr[k, j, i], ha="center", va="center", color="w")

    ax.set_title('Evolution of ' + namePopSymptoms[i])
    
    plt.show()

## Using PCA to reduce data dimensionality

In [None]:
# Imports
from sklearn.decomposition import PCA

In [None]:
# Perform PCA on the features
pca = PCA()
pca.fit(features)
pcaEVR = pca.explained_variance_ratio_
totalPC = len(pcaEVR)
cumulativeVar = 100*np.cumsum(pcaEVR)

In [None]:
threshCVar = 95

nPC = np.where(cumulativeVar > threshCVar)[0][0]
cVar = cumulativeVar[nPC]
nPC += 1

# print(cVar, nPC)

In [None]:
# Plot the cumulative variance vs # of principal components to choose #PCs
fig = plt.figure()
plt.plot(np.linspace(1, totalPC, totalPC), threshCVar*np.ones((totalPC,)), 'c--')
plt.plot(np.linspace(1, totalPC, totalPC), cumulativeVar)
plt.plot(nPC, cVar, 'o')

plt.text(nPC-3, cVar-4, 'Optimal #PCs: ' + str(nPC))
plt.text(totalPC, threshCVar, 'Cumulative Variance Threshold:' + str(threshCVar) + '%', horizontalalignment='right')

plt.xlabel("Number of Principal Components")
plt.ylabel("% Variance Explained")
plt.title("Cumulative Variance Explained vs Number of Principal Components")

In [None]:
# We have chosen the #PCs -> now reduce the search trends dataset to this dimensionality
pcaRed = PCA(n_components=nPC)
pcaRed.fit(features)
reducedFeat = pcaRed.transform(features)

fig = plt.figure()
plt.scatter(reducedFeat[:,0], reducedFeat[:,1])
plt.xlabel("PC #1")
plt.ylabel("PC #2")
plt.title("Search Trends Dataset Reduced to " + str(nPC) + "D, Visualizing in 2D")

## Using k-means clustering to evaluate groups in search trends data

In [None]:
# Imports
from sklearn.cluster import KMeans

In [None]:
nClusters = 6    # To be tuned

In [None]:
# Clustering on PCA-reduced data
kmeansRed = KMeans(n_clusters=nClusters, random_state=0)
kmeansRed.fit(reducedFeat)
y_PredRed = kmeansRed.predict(reducedFeat)

In [None]:
# Clustering on original (non-reduced) data
kmeansOri = KMeans(n_clusters=nClusters, random_state=0)
kmeansOri.fit(features)
y_PredOri = kmeansOri.predict(features)

In [None]:
# Plot clustering results for both reduced and non-reduced data

fig = plt.figure(figsize=(20,10))

plt.subplot(2,1,1)
plt.scatter(reducedFeat[:,0], reducedFeat[:,1], c=y_PredRed, cmap=plt.cm.get_cmap('viridis', nClusters))
plt.colorbar(ticks=range(nClusters))
plt.xlabel("PC #1")
plt.ylabel("PC #2")
plt.title("Cluster Labels for " + str(nPC) + "D K-Means with " + str(nClusters) + " Clusters")

plt.subplot(2,1,2)
plt.scatter(reducedFeat[:,0], reducedFeat[:,1], c=y_PredOri, cmap=plt.cm.get_cmap('viridis', nClusters))
plt.colorbar(ticks=range(nClusters))
plt.xlabel("PC #1")
plt.ylabel("PC #2")
plt.title("Cluster Labels for " + str(totalPC) + "D K-Means (Unreduced Data) with " + str(nClusters) + " Clusters")