<a href="https://colab.research.google.com/github/j-chenn/COMP551_Project_1/blob/main/COMP551_Project_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Task 1: Acquire, pre-process and analyze the data
## Acquiring both datasets:
Dataset 1: [Search Trends](https://github.com/google-research/open-covid-19-data/blob/master/data/exports/search_trends_symptoms_dataset/README.mdhttps://)

Dataset 2: [COVID hospitalization cases](https://github.com/google-research/open-covid-19-data)

In [1]:
# Imports

import numpy as np
import pandas as pd


In [2]:
# the week of 08/24/2020 for the data collection
# Load into pandas dataframes
st_df = pd.read_csv('2020_US_weekly_symptoms_dataset.csv', low_memory=False)
hp_df = pd.read_csv('aggregated_cc_by.csv', low_memory=False)

## Preprocess the datasets

In [3]:
# Search trends dataset Part I

#TODO: Preprocessing, remove all symptoms that have all zero entries (clean COLUMN)
st_df = st_df.dropna(how='all', axis=1)

#Remove all rows not in the date of the week chosen (clean ROW)
st_df = st_df[st_df['date']== '2020-08-24']

#print(st_df)

In [4]:
nameList = list(st_df['sub_region_1']) #extract the region names from st_df database
nameList = list(dict.fromkeys(nameList))  #remove duplicates
#print(nameList)

In [5]:
# Hospitalization dataset Part I

#TODO: Preprocessing

#keep the hospitalization features and delete the rest  (clean COLUMN)
hp_df = hp_df[['open_covid_region_code','region_name','date', 'hospitalized_new']]

#select the regions that match the Search trends dataset (clean ROW)
hp_df= hp_df[hp_df.region_name.isin(nameList)]

#select the regions that have the valid date range (clean ROW)
hp_df = hp_df[(hp_df['date'] >= '2020-08-24') & (hp_df['date'] <= '2020-08-30')]

#print(hp_df)

In [6]:
# Hospitalization dataset Part II

# Clean out the regions that have missing data or new hospitalization

def cleanRegion(mydf, threshold):
    ''' The function cleans the regions that have values below or equal to 0 for hospitalized data.
  
        Parameters: 
            mydf (Dataframe): the dataframe to clean
            threshold (int): minimum requirement for non-zero entries in the subset
            For instance, threshold = 2 implies each region must contain at least 2 dates with non-zero entries.
          
        Returns: 
            nameList (list): a filtered list of region names which fulfill the threshold requirement.'''
    
    df1 = mydf[(mydf['hospitalized_new'] >= 1)]
    df2 = df1.groupby('region_name').filter(lambda x : len(x)>=threshold) 
    nameList = list(df2['region_name']) 
    nameList = list(dict.fromkeys(nameList))
    return nameList

cleanNames = cleanRegion(hp_df, 3)
hp_df= hp_df[hp_df.region_name.isin(cleanNames)]

#print(hp_df.to_string())

In [7]:
# Search trends dataset Part II

#Update the st_df by this new filtered region names
st_df= st_df[st_df.sub_region_1.isin(cleanNames)]

#Filter columns so that every column have at least sp_num% of non-zero entries
sp_num = 4/11  #4/11 is the optimized ration for cleaning, see report for relevant explanations
st_df = st_df.dropna(thresh=sp_num*len(st_df), axis=1)

#Further drop unnecessary columns
st_df = st_df.drop(st_df.columns[[0, 1, 2]], axis=1)

In [8]:
print(st_df)
print(hp_df)

      sub_region_1 sub_region_1_code        date  symptom:Adrenal crisis  \
147         Hawaii             US-HI  2020-08-24                     NaN   
185          Idaho             US-ID  2020-08-24                     NaN   
223          Maine             US-ME  2020-08-24                     NaN   
261        Montana             US-MT  2020-08-24                   13.48   
299   North Dakota             US-ND  2020-08-24                    8.53   
337       Nebraska             US-NE  2020-08-24                     NaN   
375  New Hampshire             US-NH  2020-08-24                     NaN   
413     New Mexico             US-NM  2020-08-24                     NaN   
451   Rhode Island             US-RI  2020-08-24                     NaN   
489   South Dakota             US-SD  2020-08-24                   13.17   
603        Wyoming             US-WY  2020-08-24                    4.42   

     symptom:Ageusia  symptom:Allergic conjunctivitis  symptom:Amenorrhea  \
147       

In [9]:
# Hospitalization dataset Part III

#truncate the dataframe by summing over the weekly hospitalized_new values
f = dict.fromkeys(hp_df.columns.difference(['region_name']), 'first')
f['hospitalized_new'] = sum

hp_df1 = hp_df.groupby('region_name', as_index=False).agg(f)
#print(hp_df1)
#print(hp_df1.shape)

## Merging the datasets 

In [10]:
hpData = hp_df1["hospitalized_new"]
hpData = pd.Series(hpData)

st_df['hospitalized_new'] = hpData.values


#new_df = pd.concat([st_df,hpData])


#print(st_df)


In [11]:
pd.DataFrame(st_df).to_numpy()

array([['Hawaii', 'US-HI', '2020-08-24', nan, nan, nan, nan, 29.67, nan,
        27.25, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, 22.39,
        nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
        nan, nan, nan, nan, nan, nan, nan, nan, nan, 17.53, nan, nan,
        nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
        nan, nan, nan, nan, nan, nan, nan, nan, 20.5, nan, nan, nan, nan,
        nan, nan, nan, 11.33, 120.0],
       ['Idaho', 'US-ID', '2020-08-24', nan, nan, nan, nan, nan, nan,
        14.79, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
        nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
        nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
        nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
        nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
        nan, nan, 14.23, 119.0],
       ['Maine', 'US-ME', '2020-08-24', nan, nan, 15.16, nan, 

## End of Task 1