In [1]:
from collections import defaultdict
from pathlib import Path
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import pickle
from pathlib import Path
from deepdiff import DeepDiff
%matplotlib inline

In [152]:
# load mobility data
df_mob = pd.read_csv('/Users/parkj/Documents/pyDat/dataSet/COVID19_community_mobility_reports/Global_Mobility_Report_asof052021.csv', low_memory=False)
# load new cases data 
df_case = pd.read_csv('/Users/parkj/Documents/pyDat/dataSet/covid_19_data_OWID/covid-19-data-master/public/data/jhu/new_cases_per_million.csv', low_memory=False)
# load vaccination data 
df_vac = pd.read_csv('/Users/parkj/Documents/pyDat/dataSet/covid_19_data_OWID/covid-19-data-master/public/data/vaccinations/vaccinations.csv', low_memory=False)

In [153]:
# Rename the mobility time series column names
df_mob = df_mob.rename(columns = {'retail_and_recreation_percent_change_from_baseline': 'rtrc', 
                                  'grocery_and_pharmacy_percent_change_from_baseline': 'grph',
                                  'parks_percent_change_from_baseline': 'prks',
                                  'transit_stations_percent_change_from_baseline': 'tran', 
                                  'workplaces_percent_change_from_baseline': 'work',
                                  'residential_percent_change_from_baseline': 'resi'}, inplace = False)

In [8]:
def matchDate(refDate,dateList):
    """This is a helper function to align two sets of dates"""
    rL = list(refDate) 
    dL = list(dateList)
    
    dL_I = []
    rL_I = []
    for i in dL:
        if i in rL:
            dL_I.append(i) # just get the dates per se 
            rL_I.append(i)
            #dL_I.append(dL.index(i))
            #rL_I.append(rL.index(i))
    return rL_I, dL_I   

In [161]:
# organize data in Global Mobility Report 
places_id = df_mob.place_id.unique() # unique place ids
grouped = df_mob.groupby(df_mob.place_id)
dict_country = {} # country dict to contain the national-level data (ignore local regions)
country_label = defaultdict(list)
for place in places_id:
    if pd.isna(place)==False:
        df_place = grouped.get_group(place).set_index("date") # DataFrame per place
        countryId = df_place["country_region"].unique()[0]            
        if countryId not in country_label.keys(): # 1st occurrence of country contains national data
            # vaccination data
            df_place["vac"]=np.nan
            if countryId in set(df_vac["location"]):
                df_place_vac = df_vac.loc[df_vac["location"]==countryId].set_index("date")    
                mobI, vacI = matchDate(df_place.index,df_place_vac.index) # match dates   
                df_place.loc[mobI,"vac"] = df_place_vac.loc[vacI,"total_vaccinations_per_hundred"]
                if np.isnan(df_place["vac"][0]): # if the initial value is NaN, put 0
                    df_place.loc[df_place.index[0],"vac"]=0
                    df_place.loc[:,"vac"] = df_place["vac"].fillna(method='ffill') # must assign the filled list to the "vac" column
        
            # case (per million) data
            df_place["case_mil"]=np.nan
            if countryId in set(df_case.columns):
                df_place_case = df_case.loc[:,["date",countryId]].set_index("date") 
                mobI, caseI = matchDate(df_place.index,df_place_case.index) # match dates   
                df_place.loc[mobI,"case_mil"] = df_place_case.loc[caseI,countryId]
                if np.isnan(df_place["case_mil"][0]): # if the initial value is NaN, put 0
                    df_place.loc[df_place.index[0],"case_mil"]=0
                    df_place.loc[:,"case_mil"] = df_place["case_mil"].fillna(method='ffill') # must assign the filled list to the "vac" column
      
            #if countryId not in country_label.keys(): # 1st occurrence of country contains national data
            dict_country.update({countryId : df_place}) # the value of countryId is the nested dict mob_thisPlace
            country_label[countryId] = 0

In [200]:
# save data as pickle - a dictionary (dict_country)
filePath_pickle = Path('/Users/parkj/Documents/pyDat/dataSet/covid_countryData.pickle')
pickle_out = open(filePath_pickle, 'wb')
pickle.dump(dict_country, pickle_out)
pickle_out.close()

In [2]:
# create new dictionary from pickle file
filePath_pickle = Path('/Users/parkj/Documents/pyDat/dataSet/covid_countryData.pickle')
pickle_in = open(filePath_pickle, 'rb')
country_dict = pickle.load(pickle_in)
#diff = DeepDiff(dict_country, country_dict) # note that '==' does not work to compare the two dictionaries here! Use DeepDiff instead
#diff

In [8]:
country_dict["United States"]

Unnamed: 0_level_0,country_region_code,country_region,sub_region_1,sub_region_2,metro_area,iso_3166_2_code,census_fips_code,place_id,rtrc,grph,prks,tran,work,resi,vac,case_mil
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2020-02-15,US,United States,,,,,,ChIJCzYy5IS16lQRQrfeQ5K5Oxw,6.0,2.0,15.0,3.0,2.0,-1.0,0.00,0.000
2020-02-16,US,United States,,,,,,ChIJCzYy5IS16lQRQrfeQ5K5Oxw,7.0,1.0,16.0,2.0,0.0,-1.0,0.00,0.000
2020-02-17,US,United States,,,,,,ChIJCzYy5IS16lQRQrfeQ5K5Oxw,6.0,0.0,28.0,-9.0,-24.0,5.0,0.00,0.000
2020-02-18,US,United States,,,,,,ChIJCzYy5IS16lQRQrfeQ5K5Oxw,0.0,-1.0,6.0,1.0,0.0,1.0,0.00,0.000
2020-02-19,US,United States,,,,,,ChIJCzYy5IS16lQRQrfeQ5K5Oxw,2.0,0.0,8.0,1.0,1.0,0.0,0.00,0.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-05-11,US,United States,,,,,,ChIJCzYy5IS16lQRQrfeQ5K5Oxw,-8.0,4.0,25.0,-26.0,-29.0,7.0,78.68,101.643
2021-05-12,US,United States,,,,,,ChIJCzYy5IS16lQRQrfeQ5K5Oxw,-6.0,3.0,27.0,-24.0,-29.0,7.0,79.14,108.274
2021-05-13,US,United States,,,,,,ChIJCzYy5IS16lQRQrfeQ5K5Oxw,-5.0,3.0,43.0,-23.0,-29.0,6.0,79.71,115.032
2021-05-14,US,United States,,,,,,ChIJCzYy5IS16lQRQrfeQ5K5Oxw,-7.0,0.0,39.0,-20.0,-28.0,6.0,80.26,127.670
