# Add [GDP, Urbanization, Median Age, Democracy Index, Gini Index, US State Population, Total Tests, Tests per 1M, Daily Tests, Change in Residential Mobility] Notebook


Adds the following data:
- GDP in billion USD per country (per US states and China provinces, with plans to add Australian, French, and Canadian provinces as well)
- Urbanization as population percentage per country (per US states and China provinces, with plans to add Australian, French, and Canadian provinces as well)
- Median Age per country and per US state (with plans to add sub region data for other countries like above)
- Democracy Index per country
- Gini Index per coutry (incomplete data)
- Population Size for US States
- Total number of tests administered over data interval
- Number of Daily tests per US state (we don't believe we will get accurate data for this metric for other subregions) (this data is added as a row in level 1 index)

Contact ShuliFinley@gmail.com for questions :)

In [184]:
import pandas as pd
import numpy as np
import pickle
import datetime

## _Import and load hopkins data_

In [258]:
PICKLE_PATH = '../augmented_datasets/pickles/hopkins_conf_withgr_augmented0605.pkl'
RESULT_PATH = '../augmented_datasets/pickles/hopkins_conf_withgr_augmented0605_withsocietal.pkl'

In [186]:
hopkins_conf = pd.DataFrame()
with open(PICKLE_PATH, 'rb') as file:
#     hopkins_conf = pickle.load(file) ## old pandas versions use this line to load pickle file
    hopkins_conf = pd.read_pickle(file) ## updated pandas versions use this line to load pickle file
hopkins_conf

Unnamed: 0_level_0,Unnamed: 1_level_0,Province_State,Country_Region,avg_m_tmp,avg_m_RH,avg_m_precip,avg_m_wind,Max_Cases,EXP_GF_Q1,EXP_GF_Q2,EXP_GF_Q3,...,4/27/2020,4/28/2020,4/29/2020,4/30/2020,5/1/2020,5/2/2020,5/3/2020,5/4/2020,5/5/2020,5/6/2020
coordinate,information,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
"(-41.4545, 145.9707)",data,Tasmania,Australia,14.008491,72.811321,0.0,17.955660,226.0,-0.1628,0.0000,0.2500,...,214.0,218.0,219.0,221.0,221.0,221.0,221.0,221.0,225.0,226.0
"(-41.4545, 145.9707)",avg_d_RH,,,,,,,,0.0000,0.0000,0.0000,...,79.0,87.0,95.0,91.0,90.0,62.0,66.0,74.0,88.0,86.0
"(-41.4545, 145.9707)",avg_d_precip,,,,,,,,0.0000,0.0000,0.0000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"(-41.4545, 145.9707)",avg_d_tmp,,,,,,,,0.0000,0.0000,0.0000,...,11.1,11.7,11.6,9.7,8.2,10.3,7.8,9.1,8.4,11.9
"(-41.4545, 145.9707)",avg_d_wind,,,,,,,,0.0000,0.0000,0.0000,...,25.7,20.8,12.3,12.9,12.1,25.2,10.6,4.0,12.7,19.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
"(64.9631, -19.0208)",data,,Iceland,-0.080583,74.330097,0.0,17.497087,1799.0,-0.0264,0.0506,0.2312,...,1792.0,1795.0,1797.0,1797.0,1798.0,1798.0,1799.0,1799.0,1799.0,1799.0
"(64.9631, -19.0208)",avg_d_RH,,,,,,,,0.0000,0.0000,0.0000,...,70.0,69.0,76.0,81.0,79.0,72.0,52.0,48.0,39.0,56.0
"(64.9631, -19.0208)",avg_d_precip,,,,,,,,0.0000,0.0000,0.0000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"(64.9631, -19.0208)",avg_d_tmp,,,,,,,,0.0000,0.0000,0.0000,...,4.1,4.2,3.0,3.1,2.2,0.8,2.2,8.8,10.5,7.2


In [187]:
# # uncomment to print all column names
# for item in list(hopkins_conf.columns):
#     print(item)

## _Preliminary checks on imported data_

In [188]:
# Checking that our starting data has no nan country_region entries in 'data' rows
print('Original df dimensions: {}'.format(hopkins_conf.shape))
check = pd.DataFrame(hopkins_conf.xs('data', level='information', axis=0))
print('Number of \'data\' rows: {}'.format(check.shape[0]))
check = check[check['Country_Region'].isna()]
print('Number of \'data\' rows with NaN Country_Region: {}'.format(check.shape[0]))

Original df dimensions: (9185, 130)
Number of 'data' rows: 1837
Number of 'data' rows with NaN Country_Region: 0


## _Load external datasets_

In [189]:
# ========== GDP BY COUNTRY ========
# source: https://data.worldbank.org/indicator/NY.GDP.PCAP.CD  - in current USD
gdp_country_path = '../external_datasets/GDP_per_capita_countries.csv'
gdp_country = pd.read_csv(gdp_country_path)
gdp_country.columns=['Country_Region', 'GDP']


# ========== US STATES GDP ========
# source: https://en.wikipedia.org/wiki/List_of_U.S._states_by_GDP_per_capita#cite_note-3
# source: https://www.statista.com/statistics/248023/us-gross-domestic-product-gdp-by-state/ 
gdp_us_state_path = '../external_datasets/GDP_per_capita_states.csv'
gdp_us_state = pd.read_csv(gdp_us_state_path)
gdp_us_state.columns = ['Province_State', 'GDP']


# ========== CHINA PROVINCES GDP ========
#source: https://en.wikipedia.org/wiki/List_of_Chinese_administrative_divisions_by_GDP_per_capita
gdp_china_province_path = '../external_datasets/GDP_per_capita_china_provinces.csv'
gdp_china_province = pd.read_csv(gdp_china_province_path)
gdp_china_province.columns = ['Province_State', 'GDP']


# ========== GDP US STATES AND CHINA PROVINCES APPENDED ======== # for convenience
gdp_all_provinces = gdp_us_state.append(gdp_china_province, ignore_index=True)


# ========== URBANIZATION BY COUNTRY ========
# source: http://wdi.worldbank.org/table/4.2# (2018)
urban_country_path = '../external_datasets/urbanization_data.csv'
urban_country = pd.read_csv(urban_country_path)
urban_country.columns=['Country_Region', 'Urbanization']


# ========== US STATES URBANIZATION ========
# source: US census bureau (most updated was is from 2010)
urban_us_state_path = '../external_datasets/urbanization_states.csv'
urban_us_state = pd.read_csv(urban_us_state_path)
urban_us_state.columns = ['Province_State', 'Urbanization']


# ========== CHINA PROVINCES URBANIZATION ========
# source: https://en.wikipedia.org/wiki/Urbanization_in_China
urban_china_province_path = '../external_datasets/urbanization_china_provinces.csv'
urban_china_province = pd.read_csv(urban_china_province_path)
urban_china_province.columns = ['Province_State', 'Urbanization']


# ========== URBANIZATION US STATES AND CHINA PROVINCES APPENDED ======== # for convenience
urban_all_provinces = urban_us_state.append(urban_china_province, ignore_index=True)


# ========== MEDIAN AGE BY COUNTRY========
# source: https://ourworldindata.org/grapher/median-age?year=2020&time=2020
median_age_per_countries_2020_path = '../external_datasets/median_age_per_countries_2020.csv'
median_age_per_countries_2020 = pd.read_csv(median_age_per_countries_2020_path)
median_age_per_countries_2020.columns = ['Country_Region', 'code', 'year', 'Median Age']
median_age_per_countries_2020 = median_age_per_countries_2020.drop(['code'], axis = 1)
median_age_per_countries_2020 = median_age_per_countries_2020[(median_age_per_countries_2020['year'] == 2020) & (median_age_per_countries_2020['Country_Region'] != 'United States')]
median_age_per_countries_2020 = median_age_per_countries_2020.drop(['year'], axis = 1)


# ========== US STATES MEDIAN AGE ========
# source: https://worldpopulationreview.com/states/median-age-by-state/
median_age_per_US_state_2020_path = '../external_datasets/median_age_per_US_state_2020.csv'
median_age_state_col_list = ['State', 'MedianAge']
median_age_per_US_state_2020 = pd.read_csv(median_age_per_US_state_2020_path, usecols = median_age_state_col_list)
median_age_per_US_state_2020.columns = ['Province_State', 'Median Age']


# ========== DEMOCRACY INDEX BY COUNTRY ========
# source: https://en.wikipedia.org/wiki/Democracy_Index#cite_note-index2019-7
democracy_index_by_country_path = '../external_datasets/democracy_index_by_country.csv'
democracy_col_list = ['Country', 'Score']
democracy_index_by_country = pd.read_csv(democracy_index_by_country_path, usecols = democracy_col_list)
democracy_index_by_country.columns = ['Country_Region', 'Democracy']


# ========== DAILY TESTS ADMINISTERED US STATES ========
d_tests_us_states_path = '../external_datasets/US_daily_tests_filtered.csv'
d_tests_us_states_df = pd.read_csv(d_tests_us_states_path)
target_cols = ['date', 'state', 'total']
d_tests_us_states_df = d_tests_us_states_df[target_cols]


# ========== POPULATION PER STATE ========
us_states_population_path = '../external_datasets/population_us_states.csv'
us_states_population = pd.read_csv(us_states_population_path)
us_states_population.columns = ['Province_State', 'State Pop']


# ========== GINI PER COUNTRY ========
# source: en.wikipedia.org/wiki/List_of_countries_by_income_equality#cite_ref-3
gini_countries_path = '../external_datasets/gini_countries_data.csv'
gini_countries = pd.read_csv(gini_countries_path)
target_cols = ['Country', 'World Bank Gini', 'CIA Gini']
gini_countries = gini_countries[target_cols]
gini_countries.columns = ['Country_Region', 'World Bank Gini', 'CIA Gini']


# ========== TESTS PER 1 MILLION BY COUNTRY ========
# source: https://www.worldometers.info/coronavirus/#countries
tests_per_1M_path = '../external_datasets/tests_per_1M_countries.csv'
tests_per_1M = pd.read_csv(tests_per_1M_path)
target_cols = ['Country, Other', 'Tests/1M pop']
tests_per_1M = tests_per_1M[target_cols]
tests_per_1M.columns = ['Country_Region', 'Tests per 1M']


# ========== Community Mobility Reports ========
# source: google.com/covid19/mobility/
community_mobility_reports_path = '../external_datasets/Global_Mobility_Report.csv'
community_mobility_reports = pd.read_csv(community_mobility_reports_path)
target_cols = ['country_region', 'sub_region_1', 'sub_region_2','date', 'residential_percent_change_from_baseline']
community_mobility_reports = community_mobility_reports[target_cols]
community_mobility_reports.columns = ['Country_Region', 'Province_State', 'sub_region_2', 'date', 'Mobility Change']
# removing subregion data per state because we're not ready for this resolution
community_mobility_reports = community_mobility_reports[community_mobility_reports['sub_region_2'].isna()]
community_mobility_reports_1 = community_mobility_reports[community_mobility_reports.Country_Region=='United States']
community_mobility_reports_2 = community_mobility_reports[community_mobility_reports.Province_State.isna()]
community_mobility_reports = community_mobility_reports_1.append(community_mobility_reports_2)
community_mobility_reports = community_mobility_reports.drop(['sub_region_2'], axis=1)

  interactivity=interactivity, compiler=compiler, result=result)


In [190]:
community_mobility_reports

Unnamed: 0,Country_Region,Province_State,date,Mobility Change
103361,United States,,2020-02-15,-1.0
103362,United States,,2020-02-16,-1.0
103363,United States,,2020-02-17,5.0
103364,United States,,2020-02-18,1.0
103365,United States,,2020-02-19,0.0
...,...,...,...,...
335458,Zimbabwe,,2020-05-05,30.0
335459,Zimbabwe,,2020-05-06,31.0
335460,Zimbabwe,,2020-05-07,31.0
335461,Zimbabwe,,2020-05-08,31.0


In [191]:
# # mobility check
# state='California'
# country='United States'
# date='2020-02-15'
# community_mobility_reports[community_mobility_reports['Province_State']==state]
# community_mobility_reports

# _Filling missing data_

Comments describe missing data and sources

In [192]:
# ========== GDP ========

# Virgin Islands source: https://www.macrotrends.net/countries/VIR/virgin-islands-us/gdp-per-capita
missing_GDP_per_US_territory_values = {
    'Province_State': ['American Samoa', 'Guam', 'Northern Mariana Islands', 'Puerto Rico', 'Virgin Islands'], 
    'GDP': [11466.69071, 35712.56214, 23258.67586, 31651.34815, 35938]
}

missing_GDP_per_US_territory_df = pd.DataFrame.from_dict(missing_GDP_per_US_territory_values)
gdp_all_provinces = gdp_all_provinces.append(missing_GDP_per_US_territory_df, ignore_index=True)


# ========== MEDIAN AGE ========

# Missing info (excluding Northern Mariana Islands) from median_age_per_countries_2020_path (were not listed as US states)
# Missing info for Northern Mariana Islands from https://en.wikipedia.org/wiki/List_of_countries_by_median_age
missing_median_age_per_US_territory_values = {
    'Province_State': ['American Samoa', 'Guam', 'Northern Mariana Islands', 'Puerto Rico', 'Virgin Islands'], 
    'Median Age': [22, 31.4, 33.6, 38.2, 42.2]
}

# Missing info from https://en.wikipedia.org/wiki/List_of_countries_by_median_age
# the 'US' entry is based on (37.0902, -95.7129) which is Kansas
# 'Holy See' - the Vatican. Was given the medain age in Italy
missing_median_age_per_country_values = {
    'Country_Region': ['East Timor', 'Congo (Kinshasa)', 'Congo (Brazzaville)', 'Dominica', 'Andorra', 'Kosovo', 'Liechtenstein', 
                       'San Marino', 'Monaco', 'Saint Kitts and Nevis', 'US', 'Holy See'], 
    'Median Age': [18.9, 18.6, 19.7, 33.5, 44.3, 29.1, 43.2, 44.4, 53.1, 35, 36.5, 47.9]
}

# Appending missing "Median Age" values 
missing_median_ages_per_US_state_df = pd.DataFrame.from_dict(missing_median_age_per_US_territory_values)
median_age_per_US_state_2020 = median_age_per_US_state_2020.append(missing_median_ages_per_US_state_df, ignore_index=True)

missing_median_ages_per_country_df = pd.DataFrame.from_dict(missing_median_age_per_country_values)
median_age_per_countries_2020 = median_age_per_countries_2020.append(missing_median_ages_per_country_df, ignore_index=True)


# ========== DEMOCRACY INDEX BY COUNTRY ========

### Missing info from https://en.wikipedia.org/wiki/List_of_freedom_indices & https://www.transparency.org/cpi2019
# Saint Vincent and the Grenadines (7.9, based on corruption between South Korea and Cabo Verde)
# Seychelles (7.9, based on corruption between Chile and Taiwan)
# Sao Tome and Principe (6.8, based on corruption between Croatia and Argentina)
# Maldives (4, based on corruption between Guinea and Mexico)
# Brunei (7.16, based on corruption of Malaysia)
# Somalia (1, based on corruption index last 3 years)
# Holy See (7.52, same as Italy)
# Grenada (7.3, based on corruption between Italy and Malaysia)
# Belize (7.02, based on list of freedoms indices)
# Dominica (7.78, ")
# Monaco (8.12, same as France)
# Barbados (6.2, based on CORRUPTION PERCEPTIONS INDEX 2019 https://www.transparency.org/cpi2019)
# Bahamas (6.4, ")
# Saint Lucia (5.5, ")
# Kosovo (3.6, ")
# Andorra (7.3, based on list of freedoms indices - like Grenada)
# Antigua and Barbuda (")
# Saint Kitts and Nevis (")
# Liechtenstein (")
# Western Sahara (3, based on list of freedoms indices)
# San Marino (7.5, considered as fairly democratic. Based on https://news.un.org/en/story/2013/04/435902-world-can-learn-san-marinos-democratic-system-says-un-chief)
missing_democracy_values = {
    'Country_Region': ['Saint Vincent and the Grenadines', 'Seychelles', 'Sao Tome and Principe', 'Maldives', 'Brunei', 
                       'Somalia', 'Holy See', 'Grenada', 'Belize', 'Dominica', 'Monaco', 'Barbados', 'Bahamas', 'Saint Lucia', 
                       'Kosovo', 'Andorra', 'Antigua and Barbuda', 'Saint Kitts and Nevis', 'Liechtenstein', 'Western Sahara', 'San Marino'],
    'Democracy': [7.9, 7.9, 6.8, 4, 7.16, 1, 7.52, 7.3, 7.02, 7.78, 8.12, 6.2, 6.4, 5.5, 3.6, 7.3, 7.3, 7.3, 7.3, 3, 7.5]
}

# Appending missing "Democracy" values 
missing_democracy_values_df = pd.DataFrame.from_dict(missing_democracy_values)
democracy_index_by_country = democracy_index_by_country.append(missing_democracy_values_df, ignore_index=True)


# ========== GINI PER COUNTRY ========

### Missing info based on similar GDP and unimployment rate
# Afghanistan ()
# Andorra ()
# Bahamas ()
# Bahrain ()
# Barbados ()
# Brunei ()
# Cuba ()
# Eritrea ()
# Kuwait ()
# Liechtenstein ()
# Monaco ()
# Oman ()
# San Marino ()
# Saudi Arabia ()
# Qatar ()

# missing_gini_countries_values = {
#     'Country_Region': ['Afghanistan', 'Andorra', 'Bahamas', 'Bahrain', 'Barbados', 'Brunei', 'Cuba', 'Eritrea', 
#                        'Kuwait', 'Liechtenstein', 'Monaco', 'Oman', 'San Marino', 'Saudi Arabia', 'Qatar'],
#     'Other': [Afghanistan, Andorra, Bahamas, Bahrain, Barbados, Brunei, Cuba, Eritrea, 
#                        Kuwait, Liechtenstein, Monaco, Oman, San Marino, Saudi Arabia, Qatar]
# }


# # Appending missing countries' "Gini Index" values 
# missing_gini_countries_values_df = pd.DataFrame.from_dict(missing_gini_countries_values)
# gini_countries = gini_countries.append(missing_gini_countries_values_df, ignore_index = True)


# _Translation Dictionaries_


Used to Standardize names in original df and external datasets

In [193]:
def standardize_us_states_names(df):
    df = df.replace({
        'AK': 'Alaska',
        'AL': 'Alabama',
        'AR': 'Arkansas',
        'AS': 'American Samoa',
        'AZ': 'Arizona',
        'CA': 'California',
        'CO': 'Colorado',
        'CT': 'Connecticut',
        'DC': 'District of Columbia',
        'DE': 'Delaware',
        'FL': 'Florida',
        'GA': 'Georgia',
        'GU': 'Guam',
        'HI': 'Hawaii',
        'IA': 'Iowa',
        'ID': 'Idaho',
        'IL': 'Illinois',
        'IN': 'Indiana',
        'KS': 'Kansas',
        'KY': 'Kentucky',
        'LA': 'Louisiana',
        'MA': 'Massachusetts',
        'MD': 'Maryland',
        'ME': 'Maine',
        'MI': 'Michigan',
        'MN': 'Minnesota',
        'MO': 'Missouri',
        'MP': 'Northern Mariana Islands',
        'MS': 'Mississippi',
        'MT': 'Montana',
        'NA': 'National',
        'NC': 'North Carolina',
        'ND': 'North Dakota',
        'NE': 'Nebraska',
        'NH': 'New Hampshire',
        'NJ': 'New Jersey',
        'NM': 'New Mexico',
        'NV': 'Nevada',
        'NY': 'New York',
        'OH': 'Ohio',
        'OK': 'Oklahoma',
        'OR': 'Oregon',
        'PA': 'Pennsylvania',
        'PR': 'Puerto Rico',
        'RI': 'Rhode Island',
        'SC': 'South Carolina',
        'SD': 'South Dakota',
        'TN': 'Tennessee',
        'TX': 'Texas',
        'UT': 'Utah',
        'VA': 'Virginia',
        'VI': 'Virgin Islands',
        'VT': 'Vermont',
        'WA': 'Washington',
        'WI': 'Wisconsin',
        'WV': 'West Virginia',
        'WY': 'Wyoming'
})
    
    return df

In [194]:
def standardize_names(external_df):
    external_df = external_df.replace({
        'Bahamas, The': 'Bahamas',
        'Brunei Darussalam': 'Brunei',
        'Cape Verde': 'Cabo Verde',
        'Congo, Dem. Rep.': 'Congo (Kinshasa)',
        'Congo, Rep.': 'Congo (Brazzaville)',
        'Congo, Republic of the': 'Congo (Brazzaville)',
        'Côte d\'Ivoire': 'Cote d\'Ivoire',
        'Czech Republic':'Czechia',
        'Democratic Republic of the Congo': 'Congo (Kinshasa)',
        'DR Congo': 'Congo (Kinshasa)',
        'Egypt, Arab Rep.': 'Egypt',
        'Eswatini': 'Swaziland',
        'Gambia, The': 'Gambia',
        'Iran, Islamic Rep.': 'Iran',
        'Ivory Coast': 'Cote d\'Ivoire',
        'Korea, Rep.': 'Korea, South',
        'Kyrgyz Republic': 'Kyrgyzstan',
        'Macedonia': 'North Macedonia',
        'Myanmar': 'Burma',
        'Palestine': 'West Bank and Gaza',
        'Republic of the Congo': 'Congo (Brazzaville)',
        'Russian Federation': 'Russia',
        'Slovak Republic': 'Slovakia',
        'S. Korea': 'Korea, South',
        'South Korea': 'Korea, South',
        'South Korea[n 2]': 'Korea, South',        
        'St. Kitts and Nevis': 'Saint Kitts and Nevis',
        'St. Lucia': 'Saint Lucia',
        'St. Vincent and the Grenadines': 'Saint Vincent and the Grenadines',
        'Syrian Arab Republic': 'Syria',
        'Taiwan*': 'Taiwan',
        'Timor Leste': 'East Timor',
        'Timor-Leste': 'East Timor',
        'UAE': 'United Arab Emirates',
        'UK': 'United Kingdom',
        'United States': 'US',
        'USA': 'US',
        'United States': 'US',
        'Venezuela, RB': 'Venezuela'
       })
    return external_df

In [195]:
# was used for standardizaing China province names
def china_check(urban_or_gdp):
    check = hopkins_conf[hopkins_conf['Country_Region']=='China']
    china_original_prov = set(check['Province_State'])
    china_gdp_prov = set(gdp_china_province.Province_State)

    orig_minus_gdp = china_original_prov.difference(china_gdp_prov)
    gdp_minus_orig = china_gdp_prov.difference(china_original_prov)

    china_original_prov = sorted(set(check['Province_State']))
    china_gdp_prov = sorted(set(gdp_china_province.Province_State))

    test_list = [china_gdp_prov, china_original_prov]
    for x, y in zip(*test_list): 
        print(x, y) # print(test_list)

    print('orig_minus_gdp: {}'.format(orig_minus_gdp))
    print('gdp_minus_orig: {}'.format(gdp_minus_orig))

## _Functions for adding row to index level 1 (mobility change and US States daily tests)_

In [234]:
def add_index(df, row_name, gap):
    """
    Adds a row on level 1 of a df
    """
    idx = df.index
    previous_coor = (0,0)
    i = gap
    for coor, data in df.iterrows():
        coor = coor[0]
        if coor != previous_coor:
            idx = idx.insert(i, (coor, row_name))
            i += (gap + 1)
            previous_coor = coor
    return df.copy().reindex(idx)


def string_to_datetime(df_old):
    try:
        df_new = df_old.copy()
        df_new['date'] = df_new.apply(lambda row: datetime.datetime.strptime(str(row['date']), '%Y%m%d'), axis=1)
    except (ValueError):
        df_new = df_old.copy()
        df_new['date'] = df_new.apply(lambda row: datetime.datetime.strptime(str(row['date']), '%Y-%m-%d'), axis=1)
    return df_new

In [197]:
def add_daily_tests(df, tests_data):
    coordinates = set(df.index.get_level_values(level=0).tolist())
    common_dates = set(tests_data.index.get_level_values(level=0)).intersection(set(df.columns))
    counter = 0
    return_df = df.copy()
    for coor in coordinates:
        if counter % 50 == 0:
            print(counter)
        counter += 1
        if return_df.loc[coor].loc['data']['Country_Region']=='US':
            state = return_df.loc[coor].loc['data']['Province_State']
#             for (date,state) in tests_data.index:
            for date in common_dates:
                if (date,state) in tests_data.index:
                    return_df.loc[(coor, 'd_tests'),date] = tests_data.loc[(date, state)]['d_tests']
#                     print('State: {}, date: {}, d_tests: {}'.format(state, date,tests_data.loc[(date, state)]['d_tests']))
    return return_df

In [225]:
def add_daily_mobility_change(df, community_mobility_reports):
    coordinates = set(df.index.get_level_values(level=0).tolist())
    common_dates = set(community_mobility_reports.index.get_level_values(level=0)).intersection(set(df.columns))
    counter = 0
    return_df = df.copy()
    for coor in coordinates:
        if counter % 50 == 0:
            print(counter)
        counter += 1
        
        country = return_df.loc[coor].loc['data']['Country_Region']
        if (country=='US'):
            state = return_df.loc[coor].loc['data']['Province_State']
            for date in common_dates:
                if (date,country) in community_mobility_reports[community_mobility_reports['Province_State']==state].index:
                    return_df.loc[(coor, 'd_mob_change'), date] = community_mobility_reports[community_mobility_reports['Province_State']==state].loc[(date,country)]['Mobility Change']

        else: 
            for date in common_dates:
                if (date,country) in community_mobility_reports.index:
                    return_df.loc[(coor, 'd_mob_change'), date] = community_mobility_reports.loc[(date, country), 'Mobility Change'][0]
        
    return return_df

In [199]:
# preprocessing for daily tests US states data

# convert dates formats
d_tests_us_states_df = string_to_datetime(d_tests_us_states_df)
# standardize names
d_tests_us_states_df = standardize_us_states_names(d_tests_us_states_df)
d_tests_us_states_df.sort_values(['state', 'date'], inplace=True)
# count daily from cumulated totals
d_tests_us_states_df['d_tests'] = d_tests_us_states_df.groupby(['state'])['total'].transform(lambda x: x.diff()) 

try:
    d_tests_us_states_df['date'] = d_tests_us_states_df.apply(lambda row: \
                                datetime.datetime.strftime(datetime.datetime.strptime(str(row['date']), '%Y-%m-%d %H:%M:%S'), '%-m/%-d/%Y'), axis=1)
except (ValueError):
    d_tests_us_states_df['date'] = d_tests_us_states_df.apply(lambda row: \
                                datetime.datetime.strftime(datetime.datetime.strptime(str(row['date']), '%Y-%m-%d %H:%M:%S'), '%#m/%#d/%Y'), axis=1)


us_states_population = standardize_us_states_names(us_states_population)
us_states_population = us_states_population.set_index('Province_State')


# add total tests divided by total population column
d_tests_us_states_df['total_tests_div_pop'] = d_tests_us_states_df.apply(lambda row: (row['total']/us_states_population.loc[row['state']]), axis=1)

# set multi index
d_tests_us_states_df = d_tests_us_states_df.set_index(['date','state'])

In [200]:
d_tests_us_states_df

Unnamed: 0_level_0,Unnamed: 1_level_0,total,d_tests,total_tests_div_pop
date,state,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
3/7/2020,Alabama,0.0,,0.000000
3/8/2020,Alabama,0.0,0.0,0.000000
3/9/2020,Alabama,0.0,0.0,0.000000
3/10/2020,Alabama,0.0,0.0,0.000000
3/11/2020,Alabama,10.0,10.0,0.000002
...,...,...,...,...
4/10/2020,Wyoming,5056.0,906.0,0.008736
4/11/2020,Wyoming,5459.0,403.0,0.009432
4/12/2020,Wyoming,5459.0,0.0,0.009432
4/13/2020,Wyoming,5964.0,505.0,0.010305


In [201]:
community_mobility_reports

Unnamed: 0,Country_Region,Province_State,date,Mobility Change
103361,United States,,2020-02-15,-1.0
103362,United States,,2020-02-16,-1.0
103363,United States,,2020-02-17,5.0
103364,United States,,2020-02-18,1.0
103365,United States,,2020-02-19,0.0
...,...,...,...,...
335458,Zimbabwe,,2020-05-05,30.0
335459,Zimbabwe,,2020-05-06,31.0
335460,Zimbabwe,,2020-05-07,31.0
335461,Zimbabwe,,2020-05-08,31.0


In [202]:
# preprocessing for daily tests US states data

# convert dates formats
community_mobility_reports = string_to_datetime(community_mobility_reports)
community_mobility_reports

Unnamed: 0,Country_Region,Province_State,date,Mobility Change
103361,United States,,2020-02-15,-1.0
103362,United States,,2020-02-16,-1.0
103363,United States,,2020-02-17,5.0
103364,United States,,2020-02-18,1.0
103365,United States,,2020-02-19,0.0
...,...,...,...,...
335458,Zimbabwe,,2020-05-05,30.0
335459,Zimbabwe,,2020-05-06,31.0
335460,Zimbabwe,,2020-05-07,31.0
335461,Zimbabwe,,2020-05-08,31.0


In [203]:
try:
    community_mobility_reports['date'] = community_mobility_reports.apply(lambda row: \
                                datetime.datetime.strftime(datetime.datetime.strptime(str(row['date']), '%Y-%m-%d %H:%M:%S'), '%-m/%-d/%Y'), axis=1)
except (ValueError):
    community_mobility_reports['date'] = community_mobility_reports.apply(lambda row: \
                                datetime.datetime.strftime(datetime.datetime.strptime(str(row['date']), '%Y-%m-%d %H:%M:%S'), '%#m/%#d/%Y'), axis=1)

community_mobility_reports

Unnamed: 0,Country_Region,Province_State,date,Mobility Change
103361,United States,,2/15/2020,-1.0
103362,United States,,2/16/2020,-1.0
103363,United States,,2/17/2020,5.0
103364,United States,,2/18/2020,1.0
103365,United States,,2/19/2020,0.0
...,...,...,...,...
335458,Zimbabwe,,5/5/2020,30.0
335459,Zimbabwe,,5/6/2020,31.0
335460,Zimbabwe,,5/7/2020,31.0
335461,Zimbabwe,,5/8/2020,31.0


In [204]:
community_mobility_reports.sort_values(['Country_Region', 'Province_State', 'date'], inplace = True)


In [205]:
# set multi index
community_mobility_reports = standardize_names(community_mobility_reports)
community_mobility_reports = community_mobility_reports.set_index(['date', 'Country_Region'])
'US' in set(community_mobility_reports.index.get_level_values(level=1))

True

In [206]:
# set(community_mobility_reports.index.get_level_values(level=1))

In [207]:
MAX_DATE = '4/7/2020'

(d_tests_us_states_df.xs('Alabama', level='state').loc[MAX_DATE])['total_tests_div_pop']
# df.xs('Ai', level='name', drop_level=False)
# d_tests_us_states_df

0.003042104264880889

In [208]:
# d_tests_us_states_df.iloc[d_tests_us_states_df.index.get_level_values('state') == 'Missouri']

## _this is where the magic happens_

In [235]:
import math

PROVINCE_STATE = 'Province_State'
COUNTRY_REGION = 'Country_Region'
DAILY_TESTS = 'd_tests'
MOBILITY = 'd_mob_change'
MAX_DATE = '4/7/2020'


def add_gdp_urban(original_df, args_datasets):
    
    # import argument dataframes from input dict
    gdp_country = args_datasets['gdp_country']
    gdp_all_provinces = args_datasets['gdp_all_provinces'] 
    urban_country = args_datasets['urban_country']
    urban_all_provinces = args_datasets['urban_all_provinces'] 
    median_age_per_countries_2020 = args_datasets['median_age_per_countries_2020']
    median_age_per_US_state_2020 = args_datasets['median_age_per_US_state_2020']
    democracy_index_by_country = args_datasets['democracy_index_by_country']
    gini_countries = args_datasets['gini_countries']
    d_tests_us_states_df = args_datasets['d_tests_us_states_df']
    us_states_population = args_datasets['us_states_population']
    community_mobility_reports = args_datasets['community_mobility_reports']
    tests_per_1M = args_datasets['tests_per_1M']
    
    # standardizing region names
    result = standardize_names(original_df)
    gdp_country = standardize_names(gdp_country)
    urban_country = standardize_names(urban_country)
    median_age_per_countries_2020 = standardize_names(median_age_per_countries_2020)
    democracy_index_by_country = standardize_names(democracy_index_by_country)
    gini_countries = standardize_names(gini_countries)
    community_mobility_reports = standardize_names(community_mobility_reports)
    tests_per_1M = standardize_names(tests_per_1M)

    # setting indices for more convenient access in lambda funtion below
    gdp_country = gdp_country.set_index(COUNTRY_REGION)
    gdp_all_provinces = gdp_all_provinces.set_index(PROVINCE_STATE)
    urban_country = urban_country.set_index(COUNTRY_REGION)
    urban_all_provinces = urban_all_provinces.set_index(PROVINCE_STATE)
    median_age_per_US_state_2020 = median_age_per_US_state_2020.set_index(PROVINCE_STATE)
    median_age_per_countries_2020 = median_age_per_countries_2020.set_index(COUNTRY_REGION)
    democracy_index_by_country = democracy_index_by_country.set_index(COUNTRY_REGION)
    gini_countries = gini_countries.set_index(COUNTRY_REGION)
    tests_per_1M = tests_per_1M.set_index(COUNTRY_REGION)

#     if there is no state data, take country data, and if there is also no country data, put NaN
    result['GDP'] = result.apply(lambda row: gdp_all_provinces.loc[row[PROVINCE_STATE],'GDP'] \
                                     if row[PROVINCE_STATE] in list(gdp_all_provinces.index) \
                                     else (gdp_country.loc[row[COUNTRY_REGION],'GDP'] \
                                           if row[COUNTRY_REGION] in list(gdp_country.index) \
                                           else np.NaN), axis=1) 
    
    result['Urbanization'] = result.apply(lambda row: urban_all_provinces.loc[row[PROVINCE_STATE],'Urbanization'] \
                                              if row[PROVINCE_STATE] in list(urban_all_provinces.index) \
                                              else (urban_country.loc[row[COUNTRY_REGION],'Urbanization'] \
                                                    if row[COUNTRY_REGION] in list(urban_country.index) \
                                                    else np.NaN), axis=1)
    
    result['Median Age'] = result.apply(lambda row: median_age_per_US_state_2020.loc[row[PROVINCE_STATE],'Median Age'] \
                                            if row[PROVINCE_STATE] in list(median_age_per_US_state_2020.index) \
                                            else (median_age_per_countries_2020.loc[row[COUNTRY_REGION], 'Median Age'] \
                                                  if row[COUNTRY_REGION] in list(median_age_per_countries_2020.index) \
                                                  else np.NaN), axis=1)
    
    result['Democracy'] = result.apply(lambda row: democracy_index_by_country.loc[row[COUNTRY_REGION], 'Democracy'] \
                                           if row[COUNTRY_REGION] in list(democracy_index_by_country.index) \
                                           else np.NaN, axis=1)
    
    result['Gini Index'] = result.apply(lambda row: gini_countries.loc[row[COUNTRY_REGION], 'World Bank Gini'] \
                                           if row[COUNTRY_REGION] in list(gini_countries.index) and not math.isnan(gini_countries.loc[row[COUNTRY_REGION], 'World Bank Gini']) \
                                           else (gini_countries.loc[row[COUNTRY_REGION], 'CIA Gini'] \
                                                  if row[COUNTRY_REGION] in list(gini_countries.index) and not math.isnan(gini_countries.loc[row[COUNTRY_REGION], 'CIA Gini']) \
#                                                   else (gini_countries.loc[row[COUNTRY_REGION], 'Other'] \
#                                                       if row[COUNTRY_REGION] in list(gini_countries.index) and not math.isnan(gini_countries.loc[row[COUNTRY_REGION], 'Other']) \
                                                      else np.NaN), axis=1)
    
    result['State Population'] = result.apply(lambda row: us_states_population.loc[row[PROVINCE_STATE]]['State Pop'] \
                                              if row[PROVINCE_STATE] in list(us_states_population.index) \
                                              else np.NaN, axis = 1)
    
    result['Total Tests'] = result.apply(lambda row: (d_tests_us_states_df.xs(row[PROVINCE_STATE], level='state').loc[MAX_DATE])['total']
                                              if row[PROVINCE_STATE] in list(us_states_population.index) \
                                              else np.NaN, axis = 1)
    
    
    result['Tests \ Pop'] = result.apply(lambda row: (d_tests_us_states_df.xs(row[PROVINCE_STATE], level='state').loc[MAX_DATE])['total_tests_div_pop']
                                              if row[PROVINCE_STATE] in list(us_states_population.index) \
                                              else np.NaN, axis = 1)
    
    result['Tests per 1M'] = result.apply(lambda row: (tests_per_1M.loc[row[COUNTRY_REGION], 'Tests per 1M'] \
                                                      if row[COUNTRY_REGION] in list(tests_per_1M.index) \
                                                      else np.NaN), axis=1)
    
    
    
    # adding daily_tests administered field to index level 1
    result = add_index(result,DAILY_TESTS, gap=5)

    # adding daily_mob_change to index level 1
    result = add_index(result, MOBILITY, gap=6)
    
    # adding tests administered per data per US State
    result = add_daily_tests(result, d_tests_us_states_df)
    
    # adding 
    result = add_daily_mobility_change(result, community_mobility_reports)
    
    # reordering columns
    new_columns = list(result.columns)

    prev_GDP = new_columns.index('GDP')
    prev_urban = new_columns.index('Urbanization')
    prev_median_age = new_columns.index('Median Age')
    prev_democracy = new_columns.index('Democracy')
    prev_gini = new_columns.index('Gini Index')    
    prev_state_pop = new_columns.index('State Population')
    prev_total_tests = new_columns.index('Total Tests')
    prev_tests_div_pop = new_columns.index('Tests \ Pop')
    prev_tests_per_1M = new_columns.index('Tests per 1M')

    new_GDP = new_columns.index(COUNTRY_REGION) + 1
    new_urban = new_GDP + 1
    new_median_age = new_urban + 1
    new_democracy = new_median_age + 1
    new_gini = new_democracy + 1
    new_state_pop = new_gini + 1
    new_total_tests = new_state_pop + 1
    new_tests_div_pop = new_total_tests +1
    new_tests_per_1M = new_tests_div_pop + 1

    new_columns.insert(new_GDP, new_columns.pop(prev_GDP))
    new_columns.insert(new_urban, new_columns.pop(prev_urban))
    new_columns.insert(new_median_age, new_columns.pop(prev_median_age))
    new_columns.insert(new_democracy, new_columns.pop(prev_democracy))
    new_columns.insert(new_gini, new_columns.pop(prev_gini))
    new_columns.insert(new_state_pop, new_columns.pop(prev_state_pop))
    new_columns.insert(new_total_tests, new_columns.pop(prev_total_tests))
    new_columns.insert(new_tests_div_pop, new_columns.pop(prev_tests_div_pop))
    new_columns.insert(new_tests_per_1M, new_columns.pop(prev_tests_per_1M))


    return result[new_columns]

In [236]:
args_datasets = {
    'gdp_country': gdp_country, 
    'gdp_all_provinces': gdp_all_provinces, 
    'urban_country': urban_country, 
    'urban_all_provinces': urban_all_provinces, 
    'median_age_per_countries_2020': median_age_per_countries_2020, 
    'median_age_per_US_state_2020': median_age_per_US_state_2020, 
    'democracy_index_by_country': democracy_index_by_country,
    'gini_countries': gini_countries,
    'd_tests_us_states_df': d_tests_us_states_df,
    'us_states_population': us_states_population,
    'tests_per_1M': tests_per_1M,
    'community_mobility_reports': community_mobility_reports
}
new_hopkins_conf = add_gdp_urban(hopkins_conf, args_datasets)

new_hopkins_conf

0
50
100
150
200
250
300
350
400
450
500
550
600
650
700
750
800
850
900
950
1000
1050
1100
1150
1200
1250
1300
1350
1400
1450
1500
1550
1600
1650
1700
1750
1800
0


  return runner(coro)
  return self._getitem_tuple(key)


50
100
150
200
250
300
350
400
450
500
550
600
650
700
750
800
850
900
950
1000
1050
1100
1150
1200
1250
1300
1350
1400
1450
1500
1550
1600
1650
1700
1750
1800


Unnamed: 0_level_0,Unnamed: 1_level_0,Province_State,Country_Region,GDP,Urbanization,Median Age,Democracy,Gini Index,State Population,Total Tests,Tests \ Pop,...,4/27/2020,4/28/2020,4/29/2020,4/30/2020,5/1/2020,5/2/2020,5/3/2020,5/4/2020,5/5/2020,5/6/2020
coordinate,information,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
"(-41.4545, 145.9707)",data,Tasmania,Australia,57373.68668,86.012,37.900002,9.09,34.4,,,,...,214.0,218.0,219.0,221.0,221.0,221.0,221.0,221.0,225.0,226.0
"(-41.4545, 145.9707)",avg_d_RH,,,,,,,,,,,...,79.0,87.0,95.0,91.0,90.0,62.0,66.0,74.0,88.0,86.0
"(-41.4545, 145.9707)",avg_d_precip,,,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"(-41.4545, 145.9707)",avg_d_tmp,,,,,,,,,,,...,11.1,11.7,11.6,9.7,8.2,10.3,7.8,9.1,8.4,11.9
"(-41.4545, 145.9707)",avg_d_wind,,,,,,,,,,,...,25.7,20.8,12.3,12.9,12.1,25.2,10.6,4.0,12.7,19.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
"(64.9631, -19.0208)",avg_d_precip,,,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"(64.9631, -19.0208)",avg_d_tmp,,,,,,,,,,,...,4.1,4.2,3.0,3.1,2.2,0.8,2.2,8.8,10.5,7.2
"(64.9631, -19.0208)",avg_d_wind,,,,,,,,,,,...,8.7,9.7,15.0,19.9,18.0,15.8,10.9,39.0,28.0,12.0
"(64.9631, -19.0208)",d_tests,,,,,,,,,,,...,,,,,,,,,,


In [243]:
new_hopkins_conf.head(7)

Unnamed: 0_level_0,Unnamed: 1_level_0,Province_State,Country_Region,GDP,Urbanization,Median Age,Democracy,Gini Index,State Population,Total Tests,Tests \ Pop,...,4/27/2020,4/28/2020,4/29/2020,4/30/2020,5/1/2020,5/2/2020,5/3/2020,5/4/2020,5/5/2020,5/6/2020
coordinate,information,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
"(-41.4545, 145.9707)",data,Tasmania,Australia,57373.68668,86.012,37.900002,9.09,34.4,,,,...,214.0,218.0,219.0,221.0,221.0,221.0,221.0,221.0,225.0,226.0
"(-41.4545, 145.9707)",avg_d_RH,,,,,,,,,,,...,79.0,87.0,95.0,91.0,90.0,62.0,66.0,74.0,88.0,86.0
"(-41.4545, 145.9707)",avg_d_precip,,,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"(-41.4545, 145.9707)",avg_d_tmp,,,,,,,,,,,...,11.1,11.7,11.6,9.7,8.2,10.3,7.8,9.1,8.4,11.9
"(-41.4545, 145.9707)",avg_d_wind,,,,,,,,,,,...,25.7,20.8,12.3,12.9,12.1,25.2,10.6,4.0,12.7,19.0
"(-41.4545, 145.9707)",d_tests,,,,,,,,,,,...,,,,,,,,,,
"(-41.4545, 145.9707)",d_mob_change,,,,,,,,,,,...,17.0,16.0,11.0,18.0,18.0,13.0,11.0,17.0,16.0,16.0


## _Some tests_

In [241]:
gini_test = pd.DataFrame(new_hopkins_conf.xs('data', level='information', axis=0))
gini_test = gini_test[gini_test['Gini Index'].isna()]
gini_test

Unnamed: 0_level_0,Province_State,Country_Region,GDP,Urbanization,Median Age,Democracy,Gini Index,State Population,Total Tests,Tests \ Pop,...,4/27/2020,4/28/2020,4/29/2020,4/30/2020,5/1/2020,5/2/2020,5/3/2020,5/4/2020,5/5/2020,5/6/2020
coordinate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"(1.5, 10.0)",,Equatorial Guinea,10261.76001,72.143,22.4,1.92,,,,,...,258.0,315.0,315.0,315.0,315.0,315.0,315.0,315.0,315.0,439.0
"(4.5353, 114.7277)",,Brunei,31628.32879,77.629,32.400002,7.16,,,,,...,138.0,138.0,138.0,138.0,138.0,138.0,138.0,138.0,138.0,139.0
"(5.1521, 46.1996)",,Somalia,314.544166,44.971,16.799999,1.0,,,,,...,480.0,528.0,582.0,601.0,601.0,671.0,722.0,756.0,835.0,873.0
"(12.1165, -61.679)",,Grenada,10640.49676,36.272,29.4,7.3,,,,,...,18.0,19.0,20.0,20.0,20.0,21.0,21.0,21.0,21.0,21.0
"(13.193909999999999, -59.5432)",,Barbados,17949.28151,31.147,39.799999,6.2,,,,,...,80.0,80.0,80.0,81.0,81.0,81.0,82.0,82.0,82.0,82.0
"(15.1794, 39.7823)",,Eritrea,396.0,40.1,19.299999,2.37,,,,,...,39.0,39.0,39.0,39.0,39.0,39.0,39.0,39.0,39.0,39.0
"(17.0608, -61.7964)",,Antigua and Barbuda,16726.98081,24.599,32.099998,7.3,,,,,...,24.0,24.0,24.0,24.0,25.0,25.0,25.0,25.0,25.0,25.0
"(21.0, 57.0)",,Oman,16415.15728,84.539,30.700001,3.06,,,,,...,2049.0,2131.0,2274.0,2348.0,2447.0,2483.0,2568.0,2637.0,2735.0,2903.0
"(22.0, -80.0)",,Cuba,8821.818891,77.037,43.099998,2.84,,,,,...,1389.0,1437.0,1467.0,1501.0,1537.0,1611.0,1649.0,1668.0,1685.0,1703.0
"(24.0, 45.0)",,Saudi Arabia,23338.96346,83.844,31.9,1.93,,,,,...,18811.0,20077.0,21402.0,22753.0,24097.0,25459.0,27011.0,28656.0,30251.0,31938.0


In [244]:
age_test = pd.DataFrame(new_hopkins_conf.xs('data', level='information', axis=0))
age_test = age_test[age_test['Median Age'].isna()]
age_test

Unnamed: 0_level_0,Province_State,Country_Region,GDP,Urbanization,Median Age,Democracy,Gini Index,State Population,Total Tests,Tests \ Pop,...,4/27/2020,4/28/2020,4/29/2020,4/30/2020,5/1/2020,5/2/2020,5/3/2020,5/4/2020,5/5/2020,5/6/2020
coordinate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1


In [245]:
dem_test = pd.DataFrame(new_hopkins_conf.xs('data', level='information', axis=0))
dem_test = dem_test[dem_test['Democracy'].isna()]
dem_test

Unnamed: 0_level_0,Province_State,Country_Region,GDP,Urbanization,Median Age,Democracy,Gini Index,State Population,Total Tests,Tests \ Pop,...,4/27/2020,4/28/2020,4/29/2020,4/30/2020,5/1/2020,5/2/2020,5/3/2020,5/4/2020,5/5/2020,5/6/2020
coordinate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1


In [246]:
check = new_hopkins_conf[new_hopkins_conf['Urbanization'].isna()]
check = check[check['Country_Region'].notna()]
check

Unnamed: 0_level_0,Unnamed: 1_level_0,Province_State,Country_Region,GDP,Urbanization,Median Age,Democracy,Gini Index,State Population,Total Tests,Tests \ Pop,...,4/27/2020,4/28/2020,4/29/2020,4/30/2020,5/1/2020,5/2/2020,5/3/2020,5/4/2020,5/5/2020,5/6/2020
coordinate,information,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
"(15.552727, 48.516388)",data,,Yemen,,,20.299999,1.95,36.7,,,,...,1.0,1.0,6.0,6.0,7.0,10.0,10.0,12.0,22.0,25.0


In [247]:
check = new_hopkins_conf[new_hopkins_conf['GDP'].isna()]
check = check[check['Country_Region'].notna()]
check

Unnamed: 0_level_0,Unnamed: 1_level_0,Province_State,Country_Region,GDP,Urbanization,Median Age,Democracy,Gini Index,State Population,Total Tests,Tests \ Pop,...,4/27/2020,4/28/2020,4/29/2020,4/30/2020,5/1/2020,5/2/2020,5/3/2020,5/4/2020,5/5/2020,5/6/2020
coordinate,information,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
"(15.552727, 48.516388)",data,,Yemen,,,20.299999,1.95,36.7,,,,...,1.0,1.0,6.0,6.0,7.0,10.0,10.0,12.0,22.0,25.0


In [248]:
gdp = len(new_hopkins_conf[new_hopkins_conf[COUNTRY_REGION]=='US']['GDP'].unique())
print('Unique GDP values for US states: {}'.format(gdp))

urb = len(new_hopkins_conf[new_hopkins_conf[COUNTRY_REGION]=='US']['Urbanization'].unique())
print('Unique Urbanization values for US states: {}'.format(urb))

Unique GDP values for US states: 55
Unique Urbanization values for US states: 55


In [249]:
# Checking that our starting data has no nan country_region entries in 'data' rows
check_old = pd.DataFrame(hopkins_conf.xs('data', level='information', axis=0)).shape
print('Original df data rows dimensions: {}'.format(check_old))

check_new = pd.DataFrame(new_hopkins_conf.xs('data', level='information', axis=0)).shape
print('New df data rows dimensions: {}'.format(check_new))


Original df data rows dimensions: (1837, 130)
New df data rows dimensions: (1837, 139)


## _Daily tests tests_

In [250]:
def test_d_tests(df, tests_data):
    tests_data_dates = tests_data.index.get_level_values(level=0)
    common_dates = set(tests_data_dates).intersection(set(df.columns))
    printed = []
    dict = {}
    coords = set(df.index.get_level_values(level=0).tolist())
    for coord in coords:
        if df.loc[coord].loc['data']['Country_Region']=='US':
            state = df.loc[coord].loc['data']['Province_State']
            if not (state in printed):
                printed.append(state)
#                 print(state)
#                 print(coord)
#                 print(df.loc[coord].loc['d_tests'][common_dates].tolist())
                dict[state] = pd.DataFrame(df.loc[coord][common_dates])
    return dict

test_dict = test_d_tests(new_hopkins_conf, d_tests_us_states_df)


In [251]:
test_state = 'California'

test_dict[test_state]

Unnamed: 0_level_0,3/24/2020,3/3/2020,3/7/2020,2/29/2020,3/9/2020,3/15/2020,3/30/2020,4/8/2020,3/13/2020,4/5/2020,...,4/9/2020,3/19/2020,3/22/2020,3/25/2020,3/21/2020,3/26/2020,3/28/2020,4/4/2020,4/7/2020,3/8/2020
information,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
data,22.0,0.0,0.0,0.0,0.0,7.0,44.0,76.0,7.0,69.0,...,76.0,14.0,15.0,24.0,15.0,25.0,34.0,59.0,76.0,0.0
avg_d_RH,84.0,68.0,89.0,78.0,61.0,91.0,81.0,85.0,87.0,97.0,...,88.0,73.0,80.0,83.0,77.0,76.0,79.0,87.0,78.0,83.0
avg_d_precip,17.78,0.0,5.84,0.0,0.0,34.8,0.0,0.25,0.0,71.88,...,0.76,0.0,0.76,0.25,1.52,2.29,0.25,1.27,0.0,0.76
avg_d_tmp,10.7,12.3,10.5,11.9,14.4,10.0,11.9,11.7,11.8,11.1,...,12.5,10.7,13.0,8.7,12.2,8.5,11.1,10.4,10.5,10.5
avg_d_wind,8.0,4.5,7.4,7.4,5.2,11.7,6.2,6.5,9.6,13.7,...,7.5,6.3,4.0,6.4,5.8,7.8,5.0,12.3,6.9,4.6
d_tests,1254.0,,9.0,,254.0,41.0,739.0,13535.0,0.0,4833.0,...,18736.0,1119.0,312.0,39222.0,1041.0,10910.0,10933.0,31900.0,12898.0,19.0
d_mob_change,22.0,-1.0,1.0,-1.0,1.0,6.0,21.0,24.0,5.0,16.0,...,26.0,18.0,14.0,22.0,16.0,23.0,18.0,18.0,24.0,0.0


In [252]:
d_tests_us_states_df.iloc[d_tests_us_states_df.index.get_level_values('state') == test_state].T


date,3/4/2020,3/5/2020,3/6/2020,3/7/2020,3/8/2020,3/9/2020,3/10/2020,3/11/2020,3/12/2020,3/13/2020,...,4/5/2020,4/6/2020,4/7/2020,4/8/2020,4/9/2020,4/10/2020,4/11/2020,4/12/2020,4/13/2020,4/14/2020
state,California,California,California,California,California,California,California,California,California,California,...,California,California,California,California,California,California,California,California,California,California
total,515.0,515.0,522.0,531.0,550.0,804.0,823.0,1073.0,1118.0,1118.0,...,131533.0,132431.0,145329.0,158864.0,177600.0,178763.0,185276.0,203528.0,204082.0,215408.0
d_tests,,0.0,7.0,9.0,19.0,254.0,19.0,250.0,45.0,0.0,...,4833.0,898.0,12898.0,13535.0,18736.0,1163.0,6513.0,18252.0,554.0,11326.0
total_tests_div_pop,1.3e-05,1.3e-05,1.3e-05,1.3e-05,1.4e-05,2e-05,2.1e-05,2.7e-05,2.8e-05,2.8e-05,...,0.003329,0.003352,0.003678,0.004021,0.004495,0.004524,0.004689,0.005151,0.005165,0.005452


In [253]:
# how many unique values does a given date have? 
# we expect the number of states or less because of repetitions
test_date = '4/5/2020'
idx = pd.IndexSlice
len(new_hopkins_conf.loc[idx[:, 'd_tests'], :][test_date].unique())


55

In [254]:
new_hopkins_conf[new_hopkins_conf['Country_Region']=='US']

Unnamed: 0_level_0,Unnamed: 1_level_0,Province_State,Country_Region,GDP,Urbanization,Median Age,Democracy,Gini Index,State Population,Total Tests,Tests \ Pop,...,4/27/2020,4/28/2020,4/29/2020,4/30/2020,5/1/2020,5/2/2020,5/3/2020,5/4/2020,5/5/2020,5/6/2020
coordinate,information,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
"(13.4443, 144.7937)",data,Guam,US,35712.56214,94.780,31.4,7.96,41.4,165718.0,650.0,0.003922,...,141.0,141.0,141.0,145.0,145.0,145.0,145.0,145.0,145.0,149.0
"(18.2208, -66.5901)",data,Puerto Rico,US,31651.34815,93.578,38.2,7.96,41.4,3193694.0,5507.0,0.001724,...,1389.0,1400.0,1433.0,1539.0,1575.0,1757.0,1808.0,1843.0,1924.0,1968.0
"(18.3358, -64.8963)",data,Virgin Islands,US,35938.00000,95.721,42.2,7.96,41.4,104914.0,285.0,0.002717,...,57.0,57.0,57.0,66.0,66.0,66.0,66.0,66.0,66.0,66.0
"(19.60121157, -155.5210167)",data,Hawaii,US,64096.00000,91.900,38.9,7.96,41.4,1415872.0,13542.0,0.009564,...,70.0,70.0,70.0,73.0,73.0,73.0,74.0,75.0,74.0,74.0
"(20.86399628, -156.56890969999995)",data,Hawaii,US,64096.00000,91.900,38.9,7.96,41.4,1415872.0,13542.0,0.009564,...,113.0,115.0,115.0,116.0,116.0,117.0,116.0,116.0,116.0,116.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
"(58.45031811, -134.200436)",data,Alaska,US,73205.00000,66.000,34.0,7.96,41.4,731545.0,6913.0,0.009450,...,27.0,27.0,27.0,27.0,27.0,27.0,27.0,27.0,27.0,27.0
"(60.24429722, -151.53888840000005)",data,Alaska,US,73205.00000,66.000,34.0,7.96,41.4,731545.0,6913.0,0.009450,...,19.0,19.0,19.0,19.0,21.0,21.0,22.0,22.0,22.0,22.0
"(61.14998174, -149.14269860000005)",data,Alaska,US,73205.00000,66.000,34.0,7.96,41.4,731545.0,6913.0,0.009450,...,171.0,175.0,179.0,179.0,185.0,185.0,187.0,189.0,190.0,190.0
"(62.31305045, -149.5741743)",data,Alaska,US,73205.00000,66.000,34.0,7.96,41.4,731545.0,6913.0,0.009450,...,21.0,21.0,21.0,21.0,21.0,21.0,21.0,21.0,21.0,21.0


## _Mobility tests_

In [255]:
def test_d_mob_states(df, mob_data):
    mob_data_dates = mob_data.index.get_level_values(level=0)
    common_dates = set(mob_data_dates).intersection(set(df.columns))
    printed = []
    dict = {}
    coords = set(df.index.get_level_values(level=0).tolist())
    for coord in coords:
        if df.loc[coord].loc['data']['Country_Region']=='US':
            state = df.loc[coord].loc['data']['Province_State']
            if not (state in printed):
                printed.append(state)
#                 print(state)
#                 print(coord)
#                 print(df.loc[coord].loc['d_tests'][common_dates].tolist())
                dict[state] = pd.DataFrame(df.loc[coord][common_dates])
    return dict


def test_d_mob_countries(df, mob_data):
    mob_data_dates = mob_data.index.get_level_values(level=0)
    common_dates = set(mob_data_dates).intersection(set(df.columns))
    printed = []
    dict = {}
    coords = set(df.index.get_level_values(level=0).tolist())
    for coord in coords:
        country = df.loc[coord].loc['data']['Country_Region']
        if not (country in printed):
            printed.append(country)
#                 print(state)
#                 print(coord)
#                 print(df.loc[coord].loc['d_tests'][common_dates].tolist())
            dict[country] = pd.DataFrame(df.loc[coord][common_dates])
    return dict


mob_dict_states = test_d_mob_states(new_hopkins_conf, community_mobility_reports)
mob_dict_countries = test_d_mob_countries(new_hopkins_conf, community_mobility_reports)

In [256]:
mob_dict_states['California'].loc['d_mob_change']
mob_dict_states['Missouri']
mob_dict_countries['Canada'].loc['d_mob_change'].unique()

array([11., 14.,  0., 20.,  1., 23., 27., 10.,  5., 22., 24., 17., 21.,
       33., 25., 26., 16., 13., -1.,  4., -2.])

## _Writing results to pickle file_

In [263]:
with open(RESULT_PATH, 'wb') as file:
    pickle.dump(new_hopkins_conf, file)

## _Checking that pickle was written correctly and loads correctly_

In [264]:
test = pd.DataFrame()
with open(RESULT_PATH, 'rb') as file:
    test = pd.read_pickle(file)    


In [265]:
test.head(7)

Unnamed: 0_level_0,Unnamed: 1_level_0,Province_State,Country_Region,GDP,Urbanization,Median Age,Democracy,Gini Index,State Population,Total Tests,Tests \ Pop,...,4/27/2020,4/28/2020,4/29/2020,4/30/2020,5/1/2020,5/2/2020,5/3/2020,5/4/2020,5/5/2020,5/6/2020
coordinate,information,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
"(-41.4545, 145.9707)",data,Tasmania,Australia,57373.68668,86.012,37.900002,9.09,34.4,,,,...,214.0,218.0,219.0,221.0,221.0,221.0,221.0,221.0,225.0,226.0
"(-41.4545, 145.9707)",avg_d_RH,,,,,,,,,,,...,79.0,87.0,95.0,91.0,90.0,62.0,66.0,74.0,88.0,86.0
"(-41.4545, 145.9707)",avg_d_precip,,,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"(-41.4545, 145.9707)",avg_d_tmp,,,,,,,,,,,...,11.1,11.7,11.6,9.7,8.2,10.3,7.8,9.1,8.4,11.9
"(-41.4545, 145.9707)",avg_d_wind,,,,,,,,,,,...,25.7,20.8,12.3,12.9,12.1,25.2,10.6,4.0,12.7,19.0
"(-41.4545, 145.9707)",d_tests,,,,,,,,,,,...,,,,,,,,,,
"(-41.4545, 145.9707)",d_mob_change,,,,,,,,,,,...,17.0,16.0,11.0,18.0,18.0,13.0,11.0,17.0,16.0,16.0


In [266]:
len(test.loc[idx[:, 'd_tests'], :][test_date].unique())

55

In [267]:
list(test.columns)[:10]

['Province_State',
 'Country_Region',
 'GDP',
 'Urbanization',
 'Median Age',
 'Democracy',
 'Gini Index',
 'State Population',
 'Total Tests',
 'Tests \\ Pop']