## add gdp and urbanization data notebook

Adds GDP (in billion USD) and Urbanization (in population percentage) columns to given tableau and hopkins augmented data.  
Contact ShuliFinley@gmail.com for questions :)

In [4]:
import pandas as pd
import numpy as np

Import tableau and hopkins data

In [96]:
tableau_conf_data = pd.read_csv('../augmented_datasets/tableau_conf_data.csv')
tableau_death_data = pd.read_csv('../augmented_datasets/tableau_death_data.csv')

hopkins_conf_data = pd.read_csv('../augmented_datasets/hopkins_conf.csv')
hopkins_death_data = pd.read_csv('../augmented_datasets/hopkins_death.csv')

# hopkins_conf_data.head()

Import gdp per country and per state data

In [20]:
# source: http://wdi.worldbank.org/table/4.2# (2018)
gdp_country = pd.read_csv('../external_datasets/MDI_GDP_1.csv')
gdp_country.columns=['Country_Region', 'GDP']

# source: https://www.statista.com/statistics/248023/us-gross-domestic-product-gdp-by-state/ 
gdp_state = pd.read_csv('../external_datasets/GDP_states_data.csv')
gdp_state.columns = ['Province_State', 'GDP']

Import urbanization per country and per state data

In [21]:
# source: http://wdi.worldbank.org/table/4.2# (2018)
urban_country = pd.read_csv('../external_datasets/urbanization_data.csv')
urban_country.columns=['Country_Region', 'Urbanization']

# source: US census bureau (most updated was is from 2010)
urban_state = pd.read_csv('../external_datasets/urbanization_states.csv')
urban_state.columns = ['Province_State', 'Urbanization']


data preprocessing functions

In [22]:

def standardize_original_names(df):
    df = df.replace({'Cape Verde': 'Cabo Verde',
                     'Timor-Leste': 'East Timor',
                     'Timor Leste': 'East Timor',
                     'Taiwan*': 'Taiwan'
                    })
    return df

# tableau_conf_data = standardize_original_names(tableau_conf_data)
# tableau_death_data = standardize_original_names(tableau_death_data)
# hopkins_conf_data = standardize_original_names(hopkins_conf_data)
# hopkins_death_data = standardize_original_names(hopkins_death_data)


In [23]:
index_dict = {
    'gdp_country': 'Country_Region',
    'gdp_state': 'Province_State',
    'urban_country': 'Country_Region',
    'urban_state': 'Province_State'    
}

    
# # setting indices for more convenient access in lambda funtion in add_gdp_urban function
# gdp_country = gdp_country.set_index(index_dict['gdp_country'])
# gdp_state = gdp_state.set_index(index_dict['gdp_state'])
# urban_country = urban_country.set_index(index_dict['urban_country'])
# urban_state = urban_state.set_index(index_dict['urban_state'])

In [24]:
def standardize_external_names(external_df):
    external_df = external_df.replace({
        'United States': 'US',
        'Iran, Islamic Rep.': 'Iran',
        'Congo, Rep.': 'Congo (Brazzaville)',
        'Congo, Dem. Rep.': 'Congo (Kinshasa)',
        'St. Lucia': 'Saint Lucia',
        'Czech Republic':'Czechia',
        'Kyrgyz Republic': 'Kyrgyzstan',
        'Egypt, Arab Rep.': 'Egypt',
        'Syrian Arab Republic': 'Syria',
    #     'Cabo Verde': 'Cape Verde',
        'Brunei Darussalam': 'Brunei',
        'Slovak Republic': 'Slovakia',
        'Korea, Rep.': 'Korea, South',
        'Timor Leste': 'East Timor',
        'St. Vincent and the Grenadines': 'Saint Vincent and the Grenadines',
        })
    return external_df

# gdp_country = standardize_external_names(gdp_country)
# urban_country = standardize_external_names(urban_country)

this is where the magic happens

In [73]:
def add_gdp_urban(original, gdp_country, gdp_state, urban_country, urban_state):
    
    original = standardize_original_names(original)

    gdp_country = standardize_external_names(gdp_country)
    urban_country = standardize_external_names(urban_country)
    
    # setting indices for more convenient access in lambda funtion below
    gdp_country = gdp_country.set_index(index_dict['gdp_country'])
    gdp_state = gdp_state.set_index(index_dict['gdp_state'])
    urban_country = urban_country.set_index(index_dict['urban_country'])
    urban_state = urban_state.set_index(index_dict['urban_state'])
    
    # if there is no state data, take country data, and if there is also no country data, put NaN
    original['GDP'] = original.apply(lambda row: gdp_state.loc[row['Province_State'],'GDP'] if row['Province_State'] in list(gdp_state.index) else (gdp_country.loc[row['Country_Region'],'GDP'] if row['Country_Region'] in list(gdp_country.index) else np.NaN), axis=1) 
    original['Urbanization'] = original.apply(lambda row: urban_state.loc[row['Province_State'],'Urbanization'] if row['Province_State'] in list(urban_state.index) else (urban_country.loc[row['Country_Region'],'Urbanization'] if row['Country_Region'] in list(urban_country.index) else np.NaN), axis=1)

    # inserting GDP and Urbanization columns to be after Province_State column
    new_index = list(original.columns).index('Province_State')
    old_index = len(original.columns) - 1
    new_columns = list(original.columns)
    new_columns.insert(new_index+1, new_columns.pop(old_index))
    new_columns.insert(new_index+1, new_columns.pop(old_index))
    
    return original[new_columns]

In [88]:

# pd.set_option('display.max_columns', None)

new_tableau_conf = add_gdp_urban(tableau_conf_data, gdp_country, gdp_state, urban_country, urban_state)
# new_tableau_conf.head(5)


In [89]:
new_tableau_conf.to_csv('../augmented_datasets/tableau_conf_augmented_gdp_urban.csv')


In [90]:
new_tableau_death = add_gdp_urban(tableau_death_data, gdp_country, gdp_state, urban_country, urban_state)


In [91]:
new_tableau_death.to_csv('../augmented_datasets/tableau_death_augmented_gdp_urban.csv')


In [92]:
new_hopkins_conf = add_gdp_urban(hopkins_conf_data, gdp_country, gdp_state, urban_country, urban_state)


In [93]:
new_hopkins_conf.to_csv('../augmented_datasets/hopkins_conf_augmented_gdp_urban.csv')


In [94]:
new_hopkins_death = add_gdp_urban(hopkins_conf_data, gdp_country, gdp_state, urban_country, urban_state)


In [95]:
new_hopkins_death.to_csv('../augmented_datasets/hopkins_death_augmented_gdp_urban.csv')
