In [1]:
# Dependencies
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import requests
import io
import time
from datetime import datetime, timedelta
import json
import pprint
pp = pprint.PrettyPrinter(indent=4)
pd.set_option('display.max_columns', None)

In [2]:
def us_date(x):
    month = x[5:7]
    day = x[8:11]
    year = x[0:4]
    conversion = month + "/" + day +"/"+ year
    return conversion

def removeDecimal(data):
    strData = str(data)
    decimalLocation = strData.find(".")
    if decimalLocation > -1:
        return strData[0:decimalLocation]
    else:
        return strData

def emptyNan(value):
    if (value == "nan"):
        return ""
    else:
        return value

def printColumns(df, label):
    print(label)
    print(df.columns)

In [38]:
census_regions = [
    {
        "number" : 0,
        "name" : "United States",
        "states" : ["United States"]
    },
    {
        "number" : 1,
        "name" : "Northeast",
        "states" :["Maine", "New Hampshire", "Vermont", "Massachusetts", "Connecticut", "Rhode Island", "New Jersey", "New York", "Pennsylvania"]
    },
    {
        "number" : 3,
        "name" : "South",
        "states" : ["Maryland", "Delaware", "West Virginia", "Virginia", "Kentucky", "Tennessee", "North Carolina", "South Carolina", "Georgia", "Florida", "Alabama", "Mississippi", "Arkansas", "Louisiana", "Oklahoma", "Texas", "District of Columbia", "Puerto Rico"]
    },
    {
        "number" : 2,
        "name" : "Midwest",
        "states" : ["North Dakota", "South Dakota", "Nebraska", "Kansas", "Missouri", "Iowa", "Minnesota", "Wisconsin", "Illinois", "Michigan", "Indiana", "Ohio"]
    },
    {
        "number" : 4,
        "name" : "West",
        "states" : ["Washington", "Idaho", "Montana", "Wyoming", "Oregon", "California", "Nevada", "Utah", "Colorado", "Arizona", "New Mexico", "Alaska", "Hawaii"]
    }
]
print("Census Regions")
for census_region in census_regions:
    pp.pprint(census_region)
configured_regions = ['South Asia',
                      'Sub-Saharan Africa']
sub_saharan_african_countries = [
    "Angola",
    "Benin","Botswana","Burkina Faso","Burundi",
    "Cabo Verde","Cameroon","Central African Republic","Chad","Comoros","Congo - Brazzaville","Congo - Kinshasa","Cote d'Ivoire",
    "Democratic Republic of Congo",
    "Equatorial Guinea","Eritrea","Ethiopia",
    "Gabon","Gambia","Ghana","Guinea","Guinea-Bissau",
    "Kenya",
    "Lesotho","Liberia",
    "Madagascar","Malawi","Mali","Mauritania","Mauritius","Mozambique",
    "Namibia","Niger","Nigeria",
    "Republic of the Congo","Rwanda",
    "Sao Tome and Principe","Senegal","Seychelles","Sierra Leone","Somalia","South Africa","South Sudan","Sudan","Swaziland ",
    "Tanzania","Togo",
    "Uganda",
    "Zambia","Zimbabwe"
]
south_asia_countries = [
    "Afghanistan",
    "Bangladesh",
    "Bhutan",
    "India",
    "Maldives",
    "Nepal",
    "Pakistan",
    "Sri Lanka"
]
configured_countries = sub_saharan_african_countries + south_asia_countries
configured_countries.sort()
print("Configured Countries")
print(configured_countries)

Census Regions
{'name': 'United States', 'number': 0, 'states': ['United States']}
{   'name': 'Northeast',
    'number': 1,
    'states': [   'Maine',
                  'New Hampshire',
                  'Vermont',
                  'Massachusetts',
                  'Connecticut',
                  'Rhode Island',
                  'New Jersey',
                  'New York',
                  'Pennsylvania']}
{   'name': 'South',
    'number': 3,
    'states': [   'Maryland',
                  'Delaware',
                  'West Virginia',
                  'Virginia',
                  'Kentucky',
                  'Tennessee',
                  'North Carolina',
                  'South Carolina',
                  'Georgia',
                  'Florida',
                  'Alabama',
                  'Mississippi',
                  'Arkansas',
                  'Louisiana',
                  'Oklahoma',
                  'Texas',
                  'District of Columbia',
         

Download US Data

In [39]:
us_states_census_demographics = "https://www2.census.gov/programs-surveys/popest/tables/2010-2019/state/asrh/sc-est2019-agesex-civ.csv"
us_states_census_demographics_request = requests.get(us_states_census_demographics).content
us_demographics = pd.read_csv(io.StringIO(us_states_census_demographics_request.decode('utf-8')))
currentTime = datetime.now()
us_demographics["Downloaded"] = currentTime
us_demographics["Country"] = "United States"

def fixRegion(code):
    region_name = ""
    for region in census_regions:
        if region["number"] == code:
            region_name = region["name"]
            break
    if region_name == "":
        region_name = "Other"
        print(str(code) + " not found")
    return region_name

us_demographics["REGION"] = us_demographics["REGION"].apply(lambda x: fixRegion(x))

def fixSex(code):
    sex = ""
    if code == 0:
        sex = "Population 2019"
    elif code == 1:
        sex = "Male"
    elif code == 2:
        sex = "Female"
    else:
        print(str(code) + " is not a sex")
    return sex

us_demographics["SEX"] = us_demographics["SEX"].apply(lambda x: fixSex(x))

# CDC Standard age ranges 0-17, 18-29, 30-49, and 50-64
# CDC COVID Reporting Age Ranges https://www.cdc.gov/nchs/nvss/vsrr/covid_weekly/index.htm
print(us_demographics["AGE"].sort_values(ascending = True).unique())
def getAgeRange(age):
    age_range = ""
    if age == 0:
        age_range = "< 1"
    elif age == 999:
        age_range = "Total"
    elif age < 5:
        age_range = "1-4"
    elif age < 15:
        age_range = "5-14"
    elif age < 25:
        age_range = "15-24"
    elif age < 35:
        age_range = "25-34"
    elif age < 45:
        age_range = "35-44"
    elif age < 55:
        age_range = "45-54"
    elif age < 65:
        age_range = "55-64"
    elif age < 75:
        age_range = "65-74"
    elif age < 85:
        age_range = "75-84"
    else:
        age_range = "85+"
    return age_range

us_demographics["Age Range"] = us_demographics["AGE"].apply(lambda x: getAgeRange(x))

keep_columns = ["REGION","STATE","NAME","SEX","AGE","POPEST2019_CIV","Downloaded","Country", "Age Range"]
us_demographics = us_demographics[keep_columns]
us_demographics.rename(columns = {'REGION': 'Census Region',
                                  'NAME' : 'State Name',
                                  'STATE' : 'FIPS',
                                  'POPEST2019_CIV' : 'Population 2019',
                                  'SEX' : 'Sex',
                                  'AGE' : 'Age'}, 
                       inplace = True)

us_demographics.head()

[  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
  18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35
  36  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53
  54  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71
  72  73  74  75  76  77  78  79  80  81  82  83  84  85 999]


Unnamed: 0,Census Region,FIPS,State Name,Sex,Age,Population 2019,Downloaded,Country,Age Range
0,United States,0,United States,Population 2019,0,3783052,2020-10-11 15:01:51.347566,United States,< 1
1,United States,0,United States,Population 2019,1,3829599,2020-10-11 15:01:51.347566,United States,1-4
2,United States,0,United States,Population 2019,2,3922044,2020-10-11 15:01:51.347566,United States,1-4
3,United States,0,United States,Population 2019,3,3998665,2020-10-11 15:01:51.347566,United States,1-4
4,United States,0,United States,Population 2019,4,4043323,2020-10-11 15:01:51.347566,United States,1-4


In [65]:
us_sex = us_demographics[["Census Region","FIPS","Sex","Population 2019"]].copy()
us_sex = us_sex.pivot_table(index=["Census Region","FIPS"], 
                      columns='Sex', 
                      values='Population 2019', 
                      aggfunc='first').reset_index().rename_axis(None, axis=1)
us_sex["Pct Male"] = us_sex["Male"]/us_sex["Population 2019"]
us_sex = us_sex.sort_values(["FIPS"])
us_sex.head()

Unnamed: 0,Census Region,FIPS,Female,Male,Population 2019,Pct Male
38,United States,0,1847935,1935117,3783052,0.511523
21,South,1,27821,29080,56901,0.511063
39,West,2,4881,5097,9978,0.510824
40,West,4,40021,41908,81929,0.511516
22,South,5,17744,18611,36355,0.511924


In [74]:
us_age = us_demographics[["Census Region","FIPS","State Name","Age Range","Population 2019"]].copy()
us_age = us_age.groupby(["Census Region","FIPS","State Name","Age Range"],as_index=False)["Population 2019"].sum()
us_age = us_age.pivot_table(index=["Census Region","FIPS","State Name"], 
                      columns='Age Range', 
                      values='Population 2019', 
                      aggfunc='sum').reset_index().rename_axis(None, axis=1)
us_age["Pct < 1"] = us_age["< 1"]/us_age["Total"]
us_age["Pct 1-4"] = us_age["1-4"]/us_age["Total"]
us_age["Pct 15-24"] = us_age["15-24"]/us_age["Total"]
us_age["Pct 25-34"] = us_age["25-34"]/us_age["Total"]
us_age["Pct 35-44"] = us_age["35-44"]/us_age["Total"]
us_age["Pct 45-54"] = us_age["45-54"]/us_age["Total"]
us_age["Pct 55-64"] = us_age["55-64"]/us_age["Total"]
us_age["Pct 65-74"] = us_age["65-74"]/us_age["Total"]
us_age["Pct 75-84"] = us_age["75-84"]/us_age["Total"]
us_age["Pct 85+"] = us_age["85+"]/us_age["Total"]
us_age = us_age.drop(columns=["Census Region","State Name","Total"])
us_age = us_age.sort_values(["FIPS"])
us_age.head()

Unnamed: 0,FIPS,1-4,15-24,25-34,35-44,45-54,5-14,55-64,65-74,75-84,85+,< 1,Pct < 1,Pct 1-4,Pct 15-24,Pct 25-34,Pct 35-44,Pct 45-54,Pct 55-64,Pct 65-74,Pct 75-84,Pct 85+
38,0,31587262,84500410,90964550,82860364,81633238,81988326,84888424,62966866,31939744,13209916,7566104,0.011567,0.048291,0.129185,0.139067,0.126677,0.124801,0.129778,0.096264,0.04883,0.020195
21,1,474912,1263796,1284374,1179560,1230558,1216932,1315086,1002894,513694,183086,113802,0.011638,0.048566,0.12924,0.131344,0.120626,0.125841,0.134485,0.102559,0.052532,0.018723
39,2,82204,178546,218862,184610,168958,201324,186592,122600,46214,14362,19956,0.014012,0.057718,0.125363,0.153671,0.129621,0.118631,0.131013,0.086082,0.032448,0.010084
40,4,695718,1932510,1998526,1784670,1702668,1861490,1761474,1503398,822394,291474,163858,0.011286,0.04792,0.13311,0.137657,0.122927,0.117278,0.121329,0.103553,0.056646,0.020076
22,5,304218,792448,789028,738194,722378,786426,771918,605658,322282,119824,72710,0.012068,0.050492,0.131525,0.130957,0.12252,0.119895,0.128117,0.100523,0.05349,0.019888


In [75]:
state_codes = pd.read_excel(r'C:\Users\janin\Downloads\US State Codes.xlsx')
state_codes = state_codes.drop(columns=["State Name"])
state_codes.head()

Unnamed: 0,FIPS,State Abbreviation,Status
0,0,US,0
1,1,AL,0
2,2,AK,0
3,4,AZ,0
4,5,AR,0


In [76]:
us_state_demographics = pd.merge(us_sex, us_age, how="left", on="FIPS")
us_state_demographics = pd.merge(us_state_demographics, state_codes, how="left", on="FIPS")
print(us_state_demographics.columns)
us_state_demographics = us_state_demographics.drop(columns=["FIPS", "Status"])
demographics_order = [
    'State Abbreviation', 'Census Region',
    'Female', 'Male', 'Population 2019', 'Pct Male', 
    '< 1', '1-4', '15-24', '25-34', '35-44', '45-54', '5-14', '55-64', '65-74', '75-84', '85+',
    'Pct < 1', 'Pct 1-4', 'Pct 15-24', 'Pct 25-34', 'Pct 35-44', 'Pct 45-54', 'Pct 55-64', 'Pct 65-74', 'Pct 75-84', 'Pct 85+'
]
us_state_demographics = us_state_demographics[demographics_order]
us_state_demographics = us_state_demographics.sort_values(["State Abbreviation"])
print(us_state_demographics.loc[us_state_demographics["Census Region"] == "United States"])
us_state_demographics.head()

Index(['Census Region', 'FIPS', 'Female', 'Male', 'Population 2019',
       'Pct Male', '1-4', '15-24', '25-34', '35-44', '45-54', '5-14', '55-64',
       '65-74', '75-84', '85+', '< 1', 'Pct < 1', 'Pct 1-4', 'Pct 15-24',
       'Pct 25-34', 'Pct 35-44', 'Pct 45-54', 'Pct 55-64', 'Pct 65-74',
       'Pct 75-84', 'Pct 85+', 'State Abbreviation', 'Status'],
      dtype='object')
  State Abbreviation  Census Region   Female     Male  Population 2019  \
0                 US  United States  1847935  1935117          3783052   

   Pct Male      < 1       1-4     15-24     25-34     35-44     45-54  \
0  0.511523  7566104  31587262  84500410  90964550  82860364  81633238   

       5-14     55-64     65-74     75-84       85+   Pct < 1   Pct 1-4  \
0  81988326  84888424  62966866  31939744  13209916  0.011567  0.048291   

   Pct 15-24  Pct 25-34  Pct 35-44  Pct 45-54  Pct 55-64  Pct 65-74  \
0   0.129185   0.139067   0.126677   0.124801   0.129778   0.096264   

   Pct 75-84   Pct 85+  
0  

Unnamed: 0,State Abbreviation,Census Region,Female,Male,Population 2019,Pct Male,< 1,1-4,15-24,25-34,35-44,45-54,5-14,55-64,65-74,75-84,85+,Pct < 1,Pct 1-4,Pct 15-24,Pct 25-34,Pct 35-44,Pct 45-54,Pct 55-64,Pct 65-74,Pct 75-84,Pct 85+
2,AK,West,4881,5097,9978,0.510824,19956,82204,178546,218862,184610,168958,201324,186592,122600,46214,14362,0.014012,0.057718,0.125363,0.153671,0.129621,0.118631,0.131013,0.086082,0.032448,0.010084
1,AL,South,27821,29080,56901,0.511063,113802,474912,1263796,1284374,1179560,1230558,1216932,1315086,1002894,513694,183086,0.011638,0.048566,0.12924,0.131344,0.120626,0.125841,0.134485,0.102559,0.052532,0.018723
4,AR,South,17744,18611,36355,0.511924,72710,304218,792448,789028,738194,722378,786426,771918,605658,322282,119824,0.012068,0.050492,0.131525,0.130957,0.12252,0.119895,0.128117,0.100523,0.05349,0.019888
3,AZ,West,40021,41908,81929,0.511516,163858,695718,1932510,1998526,1784670,1702668,1861490,1761474,1503398,822394,291474,0.011286,0.04792,0.13311,0.137657,0.122927,0.117278,0.121329,0.103553,0.056646,0.020076
5,CA,West,225948,236641,462589,0.511558,925178,3842254,10226804,11987212,10515252,9950666,10015974,9572712,6773340,3403198,1499692,0.011754,0.048814,0.129926,0.152292,0.133591,0.126418,0.121616,0.086052,0.043236,0.019053


In [77]:
state_codes = pd.read_excel(r'C:\Users\janin\Downloads\US State Codes.xlsx')
state_codes.head()

Unnamed: 0,State Name,FIPS,State Abbreviation,Status
0,United States,0,US,0
1,Alabama,1,AL,0
2,Alaska,2,AK,0
3,Arizona,4,AZ,0
4,Arkansas,5,AR,0


In [78]:
us_states_url = "https://covidtracking.com/data/download/all-states-history.csv"
us_states_request = requests.get(us_states_url).content
states=pd.read_csv(io.StringIO(us_states_request.decode('utf-8')))
currentTime = datetime.now()
states["Downloaded"] = currentTime
states["Country"] = "United States"
printColumns(states, "Pre Rename Columns")
states = states.drop(
    columns = [
        'deathConfirmed', 'deathProbable',
        'hospitalized',
        'negativeTestsAntibody', 'negativeTestsPeopleAntibody', 'negativeTestsViral',
        'pending',
        'positiveScore', 'positiveTestsAntibody', 'positiveTestsAntigen',
        'positiveTestsPeopleAntibody', 'positiveTestsPeopleAntigen',
        'positiveTestsViral', 'positiveCasesViral',
        'totalTestEncountersViral', 'totalTestEncountersViralIncrease',
        'totalTestsAntibody', 'totalTestsAntigen',
        'totalTestsPeopleAntibody', 'totalTestsPeopleAntigen',
        'totalTestsPeopleViral', 'totalTestsPeopleViralIncrease',
        'totalTestsViral', 'totalTestsViralIncrease'
    ])
states.rename(
    columns = {
        'date': 'Time', 'state' : 'State Abbreviation', 'dataQualityGrade': 'Data Quality',
        'totalTestResults' : 'Total Tests', 'totalTestResultsIncrease' : 'Tests Daily',
        'negative' : 'Total Negative', 'negativeIncrease' : 'Negative Daily',
        'positive' : 'Total Positive', 'positiveIncrease' : 'Positive Daily',
        'recovered' : 'Total Recovered',
        'death' : 'Total Deaths', 'deathIncrease' : 'Deaths Daily',
        'hospitalizedCumulative' : 'Total Hospitalized', 'hospitalizedIncrease' : 'Hospitalized Daily', 'hospitalizedCurrently' : 'Currently Hospitalized',
        'inIcuCumulative' : 'Total In ICU', 'inIcuCurrently' : 'Currently In ICU',
        'onVentilatorCumulative' : 'Total On Ventilator', 'onVentilatorCurrently' : 'Currently On Ventilator'
    }, inplace = True)
states["Time"] = states["Time"].astype(str)
states["Time"] = states["Time"].apply(lambda x: us_date(x))
states["Date"] = pd.to_datetime(states["Time"], format="%m/%d/%Y")
printColumns(states, "Post Rename Columns")
states_input = pd.merge(states, state_codes, how="left", on="State Abbreviation")
merge_order = [
    'Time', 'Date', 'State Abbreviation', 'State Name', 'Country', 'FIPS', 'Status', 'Data Quality', 
    'Total Deaths', 'Deaths Daily', 'Total Recovered',
    'Total Tests', 'Tests Daily', 
    'Total Negative', 'Negative Daily', 'Total Positive', 'Positive Daily',    
    'Total Hospitalized', 'Currently Hospitalized', 'Hospitalized Daily',
    'Total In ICU', 'Currently In ICU',
    'Total On Ventilator', 'Currently On Ventilator', 
    'Downloaded' 
]
states_input = states_input[merge_order]
printColumns(states_input, "States Input Merge Columns")
states_input.head()

Pre Rename Columns
Index(['date', 'state', 'dataQualityGrade', 'death', 'deathConfirmed',
       'deathIncrease', 'deathProbable', 'hospitalized',
       'hospitalizedCumulative', 'hospitalizedCurrently',
       'hospitalizedIncrease', 'inIcuCumulative', 'inIcuCurrently', 'negative',
       'negativeIncrease', 'negativeTestsAntibody',
       'negativeTestsPeopleAntibody', 'negativeTestsViral',
       'onVentilatorCumulative', 'onVentilatorCurrently', 'pending',
       'positive', 'positiveCasesViral', 'positiveIncrease', 'positiveScore',
       'positiveTestsAntibody', 'positiveTestsAntigen',
       'positiveTestsPeopleAntibody', 'positiveTestsPeopleAntigen',
       'positiveTestsViral', 'recovered', 'totalTestEncountersViral',
       'totalTestEncountersViralIncrease', 'totalTestResults',
       'totalTestResultsIncrease', 'totalTestsAntibody', 'totalTestsAntigen',
       'totalTestsPeopleAntibody', 'totalTestsPeopleAntigen',
       'totalTestsPeopleViral', 'totalTestsPeopleViralIncre

Unnamed: 0,Time,Date,State Abbreviation,State Name,Country,FIPS,Status,Data Quality,Total Deaths,Deaths Daily,Total Recovered,Total Tests,Tests Daily,Total Negative,Negative Daily,Total Positive,Positive Daily,Total Hospitalized,Currently Hospitalized,Hospitalized Daily,Total In ICU,Currently In ICU,Total On Ventilator,Currently On Ventilator,Downloaded
0,10/11/2020,2020-10-11,AK,Alaska,United States,2,0,A,60.0,0,5789.0,502997.0,3744,492311.0,3486,10686.0,258,,57.0,0,,,,7.0,2020-10-11 16:49:53.692145
1,10/11/2020,2020-10-11,AL,Alabama,United States,1,0,A,2664.0,0,71240.0,1210007.0,5687,1064417.0,4998,165342.0,816,18179.0,812.0,0,1890.0,,1067.0,,2020-10-11 16:49:53.692145
2,10/11/2020,2020-10-11,AR,Arkansas,United States,5,0,A+,1569.0,17,83454.0,1146654.0,8938,1058352.0,8384,92833.0,613,5910.0,566.0,10,,237.0,731.0,97.0,2020-10-11 16:49:53.692145
3,10/11/2020,2020-10-11,AS,American Samoa,United States,60,1,D,0.0,0,,1616.0,0,1616.0,0,0.0,0,,,0,,,,,2020-10-11 16:49:53.692145
4,10/11/2020,2020-10-11,AZ,Arizona,United States,4,0,A+,5759.0,0,37110.0,1566165.0,10150,1345494.0,9565,225575.0,597,20253.0,630.0,24,,144.0,,79.0,2020-10-11 16:49:53.692145


In [89]:
us_summary_cols = [
    'Time', 'Date', 'Country',
    'Total Deaths', 'Deaths Daily', 'Total Recovered',
    'Total Tests', 'Tests Daily', 'Total Negative', 'Negative Daily', 'Total Positive', 'Positive Daily',
    'Total Hospitalized', 'Currently Hospitalized', 'Hospitalized Daily', 
    'Total In ICU', 'Currently In ICU', 'Total On Ventilator', 'Currently On Ventilator'
]
us_stats = states_input[us_summary_cols].groupby(['Time','Date','Country']).sum().reset_index()
us_stats["FIPS"] = 0
us_stats["State Abbreviation"] = "US"
us_stats["State Name"] = "United States" 
us_stats["Status"] = 0
us_stats["Downloaded"] = currentTime
us_stats["Data Quality"] = ""
us_stats = us_stats[merge_order]
us_stats.head()

Unnamed: 0,Time,Date,State Abbreviation,State Name,Country,FIPS,Status,Data Quality,Total Deaths,Deaths Daily,Total Recovered,Total Tests,Tests Daily,Total Negative,Negative Daily,Total Positive,Positive Daily,Total Hospitalized,Currently Hospitalized,Hospitalized Daily,Total In ICU,Currently In ICU,Total On Ventilator,Currently On Ventilator,Downloaded
0,01/22/2020,2020-01-22,US,United States,United States,0,0,,0.0,0,0.0,1.0,0,0.0,0,0.0,0,0.0,0.0,0,0.0,0.0,0.0,0.0,2020-10-11 16:49:53.692145
1,01/23/2020,2020-01-23,US,United States,United States,0,0,,0.0,0,0.0,2.0,1,0.0,0,0.0,0,0.0,0.0,0,0.0,0.0,0.0,0.0,2020-10-11 16:49:53.692145
2,01/24/2020,2020-01-24,US,United States,United States,0,0,,0.0,0,0.0,2.0,0,0.0,0,0.0,0,0.0,0.0,0,0.0,0.0,0.0,0.0,2020-10-11 16:49:53.692145
3,01/25/2020,2020-01-25,US,United States,United States,0,0,,0.0,0,0.0,2.0,0,0.0,0,0.0,0,0.0,0.0,0,0.0,0.0,0.0,0.0,2020-10-11 16:49:53.692145
4,01/26/2020,2020-01-26,US,United States,United States,0,0,,0.0,0,0.0,2.0,0,0.0,0,0.0,0,0.0,0.0,0,0.0,0.0,0.0,0.0,2020-10-11 16:49:53.692145


In [92]:
states_input_w_US = pd.concat([states_input, us_stats])
print(states_input_w_US["State Name"].sort_values(ascending = True).unique())
states_input_w_US.head()

['Alabama' 'Alaska' 'American Samoa' 'Arizona' 'Arkansas' 'California'
 'Colorado' 'Commonwealth of the Northern Mariana Islands' 'Connecticut'
 'Delaware' 'District of Columbia' 'Florida' 'Georgia' 'Guam' 'Hawaii'
 'Idaho' 'Illinois' 'Indiana' 'Iowa' 'Kansas' 'Kentucky' 'Louisiana'
 'Maine' 'Maryland' 'Massachusetts' 'Michigan' 'Minnesota' 'Mississippi'
 'Missouri' 'Montana' 'Nebraska' 'Nevada' 'New Hampshire' 'New Jersey'
 'New Mexico' 'New York' 'North Carolina' 'North Dakota' 'Ohio' 'Oklahoma'
 'Oregon' 'Pennsylvania' 'Puerto Rico' 'Rhode Island' 'South Carolina'
 'South Dakota' 'Tennessee' 'Texas' 'U.S. Virgin Islands' 'United States'
 'Utah' 'Vermont' 'Virginia' 'Washington' 'West Virginia' 'Wisconsin'
 'Wyoming']


Unnamed: 0,Time,Date,State Abbreviation,State Name,Country,FIPS,Status,Data Quality,Total Deaths,Deaths Daily,Total Recovered,Total Tests,Tests Daily,Total Negative,Negative Daily,Total Positive,Positive Daily,Total Hospitalized,Currently Hospitalized,Hospitalized Daily,Total In ICU,Currently In ICU,Total On Ventilator,Currently On Ventilator,Downloaded
0,10/11/2020,2020-10-11,AK,Alaska,United States,2,0,A,60.0,0,5789.0,502997.0,3744,492311.0,3486,10686.0,258,,57.0,0,,,,7.0,2020-10-11 16:49:53.692145
1,10/11/2020,2020-10-11,AL,Alabama,United States,1,0,A,2664.0,0,71240.0,1210007.0,5687,1064417.0,4998,165342.0,816,18179.0,812.0,0,1890.0,,1067.0,,2020-10-11 16:49:53.692145
2,10/11/2020,2020-10-11,AR,Arkansas,United States,5,0,A+,1569.0,17,83454.0,1146654.0,8938,1058352.0,8384,92833.0,613,5910.0,566.0,10,,237.0,731.0,97.0,2020-10-11 16:49:53.692145
3,10/11/2020,2020-10-11,AS,American Samoa,United States,60,1,D,0.0,0,,1616.0,0,1616.0,0,0.0,0,,,0,,,,,2020-10-11 16:49:53.692145
4,10/11/2020,2020-10-11,AZ,Arizona,United States,4,0,A+,5759.0,0,37110.0,1566165.0,10150,1345494.0,9565,225575.0,597,20253.0,630.0,24,,144.0,,79.0,2020-10-11 16:49:53.692145


In [93]:
printColumns(us_state_demographics, "State Demographics Columns")
states_input_w_demographics = pd.merge(states_input_w_US, us_state_demographics, how="left", on="State Abbreviation")
printColumns(states_input_w_demographics,"State Input Columns")
states_input_order = [
    'Time', 'Date', 'FIPS', 'State Abbreviation', 'State Name', 'Status', 'Census Region', 'Country',  'Data Quality',
    'Total Tests', 'Tests Daily', 'Total Negative', 'Negative Daily', 
    'Total Positive', 'Positive Daily', 
    'Total Deaths', 'Deaths Daily', 'Total Recovered',
    'Total Hospitalized', 'Currently Hospitalized', 'Hospitalized Daily', 
    'Total In ICU', 'Currently In ICU', 'Total On Ventilator', 'Currently On Ventilator', 
    'Population 2019', 'Female', 'Male', 'Pct Male',
    '< 1', '1-4', '15-24', '25-34', '35-44', '45-54', '5-14', '55-64', '65-74', '75-84', '85+', 
    'Pct < 1', 'Pct 1-4', 'Pct 15-24', 'Pct 25-34', 'Pct 35-44', 'Pct 45-54', 'Pct 55-64', 'Pct 65-74', 'Pct 75-84', 'Pct 85+' 
]
states_input_w_demographics = states_input_w_demographics[states_input_order]
print(states_input_w_demographics["Census Region"].sort_values(ascending = True).unique())
states_input_w_demographics["Census Region"] = states_input_w_demographics["Census Region"].astype(str)
regions = ['Midwest','Northeast','South','West']
states_input_w_demographics["Census Region"] = states_input_w_demographics["Census Region"].apply(lambda x: x if x in regions else "Other")
states_input_w_demographics = states_input_w_demographics.sort_values(["FIPS"])
states_input_w_demographics.head()

State Demographics Columns
Index(['State Abbreviation', 'Census Region', 'Female', 'Male',
       'Population 2019', 'Pct Male', '< 1', '1-4', '15-24', '25-34', '35-44',
       '45-54', '5-14', '55-64', '65-74', '75-84', '85+', 'Pct < 1', 'Pct 1-4',
       'Pct 15-24', 'Pct 25-34', 'Pct 35-44', 'Pct 45-54', 'Pct 55-64',
       'Pct 65-74', 'Pct 75-84', 'Pct 85+'],
      dtype='object')
State Input Columns
Index(['Time', 'Date', 'State Abbreviation', 'State Name', 'Country', 'FIPS',
       'Status', 'Data Quality', 'Total Deaths', 'Deaths Daily',
       'Total Recovered', 'Total Tests', 'Tests Daily', 'Total Negative',
       'Negative Daily', 'Total Positive', 'Positive Daily',
       'Total Hospitalized', 'Currently Hospitalized', 'Hospitalized Daily',
       'Total In ICU', 'Currently In ICU', 'Total On Ventilator',
       'Currently On Ventilator', 'Downloaded', 'Census Region', 'Female',
       'Male', 'Population 2019', 'Pct Male', '< 1', '1-4', '15-24', '25-34',
       '35-44', '

Unnamed: 0,Time,Date,FIPS,State Abbreviation,State Name,Status,Census Region,Country,Data Quality,Total Tests,Tests Daily,Total Negative,Negative Daily,Total Positive,Positive Daily,Total Deaths,Deaths Daily,Total Recovered,Total Hospitalized,Currently Hospitalized,Hospitalized Daily,Total In ICU,Currently In ICU,Total On Ventilator,Currently On Ventilator,Population 2019,Female,Male,Pct Male,< 1,1-4,15-24,25-34,35-44,45-54,5-14,55-64,65-74,75-84,85+,Pct < 1,Pct 1-4,Pct 15-24,Pct 25-34,Pct 35-44,Pct 45-54,Pct 55-64,Pct 65-74,Pct 75-84,Pct 85+
12684,10/11/2020,2020-10-11,0,US,United States,0,Other,United States,,115424481.0,943645,102250976.0,800029,7727630.0,46776,206597.0,464,3075077.0,423058.0,34028.0,999,21553.0,6583.0,2454.0,1614.0,3783052.0,1847935.0,1935117.0,0.511523,7566104.0,31587262.0,84500410.0,90964550.0,82860364.0,81633238.0,81988326.0,84888424.0,62966866.0,31939744.0,13209916.0,0.011567,0.048291,0.129185,0.139067,0.126677,0.124801,0.129778,0.096264,0.04883,0.020195
12516,04/26/2020,2020-04-26,0,US,United States,0,Other,United States,,5547681.0,204025,4484632.0,179284,970648.0,27310,51383.0,1215,116529.0,112926.0,56192.0,2192,2571.0,14104.0,227.0,5121.0,3783052.0,1847935.0,1935117.0,0.511523,7566104.0,31587262.0,84500410.0,90964550.0,82860364.0,81633238.0,81988326.0,84888424.0,62966866.0,31939744.0,13209916.0,0.011567,0.048291,0.129185,0.139067,0.126677,0.124801,0.129778,0.096264,0.04883,0.020195
12515,04/25/2020,2020-04-25,0,US,United States,0,Other,United States,,5343656.0,278950,4305348.0,240717,943338.0,35889,50168.0,1748,112539.0,110734.0,57381.0,2456,2516.0,14415.0,227.0,5266.0,3783052.0,1847935.0,1935117.0,0.511523,7566104.0,31587262.0,84500410.0,90964550.0,82860364.0,81633238.0,81988326.0,84888424.0,62966866.0,31939744.0,13209916.0,0.011567,0.048291,0.129185,0.139067,0.126677,0.124801,0.129778,0.096264,0.04883,0.020195
12514,04/24/2020,2020-04-24,0,US,United States,0,Other,United States,,5064706.0,238383,4064631.0,202285,907449.0,34231,48420.0,1895,101296.0,108278.0,57295.0,2338,2468.0,14605.0,227.0,5187.0,3783052.0,1847935.0,1935117.0,0.511523,7566104.0,31587262.0,84500410.0,90964550.0,82860364.0,81633238.0,81988326.0,84888424.0,62966866.0,31939744.0,13209916.0,0.011567,0.048291,0.129185,0.139067,0.126677,0.124801,0.129778,0.096264,0.04883,0.020195
12513,04/23/2020,2020-04-23,0,US,United States,0,Other,United States,,4826323.0,194831,3862346.0,161630,873218.0,31812,46525.0,1791,96921.0,105940.0,58989.0,2802,2428.0,14744.0,227.0,5464.0,3783052.0,1847935.0,1935117.0,0.511523,7566104.0,31587262.0,84500410.0,90964550.0,82860364.0,81633238.0,81988326.0,84888424.0,62966866.0,31939744.0,13209916.0,0.011567,0.048291,0.129185,0.139067,0.126677,0.124801,0.129778,0.096264,0.04883,0.020195


In [100]:
states_input_integers = [
    'Total Deaths', 'Deaths Daily',
    'Total Hospitalized', 'Currently Hospitalized', 'Hospitalized Daily',
    'Total In ICU', 'Currently In ICU',
    'Total Negative', 'Negative Daily',
    'Total On Ventilator', 'Currently On Ventilator',
    'Total Positive', 'Positive Daily',
    'Total Recovered',
    'Total Tests', 'Tests Daily', 
    'FIPS', 'Status',
    'Female', 'Male', 'Population 2019',
    '1-4', '15-24', '25-34', '35-44', '45-54', '5-14', '55-64', '65-74', '75-84', '85+', '< 1'
]
states_input_float = [
    'Pct Male',
    'Pct < 1', 'Pct 1-4', 'Pct 15-24', 'Pct 25-34', 'Pct 35-44', 'Pct 45-54', 'Pct 55-64', 'Pct 65-74', 'Pct 75-84', 'Pct 85+'
]
states_input_cleaned = states_input_w_demographics.copy()
printColumns(states_input_cleaned, "States Input Columns")
for i in states_input_cleaned.columns:
    if i in states_input_integers:
        states_input_cleaned[i] = states_input_cleaned[i].apply(lambda x: pd.to_numeric(x,
                                                                    errors='coerce',
                                                                    downcast='integer'))
        states_input_cleaned[i] = states_input_cleaned[i].astype(str)
        states_input_cleaned[i] = states_input_cleaned[i].apply(lambda x: removeDecimal(x))
        states_input_cleaned[i] = states_input_cleaned[i].apply(lambda x: pd.to_numeric(x,
                                                                    errors='coerce',
                                                                    downcast='integer'))
    else:
        states_input_cleaned[i] = states_input_cleaned[i].astype(str)
    isPct = ("Pct" in i)
    if isPct or (i in states_input_float):
        states_input_cleaned[i] = states_input_cleaned[i].astype(str)
        states_input_cleaned[i] = states_input_cleaned[i].apply(lambda x: x[:6])
        states_input_cleaned[i] = states_input_cleaned[i].apply(lambda x: pd.to_numeric(x,
                                                                    errors='coerce'))
    states_input_cleaned[i] = states_input_cleaned[i].astype(str)
    states_input_cleaned[i] = states_input_cleaned[i].apply(lambda x: emptyNan(x))
states_input_cleaned = states_input_cleaned[states_input_order]
states_input_cleaned = states_input_cleaned.sort_values(["FIPS"])
states_input_cleaned.head()

States Input Columns
Index(['Time', 'Date', 'FIPS', 'State Abbreviation', 'State Name', 'Status',
       'Census Region', 'Country', 'Data Quality', 'Total Tests',
       'Tests Daily', 'Total Negative', 'Negative Daily', 'Total Positive',
       'Positive Daily', 'Total Deaths', 'Deaths Daily', 'Total Recovered',
       'Total Hospitalized', 'Currently Hospitalized', 'Hospitalized Daily',
       'Total In ICU', 'Currently In ICU', 'Total On Ventilator',
       'Currently On Ventilator', 'Population 2019', 'Female', 'Male',
       'Pct Male', '< 1', '1-4', '15-24', '25-34', '35-44', '45-54', '5-14',
       '55-64', '65-74', '75-84', '85+', 'Pct < 1', 'Pct 1-4', 'Pct 15-24',
       'Pct 25-34', 'Pct 35-44', 'Pct 45-54', 'Pct 55-64', 'Pct 65-74',
       'Pct 75-84', 'Pct 85+'],
      dtype='object')


Unnamed: 0,Time,Date,FIPS,State Abbreviation,State Name,Status,Census Region,Country,Data Quality,Total Tests,Tests Daily,Total Negative,Negative Daily,Total Positive,Positive Daily,Total Deaths,Deaths Daily,Total Recovered,Total Hospitalized,Currently Hospitalized,Hospitalized Daily,Total In ICU,Currently In ICU,Total On Ventilator,Currently On Ventilator,Population 2019,Female,Male,Pct Male,< 1,1-4,15-24,25-34,35-44,45-54,5-14,55-64,65-74,75-84,85+,Pct < 1,Pct 1-4,Pct 15-24,Pct 25-34,Pct 35-44,Pct 45-54,Pct 55-64,Pct 65-74,Pct 75-84,Pct 85+
12684,10/11/2020,2020-10-11,0,US,United States,0,Other,United States,,115424481.0,943645,102250976.0,800029,7727630.0,46776,206597.0,464,3075077.0,423058.0,34028.0,999,21553.0,6583.0,2454.0,1614.0,3783052.0,1847935.0,1935117.0,0.5115,7566104.0,31587262.0,84500410.0,90964550.0,82860364.0,81633238.0,81988326.0,84888424.0,62966866.0,31939744.0,13209916.0,0.0115,0.0482,0.1291,0.139,0.1266,0.1248,0.1297,0.0962,0.0488,0.0201
12652,09/09/2020,2020-09-09,0,US,United States,0,Other,United States,,86672604.0,629649,77613114.0,553429,6329536.0,31114,182773.0,1084,2387479.0,381926.0,32562.0,2060,18322.0,6646.0,1994.0,1903.0,3783052.0,1847935.0,1935117.0,0.5115,7566104.0,31587262.0,84500410.0,90964550.0,82860364.0,81633238.0,81988326.0,84888424.0,62966866.0,31939744.0,13209916.0,0.0115,0.0482,0.1291,0.139,0.1266,0.1248,0.1297,0.0962,0.0488,0.0201
12682,10/09/2020,2020-10-09,0,US,United States,0,Other,United States,,113347658.0,1092407,100497078.0,950718,7623648.0,57060,205470.0,893,3039089.0,420587.0,34839.0,1802,21389.0,6713.0,2437.0,1655.0,3783052.0,1847935.0,1935117.0,0.5115,7566104.0,31587262.0,84500410.0,90964550.0,82860364.0,81633238.0,81988326.0,84888424.0,62966866.0,31939744.0,13209916.0,0.0115,0.0482,0.1291,0.139,0.1266,0.1248,0.1297,0.0962,0.0488,0.0201
12681,10/08/2020,2020-10-08,0,US,United States,0,Other,United States,,112255251.0,1040059,99546360.0,880813,7566588.0,55352,204577.0,986,3021252.0,418785.0,34322.0,2074,21217.0,6621.0,2427.0,1638.0,3783052.0,1847935.0,1935117.0,0.5115,7566104.0,31587262.0,84500410.0,90964550.0,82860364.0,81633238.0,81988326.0,84888424.0,62966866.0,31939744.0,13209916.0,0.0115,0.0482,0.1291,0.139,0.1266,0.1248,0.1297,0.0962,0.0488,0.0201
12680,10/07/2020,2020-10-07,0,US,United States,0,Other,United States,,111215192.0,862764,98665547.0,732692,7511236.0,50602,203591.0,916,2999895.0,416711.0,32105.0,2250,21112.0,6507.0,2410.0,1646.0,3783052.0,1847935.0,1935117.0,0.5115,7566104.0,31587262.0,84500410.0,90964550.0,82860364.0,81633238.0,81988326.0,84888424.0,62966866.0,31939744.0,13209916.0,0.0115,0.0482,0.1291,0.139,0.1266,0.1248,0.1297,0.0962,0.0488,0.0201


In [101]:
states_input_cleaned.to_excel(r'C:\Users\janin\Downloads\states_input.xlsx', index = False)

US Statistics

In [None]:
pgmm_us = pd.read_excel(r'C:\Users\janin\Downloads\USState-Results.xlsx')

Download Raw Country Data

In [None]:
github_url="https://github.com/dsbbfinddx/FINDCov19TrackerData/blob/master/processed/data_all.csv?raw=true"
github_request=requests.get(github_url).content
c=pd.read_csv(io.StringIO(github_request.decode('utf-8')))
currentTime = datetime.now()

In [None]:
print("Columns")
print(c.columns)
print("Sets")
c["set"] = c["set"].astype(str)
print(c["set"].unique())
print("Names")

Clean Data

In [None]:
print("Units")
c["unit"] = c["unit"].astype(str)
c["unit"] = c["unit"].apply(lambda x: "" if x=="unit" else x)
print(c["unit"].sort_values(ascending = True).unique())
print("Times")
c["time"] = c["time"].astype(str)
print(c["time"])

In [None]:
# Remove special characters and rename Congos
def fixCountry(country):
    if country == "Congo - Kinshasa":
        return "Democratic Republic of Congo"
    elif country == "Congo - Brazzaville":
        return "Republic of the Congo"
    elif country == "São Tomé & Príncipe":
        converted_country = "Sao Tome & Príncipe"
    elif country == "Côte d’Ivoire":
        converted_country = "Cote d'Ivoire"
    else:
        return country
c["name"] = c["name"].astype(str)
c["name"] = c["name"].apply(lambda x: "" if x=="nan" else x)
c["name"] = c["name"].apply(lambda x: fixCountry(x))
print(c["name"].sort_values(ascending = True).unique())

In [None]:
def region(country):
    if country in sub_saharan_african_countries:
        return "Sub-Saharan Africa"
    elif country in south_asia_countries:
        return "South Asia"
    else:
        return ""
c["region"] = c["name"].apply(lambda x: region(x))
print(c["region"].sort_values(ascending = True).unique())

In [None]:
# Format text date and add datetime for date
c["time"] = c["time"].apply(lambda x: us_date(x))
c["date"] = pd.to_datetime(c["time"], format="%m/%d/%Y")
minmax_dates = c.groupby(["name"]).agg({"date": [np.min,np.max]})
print(minmax_dates)
min_date = c["date"].min()
print(min_date)
c.sort_values(by=['set','name','date'], inplace=True)

In [None]:
# Calculate changing cases
c["new_cases"] = c["all_cum_cases"].diff()
c["new_cases"] = np.where(c["new_cases"].notna(),c["new_cases"],c["all_cum_cases"])
print(c[["time","date","all_cum_cases","new_cases"]])
c["new_deaths"] = c["all_cum_deaths"].diff()
c["new_deaths"] = np.where(c["new_deaths"].notna(),c["new_deaths"],c["all_cum_deaths"])
print(c[["time","date","all_cum_deaths","new_deaths"]])
c["new_tests"] = c["all_cum_tests"].diff()
c["new_tests"] = np.where(c["new_tests"].notna(),c["new_tests"],c["all_cum_tests"])
print(c[["time","date","all_cum_tests","new_tests"]])
c["new_negatives"] = ""
print(c[["time","date","new_tests","new_cases","new_negatives"]])

In [None]:
# Format numeric columns
numeric_columns = ['pop_100k',
                   'new_cases_orig','new_deaths_orig','new_tests_orig',
                   'cap_cum_cases','cap_new_cases',
                   'cap_cum_deaths','cap_new_deaths',
                   'cap_cum_tests','cap_new_tests',
                   'all_cum_cases','all_new_cases','all_cum_deaths','all_new_deaths',
                   'all_cum_tests','all_new_tests',
                   'pos']
float_columns = ['pop_100k',
                 'cap_cum_cases','cap_new_cases','cap_cum_deaths',
                 'cap_new_deaths','cap_cum_tests','cap_new_tests']
integer_columns = ['new_cases_orig','new_deaths_orig','new_tests_orig',
                   'all_cum_cases','new_cases', 
                   'all_cum_deaths','new_deaths',
                   'all_cum_tests','all_new_tests','new_tests',
                   'pos']

c[float_columns] = c[float_columns].apply(pd.to_numeric)
c[integer_columns] = c[integer_columns].apply(lambda x: pd.to_numeric(x, errors='coerce', downcast='integer'))
has_data = c.all_cum_cases > 0
c = c[has_data]
c = c.where(c.notnull(), None)

In [None]:
c.head(10)

In [None]:
# Add missing columns to match Google sheet
c["state"] = ""
c["county"] = ""
c["all_cum_neg"] = "" #c["all_cum_tests"] - c["all_cum_cases"]
c["all_new_neg"] = "" #c["all_new_tests"] - c["all_new_cases"]
c["hospitalized_currently"] = ""
c["hospitalized_cum"] = ""
c["source"] = github_url
c["accessed"] = str(currentTime.month) + '/' + str(currentTime.day) + '/' + str(currentTime.year)
c.head(10)

In [None]:
# Get all countries
countries_df = c.loc[c["set"]=="country"]
countries_df["name"].unique()

Build Demographics

In [None]:
country_locations = pd.read_excel(r'C:\Users\janin\Downloads\Country Geo.xlsx')
country_locations.drop(columns=["Population","Alternate"], inplace = True)
country_locations.rename(columns = {'Region': 'Population Region'}, inplace = True)
country_locations["Country"] = country_locations["Country"].astype(str)
country_locations.head()

In [None]:
country_populations = pd.read_excel(r'C:\Users\janin\Downloads\Country Populations 2020.xlsx')
country_populations.rename(columns = {'Population': 'Population 2020','Data Source':'Population Data Source'}, inplace = True)
country_populations["Country"] = country_populations["Country"].astype(str)
country_populations.head()

In [None]:
country_demographics_all = pd.merge(country_populations, country_locations, how="left", on="Country")
print(country_demographics_all.columns)
print("All Countries")
print(country_demographics_all["Country"].sort_values(ascending = True).unique())
country_demographics_all["Country"] = country_demographics_all["Country"].astype(str)
print("Configured Countries")
print(configured_countries)
is_configured_demographics = country_demographics_all.Country.isin(configured_countries)
country_demographics = country_demographics_all[is_configured_demographics].copy()
print(country_demographics["Country"])
country_demographics.head()

Get African Input Data

In [None]:
is_configured = c.name.isin(configured_countries)
configured = c[is_configured].copy()
configured.head(-10)

Write Input Data

In [None]:
script_order = ["time","region","name","state","county","pop_100k",
                "all_cum_cases","all_cum_neg",
                "hospitalized_currently","hospitalized_cum",
                "all_cum_deaths","new_deaths",
                "all_new_neg","new_cases","new_tests",
                "source","accessed"]
column_names = ["Date","Region","Country","State","County","Pop_100k",
                "Positive Total", "Negative Total",
                "Hospitalized Currently", "Hospitalized Cumulative",
                "Deaths Total","Death Daily",
                "Negative Daily","Positive Daily","Tests Daily",
                "source","accessed"]
integer_output = ["Positive Total", "Negative Total",
                "Hospitalized Currently", "Hospitalized Cumulative",
                "Deaths Total","Death Daily",
                "Negative Daily","Positive Daily","Tests Daily"]

In [None]:
configured_output = configured[script_order].copy()
configured_output.columns = column_names
configured_output[integer_output] = configured_output[integer_output].apply(lambda x: pd.to_numeric(x, 
                                                                                                    errors='coerce', 
                                                                                                    downcast='integer'))
configured_output = configured_output[configured_output.columns].astype(str)
for i in configured_output.columns:
    configured_output[i] = configured_output[i].apply(lambda x: emptyNan(x))
configured_output.head(-10)

In [None]:
# Merge input data and demographics
pd.set_option('display.max_columns', None)
merged_input = pd.merge(configured_output, country_demographics, how="left", on="Country")
print(merged_input["Region"].sort_values(ascending = True).unique())
print(merged_input.dtypes)
merged_input.head()

In [None]:
# Write input data
merged_input.to_excel(r'C:\Users\janin\Downloads\configured.xlsx', index = False)

Output New Measures

In [None]:
pgmm_ssa = pd.read_excel(r'C:\Users\janin\Downloads\SSA-Temp.xlsx')
pgmm_ssa.rename(columns = {'County': 'Country'}, inplace = True)
pgmm_ssa["Country"] = pgmm_ssa["Country"].apply(lambda x: fixCountry(x))
pgmm_ssa["Country"] = pgmm_ssa["Country"].astype(str)
print(pgmm_ssa.dtypes)
pgmm_ssa[pgmm_ssa["Country"].str.contains("Congo")].head()

In [None]:
pgmm_ssa.to_excel(r'C:\Users\janin\Downloads\ss_africa.xlsx', index = False)

In [None]:
pgmm_sa = pd.read_excel(r'C:\Users\janin\Downloads\SouthAsia excel updated 20201008.xlsx')
pgmm_sa["Country"] = pgmm_sa["Country"].apply(lambda x: fixCountry(x))
pgmm_sa["Country"] = pgmm_sa["Country"].astype(str)
print(pgmm_sa.dtypes)
pgmm_sa.head()

In [None]:
pgmm_sa.to_excel(r'C:\Users\janin\Downloads\south_asia.xlsx', index = False)

In [None]:
all_configured = pd.concat([pgmm_ssa, pgmm_sa], sort=False)
all_configured.head()

In [None]:
all_configured.to_excel(r'C:\Users\janin\Downloads\all_configured_regions.xlsx', index = False)