In [63]:
# Dependencies
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import requests
import io
import time
from datetime import datetime, timedelta
import json
import pprint
pp = pprint.PrettyPrinter(indent=4)
pd.set_option('display.max_columns', None)

In [64]:
def fixRegion(code):
    region_name = ""
    for region in census_regions:
        if region["number"] == code:
            region_name = region["name"]
            break
    if region_name == "":
        region_name = "Other"
        print(str(code) + " not found")
    return region_name

# CDC Standard age ranges 0-17, 18-29, 30-49, and 50-64
# CDC COVID Reporting Age Ranges https://www.cdc.gov/nchs/nvss/vsrr/covid_weekly/index.htm
def getAgeRange(age):
    age_range = ""
    if age == 0:
        age_range = "< 1"
    elif age == 999:
        age_range = "Total"
    elif age < 5:
        age_range = "1-4"
    elif age < 15:
        age_range = "5-14"
    elif age < 25:
        age_range = "15-24"
    elif age < 35:
        age_range = "25-34"
    elif age < 45:
        age_range = "35-44"
    elif age < 55:
        age_range = "45-54"
    elif age < 65:
        age_range = "55-64"
    elif age < 75:
        age_range = "65-74"
    elif age < 85:
        age_range = "75-84"
    elif age == 85:
        age_range = "85+"
    return age_range

def fixSex(code):
    sex = ""
    if code == 0:
        sex = "Population 2019"
    elif code == 1:
        sex = "Male"
    elif code == 2:
        sex = "Female"
    else:
        print(str(code) + " is not a sex")
    return sex

def us_date(x):
    month = x[5:7]
    day = x[8:11]
    year = x[0:4]
    conversion = month + "/" + day +"/"+ year
    return conversion

def removeDecimal(data):
    strData = str(data)
    decimalLocation = strData.find(".")
    if decimalLocation > -1:
        return strData[0:decimalLocation]
    else:
        return strData

def emptyNan(value):
    if (value == "nan"):
        return ""
    else:
        return value

def printColumns(df, label):
    print(label)
    print(df.columns)

def print_column_unique(column):
    print("Column Values:")
    values = column.sort_values(ascending = True).unique()
    print(values)
    return values

def print_column_missing(column, comparison):
    values = print_column_unique(column)
    print("Comparison:")
    print(comparison)
    missing_values = []
    for value in values:
        if not value in comparison:
            missing_values.append(value)
    if len(missing_values) > 0:
        print("Column values not in comparison:")
        print(missing_values)
    else:
        print("No missing values")
    missing_values = []
    for value in comparison:
        if not value in values:
            missing_values.append(value)
    if len(missing_values) > 0:
        print("Comparison values not in column:")
        print(missing_values)
    else:
        print("No missing values")
    return values

In [77]:
# Countries and Regions

european_countries = [
    'Albania','Andorra','Austria','Belarus','Belgium','Bosnia & Herzegovina','Bulgaria',
    'Croatia','Czech Republic','Denmark','Estonia','Finland','France',
    'Germany','Greece','Greenland','Hungary','Iceland','Ireland','Isle of Man','Italy',
    'Latvia','Liechtenstein','Lithuania','Luxembourg','Malta','Moldova','Monaco','Montenegro',
    'Netherlands','Norway','Poland','Portugal','Romania',
    'San Marino','Serbia','Slovakia','Slovenia','Spain','Sweden','Switzerland',
    'Ukraine','United Kingdom','Vatican City'
]
carribean_countries = [
    "Antigua & Barbuda","Aruba","Bahamas","Barbados","Bermuda","British Virgin Islands",
    "Cayman Islands","Cuba","Curacao","Dominica","Dominican Republic","Grenada",
    "Haiti","Jamaica","Puerto Rico","St. Kitts & Nevis","St. Lucia","St. Vincent & Grenadines",
    "Sint Maarten","Trinidad & Tobago","Turks and Caicos Islands","United States Virgin Islands"
]
central_south_america_countries = [
    'Argentina','Belize','Bolivia','Brazil','Chile','Colombia','Costa Rica',
    'Ecuador','El Salvador','Guatemala','Guyana','Honduras',
    'Mexico','Nicaragua','Panama','Paraguay','Peru','Suriname','Uruguay','Venezuela'
]
latin_american_countries = carribean_countries + central_south_america_countries
sub_saharan_african_countries = [
    "Angola","Benin","Botswana","Burkina Faso","Burundi",
    "Cabo Verde","Cameroon","Central African Republic","Chad","Comoros","Côte d’Ivoire",
    "Democratic Republic of Congo","Equatorial Guinea","Eritrea","Ethiopia",
    "Gabon","Gambia","Ghana","Guinea","Guinea-Bissau","Kenya","Lesotho","Liberia",
    "Madagascar","Malawi","Mali","Mauritania","Mauritius","Mozambique",
    "Namibia","Niger","Nigeria","Republic of the Congo","Rwanda",
    "São Tomé and Príncipe","Senegal","Seychelles","Sierra Leone",
    "Somalia","South Africa","South Sudan","Sudan","Swaziland",
    "Tanzania","Togo","Uganda","Zambia","Zimbabwe"
]
south_asia_countries = [
    "Afghanistan","Bangladesh","Bhutan","India","Maldives","Nepal","Pakistan","Sri Lanka"
]
central_asian_countries = [
    'Armenia','Azerbaijan','Cyprus','Faeroe Islands','Georgia','Gibraltar','Kazakhstan','Kosovo','Kyrgyzstan',
    'North Macedonia','Russia','Tajikistan','Turkey','Turkmenistan','Uzbekistan'
]
east_asian_countries = [
    "Brunei","Cambodia","China","Indonesia","Japan","Laos","Malaysia","Mongolia","Myanmar","North Korea","Philippines",
    "Singapore","South Korea","Taiwan","Thailand","Timor","Vietnam"
]
pacific_countries = [
    "Australia","Cook Islands","Fiji","French Polynesia","Guam","Kiribati",
    "Marshall Islands","Micronesia","Nauru","New Caledonia","New Zealand",
    "Northern Mariana Islands","Palau","Papua New Guinea","Samoa","Solomon Islands","Tonga","Tuvalu","Vanuatu"
]
east_asia_and_pacific_countries = east_asian_countries + pacific_countries
middle_eastern_countries = [
    "Bahrain","Iran","Iraq","Israel","Jordan","Kuwait","Lebanon","Oman","Qatar","Saudi Arabia","Syria",
    "United Arab Emirates","Yemen"
]
north_african_countries = [
    "Algeria","Djibouti","Egypt","Libya","Morocco","Tunisia"
]
middle_east_and_north_africa_countries = middle_eastern_countries + north_african_countries 
north_american_countries = ["Canada","United States"]
configured_country_lists = [
    european_countries,
    latin_american_countries,
    sub_saharan_african_countries,
    south_asia_countries,
    central_asian_countries,
    middle_east_and_north_africa_countries
]
configured_countries = []
for country_list in configured_country_lists:
    for country in country_list:
        configured_countries.append(country)
print("Configured Countries")
configured_countries.sort()
print(configured_countries)
#print(configured_countries)
all_countries = configured_countries + east_asia_and_pacific_countries + middle_east_and_north_africa_countries + north_american_countries
all_countries.sort()
print("All Countries")
print(all_countries)
print("Configured Regions")
configured_regions = [
    'Central Asia',
    'Europe',
    'Latin America',
    'South Asia',
    'Sub-Saharan Africa',
    'Middle East and North Africa'
]
print(configured_regions)
print("Country Conversions")
country_conversions = {
    "Antigua & Barbuda": ["Antigua and Barbuda"],
    "Bahamas": ["Bahamas, The"],
    "Bosnia & Herzegovina": ["Bosnia and Herzegovina"],
    "Brunei": ["Brunei Darussalam"],
    "Cabo Verde": ["Cape Verde"],
    "Côte d’Ivoire": ["Cote d'Ivoire","Cote dIvoire"],
    "Czech Republic": ["Czechia","Czech Republic (Czechia)"],
    "Democratic Republic of Congo": ["Congo - Kinshasa"],
    "Egypt": ["Egypt, Arab Rep."],
    "Faeroe Islands": ["Faroe Islands"],
    "Gambia": ["Gambia, The"],
    "Iran": ["Iran, Islamic Rep."],
    "Kyrgyzstan": ["Kyrgyz Republic"],
    "Laos": ["Lao PDR"],
    "Micronesia": ["Micronesia, Fed. Sts."],
    "Myanmar": ["Myanmar (Burma)","Burma"],
    "North Macedonia": ["Macedonia"],
    "Republic of the Congo": ["Congo - Brazzaville"],
    "Russia": ["Russian Federation"],
    "São Tomé and Príncipe": ["Sao Tome and Principe","Sao Tome & Príncipe","São Tomé & Príncipe"],
    "Sint Maarten": ["Sint Maarten (Dutch part)"],
    "Slovakia": ["Slovak Republic"],
    "St. Kitts & Nevis": ["Saint Kitts and Nevis"],
    "St. Lucia": ["Saint Lucia"],
    "St. Vincent & Grenadines": ["Saint Vincent and the Grenadines"],
    "Swaziland": ["Eswatini"],
    "Syria": ["Syrian Arab Republic"],
    "Timor": ["Timor-Leste"],
    "Trinidad & Tobago": ["Trinidad and Tobago"],
    "Vatican City": ["Holy See"],
    "Yemen": ["Yemen, Rep."],
    "" : ["nan"]
}
unincorporated_disputed_territories = [
    "American Samoa", "Anguilla","Caribbean Netherlands","Channel Islands","Curaçao",
    "Falkland Islands","French Guiana","Guadeloupe","Hong Kong"
]
print(country_conversions)
print("Countries by Region")
countries_by_region = {
    'Central Asia': central_asian_countries,
    'Europe': european_countries,
    'Latin America': latin_american_countries,
    'South Asia': south_asia_countries,
    'Sub-Saharan Africa': sub_saharan_african_countries,
    'Middle East and North Africa': middle_east_and_north_africa_countries,
    'East Asia and Pacific': east_asia_and_pacific_countries,
    'North America': north_american_countries
}
print(countries_by_region)

def key_from_value(value, dictionary, default):
    return_value = default
    for key, values in dictionary.items():
        if value.strip() in values:
            return_value = key
            break
    return return_value.strip()

def region_from_country(country):
    return key_from_value(country, countries_by_region, "")

def fixCountry(value):
    return key_from_value(value, country_conversions, value)

def fixCountries(countries_column, configuredCountries):
    countries_conversion = countries_column.astype(str)
    countries_conversion = countries_conversion.apply(lambda x: fixCountry(x))
    print(conversions)
    countries = print_column_missing(countries_conversion,configuredCountries)
    return countries_conversion

def testConversion(title, test_array, conversion):
    print(title)
    no_conversions = []
    for value in test_array:
        return_value = ""
        if conversion == "country":
            return_value = fixCountry(value)
        elif conversion == "region":
            return_value = region_from_country(fixCountry(value))
        if return_value != value.strip():
            print(value.strip() + "," + return_value)
        if return_value == "":
            no_conversions.append(value)
    if len(no_conversions) > 0:
        print("Missing Conversions")
        print(no_conversions)
    print("")

Configured Countries
['Afghanistan', 'Albania', 'Algeria', 'Andorra', 'Angola', 'Antigua & Barbuda', 'Argentina', 'Armenia', 'Aruba', 'Austria', 'Azerbaijan', 'Bahamas', 'Bahrain', 'Bangladesh', 'Barbados', 'Belarus', 'Belgium', 'Belize', 'Benin', 'Bermuda', 'Bhutan', 'Bolivia', 'Bosnia & Herzegovina', 'Botswana', 'Brazil', 'British Virgin Islands', 'Bulgaria', 'Burkina Faso', 'Burundi', 'Cabo Verde', 'Cameroon', 'Cayman Islands', 'Central African Republic', 'Chad', 'Chile', 'Colombia', 'Comoros', 'Costa Rica', 'Croatia', 'Cuba', 'Curacao', 'Cyprus', 'Czech Republic', 'Côte d’Ivoire', 'Democratic Republic of Congo', 'Denmark', 'Djibouti', 'Dominica', 'Dominican Republic', 'Ecuador', 'Egypt', 'El Salvador', 'Equatorial Guinea', 'Eritrea', 'Estonia', 'Ethiopia', 'Faeroe Islands', 'Finland', 'France', 'Gabon', 'Gambia', 'Georgia', 'Germany', 'Ghana', 'Gibraltar', 'Greece', 'Greenland', 'Grenada', 'Guatemala', 'Guinea', 'Guinea-Bissau', 'Guyana', 'Haiti', 'Honduras', 'Hungary', 'Iceland', 

In [79]:
input_countries = ['Afghanistan','Albania','Algeria','Andorra','Angola','Antigua & Barbuda',
 'Argentina','Armenia','Australia','Austria','Azerbaijan','Bahamas',
 'Bahrain','Bangladesh','Barbados','Belarus','Belgium','Belize','Benin',
 'Bermuda','Bhutan','Bolivia','Bosnia & Herzegovina','Botswana','Brazil',
 'Brunei','Bulgaria','Burkina Faso','Burundi','Cambodia','Cameroon',
 'Canada','Cape Verde','Central African Republic','Chad','Chile','China',
 'Colombia','Comoros','Congo - Brazzaville','Congo - Kinshasa',
 'Costa Rica','Croatia','Cuba','Cyprus','Czechia','Côte d’Ivoire',
 'Denmark','Djibouti','Dominica','Dominican Republic','Ecuador','Egypt',
 'El Salvador','Equatorial Guinea','Eritrea','Estonia','Eswatini',
 'Ethiopia','Faroe Islands','Fiji','Finland','France','French Polynesia',
 'Gabon','Gambia','Georgia','Germany','Ghana','Greece','Greenland',
 'Grenada','Guatemala','Guinea','Guinea-Bissau','Guyana','Haiti',
 'Honduras','Hong Kong SAR China','Hungary','Iceland','India','Indonesia',
 'Iran','Iraq','Ireland','Israel','Italy','Jamaica','Japan','Jordan',
 'Kazakhstan','Kenya','Kosovo','Kuwait','Kyrgyzstan','Laos','Latvia',
 'Lebanon','Lesotho','Liberia','Libya','Liechtenstein','Lithuania',
 'Luxembourg','Madagascar','Malawi','Malaysia','Maldives','Mali','Malta',
 'Mauritania','Mauritius','Mexico','Moldova','Monaco','Mongolia',
 'Montenegro','Morocco','Mozambique','Myanmar (Burma)','Namibia','Nepal',
 'Netherlands','New Caledonia','New Zealand','Nicaragua','Niger','Nigeria',
 'North Macedonia','Norway','Oman','Pakistan','Palestinian Territories',
 'Panama','Papua New Guinea','Paraguay','Peru','Philippines','Poland',
 'Portugal','Qatar','Romania','Russia','Rwanda','San Marino',
 'Saudi Arabia','Senegal','Serbia','Seychelles','Sierra Leone','Singapore',
 'Slovakia','Slovenia','Somalia','South Africa','South Korea',
 'South Sudan','Spain','Sri Lanka','St. Kitts & Nevis','St. Lucia',
 'St. Vincent & Grenadines','Sudan','Suriname','Sweden','Switzerland',
 'Syria','São Tomé & Príncipe','Taiwan','Tajikistan','Tanzania','Thailand',
 'Timor-Leste','Togo','Trinidad & Tobago','Tunisia','Turkey','Uganda',
 'Ukraine','United Arab Emirates','United Kingdom','United States',
 'Uruguay','Uzbekistan','Vatican City','Venezuela','Vietnam',
 'Western Sahara','Yemen','Zambia','Zimbabwe','nan']
 
#testConversion("Country Fixes",input_countries,"country")
3testConversion("Region Assignments",input_countries,"region")



SyntaxError: invalid syntax (<ipython-input-79-da83989bfcd0>, line 36)

In [80]:
# Download input data

github_url="https://github.com/dsbbfinddx/FINDCov19TrackerData/blob/master/processed/data_all.csv?raw=true"
github_request=requests.get(github_url).content
c=pd.read_csv(io.StringIO(github_request.decode('utf-8')))
currentTime = datetime.now()

print("Columns")
print(c.columns)
print("Sets")
c["set"] = c["set"].astype(str)
sets = print_column_unique(c["set"])
print("Names")
c["name"] = c["name"].astype(str)
names = print_column_unique(c["name"])
print("Units")
c["unit"] = c["unit"].astype(str)
c["unit"] = c["unit"].apply(lambda x: "" if x=="unit" else x)
units = print_column_unique(c["unit"])
print("Times")
c["time"] = c["time"].astype(str)
times = print_column_unique(c["time"])

c["name"] = c["name"].astype(str)
c["name"] = c["name"].apply(lambda x: "" if x=="nan" else x)
conversions = {}
c["name"] = c["name"].astype(str)
c["name"] = c["name"].apply(lambda x: fixCountry(x))
print(conversions)
print("Configured Countries")
print_column_missing(c["name"],configured_countries)
print("All Countries")
print_column_missing(c["name"],all_countries)

c["region"] = c["name"].apply(lambda x: region_from_country(x))
print("Regions")
c_regions = print_column_missing(c["region"],configured_regions)

# Format text date and add datetime for date
c["time"] = c["time"].apply(lambda x: us_date(x))
c["date"] = pd.to_datetime(c["time"], format="%m/%d/%Y")
minmax_dates = c.groupby(["name"]).agg({"date": [np.min,np.max]})
print(minmax_dates)
min_date = c["date"].min()
print(min_date)
c.sort_values(by=['set','name','date'], inplace=True)

# Calculate changing cases
c["new_cases"] = c["all_cum_cases"].diff()
c["new_cases"] = np.where(c["new_cases"].notna(),c["new_cases"],c["all_cum_cases"])
print(c[["time","date","all_cum_cases","new_cases"]])
c["new_deaths"] = c["all_cum_deaths"].diff()
c["new_deaths"] = np.where(c["new_deaths"].notna(),c["new_deaths"],c["all_cum_deaths"])
print(c[["time","date","all_cum_deaths","new_deaths"]])
c["new_tests"] = c["all_cum_tests"].diff()
c["new_tests"] = np.where(c["new_tests"].notna(),c["new_tests"],c["all_cum_tests"])
print(c[["time","date","all_cum_tests","new_tests"]])
c["new_negatives"] = ""
print(c[["time","date","new_tests","new_cases","new_negatives"]])

# Format numeric columns
numeric_columns = ['pop_100k',
                   'new_cases_orig','new_deaths_orig','new_tests_orig',
                   'cap_cum_cases','cap_new_cases',
                   'cap_cum_deaths','cap_new_deaths',
                   'cap_cum_tests','cap_new_tests',
                   'all_cum_cases','all_new_cases','all_cum_deaths','all_new_deaths',
                   'all_cum_tests','all_new_tests',
                   'pos']
float_columns = ['pop_100k',
                 'cap_cum_cases','cap_new_cases','cap_cum_deaths',
                 'cap_new_deaths','cap_cum_tests','cap_new_tests']
integer_columns = ['new_cases_orig','new_deaths_orig','new_tests_orig',
                   'all_cum_cases','new_cases', 
                   'all_cum_deaths','new_deaths',
                   'all_cum_tests','all_new_tests','new_tests',
                   'pos']

c[float_columns] = c[float_columns].apply(pd.to_numeric)
c[integer_columns] = c[integer_columns].apply(lambda x: pd.to_numeric(x, errors='coerce', downcast='integer'))
has_data = c.all_cum_cases > 0
c = c[has_data]
c = c.where(c.notnull(), None)

# Add missing columns to match Google sheet
c["state"] = ""
c["county"] = ""
c["all_cum_neg"] = "" #c["all_cum_tests"] - c["all_cum_cases"]
c["all_new_neg"] = "" #c["all_new_tests"] - c["all_new_cases"]
c["hospitalized_currently"] = ""
c["hospitalized_cum"] = ""
c["source"] = github_url
c["accessed"] = str(currentTime.month) + '/' + str(currentTime.day) + '/' + str(currentTime.year)
c.head(10)

Columns
Index(['set', 'name', 'unit', 'time', 'pop_100k', 'new_cases_orig',
       'new_deaths_orig', 'new_tests_orig', 'cap_cum_cases', 'cap_new_cases',
       'cap_cum_deaths', 'cap_new_deaths', 'cap_cum_tests', 'cap_new_tests',
       'all_cum_cases', 'all_new_cases', 'all_cum_deaths', 'all_new_deaths',
       'all_cum_tests', 'all_new_tests', 'pos'],
      dtype='object')
Sets
Column Values:
['country' 'income' 'region']
Names
Column Values:
['Afghanistan' 'Albania' 'Algeria' 'Andorra' 'Angola' 'Antigua & Barbuda'
 'Argentina' 'Armenia' 'Australia' 'Austria' 'Azerbaijan' 'Bahamas'
 'Bahrain' 'Bangladesh' 'Barbados' 'Belarus' 'Belgium' 'Belize' 'Benin'
 'Bermuda' 'Bhutan' 'Bolivia' 'Bosnia & Herzegovina' 'Botswana' 'Brazil'
 'Brunei' 'Bulgaria' 'Burkina Faso' 'Burundi' 'Cambodia' 'Cameroon'
 'Canada' 'Cape Verde' 'Central African Republic' 'Chad' 'Chile' 'China'
 'Colombia' 'Comoros' 'Congo - Brazzaville' 'Congo - Kinshasa'
 'Costa Rica' 'Croatia' 'Cuba' 'Cyprus' 'Czechia' 'Côte d’I

['' 'Afghanistan' 'Albania' 'Algeria' 'Andorra' 'Angola'
 'Antigua & Barbuda' 'Argentina' 'Armenia' 'Australia' 'Austria'
 'Azerbaijan' 'Bahamas' 'Bahrain' 'Bangladesh' 'Barbados' 'Belarus'
 'Belgium' 'Belize' 'Benin' 'Bermuda' 'Bhutan' 'Bolivia'
 'Bosnia & Herzegovina' 'Botswana' 'Brazil' 'Brunei' 'Bulgaria'
 'Burkina Faso' 'Burundi' 'Cabo Verde' 'Cambodia' 'Cameroon' 'Canada'
 'Central African Republic' 'Chad' 'Chile' 'China' 'Colombia' 'Comoros'
 'Costa Rica' 'Croatia' 'Cuba' 'Cyprus' 'Czech Republic' 'Côte d’Ivoire'
 'Democratic Republic of Congo' 'Denmark' 'Djibouti' 'Dominica'
 'Dominican Republic' 'Ecuador' 'Egypt' 'El Salvador' 'Equatorial Guinea'
 'Eritrea' 'Estonia' 'Ethiopia' 'Faeroe Islands' 'Fiji' 'Finland' 'France'
 'French Polynesia' 'Gabon' 'Gambia' 'Georgia' 'Germany' 'Ghana' 'Greece'
 'Greenland' 'Grenada' 'Guatemala' 'Guinea' 'Guinea-Bissau' 'Guyana'
 'Haiti' 'Honduras' 'Hong Kong SAR China' 'Hungary' 'Iceland' 'India'
 'Indonesia' 'Iran' 'Iraq' 'Ireland' 'Israel' 'I

[55310 rows x 5 columns]


Unnamed: 0,set,name,unit,time,pop_100k,new_cases_orig,new_deaths_orig,new_tests_orig,cap_cum_cases,cap_new_cases,cap_cum_deaths,cap_new_deaths,cap_cum_tests,cap_new_tests,all_cum_cases,all_new_cases,all_cum_deaths,all_new_deaths,all_cum_tests,all_new_tests,pos,region,date,new_cases,new_deaths,new_tests,new_negatives,state,county,all_cum_neg,all_new_neg,hospitalized_currently,hospitalized_cum,source,accessed
6484,country,Afghanistan,AF,02/24/2020,389.28,1,0,,0.00256885,0.000366978,0,0,,,1,0.142857,0,0,,,,South Asia,2020-02-24,1,0,,,,,,,,,https://github.com/dsbbfinddx/FINDCov19Tracker...,10/24/2020
6680,country,Afghanistan,AF,02/25/2020,389.28,0,0,,0.00256885,0.000366978,0,0,,,1,0.142857,0,0,,,,South Asia,2020-02-25,0,0,,,,,,,,,https://github.com/dsbbfinddx/FINDCov19Tracker...,10/24/2020
6876,country,Afghanistan,AF,02/26/2020,389.28,0,0,,0.00256885,0.000366978,0,0,,,1,0.142857,0,0,,,,South Asia,2020-02-26,0,0,,,,,,,,,https://github.com/dsbbfinddx/FINDCov19Tracker...,10/24/2020
7072,country,Afghanistan,AF,02/27/2020,389.28,0,0,,0.00256885,0.000366978,0,0,,,1,0.142857,0,0,,,,South Asia,2020-02-27,0,0,,,,,,,,,https://github.com/dsbbfinddx/FINDCov19Tracker...,10/24/2020
7268,country,Afghanistan,AF,02/28/2020,389.28,0,0,,0.00256885,0.000366978,0,0,,,1,0.142857,0,0,,,,South Asia,2020-02-28,0,0,,,,,,,,,https://github.com/dsbbfinddx/FINDCov19Tracker...,10/24/2020
7464,country,Afghanistan,AF,02/29/2020,389.28,0,0,,0.00256885,0.000366978,0,0,,,1,0.142857,0,0,,,,South Asia,2020-02-29,0,0,,,,,,,,,https://github.com/dsbbfinddx/FINDCov19Tracker...,10/24/2020
7660,country,Afghanistan,AF,03/01/2020,389.28,0,0,,0.00256885,0.000366978,0,0,,,1,0.142857,0,0,,,,South Asia,2020-03-01,0,0,,,,,,,,,https://github.com/dsbbfinddx/FINDCov19Tracker...,10/24/2020
7856,country,Afghanistan,AF,03/02/2020,389.28,0,0,,0.00256885,0.0,0,0,,,1,0.0,0,0,,,,South Asia,2020-03-02,0,0,,,,,,,,,https://github.com/dsbbfinddx/FINDCov19Tracker...,10/24/2020
8052,country,Afghanistan,AF,03/03/2020,389.28,0,0,59.0,0.00256885,0.0,0,0,0.151562,,1,0.0,0,0,59.0,,,South Asia,2020-03-03,0,0,59.0,,,,,,,,https://github.com/dsbbfinddx/FINDCov19Tracker...,10/24/2020
8248,country,Afghanistan,AF,03/04/2020,389.28,0,0,,0.00256885,0.0,0,0,0.151562,,1,0.0,0,0,59.0,,,South Asia,2020-03-04,0,0,0.0,,,,,,,,https://github.com/dsbbfinddx/FINDCov19Tracker...,10/24/2020


In [81]:
# Get all countries
countries_df = c.loc[c["set"]=="country"]
countries_df["name"].unique()

array(['Afghanistan', 'Albania', 'Algeria', 'Andorra', 'Angola',
       'Antigua & Barbuda', 'Argentina', 'Armenia', 'Australia',
       'Austria', 'Azerbaijan', 'Bahamas', 'Bahrain', 'Bangladesh',
       'Barbados', 'Belarus', 'Belgium', 'Belize', 'Benin', 'Bhutan',
       'Bolivia', 'Bosnia & Herzegovina', 'Botswana', 'Brazil', 'Brunei',
       'Bulgaria', 'Burkina Faso', 'Burundi', 'Cabo Verde', 'Cambodia',
       'Cameroon', 'Canada', 'Central African Republic', 'Chad', 'Chile',
       'China', 'Colombia', 'Comoros', 'Costa Rica', 'Croatia', 'Cuba',
       'Cyprus', 'Czech Republic', 'Côte d’Ivoire',
       'Democratic Republic of Congo', 'Denmark', 'Djibouti', 'Dominica',
       'Dominican Republic', 'Ecuador', 'Egypt', 'El Salvador',
       'Equatorial Guinea', 'Eritrea', 'Estonia', 'Ethiopia', 'Fiji',
       'Finland', 'France', 'Gabon', 'Gambia', 'Georgia', 'Germany',
       'Ghana', 'Greece', 'Grenada', 'Guatemala', 'Guinea',
       'Guinea-Bissau', 'Guyana', 'Haiti', 'Hondura

Build Demographics

In [82]:
country_locations = pd.read_excel(r'C:\Users\janin\Downloads\Country Geo.xlsx')
print(country_locations.columns)
country_locations.drop(columns=["Population","Alternate"], inplace = True)
country_locations.rename(columns = {'Region': 'Population Region'}, inplace = True)
conversions = {}
country_locations["Country"] = country_locations["Country"].astype(str)
country_locations["Country"] = country_locations["Country"].apply(lambda x: fixCountry(x))
print(conversions)
countries = print_column_missing(country_locations["Country"],all_countries)
country_locations.head()

Index(['Region', 'Country', 'Population', 'Latitude', 'Longitude',
       'Alternate'],
      dtype='object')
{}
Column Values:
['Albania' 'Algeria' 'Andorra' 'Angola' 'Antigua & Barbuda' 'Argentina'
 'Armenia' 'Aruba' 'Australia' 'Austria' 'Azerbaijan' 'Bahamas' 'Bahrain'
 'Barbados' 'Belarus' 'Belgium' 'Belize' 'Benin' 'Bermuda' 'Bolivia'
 'Bosnia & Herzegovina' 'Botswana' 'Brazil' 'British Virgin Islands'
 'Brunei' 'Bulgaria' 'Burkina Faso' 'Burundi' 'Cabo Verde' 'Cambodia'
 'Cameroon' 'Canada' 'Cayman Islands' 'Central African Republic' 'Chad'
 'Chile' 'China' 'Colombia' 'Comoros' 'Costa Rica' 'Croatia' 'Cuba'
 'Curacao' 'Cyprus' 'Czech Republic' 'Côte d’Ivoire'
 'Democratic Republic of Congo' 'Denmark' 'Djibouti' 'Dominica'
 'Dominican Republic' 'Ecuador' 'Egypt' 'El Salvador' 'Equatorial Guinea'
 'Eritrea' 'Estonia' 'Ethiopia' 'Faeroe Islands' 'Fiji' 'Finland' 'France'
 'French Polynesia' 'Gabon' 'Gambia' 'Georgia' 'Germany' 'Ghana'
 'Gibraltar' 'Greece' 'Greenland' 'Grenada' 'Gu

Unnamed: 0,Population Region,Country,Latitude,Longitude
0,Carribean,Antigua & Barbuda,17.060816,-61.796428
1,Carribean,Aruba,12.52111,-69.968338
2,Carribean,Bahamas,25.03428,-77.39628
3,Carribean,Barbados,13.193887,-59.543198
4,Carribean,Bermuda,32.321384,-64.75737


In [83]:
country_populations = pd.read_excel(r'C:\Users\janin\Downloads\Country Populations 2020.xlsx')
country_populations.rename(columns = {'Population': 'Population 2020','Data Source':'Population Data Source'}, inplace = True)
conversions = {}
country_populations["Country"] = country_populations["Country"].astype(str)
country_populations["Country"] = country_populations["Country"].apply(lambda x: fixCountry(x))
print(conversions)
print_column_missing(country_populations["Country"],all_countries)
country_populations.head()

{}
Column Values:
['Afghanistan' 'Albania' 'Algeria' 'American Samoa' 'Andorra' 'Angola'
 'Anguilla' 'Antigua & Barbuda' 'Argentina' 'Armenia' 'Aruba' 'Australia'
 'Austria' 'Azerbaijan' 'Bahamas' 'Bahrain' 'Bangladesh' 'Barbados'
 'Belarus' 'Belgium' 'Belize' 'Benin' 'Bermuda' 'Bhutan' 'Bolivia'
 'Bosnia & Herzegovina' 'Botswana' 'Brazil' 'British Virgin Islands'
 'Brunei' 'Bulgaria' 'Burkina Faso' 'Burundi' 'Cabo Verde' 'Cambodia'
 'Cameroon' 'Canada' 'Caribbean Netherlands' 'Cayman Islands'
 'Central African Republic' 'Chad' 'Channel Islands' 'Chile' 'China'
 'Colombia' 'Comoros' 'Cook Islands' 'Costa Rica' 'Croatia' 'Cuba'
 'Curaçao' 'Cyprus' 'Czech Republic' 'Côte d’Ivoire'
 'Democratic Republic of Congo' 'Denmark' 'Djibouti' 'Dominica'
 'Dominican Republic' 'Ecuador' 'Egypt' 'El Salvador' 'Equatorial Guinea'
 'Eritrea' 'Estonia' 'Ethiopia' 'Faeroe Islands' 'Falkland Islands' 'Fiji'
 'Finland' 'France' 'French Guiana' 'French Polynesia' 'Gabon' 'Gambia'
 'Georgia' 'Germany' 'Ghana

Unnamed: 0,Rank,Country,Country Link,Year,Population 2020,Annual % Change,Net Change,Density (P/Km²),Land Area (Km²),Migrants (net),Fertility Rate,Median Age,Urban Population %,World Share %,Population Data Source
0,37,Afghanistan,Afghanistan,2020,38928346,2.33,886592,60,652860,-62920.0,4.6,18,25,0.5,Worldometer (www.Worldometers.info)Elaboration...
1,140,Albania,Albania,2020,2877797,-0.11,-3120,105,27400,-14000.0,1.6,36,63,0.04,Worldometer (www.Worldometers.info)Elaboration...
2,33,Algeria,Algeria,2020,43851044,1.85,797990,18,2381740,-10000.0,3.1,29,73,0.56,Worldometer (www.Worldometers.info)Elaboration...
3,210,American Samoa,American Samoa,2020,55191,-0.22,-121,276,200,,N.A.,N.A.,88,0.0,Worldometer (www.Worldometers.info)Elaboration...
4,203,Andorra,Andorra,2020,77265,0.16,123,164,470,,N.A.,N.A.,88,0.0,Worldometer (www.Worldometers.info)Elaboration...


In [97]:
country_demographics_all = pd.merge(country_populations, country_locations, how="left", on="Country")
print(country_demographics_all.columns)
print("All Countries")
demographics_countries = country_demographics_all["Country"].sort_values(ascending = True).unique()
print(demographics_countries)
country_demographics_all["Country"] = country_demographics_all["Country"].astype(str)
print("Configured Countries")
print(configured_countries)
is_configured_demographics = country_demographics_all.Country.isin(configured_countries)
country_demographics = country_demographics_all[is_configured_demographics].copy()
print(country_demographics["Country"])
country_demographics.head()

Index(['Rank', 'Country', 'Country Link', 'Year', 'Population 2020',
       'Annual % Change', 'Net Change', 'Density (P/Km²)', 'Land Area (Km²)',
       'Migrants (net)', 'Fertility Rate', 'Median Age', 'Urban Population  %',
       'World Share %', 'Population Data Source', 'Population Region',
       'Latitude', 'Longitude'],
      dtype='object')
All Countries
['Afghanistan' 'Albania' 'Algeria' 'American Samoa' 'Andorra' 'Angola'
 'Anguilla' 'Antigua & Barbuda' 'Argentina' 'Armenia' 'Aruba' 'Australia'
 'Austria' 'Azerbaijan' 'Bahamas' 'Bahrain' 'Bangladesh' 'Barbados'
 'Belarus' 'Belgium' 'Belize' 'Benin' 'Bermuda' 'Bhutan' 'Bolivia'
 'Bosnia & Herzegovina' 'Botswana' 'Brazil' 'British Virgin Islands'
 'Brunei' 'Bulgaria' 'Burkina Faso' 'Burundi' 'Cabo Verde' 'Cambodia'
 'Cameroon' 'Canada' 'Caribbean Netherlands' 'Cayman Islands'
 'Central African Republic' 'Chad' 'Channel Islands' 'Chile' 'China'
 'Colombia' 'Comoros' 'Cook Islands' 'Costa Rica' 'Croatia' 'Cuba'
 'Curaçao' 'Cypr

Unnamed: 0,Rank,Country,Country Link,Year,Population 2020,Annual % Change,Net Change,Density (P/Km²),Land Area (Km²),Migrants (net),Fertility Rate,Median Age,Urban Population %,World Share %,Population Data Source,Population Region,Latitude,Longitude
0,37,Afghanistan,Afghanistan,2020,38928346,2.33,886592,60,652860,-62920.0,4.6,18,25,0.5,Worldometer (www.Worldometers.info)Elaboration...,,,
1,140,Albania,Albania,2020,2877797,-0.11,-3120,105,27400,-14000.0,1.6,36,63,0.04,Worldometer (www.Worldometers.info)Elaboration...,Europe,41.153332,20.168331
2,33,Algeria,Algeria,2020,43851044,1.85,797990,18,2381740,-10000.0,3.1,29,73,0.56,Worldometer (www.Worldometers.info)Elaboration...,Middle East and North Africa,28.033886,1.659626
4,203,Andorra,Andorra,2020,77265,0.16,123,164,470,,N.A.,N.A.,88,0.0,Worldometer (www.Worldometers.info)Elaboration...,Europe,42.546245,1.601554
5,44,Angola,Angola,2020,32866272,3.27,1040977,26,1246700,6413.0,5.6,17,67,0.42,Worldometer (www.Worldometers.info)Elaboration...,Sub-Saharan Africa,-11.202692,17.873887


In [85]:
is_configured = c.name.isin(configured_countries)
configured = c[is_configured].copy()
configured.head(-10)

Unnamed: 0,set,name,unit,time,pop_100k,new_cases_orig,new_deaths_orig,new_tests_orig,cap_cum_cases,cap_new_cases,cap_cum_deaths,cap_new_deaths,cap_cum_tests,cap_new_tests,all_cum_cases,all_new_cases,all_cum_deaths,all_new_deaths,all_cum_tests,all_new_tests,pos,region,date,new_cases,new_deaths,new_tests,new_negatives,state,county,all_cum_neg,all_new_neg,hospitalized_currently,hospitalized_cum,source,accessed
6484,country,Afghanistan,AF,02/24/2020,389.28,1,0,,0.00256885,0.000366978,0,0,,,1,0.142857,0,0,,,,South Asia,2020-02-24,1,0,,,,,,,,,https://github.com/dsbbfinddx/FINDCov19Tracker...,10/24/2020
6680,country,Afghanistan,AF,02/25/2020,389.28,0,0,,0.00256885,0.000366978,0,0,,,1,0.142857,0,0,,,,South Asia,2020-02-25,0,0,,,,,,,,,https://github.com/dsbbfinddx/FINDCov19Tracker...,10/24/2020
6876,country,Afghanistan,AF,02/26/2020,389.28,0,0,,0.00256885,0.000366978,0,0,,,1,0.142857,0,0,,,,South Asia,2020-02-26,0,0,,,,,,,,,https://github.com/dsbbfinddx/FINDCov19Tracker...,10/24/2020
7072,country,Afghanistan,AF,02/27/2020,389.28,0,0,,0.00256885,0.000366978,0,0,,,1,0.142857,0,0,,,,South Asia,2020-02-27,0,0,,,,,,,,,https://github.com/dsbbfinddx/FINDCov19Tracker...,10/24/2020
7268,country,Afghanistan,AF,02/28/2020,389.28,0,0,,0.00256885,0.000366978,0,0,,,1,0.142857,0,0,,,,South Asia,2020-02-28,0,0,,,,,,,,,https://github.com/dsbbfinddx/FINDCov19Tracker...,10/24/2020
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52471,country,Zimbabwe,ZW,10/09/2020,148.63,43,0,,53.7846,0.130718,1.54074,0.00096116,425.116,,7994,19.4286,229,0.142857,63185,,,Sub-Saharan Africa,2020-10-09,43,0,0,,,,,,,,https://github.com/dsbbfinddx/FINDCov19Tracker...,10/24/2020
52673,country,Zimbabwe,ZW,10/10/2020,148.63,16,1,,53.8922,0.120145,1.54747,0.00192232,425.116,,8010,17.8571,230,0.285714,63185,,,Sub-Saharan Africa,2020-10-10,16,1,0,,,,,,,,https://github.com/dsbbfinddx/FINDCov19Tracker...,10/24/2020
52875,country,Zimbabwe,ZW,10/11/2020,148.63,1,0,,53.8989,0.118223,1.54747,0.00192232,425.116,,8011,17.5714,230,0.285714,63185,,,Sub-Saharan Africa,2020-10-11,1,0,0,,,,,,,,https://github.com/dsbbfinddx/FINDCov19Tracker...,10/24/2020
53077,country,Zimbabwe,ZW,10/12/2020,148.63,10,0,,53.9662,0.118223,1.54747,0.00192232,425.116,,8021,17.5714,230,0.285714,63185,,,Sub-Saharan Africa,2020-10-12,10,0,0,,,,,,,,https://github.com/dsbbfinddx/FINDCov19Tracker...,10/24/2020


In [87]:
script_order = ["time","region","name","state","county","pop_100k",
                "all_cum_cases","all_cum_neg",
                "hospitalized_currently","hospitalized_cum",
                "all_cum_deaths","new_deaths",
                "all_new_neg","new_cases","new_tests",
                "source","accessed"]
column_names = ["Date","Region","Country","State","County","Pop_100k",
                "Positive Total", "Negative Total",
                "Hospitalized Currently", "Hospitalized Cumulative",
                "Deaths Total","Death Daily",
                "Negative Daily","Positive Daily","Tests Daily",
                "source","accessed"]
integer_output = ["Positive Total", "Negative Total",
                "Hospitalized Currently", "Hospitalized Cumulative",
                "Deaths Total","Death Daily",
                "Negative Daily","Positive Daily","Tests Daily"]

configured_output = configured[script_order].copy()
configured_output.columns = column_names
configured_output[integer_output] = configured_output[integer_output].apply(lambda x: pd.to_numeric(x, 
                                                                                                    errors='coerce', 
                                                                                                    downcast='integer'))
configured_output = configured_output[configured_output.columns].astype(str)
for i in configured_output.columns:
    configured_output[i] = configured_output[i].apply(lambda x: emptyNan(x))
configured_output.head(-10)

Unnamed: 0,Date,Region,Country,State,County,Pop_100k,Positive Total,Negative Total,Hospitalized Currently,Hospitalized Cumulative,Deaths Total,Death Daily,Negative Daily,Positive Daily,Tests Daily,source,accessed
6484,02/24/2020,South Asia,Afghanistan,,,389.28,1,,,,0,0,,1,,https://github.com/dsbbfinddx/FINDCov19Tracker...,10/24/2020
6680,02/25/2020,South Asia,Afghanistan,,,389.28,1,,,,0,0,,0,,https://github.com/dsbbfinddx/FINDCov19Tracker...,10/24/2020
6876,02/26/2020,South Asia,Afghanistan,,,389.28,1,,,,0,0,,0,,https://github.com/dsbbfinddx/FINDCov19Tracker...,10/24/2020
7072,02/27/2020,South Asia,Afghanistan,,,389.28,1,,,,0,0,,0,,https://github.com/dsbbfinddx/FINDCov19Tracker...,10/24/2020
7268,02/28/2020,South Asia,Afghanistan,,,389.28,1,,,,0,0,,0,,https://github.com/dsbbfinddx/FINDCov19Tracker...,10/24/2020
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52471,10/09/2020,Sub-Saharan Africa,Zimbabwe,,,148.63,7994,,,,229,0,,43,0.0,https://github.com/dsbbfinddx/FINDCov19Tracker...,10/24/2020
52673,10/10/2020,Sub-Saharan Africa,Zimbabwe,,,148.63,8010,,,,230,1,,16,0.0,https://github.com/dsbbfinddx/FINDCov19Tracker...,10/24/2020
52875,10/11/2020,Sub-Saharan Africa,Zimbabwe,,,148.63,8011,,,,230,0,,1,0.0,https://github.com/dsbbfinddx/FINDCov19Tracker...,10/24/2020
53077,10/12/2020,Sub-Saharan Africa,Zimbabwe,,,148.63,8021,,,,230,0,,10,0.0,https://github.com/dsbbfinddx/FINDCov19Tracker...,10/24/2020


In [88]:
# Merge input data and demographics
merged_input = pd.merge(configured_output, country_demographics, how="left", on="Country")
print_column_unique(merged_input["Region"])
print(merged_input.dtypes)
merged_input.head()

Column Values:
['Central Asia' 'Europe' 'Latin America' 'Middle East and North Africa'
 'South Asia' 'Sub-Saharan Africa']
Date                        object
Region                      object
Country                     object
State                       object
County                      object
Pop_100k                    object
Positive Total              object
Negative Total              object
Hospitalized Currently      object
Hospitalized Cumulative     object
Deaths Total                object
Death Daily                 object
Negative Daily              object
Positive Daily              object
Tests Daily                 object
source                      object
accessed                    object
Rank                       float64
Country Link                object
Year                       float64
Population 2020            float64
Annual % Change            float64
Net Change                 float64
Density (P/Km²)            float64
Land Area (Km²)            float64
Mi

Unnamed: 0,Date,Region,Country,State,County,Pop_100k,Positive Total,Negative Total,Hospitalized Currently,Hospitalized Cumulative,Deaths Total,Death Daily,Negative Daily,Positive Daily,Tests Daily,source,accessed,Rank,Country Link,Year,Population 2020,Annual % Change,Net Change,Density (P/Km²),Land Area (Km²),Migrants (net),Fertility Rate,Median Age,Urban Population %,World Share %,Population Data Source,Population Region,Latitude,Longitude
0,02/24/2020,South Asia,Afghanistan,,,389.28,1,,,,0,0,,1,,https://github.com/dsbbfinddx/FINDCov19Tracker...,10/24/2020,37.0,Afghanistan,2020.0,38928346.0,2.33,886592.0,60.0,652860.0,-62920.0,4.6,18,25,0.5,Worldometer (www.Worldometers.info)Elaboration...,,,
1,02/25/2020,South Asia,Afghanistan,,,389.28,1,,,,0,0,,0,,https://github.com/dsbbfinddx/FINDCov19Tracker...,10/24/2020,37.0,Afghanistan,2020.0,38928346.0,2.33,886592.0,60.0,652860.0,-62920.0,4.6,18,25,0.5,Worldometer (www.Worldometers.info)Elaboration...,,,
2,02/26/2020,South Asia,Afghanistan,,,389.28,1,,,,0,0,,0,,https://github.com/dsbbfinddx/FINDCov19Tracker...,10/24/2020,37.0,Afghanistan,2020.0,38928346.0,2.33,886592.0,60.0,652860.0,-62920.0,4.6,18,25,0.5,Worldometer (www.Worldometers.info)Elaboration...,,,
3,02/27/2020,South Asia,Afghanistan,,,389.28,1,,,,0,0,,0,,https://github.com/dsbbfinddx/FINDCov19Tracker...,10/24/2020,37.0,Afghanistan,2020.0,38928346.0,2.33,886592.0,60.0,652860.0,-62920.0,4.6,18,25,0.5,Worldometer (www.Worldometers.info)Elaboration...,,,
4,02/28/2020,South Asia,Afghanistan,,,389.28,1,,,,0,0,,0,,https://github.com/dsbbfinddx/FINDCov19Tracker...,10/24/2020,37.0,Afghanistan,2020.0,38928346.0,2.33,886592.0,60.0,652860.0,-62920.0,4.6,18,25,0.5,Worldometer (www.Worldometers.info)Elaboration...,,,


In [89]:
# Write input data
merged_input.to_excel(r'C:\Users\janin\Downloads\configured.xlsx', index = False)

Output New Measures

In [90]:
pgmm_ssa = pd.read_excel(r'C:\Users\janin\Downloads\SSA-Temp.xlsx')
pgmm_ssa.rename(columns = {'County': 'Country'}, inplace = True)
fixCountries(pgmm_ssa["Country"],sub_saharan_african_countries)
conversions = {}
pgmm_ssa["Country"] = pgmm_ssa["Country"].astype(str)
pgmm_ssa["Country"] = pgmm_ssa["Country"].apply(lambda x: fixCountry(x))
print(conversions)
print_column_missing(pgmm_ssa["Country"],sub_saharan_african_countries)
print(pgmm_ssa.dtypes)
pgmm_ssa.to_excel(r'C:\Users\janin\Downloads\ss_africa.xlsx', index = False)
pgmm_ssa[pgmm_ssa["Country"].str.contains("Congo")].head()

{}
Column Values:
['Angola' 'Benin' 'Botswana' 'Burkina Faso' 'Burundi' 'Cabo Verde'
 'Cameroon' 'Central African Republic' 'Chad' 'Comoros' 'Côte d’Ivoire'
 'Democratic Republic of Congo' 'Equatorial Guinea' 'Eritrea' 'Ethiopia'
 'Gabon' 'Gambia' 'Ghana' 'Guinea' 'Guinea-Bissau' 'Kenya' 'Lesotho'
 'Liberia' 'Madagascar' 'Malawi' 'Mali' 'Mauritania' 'Mauritius'
 'Mozambique' 'Namibia' 'Niger' 'Nigeria' 'Republic of the Congo' 'Rwanda'
 'Senegal' 'Seychelles' 'Sierra Leone' 'Somalia' 'South Africa'
 'South Sudan' 'Sudan' 'Swaziland' 'São Tomé and Príncipe' 'Togo' 'Uganda'
 'Zambia' 'Zimbabwe']
Comparison:
['Angola', 'Benin', 'Botswana', 'Burkina Faso', 'Burundi', 'Cabo Verde', 'Cameroon', 'Central African Republic', 'Chad', 'Comoros', 'Côte d’Ivoire', 'Democratic Republic of Congo', 'Equatorial Guinea', 'Eritrea', 'Ethiopia', 'Gabon', 'Gambia', 'Ghana', 'Guinea', 'Guinea-Bissau', 'Kenya', 'Lesotho', 'Liberia', 'Madagascar', 'Malawi', 'Mali', 'Mauritania', 'Mauritius', 'Mozambique', 'Nam

Unnamed: 0,Date,Country,New COVID Cases,Cumulative COVID Cases,7 Day Moving Average New Cases,Rate of Infection,New Deaths,Cumulative Deaths,7 Day Moving Average of Death Rate,Rate of Deaths,Speed,Acceleration,Jerk,1 Day Persistence,7 Day Persistence
22,2020-09-08,Republic of the Congo,0,4891,130.285714,0.0,12,102,5.142857,0.223027,2.421439,0.0,-0.698288,12.186655,0.0
23,2020-09-15,Republic of the Congo,0,4934,6.142857,0.0,0,88,,0.0,0.114169,0.0,0.682357,0.57459,12.965255
24,2020-09-08,Democratic Republic of Congo,59,10292,26.857143,0.06798,0,260,0.142857,0.0,0.030945,0.008559,0.013333,1.817308,3.028069
25,2020-09-15,Democratic Republic of Congo,11,10401,15.571429,0.012674,3,267,1.0,0.003457,0.017941,-0.007901,-0.004115,2.097922,2.672662


In [91]:
pgmm_sa = pd.read_excel(r'C:\Users\janin\Downloads\SouthAsia excel updated 20201008.xlsx')
conversions = {}
pgmm_sa["Country"] = pgmm_sa["Country"].astype(str)
pgmm_sa["Country"] = pgmm_sa["Country"].apply(lambda x: fixCountry(x))
print(conversions)
print_column_missing(pgmm_sa["Country"],south_asia_countries)
print(pgmm_sa.dtypes)
pgmm_sa.to_excel(r'C:\Users\janin\Downloads\south_asia.xlsx', index = False)
pgmm_sa.head()

{}
Column Values:
['Afghanistan' 'Bangladesh' 'Bhutan' 'India' 'Maldives' 'Nepal' 'Pakistan'
 'Sri Lanka']
Comparison:
['Afghanistan', 'Bangladesh', 'Bhutan', 'India', 'Maldives', 'Nepal', 'Pakistan', 'Sri Lanka']
No missing values
No missing values
Country                                       object
Date                                  datetime64[ns]
New COVID Cases                                int64
Cumulative COVID Cases                         int64
7 Day Moving Average New Cases               float64
Rate of Infection                            float64
New Deaths                                     int64
Cumulative Deaths                              int64
7 Day Moving Average of Death Rate           float64
Rate of Deaths                               float64
Speed                                        float64
Acceleration                                 float64
Jerk                                         float64
7 Day Persistence                            float64
dtype: o

Unnamed: 0,Country,Date,New COVID Cases,Cumulative COVID Cases,7 Day Moving Average New Cases,Rate of Infection,New Deaths,Cumulative Deaths,7 Day Moving Average of Death Rate,Rate of Deaths,Speed,Acceleration,Jerk,7 Day Persistence
0,Afghanistan,2020-09-22,22,39096,40.142857,0.057831,1,1445,2.714286,0.002629,0.105523,-0.007886,0.001878,1.09
1,Bangladesh,2020-09-22,1557,352178,1588.857143,0.954944,28,5007,29.285714,0.017173,0.974483,-0.014632,-0.005082,1.17
2,Bhutan,2020-09-22,0,261,2.142857,0.0,0,0,0.0,0.0,0.280812,-0.018721,-0.018721,1.1
3,India,2020-09-22,0,5562663,77472.0,0.0,0,88935,981.285714,0.0,5.669716,-0.942224,-0.850995,1.67
4,Maldives,2020-09-22,48,9818,70.0,9.040348,0,34,0.142857,0.0,13.183841,-0.995515,-0.349775,2.46


In [92]:
pgmm_la = pd.read_excel(r'C:\Users\janin\Downloads\LatinAmerica.xlsx')
conversions = {}
pgmm_la["Country"] = pgmm_la["Country"].astype(str)
pgmm_la["Country"] = pgmm_la["Country"].apply(lambda x: fixCountry(x))
print(conversions)
print_column_missing(pgmm_la["Country"],latin_american_countries)
print(pgmm_la.dtypes)
pgmm_la.to_excel(r'C:\Users\janin\Downloads\Latin_America_cleaned.xlsx', index = False)
pgmm_la.head()

{}
Column Values:
['Antigua & Barbuda' 'Argentina' 'Bahamas' 'Barbados' 'Belize' 'Bolivia'
 'Brazil' 'Chile' 'Colombia' 'Costa Rica' 'Cuba' 'Dominica'
 'Dominican Republic' 'Ecuador' 'El Salvador' 'Grenada' 'Guatemala'
 'Guyana' 'Haiti' 'Honduras' 'Jamaica' 'Mexico' 'Panama' 'Paraguay' 'Peru'
 'St. Kitts & Nevis' 'St. Lucia' 'St. Vincent & Grenadines' 'Suriname'
 'Trinidad & Tobago' 'Uruguay' 'Venezuela']
Comparison:
['Antigua & Barbuda', 'Aruba', 'Bahamas', 'Barbados', 'Bermuda', 'British Virgin Islands', 'Cayman Islands', 'Cuba', 'Curacao', 'Dominica', 'Dominican Republic', 'Grenada', 'Haiti', 'Jamaica', 'Puerto Rico', 'St. Kitts & Nevis', 'St. Lucia', 'St. Vincent & Grenadines', 'Sint Maarten', 'Trinidad & Tobago', 'Turks and Caicos Islands', 'United States Virgin Islands', 'Argentina', 'Belize', 'Bolivia', 'Brazil', 'Chile', 'Colombia', 'Costa Rica', 'Ecuador', 'El Salvador', 'Guatemala', 'Guyana', 'Honduras', 'Mexico', 'Nicaragua', 'Panama', 'Paraguay', 'Peru', 'Suriname', 'Urugua

Unnamed: 0,Date,Country,New Cases,Cumulative Cases,7 Day Moving Average,Infection Rate,Deaths,Cumulative Deaths,7 Day Moving Average.1,Death Rate,Speed,Acceleration,Jerk,1 Day Persistence,7 Day Persistence
0,2020-10-06,Antigua & Barbuda,0,107,0.857143,0.0,0,3,0.0,0.0,0.882579,-1.268826e-16,1.268826e-16,-0.430702,0.065819
1,2020-10-13,Antigua & Barbuda,0,111,0.571429,0.0,0,3,0.0,0.0,0.588386,0.0,-6.344132000000001e-17,-0.436498,0.197754
2,2020-10-06,Argentina,14740,824468,12551.285714,32.800228,359,21827,758.285714,0.798866,27.929785,0.4014992,0.5811089,0.094288,23.480881
3,2020-10-13,Argentina,13305,917035,13223.857143,29.60699,386,24572,392.142857,0.858948,29.426427,-0.4561768,0.0899638,0.140674,24.457158
4,2020-10-06,Bahamas,107,4559,93.714286,27.472386,4,100,1.285714,1.027005,24.061262,1.540508,-0.03667875,-0.004373,13.749764


In [93]:
pgmm_ca = pd.read_excel(r'C:\Users\janin\Downloads\CentralAsia-Results.xlsx')
pgmm_ca["Country"] = pgmm_ca["Country"].astype(str)
pgmm_ca["Country"] = pgmm_ca["Country"].apply(lambda x: "Central Asia" if x == "Region" else x)
print_column_missing(pgmm_ca["Country"],central_asian_countries)
print(pgmm_ca.dtypes)
pgmm_ca.to_excel(r'C:\Users\janin\Downloads\Central_Asia_cleaned.xlsx', index = False)
pgmm_ca.head()

Column Values:
['Armenia' 'Azerbaijan' 'Central Asia' 'Cyprus' 'Georgia' 'Kazakhstan'
 'Kosovo' 'Kyrgyzstan' 'North Macedonia' 'Russia' 'Tajikistan' 'Turkey'
 'Uzbekistan']
Comparison:
['Armenia', 'Azerbaijan', 'Cyprus', 'Faeroe Islands', 'Georgia', 'Gibraltar', 'Kazakhstan', 'Kosovo', 'Kyrgyzstan', 'North Macedonia', 'Russia', 'Tajikistan', 'Turkey', 'Turkmenistan', 'Uzbekistan']
Column values not in comparison:
['Central Asia']
Comparison values not in column:
['Faeroe Islands', 'Gibraltar', 'Turkmenistan']
Date                        int64
Country                    object
New Cases                   int64
Cumulative Cases            int64
7 Day Moving Average      float64
Infection Rate            float64
Deaths                      int64
Cumulative Deaths           int64
7 Day Moving Average.1    float64
Death Rate                float64
Speed                     float64
Acceleration              float64
Jerk                      float64
1 Day Persistence         float64
7 Day Per

Unnamed: 0,Date,Country,New Cases,Cumulative Cases,7 Day Moving Average,Infection Rate,Deaths,Cumulative Deaths,7 Day Moving Average.1,Death Rate,Speed,Acceleration,Jerk,1 Day Persistence,7 Day Persistence
0,44110,Armenia,406,53083,454.571429,13.726739,6,990,4.571429,0.202858,15.368924,0.381567,0.347757,16.115213,-0.554776
1,44117,Armenia,745,57566,640.428571,25.188227,6,1032,6.0,0.202858,21.652698,1.637356,0.724494,21.521573,-0.790196
2,44110,Azerbaijan,143,40931,116.0,1.426673,2,600,1.428571,0.019953,1.157301,0.121146,0.037056,1.11413,-0.0543
3,44117,Azerbaijan,277,42381,207.142857,2.763556,3,612,1.714286,0.02993,2.06661,0.190983,0.155352,2.016774,-0.059503
4,44110,Cyprus,29,1876,19.0,2.41954,1,23,0.142857,0.083432,1.585216,-0.011919,-0.083432,1.717325,-0.076602


In [106]:
pgmm_eu = pd.read_excel(r'C:\Users\janin\Downloads\Europe-Results-Updated.xlsx')
pgmm_eu["Country"] = pgmm_eu["Country"].astype(str)
pgmm_eu["Country"] = pgmm_eu["Country"].apply(lambda x: "Europe" if x == "Region" else x)
print_column_missing(pgmm_eu["Country"],european_countries)
print(pgmm_eu.dtypes)
pgmm_eu.to_excel(r'C:\Users\janin\Downloads\Europe_cleaned.xlsx', index = False)
pgmm_eu.head()

Column Values:
['Austria' 'Belarus' 'Belgium' 'Bulgaria' 'Croatia' 'Czech Republic'
 'Denmark' 'Estonia' 'Europe' 'Finland' 'France' 'Germany' 'Greece'
 'Hungary' 'Iceland' 'Ireland' 'Italy' 'Latvia' 'Lithuania' 'Luxembourg'
 'Malta' 'Netherlands' 'Norway' 'Poland' 'Portugal' 'Romania' 'Serbia'
 'Slovakia' 'Slovenia' 'Spain' 'Sweden' 'Switzerland' 'Ukraine'
 'United Kingdom']
Comparison:
['Albania', 'Andorra', 'Austria', 'Belarus', 'Belgium', 'Bosnia & Herzegovina', 'Bulgaria', 'Croatia', 'Czech Republic', 'Denmark', 'Estonia', 'Finland', 'France', 'Germany', 'Greece', 'Greenland', 'Hungary', 'Iceland', 'Ireland', 'Isle of Man', 'Italy', 'Latvia', 'Liechtenstein', 'Lithuania', 'Luxembourg', 'Malta', 'Moldova', 'Monaco', 'Montenegro', 'Netherlands', 'Norway', 'Poland', 'Portugal', 'Romania', 'San Marino', 'Serbia', 'Slovakia', 'Slovenia', 'Spain', 'Sweden', 'Switzerland', 'Ukraine', 'United Kingdom', 'Vatican City']
Column values not in comparison:
['Europe']
Comparison values not in co

Unnamed: 0,Date,Country,New Cases,Cumulative Cases,7 Day Moving Average,Infection Rate,Deaths,Cumulative Deaths,7 Day Moving Average.1,Death Rate,Speed,Acceleration,Jerk,1 Day Persistence,7 Day Persistence
0,2020-10-07,Austria,549,50435,832.571429,6.184475,22,840,6.285714,0.24783,9.378902,-0.278406,-1.644687,3.368949,7.970965
1,2020-10-14,Austria,1171,57762,1046.714286,13.191294,10,879,5.571429,0.11265,11.791218,1.000974,2.063101,3.764173,9.863012
2,2020-10-07,Belarus,394,81090,404.285714,4.161889,6,868,5.714286,0.063379,4.270538,0.120722,0.024144,1.447662,3.421389
3,2020-10-14,Belarus,526,84524,490.571429,5.556227,5,906,5.428571,0.052816,5.181989,0.199191,0.104123,1.738248,4.49097
4,2020-10-07,Belgium,5686,146382,3434.285714,49.512128,18,10108,13.285714,0.156739,29.904818,3.368646,0.299795,9.257136,16.67917


In [101]:
pgmm_me = pd.read_excel(r'C:\Users\janin\Downloads\Middle East Output.xlsx')
pgmm_me["Country"] = pgmm_me["Country"].astype(str)
pgmm_me["Country"] = pgmm_me["Country"].apply(lambda x: "Middle East and North Africa" if x == "Region" else x)
print_column_missing(pgmm_me["Country"],middle_east_and_north_africa_countries)
print_column_missing(pgmm_me["Country"],demographics_countries)
print(pgmm_me.dtypes)
pgmm_me.to_excel(r'C:\Users\janin\Downloads\Middle_East_North_Africa_cleaned.xlsx', index = False)
pgmm_me.head()

Column Values:
['Algeria' 'Bahrain' 'Djibouti' 'Egypt' 'Iran' 'Iraq' 'Israel' 'Jordan'
 'Lebanon' 'Libya' 'Middle East and North Africa' 'Morocco' 'Oman' 'Qatar'
 'Saudi Arabia' 'Tunisia' 'United Arab Emirates']
Comparison:
['Bahrain', 'Iran', 'Iraq', 'Israel', 'Jordan', 'Kuwait', 'Lebanon', 'Oman', 'Qatar', 'Saudi Arabia', 'Syria', 'United Arab Emirates', 'Yemen', 'Algeria', 'Djibouti', 'Egypt', 'Libya', 'Morocco', 'Tunisia']
Column values not in comparison:
['Middle East and North Africa']
Comparison values not in column:
['Kuwait', 'Syria', 'Yemen']
Column Values:
['Algeria' 'Bahrain' 'Djibouti' 'Egypt' 'Iran' 'Iraq' 'Israel' 'Jordan'
 'Lebanon' 'Libya' 'Middle East and North Africa' 'Morocco' 'Oman' 'Qatar'
 'Saudi Arabia' 'Tunisia' 'United Arab Emirates']
Comparison:
['Afghanistan' 'Albania' 'Algeria' 'American Samoa' 'Andorra' 'Angola'
 'Anguilla' 'Antigua & Barbuda' 'Argentina' 'Armenia' 'Aruba' 'Australia'
 'Austria' 'Azerbaijan' 'Bahamas' 'Bahrain' 'Bangladesh' 'Barbados'
 'Be

Unnamed: 0,Date,Country,New Cases,Cumulative Cases,7 Day Moving Average,Infection Rate,Deaths,Cumulative Deaths,7 Day Moving Average.1,Death Rate,Speed,Acceleration,Jerk,1 Day Persistence,7 Day Persistence
0,2020-10-11,Algeria,132,53072,133.714286,0.306598,6,1801,5.857143,0.013936,0.31058,-0.002986,0.000995,0.055256,0.202084
1,2020-10-18,Algeria,199,54402,190.0,0.46222,10,1856,7.857143,0.023227,0.441316,0.022232,-0.000664,0.07385,0.176942
2,2020-10-11,Bahrain,327,75614,421.714286,19.924785,2,275,2.142857,0.121864,25.695923,-0.217615,0.478752,4.566401,16.370049
3,2020-10-18,Bahrain,331,77902,326.857143,20.168514,7,300,3.571429,0.426524,19.916081,0.034818,1.610347,3.503413,14.639317
4,2020-10-11,Djibouti,0,5423,0.571429,0.0,0,61,0.0,0.0,0.058695,-0.014674,0.0,0.012929,0.083598


In [None]:
pgmm_ep = pd.read_excel(r'C:\Users\janin\Downloads\East Asia and Pacific output.xlsx')
pgmm_ep["Country"] = pgmm_ep["Country"].astype(str)
pgmm_ep["Country"] = pgmm_ep["Country"].apply(lambda x: "East Aisa and Pacific" if x == "Region" else x)
print_column_missing(pgmm_ep["Country"],east_asia_and_pacific_countries)
print_column_missing(pgmm_ep["Country"],demographics_countries)
print(pgmm_ep.dtypes)
pgmm_ep.to_excel(r'C:\Users\janin\Downloads\East_Asia_Pacific_cleaned.xlsx', index = False)
pgmm_ep.head()

In [107]:
all_configured = pd.concat([pgmm_ssa, pgmm_sa], sort=False)
all_configured = pd.concat([all_configured, pgmm_la], sort=False)
all_configured = pd.concat([all_configured, pgmm_ca], sort=False)
all_configured = pd.concat([all_configured, pgmm_eu], sort=False)
all_configured = pd.concat([all_configured, pgmm_me], sort=False)
all_configured = pd.concat([all_configured, pgmm_pe], sort=False)
print_column_missing(all_configured["Country"],configured_countries)
all_configured.head()

Column Values:
['Afghanistan' 'Algeria' 'Angola' 'Antigua & Barbuda' 'Argentina'
 'Armenia' 'Austria' 'Azerbaijan' 'Bahamas' 'Bahrain' 'Bangladesh'
 'Barbados' 'Belarus' 'Belgium' 'Belize' 'Benin' 'Bhutan' 'Bolivia'
 'Botswana' 'Brazil' 'Bulgaria' 'Burkina Faso' 'Burundi' 'Cabo Verde'
 'Cameroon' 'Central African Republic' 'Central Asia' 'Chad' 'Chile'
 'Colombia' 'Comoros' 'Costa Rica' 'Croatia' 'Cuba' 'Cyprus'
 'Czech Republic' 'Côte d’Ivoire' 'Democratic Republic of Congo' 'Denmark'
 'Djibouti' 'Dominica' 'Dominican Republic' 'Ecuador' 'Egypt'
 'El Salvador' 'Equatorial Guinea' 'Eritrea' 'Estonia' 'Ethiopia' 'Europe'
 'Finland' 'France' 'Gabon' 'Gambia' 'Georgia' 'Germany' 'Ghana' 'Greece'
 'Grenada' 'Guatemala' 'Guinea' 'Guinea-Bissau' 'Guyana' 'Haiti'
 'Honduras' 'Hungary' 'Iceland' 'India' 'Iran' 'Iraq' 'Ireland' 'Israel'
 'Italy' 'Jamaica' 'Jordan' 'Kazakhstan' 'Kenya' 'Kosovo' 'Kyrgyzstan'
 'Latvia' 'Lebanon' 'Lesotho' 'Liberia' 'Libya' 'Lithuania' 'Luxembourg'
 'Madagascar' 'M

Unnamed: 0,Date,Country,New COVID Cases,Cumulative COVID Cases,7 Day Moving Average New Cases,Rate of Infection,New Deaths,Cumulative Deaths,7 Day Moving Average of Death Rate,Rate of Deaths,Speed,Acceleration,Jerk,1 Day Persistence,7 Day Persistence,New Cases,Cumulative Cases,7 Day Moving Average,Infection Rate,Deaths,7 Day Moving Average.1,Death Rate
0,2020-09-08 00:00:00,Angola,52.0,3033.0,43.428571,0.163392,4.0,124,2.142857,0.012569,0.136459,-0.010324,-0.00404,4.369557,6.340465,,,,,,,
1,2020-09-15 00:00:00,Angola,130.0,3569.0,76.571429,0.40848,3.0,139,2.142857,0.009426,0.240599,0.035013,0.019302,6.120053,4.321752,,,,,,,
2,2020-09-08 00:00:00,Benin,0.0,2213.0,9.714286,0.0,0.0,40,0.0,0.0,0.082316,0.0,0.0,0.908654,0.426489,,,,,,,
3,2020-09-15 00:00:00,Benin,0.0,2267.0,7.714286,0.0,0.0,40,0.0,0.0,0.065369,0.0,0.0,0.721578,0.966708,,,,,,,
4,2020-09-08 00:00:00,Botswana,0.0,2126.0,57.428571,0.0,0.0,9,0.428571,0.0,2.492887,-0.56431,-1.333261,6.587742,2.303039,,,,,,,


In [108]:
all_configured.to_excel(r'C:\Users\janin\Downloads\all_configured_regions.xlsx', index = False)