In [1]:
# Dependencies
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import requests
import io
import time
from datetime import datetime, timedelta, date
import json
import pprint
pp = pprint.PrettyPrinter(indent=4)
pd.set_option('display.max_columns', None)

now = datetime.now()
current_date = now.strftime('%Y-%m-%d')
print(current_date)
file_date = "2021-01-25"
file_end = " -Results- " + current_date +" .csv"
output_file_end = " -Results- " + file_date +" .csv"
statistics_file_end = " - GMM - " + file_date + " .txt"
print(file_end)

#Folders and Files

# Folders
repositoryFolder = "D:/Repositories/Global-COVID-Surveillance/data/"
localDownloadFolder = "C:/Users/janin/Downloads/"
demographicsFolder = repositoryFolder + "raw/demographics/"
configuredFolder = repositoryFolder + "configured/"
cleanedFolder = repositoryFolder + "cleaned/"
regionsFolder = repositoryFolder + "raw/regions/"
locationsFolder = repositoryFolder + "raw/locations/"

rFolder = localDownloadFolder
dataFolder = repositoryFolder

# Population Input File
all_populations_file = cleanedFolder + "all_populations.xlsx"
us_codes = demographicsFolder + "US State Codes.xlsx"

# Location Input
locations_file = demographicsFolder + "Country Geo.xlsx"

#Sources
population_source_url ="https://www.worldometers.info/world-population/population-by-country/"
canada_population_source = "https://www150.statcan.gc.ca/t1/tbl1/en/tv.action?pid=1710000901" # Statistics Canada Quarterly Population
us_population_source = "https://www2.census.gov/programs-surveys/popest/tables/2010-2019/state/asrh/sc-est2019-agesex-civ.csv"

github_url="https://github.com/dsbbfinddx/FINDCov19TrackerData/blob/master/processed/data_all.csv?raw=true"
owid_data = R"https://github.com/owid/covid-19-data/blob/master/public/data/owid-covid-data.xlsx?raw=true"
country_codes_coordinates = "https://raw.githubusercontent.com/dsbbfinddx/FINDCov19TrackerData/master/raw/countries_codes_and_coordinates.csv"
countries_geo = "https://raw.githubusercontent.com/dsbbfinddx/FINDCov19TrackerData/master/raw/countries.geo.json"
us_states_geo = "https://raw.githubusercontent.com/dsbbfinddx/FINDCov19TrackerData/master/raw/us-states.geo.json"

kaggle_locations = "https://www.kaggle.com/paultimothymooney/latitude-and-longitude-for-every-country-and-state"

canada_source_csv = "https://opendata.arcgis.com/datasets/3afa9ce11b8842cb889714611e6f3076_0.csv"
us_source_csv = "https://covidtracking.com/data/download/all-states-history.csv"

# Countries and Regions
european_countries = [
    'Albania','Andorra','Austria','Belarus','Belgium','Bosnia & Herzegovina','Bulgaria',
    'Croatia','Czech Republic','Denmark','Estonia','Finland','France',
    'Germany','Greece','Greenland','Hungary','Iceland','Ireland','Isle of Man','Italy',
    'Latvia','Liechtenstein','Lithuania','Luxembourg','Malta','Moldova','Monaco','Montenegro',
    'Netherlands','Norway','Poland','Portugal','Romania',
    'San Marino','Serbia','Slovakia','Slovenia','Spain','Sweden','Switzerland',
    'Ukraine','United Kingdom','Vatican City'
]
carribean_countries = [
    "Antigua & Barbuda","Aruba","Bahamas","Barbados","Bermuda","British Virgin Islands",
    "Cayman Islands","Cuba","Curacao","Dominica","Dominican Republic","Grenada",
    "Haiti","Jamaica","Puerto Rico",
    "St. Barthelemy","St. Kitts & Nevis","St. Lucia","St. Vincent & Grenadines",
    "Sint Maarten","Trinidad & Tobago","Turks and Caicos Islands","United States Virgin Islands"
]
central_south_america_countries = [
    'Argentina','Belize','Bolivia','Brazil','Chile','Colombia','Costa Rica',
    'Ecuador','El Salvador','Guatemala','Guyana','Honduras',
    'Mexico','Nicaragua','Panama','Paraguay','Peru','Suriname','Uruguay','Venezuela'
]
latin_american_countries = carribean_countries + central_south_america_countries
sub_saharan_african_countries = [
    "Angola","Benin","Botswana","Burkina Faso","Burundi",
    "Cabo Verde","Cameroon","Central African Republic","Chad","Comoros","Côte d’Ivoire",
    "Democratic Republic of Congo","Equatorial Guinea","Eritrea","Ethiopia",
    "Gabon","Gambia","Ghana","Guinea","Guinea-Bissau","Kenya","Lesotho","Liberia",
    "Madagascar","Malawi","Mali","Mauritania","Mauritius","Mozambique",
    "Namibia","Niger","Nigeria","Republic of the Congo","Rwanda",
    "São Tomé and Príncipe","Senegal","Seychelles","Sierra Leone",
    "Somalia","South Africa","South Sudan","Sudan","Swaziland",
    "Tanzania","Togo","Uganda","Zambia","Zimbabwe"
]
south_asia_countries = [
    "Afghanistan","Bangladesh","Bhutan","India","Maldives","Nepal","Pakistan","Sri Lanka"
]
central_asian_countries = [
    'Armenia','Azerbaijan','Cyprus','Faeroe Islands','Georgia','Gibraltar','Kazakhstan','Kosovo','Kyrgyzstan',
    'North Macedonia','Russia','Tajikistan','Turkey','Turkmenistan','Uzbekistan'
]
east_asian_countries = [
    "Brunei","Cambodia","China","Indonesia","Japan","Laos","Malaysia","Mongolia","Myanmar","Niue","North Korea","Philippines",
    "Singapore","South Korea","Taiwan","Thailand","Timor","Vietnam"
]
pacific_countries = [
    "Australia","Cook Islands","Fiji","French Polynesia","Guam","Kiribati",
    "Marshall Islands","Micronesia","Nauru","New Caledonia","New Zealand",
    "Northern Mariana Islands","Palau","Papua New Guinea","Samoa","Solomon Islands","Tonga","Tuvalu","Vanuatu"
]
east_asia_and_pacific_countries = east_asian_countries + pacific_countries
middle_eastern_countries = [
    "Bahrain","Iran","Iraq","Israel","Jordan","Kuwait","Lebanon","Oman","Qatar",
    "Saudi Arabia","Syria","State of Palestine",
    "United Arab Emirates","Yemen"
]
north_african_countries = [
    "Algeria","Djibouti","Egypt","Libya","Morocco","Tunisia","Western Sahara"
]
middle_east_and_north_africa_countries = middle_eastern_countries + north_african_countries 
north_american_countries = ["Canada","United States"]
configured_country_lists = [
    european_countries,
    latin_american_countries,
    sub_saharan_african_countries,
    south_asia_countries,
    central_asian_countries,
    middle_east_and_north_africa_countries,
    east_asia_and_pacific_countries
]
countries_by_region = {
    'Central Asia': central_asian_countries,
    'Europe': european_countries,
    'Latin America': latin_american_countries,
    'South Asia': south_asia_countries,
    'Sub-Saharan Africa': sub_saharan_african_countries,
    'Middle East and North Africa': middle_east_and_north_africa_countries,
    'East Asia and Pacific': east_asia_and_pacific_countries,
    'North America': north_american_countries
}

configured_countries = []
for country_list in configured_country_lists:
    for country in country_list:
        configured_countries.append(country)
configured_countries.sort()
all_countries = configured_countries +  north_american_countries
all_countries.sort()

configured_regions = [
    'Central Asia',
    'East Asia and Pacific',
    'Europe',
    'Latin America',
    'Middle East and North Africa',
    'North America',
    'South Asia',
    'Sub-Saharan Africa'
]

regions = [
    'Central Asia',
    'East Asia and Pacific',
    'Europe',
    'Latin America',
    'Middle East and North Africa',
    'North America',
    'South Asia',
    'Sub-Saharan Africa',
    'Territory'
]

unincorporated_disputed_territories = [
    "American Samoa", "Anguilla","Caribbean Netherlands","Channel Islands","Curaçao",
    "Falkland Islands","French Guiana","Guadeloupe","Hong Kong","International"
]

countries_by_region = {
    'Central Asia': central_asian_countries,
    'Europe': european_countries,
    'Latin America': latin_american_countries,
    'South Asia': south_asia_countries,
    'Sub-Saharan Africa': sub_saharan_african_countries,
    'Middle East and North Africa': middle_east_and_north_africa_countries,
    'East Asia and Pacific': east_asia_and_pacific_countries,
    'North America': north_american_countries,
    'Territory': unincorporated_disputed_territories
}

census_regions = {
    0: {"name" : "United States",
        "states" : ["United States"]},
    1: {"name" : "Northeast",
        "states" :["Connecticut", "Maine", "New Hampshire", "Vermont", "Massachusetts", 
                   "Rhode Island", "New Jersey", "New York", "Pennsylvania"]},
    3: {"name" : "South",
        "states" : ["Maryland", "Delaware", "West Virginia", "Virginia", "Kentucky", 
                    "Tennessee", "North Carolina", "South Carolina", "Georgia", "Florida", 
                    "Alabama", "Mississippi", "Arkansas", "Louisiana", "Oklahoma", "Texas", 
                    "District of Columbia", "Puerto Rico"]},
    2: {"name" : "Midwest",
        "states" : ["North Dakota", "South Dakota", "Nebraska", "Kansas", "Missouri", "Iowa", 
                    "Minnesota", "Wisconsin", "Illinois", "Michigan", "Indiana", "Ohio"]},
    4: {"name" : "West",
        "states" : ["Washington", "Idaho", "Montana", "Wyoming", "Oregon", "California", "Nevada", 
                    "Utah", "Colorado", "Arizona", "New Mexico", "Alaska", "Hawaii"]}
}

us_states = [
    'Alabama','Alaska','Arizona','Arkansas','California','Colorado','Connecticut','Delaware','District of Columbia',
    'Florida','Georgia','Hawaii','Idaho','Illinois','Indiana','Iowa','Kansas','Kentucky','Louisiana',
    'Maine','Maryland','Massachusetts','Michigan','Minnesota','Mississippi','Missouri','Montana','Nebraska',
    'Nevada','New Hampshire','New Jersey','New Mexico','New York','North Carolina','North Dakota',
    'Ohio','Oklahoma','Oregon','Pennsylvania','Rhode Island','South Carolina','South Dakota','Tennessee','Texas',
    'Utah','Vermont','Virginia','Washington','West Virginia','Wisconsin','Wyoming'
]
canada_provinces = [
    'Alberta','British Columbia','Manitoba','New Brunswick','Newfoundland and Labrador','Northwest Territories',
    'Nova Scotia','Ontario','Prince Edward Island','Quebec','Saskatchewan','Yukon'
]
states_and_provinces = us_states + canada_provinces

country_conversions = {
    "Antigua & Barbuda": ["Antigua and Barbuda"],
    "Bahamas": ["Bahamas, The"],
    "Bosnia & Herzegovina": ["Bosnia and Herzegovina"],
    "Brunei": ["Brunei Darussalam"],
    "Cabo Verde": ["Cape Verde"],
    "Côte d’Ivoire": ["Cote d'Ivoire","Cote dIvoire"],
    "Czech Republic": ["Czechia","Czech Republic (Czechia)"],
    "Democratic Republic of Congo": ["Congo - Kinshasa"],
    "Egypt": ["Egypt, Arab Rep."],
    "Faeroe Islands": ["Faroe Islands"],
    "Gambia": ["Gambia, The"],
    "Hong Kong": ["Hong Kong SAR China"],
    "Iran": ["Iran, Islamic Rep."],
    "Kyrgyzstan": ["Kyrgyz Republic"],
    "Laos": ["Lao PDR"],
    "Micronesia": ["Micronesia, Fed. Sts.","Micronesia (country)"],
    "Myanmar": ["Myanmar (Burma)","Burma"],
    "North Macedonia": ["Macedonia"],
    "State of Palestine": ["Palestinian Territories","Palestine"],
    "Republic of the Congo": ["Congo - Brazzaville","Congo"],
    "Russia": ["Russian Federation"],
    "São Tomé and Príncipe": ["Sao Tome and Principe","Sao Tome & Príncipe","São Tomé & Príncipe"],
    "Sint Maarten": ["Sint Maarten (Dutch part)"],
    "Slovakia": ["Slovak Republic"],
    "St. Kitts & Nevis": ["Saint Kitts and Nevis"],
    "St. Lucia": ["Saint Lucia"],
    "St. Vincent & Grenadines": ["Saint Vincent and the Grenadines"],
    "Swaziland": ["Eswatini"],
    "Syria": ["Syrian Arab Republic"],
    "Timor": ["Timor-Leste"],
    "Trinidad & Tobago": ["Trinidad and Tobago"],
    "Vatican City": ["Holy See","Vatican"],
    "Yemen": ["Yemen, Rep."],
    "" : ["nan"]
}
us_territories =['American Samoa','Commonwealth of the Northern Mariana Islands','Guam','Puerto Rico','U.S. Virgin Islands']
territories = {
    "American Samoa":{"Region":"North America","Country":"United States"}, 
    "Anguilla":{"Region":"Europe","Country":"United Kingdom"},
    "Caribbean Netherlands":{"Region":"Europe","Country":"Netherlands"},
    "Channel Islands":{"Region":"Europe","Country":"Channel Islands"},
    "Curaçao":{"Region":"Europe","Country":"Netherlands"},
    "Falkland Islands":{"Region":"Europe","Country":"United Kingdom"},
    "French Guiana":{"Region":"Europe","Country":"France"},
    "Guadeloupe":{"Region":"Europe","Country":"France"},
    "Hong Kong":{"Region":"East Asia and Pacific","Country":"China"},
    "International":{"Region":"World","Country":""},
    "Macao":{"Region":"East Asia and Pacific","Country":"China"},
    "Martinique":{"Region":"Europe","Country":"France"},
    "Mayotte":{"Region":"Europe","Country":"France"},
    "Montserrat":{"Region":"Europe","Country":"United Kingdom"},
    "Réunion":{"Region":"Europe","Country":"France"},
    "St. Helena":{"Region":"Europe","Country":"United Kingdom"},
    "St. Martin":{"Region":"Europe","Country":"France"},
    "St. Pierre & Miquelon":{"Region":"Europe","Country":"France"},
    "Tokelau":{"Region":"East Asia and Pacific","Country":"New Zealand"},
    "Turks and Caicos":{"Region":"Europe","Country":"United Kingdom"},
    "U.S. Virgin Islands":{"Region":"North America","Country":"United States"},
    "Wallis & Futuna":{"Region":"Europe","Country":"France"},
    "World":{"Region":"World","Country":""}
}


2021-01-30
 -Results- 2021-01-30 .csv


In [2]:
def titleCase(words):
    if len(words) > 3:
        titlecased = ""
        wordsArray = words.lower().split(" ")
        for word in wordsArray:
            if len(titlecased) > 0 :
                titlecased = titlecased + " "
            if word == "and":
                titlecased = titlecased + "and"
            else:
                titlecased = titlecased + word.capitalize()
        return titlecased
    else:
        return words.upper()

def fixRegion(code):
    region_name = ""
    for region in census_regions:
        if region["number"] == code:
            region_name = region["name"]
            break
    if region_name == "":
        region_name = "Other"
        print(str(code) + " not found")
    return region_name

def checkRegions(regionColumn, countryColumn):
    fixed = []
    for i in range(0,len(regionColumn)):
        region = regionColumn[i]
        country = countryColumn[i]
        if (not (region in regions)) and (not (country in fixed)):
            fixed.append(country)
            print(f"{country} = {region}")

# CDC Standard age ranges 0-17, 18-29, 30-49, and 50-64
# CDC COVID Reporting Age Ranges https://www.cdc.gov/nchs/nvss/vsrr/covid_weekly/index.htm
def getAgeRange(age):
    age_range = ""
    if age == 0:
        age_range = "< 1"
    elif age == 999:
        age_range = "Total"
    elif age < 5:
        age_range = "1-4"
    elif age < 15:
        age_range = "5-14"
    elif age < 25:
        age_range = "15-24"
    elif age < 35:
        age_range = "25-34"
    elif age < 45:
        age_range = "35-44"
    elif age < 55:
        age_range = "45-54"
    elif age < 65:
        age_range = "55-64"
    elif age < 75:
        age_range = "65-74"
    elif age < 85:
        age_range = "75-84"
    elif age == 85:
        age_range = "85+"
    return age_range

def fixSex(code):
    sex = ""
    if code == 0:
        sex = "Population 2019"
    elif code == 1:
        sex = "Male"
    elif code == 2:
        sex = "Female"
    else:
        print(str(code) + " is not a sex")
    return sex

def us_date(x):
    month = x[5:7]
    day = x[8:11]
    year = x[0:4]
    conversion = month + "/" + day +"/"+ year
    return conversion

def removeDecimal(data):
    strData = str(data)
    decimalLocation = strData.find(".")
    if decimalLocation > -1:
        return strData[0:decimalLocation]
    else:
        return strData

def emptyNan(value):
    if (value == "nan"):
        return ""
    else:
        return value

def printColumns(df, label):
    print(label)
    print(df.columns)

def print_column_unique(column):
    print("Column Values:")
    values = list(set(sorted(column,key=lambda v: (isinstance(v, str), v))))
    print(values)
    return values

def print_column_missing(column, comparison):
    values = print_column_unique(column)
    print("Comparison:")
    print(comparison)
    missing_values = []
    for value in values:
        if not value in comparison:
            missing_values.append(value)
    if len(missing_values) > 0:
        print("Column values not in comparison:")
        print(missing_values)
    else:
        print("No missing values")
    missing_values = []
    for value in comparison:
        if not value in values:
            missing_values.append(value)
    if len(missing_values) > 0:
        print("Comparison values not in column:")
        print(missing_values)
    else:
        print("No missing values")
    return values

def division(a,b):
    if not (b == 0):
        return a/b
    else:
        return np.nan
    
def key_from_value(value, dictionary, default):
    return_value = default
    for key, values in dictionary.items():
        if value.strip() in values:
            return_value = key
            break
    return return_value.strip()

def region_from_country(country):
    return key_from_value(country, countries_by_region, "")

def fixTerritoryRegion(territory):
    characteristics = territories[territory]
    return characteristics["Region"]

def fixCountry(value):
    return key_from_value(value, country_conversions, value)

def fixCountries(countries_column, configuredCountries):
    countries_conversion = countries_column.astype(str)
    countries_conversion = countries_conversion.apply(lambda x: fixCountry(x))
    print(conversions)
    countries = print_column_missing(countries_conversion,configuredCountries)
    return countries_conversion

def checkCountries(column):
    fixed = []
    for value in column:
        fixedValue = fixCountry(value)
        if (not (fixedValue == value)) and (not (value in fixed)):
            fixed.append(value)
            print(f"{value} => {fixedValue}")
    if len(fixed) == 0:
        print("No countries need to be fixed.")

def testConversion(title, test_array, conversion):
    print(title)
    no_conversions = []
    for value in test_array:
        return_value = ""
        if conversion == "country":
            return_value = fixCountry(value)
        elif conversion == "region":
            return_value = region_from_country(fixCountry(value))
        if return_value != value.strip():
            print(value.strip() + "," + return_value)
        if return_value == "":
            no_conversions.append(value)
    if len(no_conversions) > 0:
        print("Missing Conversions")
        print(no_conversions)
    print("")
    
def division(a,b):
    if not (b == 0):
        return a/b
    else:
        return np.nan
    
def fixProvince(value):
    province_map = {
        'BC': 'British Columbia',
        'NL': 'Newfoundland and Labrador',
        'NWT': 'Northwest Territories',
        'PEI': 'Prince Edward Island',
        'Repatriated': 'Repatriated Canada',
        'Repatriated Cdn': 'Repatriated Canada'
    }
    value = titleCase(value)
    if value in province_map.keys():
        new_province = province_map[value]
        return new_province
    else:
        return value

def censusRegionByState(state):
    for key in census_regions.keys():
        if (state in census_regions[key]["states"]):
            return census_regions[key]["name"]
        
def fixUSRegion(code):
    region = census_regions[code]
    region_name = region["name"]
    return region_name

def convertDateToExcel(dayString) :
    temp = datetime(1899, 12, 30)
    day = datetime.strptime(dayString, '%m/%d/%Y')
    delta = day - temp
    return float(delta.days) + (float(delta.seconds) / 86400)

def firstDay(dayString):
    dt = datetime.strptime(dayString, '%m/%d/%Y')
    firstDate = dt - timedelta(days=dt.weekday())
    return firstDate.strftime('%m/%d/%Y')

In [3]:
all_populations = pd.read_excel(cleanedFolder + "all_populations.xlsx")
population_groups = ["Level","Region","Census Region","Country","State/Province"]
for group in population_groups:
    all_populations[group] = all_populations[group].astype(str)
    all_populations[group] = all_populations[group].apply(lambda x: "" if x=="nan" else x)
print(all_populations.columns)
all_populations.head()

Index([                  'Level',                  'Region',
                       'Country',           'Census Region',
                'State/Province',              'Population',
               'Population 100K',      'Country Population',
       'Country Population 100K',       'Region Population',
       ...
                       'Pct 1-4',                'Pct 5-14',
                     'Pct 15-24',               'Pct 25-34',
                     'Pct 35-44',               'Pct 45-54',
                     'Pct 55-64',               'Pct 65-74',
                     'Pct 75-84',                 'Pct 85+'],
      dtype='object', length=131)


Unnamed: 0,Level,Region,Country,Census Region,State/Province,Population,Population 100K,Country Population,Country Population 100K,Region Population,Region Population 100K,World Population,World Population 100K,Country Share,Region Share,World Share,World Share (%),Urban Population (%),Annual Change (%),Net Change,Migrants (net),Density (P/Km²),Land Area (Km²),Fertility Rate,Median Age,< 1,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85+,1-4,5-14,15-24,25-34,35-44,45-54,55-64,65-74,75-84,Pct < 1,Pct 1-4,Pct 5-14,Pct 15-24,Pct 25-34,Pct 35-44,Pct 45-54,Pct 55-64,Pct 65-74,Pct 75-84,Pct 85+
0,Country,Central Asia,Armenia,,,2963243,29.63243,,,326887719,3268.87719,7796609105,77966.09105,,0.009065,0.00038,0.04,63.0,0.19,5512.0,-4998.0,104.0,28470.0,1.8,35.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,Country,Central Asia,Azerbaijan,,,10139177,101.39177,,,326887719,3268.87719,7796609105,77966.09105,,0.031017,0.0013,0.13,56.0,0.91,91459.0,1200.0,123.0,82658.0,2.1,32.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,Country,Central Asia,Cyprus,,,1207359,12.07359,,,326887719,3268.87719,7796609105,77966.09105,,0.003693,0.000155,0.02,67.0,0.73,8784.0,5000.0,131.0,9240.0,1.3,37.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,Country,Central Asia,Faeroe Islands,,,48863,0.48863,,,326887719,3268.87719,7796609105,77966.09105,,0.000149,6e-06,0.0,43.0,0.38,185.0,,35.0,1396.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,Country,Central Asia,Georgia,,,3989167,39.89167,,,326887719,3268.87719,7796609105,77966.09105,,0.012203,0.000512,0.05,58.0,-0.19,-7598.0,-10000.0,57.0,69490.0,2.1,38.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [4]:
# Global input data owid
print(owid_data)
o = pd.io.excel.read_excel(owid_data)
print(o.columns)

o = o.rename(columns={
    "location":"Country",
    "population": "Owid Population",
    "new_tests":"Owid Tests",
    "total_tests": "Owid Total Tests",
    "new_cases": "Owid Cases",
    "new_deaths": "Owid Deaths"
}).reset_index()

o["date"] = o["date"].astype(str)
o["Year"] = o["date"].apply(lambda x: x[0:4])
o["Month"] = o["date"].apply(lambda x: x[5:7])
o["Day"] = o["date"].apply(lambda x: x[8:10])
o["Date"] = o.apply(lambda x: x["Month"] + "/" + x["Day"] + "/" + x["Year"],axis=1)
o["Time"] = o.date.apply(lambda x: datetime.strptime(x, '%Y-%m-%d'))

o["Country"] = o["Country"].astype(str)
print_column_unique(o["Country"])
o["Country"] = o["Country"].apply(lambda x: "" if x=="nan" else x)
o["Country"] = o["Country"].apply(lambda x: fixCountry(x))
checkCountries(o["Country"])

min_date = o["Time"].min()
max_date = o["Time"].max()
print("Min: " + str(min_date))
print("Max: " + str(max_date))

o_integer_columns = ["Owid Population", "Owid Tests", "Owid Cases", "Owid Deaths"]
o[o_integer_columns] = o[o_integer_columns].apply(lambda x: pd.to_numeric(x, errors='coerce', downcast='integer'))
o_columns = ["Country", "Time", "Date", "Owid Population", "Owid Tests", "Owid Cases", "Owid Deaths"]
o = o[o_columns]

o.head()
#o_notnull = o.loc[pd.notnull(o["Owid Tests"])].copy().reset_index().drop(columns=["index"])
#o_notnull.head()

https://github.com/owid/covid-19-data/blob/master/public/data/owid-covid-data.xlsx?raw=true
Index(['iso_code', 'continent', 'location', 'date', 'total_cases', 'new_cases',
       'new_cases_smoothed', 'total_deaths', 'new_deaths',
       'new_deaths_smoothed', 'total_cases_per_million',
       'new_cases_per_million', 'new_cases_smoothed_per_million',
       'total_deaths_per_million', 'new_deaths_per_million',
       'new_deaths_smoothed_per_million', 'reproduction_rate', 'icu_patients',
       'icu_patients_per_million', 'hosp_patients',
       'hosp_patients_per_million', 'weekly_icu_admissions',
       'weekly_icu_admissions_per_million', 'weekly_hosp_admissions',
       'weekly_hosp_admissions_per_million', 'total_tests', 'new_tests',
       'total_tests_per_thousand', 'new_tests_per_thousand',
       'new_tests_smoothed', 'new_tests_smoothed_per_thousand',
       'positive_rate', 'tests_per_case', 'tests_units', 'total_vaccinations',
       'people_vaccinated', 'people_fully_vacc

Unnamed: 0,Country,Time,Date,Owid Population,Owid Tests,Owid Cases,Owid Deaths
0,Afghanistan,2020-02-24,02/24/2020,38928341.0,,1.0,
1,Afghanistan,2020-02-25,02/25/2020,38928341.0,,0.0,
2,Afghanistan,2020-02-26,02/26/2020,38928341.0,,0.0,
3,Afghanistan,2020-02-27,02/27/2020,38928341.0,,0.0,
4,Afghanistan,2020-02-28,02/28/2020,38928341.0,,0.0,


In [5]:
date_check = o.groupby(o["Country"])["Time"].max()
date_check.head(200)

Country
Afghanistan   2021-01-29
Albania       2021-01-29
Algeria       2021-01-29
Andorra       2021-01-29
Angola        2021-01-29
                 ...    
Vietnam       2021-01-29
World         2021-01-29
Yemen         2021-01-29
Zambia        2021-01-29
Zimbabwe      2021-01-29
Name: Time, Length: 193, dtype: datetime64[ns]

In [6]:
# Global input data
github_request=requests.get(github_url).content
c=pd.read_csv(io.StringIO(github_request.decode('utf-8')))
currentTime = datetime.now()

print("c  original columns")
print(c.columns)

print("Sets")
c["set"] = c["set"].astype(str)
sets = print_column_unique(c["set"])
c = c[c["set"] == "country"].copy().rename(columns={
    "name":"Country",
    "unit":"Abbreviation",
    "time":"Time"
}).reset_index()
c = c.drop(columns=["index","set","pop_100k"])

print("Countries")
c["Country"] = c["Country"].astype(str)
c["Country"] = c["Country"].apply(lambda x: "" if x=="nan" else x)
c["Country"] = c["Country"].apply(lambda x: fixCountry(x))
checkCountries(c["Country"])

# Format text date and add datetime for date
c["Time"] = c["Time"].astype(str)
c["Time"] = c["Time"].apply(lambda x: us_date(x))
c["Date"] = c["Time"]
c["Time"] = c.apply(lambda x: pd.to_datetime(x["Date"], format="%m/%d/%Y"), axis=1)
minmax_dates = c.groupby(["Country"]).agg({"Date": [np.min,np.max]})
min_date = c["Time"].min()
max_date = c["Time"].max()
print("Min: " + str(min_date))
print("Max: " + str(max_date))
c["Accessed"] = str(currentTime.month) + '/' + str(currentTime.day) + '/' + str(currentTime.year)

o_not_international = o.loc[o["Country"]!="International"].copy()
c = o_not_international.merge(c,how="left",on=["Country","Date"])

c["Region"] = c["Country"].apply(lambda x: region_from_country(x))
c["Level"] = c.apply(lambda x: "Territory" if x["Region"] == "" else "World" if x["Region"] == "World" else "Country",axis=1)
c["Region"] = c.apply(lambda x: fixTerritoryRegion(x["Country"]) if x["Region"] == "" else x["Region"],axis=1)
c["State/Province"] = c.apply(lambda x: x["Country"] if x["Level"]=="Territory" else "",axis=1)
c["Country"] = c.apply(lambda x: "" if x["Level"]=="Territory" else x["Country"],axis=1)

# Format numeric columns
numeric_columns = [
    'new_cases_orig','new_deaths_orig','new_tests_orig',
    'cap_cum_cases','cap_new_cases',
    'cap_cum_deaths','cap_new_deaths',
    'cap_cum_tests','cap_new_tests',
    'all_cum_cases','all_new_cases','all_cum_deaths','all_new_deaths',
    'all_cum_tests','all_new_tests',
    'pos'
]
float_columns = [
    'cap_cum_cases','cap_new_cases','cap_cum_deaths',
    'cap_new_deaths','cap_cum_tests','cap_new_tests'
]
integer_columns = [
    'new_cases_orig', 'Owid Cases', 'all_cum_cases',
    'new_deaths_orig', 'Owid Deaths', 'all_cum_deaths',
    'new_tests_orig', 'Owid Tests', 'all_cum_tests', 'all_new_tests'
]
c[float_columns] = c[float_columns].apply(pd.to_numeric)
c[integer_columns] = c[integer_columns].apply(lambda x: pd.to_numeric(x, errors='coerce', downcast='integer'))
c = c.drop(columns=['Time_y'])
c = c.rename(columns ={
    "Time_x":"Time"
})
c["Tests Daily"] = c.apply(
    lambda x: x["Owid Tests"] if (pd.isna(x["new_tests_orig"]) and pd.notna(x["Owid Tests"])) else x["new_tests_orig"],
    axis=1
)
c["Cases Daily"] = c.apply(
    lambda x: x["Owid Cases"] if (pd.isna(x["new_cases_orig"]) and pd.notna(x["Owid Cases"])) else x["new_cases_orig"],
    axis=1
)
c["Deaths Daily"] = c.apply(
    lambda x: x["Owid Deaths"] if (pd.isna(x["new_deaths_orig"]) and pd.notna(x["Owid Deaths"])) else x["new_deaths_orig"],
    axis=1
)
print("c columns after Tests Daily")
print(c.columns)

c_order = [
    'Level', 'Region', 'Country', 'State/Province', 'Abbreviation',
    'Time', 'Date', 'Accessed',
    'Tests Daily', 'Cases Daily', 'Deaths Daily'
]
c = c[c_order]

c = c.where(c.notnull(), None)

c = c.merge(all_populations,how="left",on=["Level","Region","Country","State/Province"])

c_data_order = [
    'Level', 'Region', 'Country', 'State/Province', 'Abbreviation', 'Time', 'Date', 
    'Tests Daily', 'Cases Daily', 'Deaths Daily',
    'Population', 'Population 100K',
    'Country Population', 'Country Population 100K',"Country Share",
    'Region Population', 'Region Population 100K',"Region Share",
    'World Population', 'World Population 100K',"World Share",
    'World Share (%)', 'Urban Population (%)', 'Annual Change (%)', 'Net Change', 'Migrants (net)', 'Density (P/Km²)',
    'Land Area (Km²)', 'Fertility Rate', 'Median Age',
    '< 1', 1, 2, 3, 4, 5, 6, 7, 8, 9,
    10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
    20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
    30, 31, 32, 33, 34, 35, 36, 37, 38, 39,
    40, 41, 42, 43, 44, 45, 46, 47, 48, 49,
    50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
    60, 61, 62, 63, 64, 65, 66, 67, 68, 69,
    70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
    80, 81, 82, 83, 84, '85+',
    '1-4','5-14','15-24','25-34','35-44','45-54','55-64','65-74','75-84',
    'Pct < 1','Pct 1-4','Pct 5-14','Pct 15-24','Pct 25-34','Pct 35-44','Pct 45-54','Pct 55-64','Pct 65-74','Pct 75-84','Pct 85+',
    'Accessed'
]
c = c.sort_values(by=['Region','Country','Time']).reset_index()
print(c.columns)
c = c[c_data_order]

pop_check = c.loc[pd.isnull(c["Population"])]
print_column_unique(pop_check["Country"])

c.head()

c  original columns
Index(['set', 'name', 'unit', 'time', 'cum_tests_orig', 'new_tests_orig',
       'pop_100k', 'new_cases_orig', 'new_deaths_orig', 'cap_cum_cases',
       'cap_new_cases', 'cap_cum_deaths', 'cap_new_deaths', 'cap_cum_tests',
       'cap_new_tests', 'all_cum_cases', 'all_new_cases', 'all_cum_deaths',
       'all_new_deaths', 'all_cum_tests', 'all_new_tests', 'pos'],
      dtype='object')
Sets
Column Values:
['income', 'region', 'country']
Countries
No countries need to be fixed.
Min: 2020-01-18 00:00:00
Max: 2021-01-29 00:00:00
c columns after Tests Daily
Index(['Country', 'Time', 'Date', 'Owid Population', 'Owid Tests',
       'Owid Cases', 'Owid Deaths', 'Abbreviation', 'cum_tests_orig',
       'new_tests_orig', 'new_cases_orig', 'new_deaths_orig', 'cap_cum_cases',
       'cap_new_cases', 'cap_cum_deaths', 'cap_new_deaths', 'cap_cum_tests',
       'cap_new_tests', 'all_cum_cases', 'all_new_cases', 'all_cum_deaths',
       'all_new_deaths', 'all_cum_tests', 'all_new_

Unnamed: 0,Level,Region,Country,State/Province,Abbreviation,Time,Date,Tests Daily,Cases Daily,Deaths Daily,Population,Population 100K,Country Population,Country Population 100K,Country Share,Region Population,Region Population 100K,Region Share,World Population,World Population 100K,World Share,World Share (%),Urban Population (%),Annual Change (%),Net Change,Migrants (net),Density (P/Km²),Land Area (Km²),Fertility Rate,Median Age,< 1,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85+,1-4,5-14,15-24,25-34,35-44,45-54,55-64,65-74,75-84,Pct < 1,Pct 1-4,Pct 5-14,Pct 15-24,Pct 25-34,Pct 35-44,Pct 45-54,Pct 55-64,Pct 65-74,Pct 75-84,Pct 85+,Accessed
0,Country,Central Asia,Armenia,,AM,2020-03-01,03/01/2020,,1,0,2963243.0,29.63243,,,,326887719.0,3268.87719,0.009065,7796609000.0,77966.09105,0.00038,0.04,63.0,0.19,5512.0,-4998.0,104.0,28470.0,1.8,35.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1/30/2021
1,Country,Central Asia,Armenia,,AM,2020-03-02,03/02/2020,,0,0,2963243.0,29.63243,,,,326887719.0,3268.87719,0.009065,7796609000.0,77966.09105,0.00038,0.04,63.0,0.19,5512.0,-4998.0,104.0,28470.0,1.8,35.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1/30/2021
2,Country,Central Asia,Armenia,,AM,2020-03-03,03/03/2020,,0,0,2963243.0,29.63243,,,,326887719.0,3268.87719,0.009065,7796609000.0,77966.09105,0.00038,0.04,63.0,0.19,5512.0,-4998.0,104.0,28470.0,1.8,35.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1/30/2021
3,Country,Central Asia,Armenia,,AM,2020-03-04,03/04/2020,,0,0,2963243.0,29.63243,,,,326887719.0,3268.87719,0.009065,7796609000.0,77966.09105,0.00038,0.04,63.0,0.19,5512.0,-4998.0,104.0,28470.0,1.8,35.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1/30/2021
4,Country,Central Asia,Armenia,,AM,2020-03-05,03/05/2020,,0,0,2963243.0,29.63243,,,,326887719.0,3268.87719,0.009065,7796609000.0,77966.09105,0.00038,0.04,63.0,0.19,5512.0,-4998.0,104.0,28470.0,1.8,35.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1/30/2021


In [7]:
date_check = c.groupby(c["Country"])["Time"].max()
date_check.head(200)

Country
              2021-01-29
Afghanistan   2021-01-29
Albania       2021-01-29
Algeria       2021-01-29
Andorra       2021-01-29
                 ...    
Venezuela     2021-01-29
Vietnam       2021-01-29
Yemen         2021-01-29
Zambia        2021-01-29
Zimbabwe      2021-01-29
Name: Time, Length: 192, dtype: datetime64[ns]

In [8]:
# Canada raw data
canada_source_request = requests.get(canada_source_csv).content
canada_df = pd.read_csv(io.StringIO(canada_source_request.decode('utf-8')))
currentTime = datetime.now()

print("Original Canada Columns")
print(canada_df.columns)
canada_df.rename(columns = {
    'Province': 'State/Province',
    'SummaryDate': 'Time',
    'TotalCases': 'Total Cases','DailyTotals': 'Cases Daily',
    'TotalRecovered' : 'Total Recovered','DailyRecovered': 'Recovered Daily',
    'TotalDeaths': 'Total Deaths','DailyDeaths': 'Deaths Daily',
    'TotalTested': 'Total Tests','DailyTested': 'Tests Daily',
    'TotalActive': 'Total Active','DailyActive': 'Active Daily',
    'TotalHospitalized': 'Total Hospitalized','DailyHospitalized': 'Hospitalized Daily',
    'TotalICU': 'Total ICU', 'DailyICU': 'ICU Daily'
}, inplace = True)
print("Renamed Canada Columns")
print(canada_df.columns)

canada_df.drop(columns=["OBJECTID"], inplace = True)
canada_df["Accessed"] = str(currentTime.month) + '/' + str(currentTime.day) + '/' + str(currentTime.year)
canada_df["Country"] = "Canada"
canada_df["Region"] = "North America"
canada_df["State/Province"] = canada_df["State/Province"].apply(lambda x: fixProvince(x))
canada_df["Date"] = canada_df["Time"].apply(lambda x: us_date(x).replace(" ",""))
canada_df["Time"] = canada_df["Date"].apply(lambda x: datetime.strptime(x, '%m/%d/%Y'))
canada_df["Level"] = canada_df["State/Province"].apply(lambda x: "Country" if x == "Canada" else "State/Province")
canada_df["State/Province"] = canada_df["State/Province"].apply(lambda x: "" if x=="Canada" else x)
string_columns = ["State/Province","Abbreviation","Country","Region"]
canada_df = canada_df.sort_values(by=["Level","Country","State/Province","Date"])
canada_df = canada_df.reset_index()
canada_df = canada_df.drop(columns=["index"])
canada_df = canada_df.merge(all_populations,how="left",on=["Level","Region","Country","State/Province"]).reset_index()
canada_order = [
    'Level', 'Region', 'Country', 'State/Province', 'Abbreviation', 'Time', 'Date',
    'Cases Daily', 'Total Cases',
    'Tests Daily', 'Total Tests', 
    'Deaths Daily', 'Total Deaths', 
    'Recovered Daily', 'Total Recovered', 
    'Active Daily', 'Total Active',
    'Hospitalized Daily', 'Total Hospitalized', 
    'ICU Daily', 'Total ICU', 
    'Population', 'Population 100K',
    'Country Population', 'Country Population 100K',"Country Share",
    'Region Population', 'Region Population 100K',"Region Share",
    'World Population', 'World Population 100K',"World Share",
    'World Share (%)', 'Urban Population (%)', 'Annual Change (%)', 'Net Change', 'Migrants (net)', 'Density (P/Km²)',
    'Land Area (Km²)', 'Fertility Rate', 'Median Age',
    '< 1', 1, 2, 3, 4, 5, 6, 7, 8, 9,
    10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
    20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
    30, 31, 32, 33, 34, 35, 36, 37, 38, 39,
    40, 41, 42, 43, 44, 45, 46, 47, 48, 49,
    50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
    60, 61, 62, 63, 64, 65, 66, 67, 68, 69,
    70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
    80, 81, 82, 83, 84, '85+',
    '1-4','5-14','15-24','25-34','35-44','45-54','55-64','65-74','75-84',
    'Pct < 1','Pct 1-4','Pct 5-14','Pct 15-24','Pct 25-34','Pct 35-44','Pct 45-54','Pct 55-64','Pct 65-74','Pct 75-84','Pct 85+',
    'Accessed'
]
canada_df = canada_df[canada_order].copy()
canada_min = canada_df["Time"].min()
print("Min Date: " + str(canada_min))
canada_max = canada_df["Time"].max()
print("Max Date: " + str(canada_max))
print_column_unique(canada_df["State/Province"])
canada_df.head(75)

Original Canada Columns
Index(['OBJECTID', 'Province', 'Abbreviation', 'DailyTotals', 'SummaryDate',
       'TotalCases', 'TotalRecovered', 'DailyRecovered', 'TotalDeaths',
       'DailyDeaths', 'TotalTested', 'DailyTested', 'TotalActive',
       'DailyActive', 'TotalHospitalized', 'DailyHospitalized', 'TotalICU',
       'DailyICU', 'TotalVaccinated', 'DailyVaccinated'],
      dtype='object')
Renamed Canada Columns
Index(['OBJECTID', 'State/Province', 'Abbreviation', 'Cases Daily', 'Time',
       'Total Cases', 'Total Recovered', 'Recovered Daily', 'Total Deaths',
       'Deaths Daily', 'Total Tests', 'Tests Daily', 'Total Active',
       'Active Daily', 'Total Hospitalized', 'Hospitalized Daily', 'Total ICU',
       'ICU Daily', 'TotalVaccinated', 'DailyVaccinated'],
      dtype='object')
Min Date: 2020-01-25 00:00:00
Max Date: 2021-01-29 00:00:00
Column Values:
['', 'Northwest Territories', 'Alberta', 'Repatriated Canada', 'Nunavut', 'New Brunswick', 'Quebec', 'Manitoba', 'Nova Scoti

Unnamed: 0,Level,Region,Country,State/Province,Abbreviation,Time,Date,Cases Daily,Total Cases,Tests Daily,Total Tests,Deaths Daily,Total Deaths,Recovered Daily,Total Recovered,Active Daily,Total Active,Hospitalized Daily,Total Hospitalized,ICU Daily,Total ICU,Population,Population 100K,Country Population,Country Population 100K,Country Share,Region Population,Region Population 100K,Region Share,World Population,World Population 100K,World Share,World Share (%),Urban Population (%),Annual Change (%),Net Change,Migrants (net),Density (P/Km²),Land Area (Km²),Fertility Rate,Median Age,< 1,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85+,1-4,5-14,15-24,25-34,35-44,45-54,55-64,65-74,75-84,Pct < 1,Pct 1-4,Pct 5-14,Pct 15-24,Pct 25-34,Pct 35-44,Pct 45-54,Pct 55-64,Pct 65-74,Pct 75-84,Pct 85+,Accessed
0,Country,North America,Canada,,CA,2021-01-01,01/01/2021,1302,582704,163563,16051201,0,15605,6,489816,-4.0,73598,0.0,4067.0,0.0,794.0,38005238.0,380.05238,,,,365057840.0,3650.5784,0.104107,7.796609e+09,77966.09105,0.004875,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1/30/2021
1,Country,North America,Canada,,CA,2021-01-02,01/02/2021,7583,590287,218050,16269251,109,15714,5264,495080,1311.0,74909,-192.0,3875.0,-9.0,785.0,38005238.0,380.05238,,,,365057840.0,3650.5784,0.104107,7.796609e+09,77966.09105,0.004875,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1/30/2021
2,Country,North America,Canada,,CA,2021-01-03,01/03/2021,11372,601659,61889,16331140,151,15865,9891,504971,931.0,75840,42.0,3917.0,24.0,809.0,38005238.0,380.05238,,,,365057840.0,3650.5784,0.104107,7.796609e+09,77966.09105,0.004875,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1/30/2021
3,Country,North America,Canada,,CA,2021-01-04,01/04/2021,9765,611424,-39110,16292030,209,16074,12912,517883,5095.0,80935,251.0,4168.0,-2.0,807.0,38005238.0,380.05238,,,,365057840.0,3650.5784,0.104107,7.796609e+09,77966.09105,0.004875,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1/30/2021
4,Country,North America,Canada,,CA,2021-01-05,01/05/2021,7222,618646,219952,16511982,160,16234,5681,523564,1434.0,82369,193.0,4361.0,23.0,830.0,38005238.0,380.05238,,,,365057840.0,3650.5784,0.104107,7.796609e+09,77966.09105,0.004875,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1/30/2021
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
70,Country,North America,Canada,,CA,2020-03-06,03/06/2020,8,54,0,0,0,0,0,8,8.0,46,,,,,38005238.0,380.05238,,,,365057840.0,3650.5784,0.104107,7.796609e+09,77966.09105,0.004875,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1/30/2021
71,Country,North America,Canada,,CA,2020-03-07,03/07/2020,6,60,0,0,0,0,0,8,6.0,52,,,,,38005238.0,380.05238,,,,365057840.0,3650.5784,0.104107,7.796609e+09,77966.09105,0.004875,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1/30/2021
72,Country,North America,Canada,,CA,2020-03-08,03/08/2020,7,67,0,0,1,1,0,8,7.0,58,,,,,38005238.0,380.05238,,,,365057840.0,3650.5784,0.104107,7.796609e+09,77966.09105,0.004875,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1/30/2021
73,Country,North America,Canada,,CA,2020-03-09,03/09/2020,19,86,0,0,0,1,1,9,18.0,76,,,,,38005238.0,380.05238,,,,365057840.0,3650.5784,0.104107,7.796609e+09,77966.09105,0.004875,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1/30/2021


In [9]:
us_state_codes = pd.read_excel(us_codes)
us_state_codes["Census Region"] = us_state_codes["State Name"].apply(lambda x: censusRegionByState(x))
us_state_codes.rename(columns = {
    'State Name':'State/Province',
    "State Abbreviation": "Abbreviation"
}, inplace = True)
us_state_codes.head(70)

us_states_url = "https://covidtracking.com/data/download/all-states-history.csv"
us_states_request = requests.get(us_states_url).content
states=pd.read_csv(io.StringIO(us_states_request.decode('utf-8')))
currentTime = datetime.now()
printColumns(states, "Original US Columns")
states["Accessed"] = currentTime
states["Country"] = "United States"
states = states.drop(
    columns = [
        'deathConfirmed', 'deathProbable',
        'hospitalized',
        'negativeTestsAntibody', 'negativeTestsPeopleAntibody', 'negativeTestsViral',
        'positiveScore', 'positiveTestsAntibody', 'positiveTestsAntigen',
        'positiveTestsPeopleAntibody', 'positiveTestsPeopleAntigen',
        'positiveTestsViral', 'positiveCasesViral',
        'totalTestEncountersViral', 'totalTestEncountersViralIncrease',
        'totalTestsAntibody', 'totalTestsAntigen',
        'totalTestsPeopleAntibody', 'totalTestsPeopleAntigen',
        'totalTestsPeopleViral', 'totalTestsPeopleViralIncrease',
        'totalTestsViral', 'totalTestsViralIncrease'
    ])
states.rename(
    columns = {
        'date': 'Time', 'state' : 'Abbreviation', 'dataQualityGrade': 'Data Quality',
        'totalTestResults' : 'Total Tests', 'totalTestResultsIncrease' : 'Tests Daily',
        'negative' : 'Total Negative', 'negativeIncrease' : 'Negative Daily',
        'positive' : 'Total Cases', 'positiveIncrease' : 'Cases Daily',
        'recovered' : 'Total Recovered',
        'death' : 'Total Deaths', 'deathIncrease' : 'Deaths Daily',
        'hospitalizedCumulative' : 'Total Hospitalized', 'hospitalizedIncrease' : 'Hospitalized Daily', 'hospitalizedCurrently' : 'Currently Hospitalized',
        'inIcuCumulative' : 'Total In ICU', 'inIcuCurrently' : 'Currently In ICU',
        'onVentilatorCumulative' : 'Total On Ventilator', 'onVentilatorCurrently' : 'Currently On Ventilator'
    }, inplace = True)
states["Date"] = states["Time"].astype(str)
states["Date"] = states["Date"].apply(lambda x: us_date(x))
states["Time"] = pd.to_datetime(states["Date"], format="%m/%d/%Y")
#printColumns(states, "Post Rename Columns")
states_input = pd.merge(states, us_state_codes, how="left", on="Abbreviation")
merge_order = [
    'Time', 'Date', 'Abbreviation', 'State/Province', 'Country', 'FIPS', 'Status', 'Data Quality', 
    'Total Deaths', 'Deaths Daily', 'Total Recovered',
    'Total Tests', 'Tests Daily', 
    'Total Negative', 'Negative Daily', 'Total Cases', 'Cases Daily',    
    'Total Hospitalized', 'Currently Hospitalized', 'Hospitalized Daily',
    'Total In ICU', 'Currently In ICU',
    'Total On Ventilator', 'Currently On Ventilator', 
    'Accessed' 
]
states_input = states_input[merge_order]
#printColumns(states_input, "States Input Merge Columns")
states_input["Region"] = "North America"
states_input["Level"] = states_input.apply(
    lambda x:
    "Territory" if x["State/Province"] in us_territories else 
    "State/Province",
    axis=1
)

print(states_input.columns)
us_min = states_input["Time"].min()
print("Min Date: " + str(us_min))
us_max = states_input["Time"].max()
print("Max Date: " + str(us_max))

us_columns = [
    'Level', 'Region','Country','State/Province','Abbreviation','FIPS',
    'Time', 'Date', 'Status', 'Data Quality', 
    'Cases Daily', 'Total Cases', 'Negative Daily', 'Total Negative',
    'Deaths Daily', 'Total Deaths', 'Total Recovered', 
    'Tests Daily', 'Total Tests', 
    'Hospitalized Daily', 'Total Hospitalized', 'Currently Hospitalized', 
    'Currently In ICU', 'Total In ICU', 'Currently On Ventilator',  'Total On Ventilator',
    'Accessed'
]
states_input = states_input[us_columns]
states_input = states_input.merge(all_populations,how="left",on=["Level","Region","Country","State/Province"]).reset_index()
states_input = states_input.drop(columns=["index"])

print("US States")
print_column_unique(states_input["State/Province"])

states_input.head()

Original US Columns
Index(['date', 'state', 'dataQualityGrade', 'death', 'deathConfirmed',
       'deathIncrease', 'deathProbable', 'hospitalized',
       'hospitalizedCumulative', 'hospitalizedCurrently',
       'hospitalizedIncrease', 'inIcuCumulative', 'inIcuCurrently', 'negative',
       'negativeIncrease', 'negativeTestsAntibody',
       'negativeTestsPeopleAntibody', 'negativeTestsViral',
       'onVentilatorCumulative', 'onVentilatorCurrently', 'positive',
       'positiveCasesViral', 'positiveIncrease', 'positiveScore',
       'positiveTestsAntibody', 'positiveTestsAntigen',
       'positiveTestsPeopleAntibody', 'positiveTestsPeopleAntigen',
       'positiveTestsViral', 'recovered', 'totalTestEncountersViral',
       'totalTestEncountersViralIncrease', 'totalTestResults',
       'totalTestResultsIncrease', 'totalTestsAntibody', 'totalTestsAntigen',
       'totalTestsPeopleAntibody', 'totalTestsPeopleAntigen',
       'totalTestsPeopleViral', 'totalTestsPeopleViralIncrease',
    

Unnamed: 0,Level,Region,Country,State/Province,Abbreviation,FIPS,Time,Date,Status,Data Quality,Cases Daily,Total Cases,Negative Daily,Total Negative,Deaths Daily,Total Deaths,Total Recovered,Tests Daily,Total Tests,Hospitalized Daily,Total Hospitalized,Currently Hospitalized,Currently In ICU,Total In ICU,Currently On Ventilator,Total On Ventilator,Accessed,Census Region,Population,Population 100K,Country Population,Country Population 100K,Region Population,Region Population 100K,World Population,World Population 100K,Country Share,Region Share,World Share,World Share (%),Urban Population (%),Annual Change (%),Net Change,Migrants (net),Density (P/Km²),Land Area (Km²),Fertility Rate,Median Age,< 1,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85+,1-4,5-14,15-24,25-34,35-44,45-54,55-64,65-74,75-84,Pct < 1,Pct 1-4,Pct 5-14,Pct 15-24,Pct 25-34,Pct 35-44,Pct 45-54,Pct 55-64,Pct 65-74,Pct 75-84,Pct 85+
0,State/Province,North America,United States,Alaska,AK,2,2021-01-30,01/30/2021,0,A,135,52470.0,0,,0,262.0,,6119,1495885.0,3,1205.0,39.0,,,10.0,,2021-01-30 21:57:03.742534,West,712114.0,7.12114,327052602.0,3270.52602,365057840.0,3650.5784,7796609000.0,77966.09105,0.002177,0.001951,9.1e-05,,,,,,,,,,9978.0,10012.0,10186.0,10509.0,10395.0,10414.0,10303.0,10286.0,10436.0,10157.0,9976.0,10016.0,9887.0,9509.0,9678.0,9488.0,9410.0,9343.0,8518.0,7525.0,8088.0,8617.0,9132.0,9252.0,9900.0,10318.0,10693.0,11456.0,11576.0,11552.0,10946.0,10809.0,10460.0,10822.0,10799.0,10303.0,10452.0,9962.0,9667.0,9685.0,8865.0,8589.0,8533.0,7954.0,8295.0,7827.0,7962.0,8230.0,8696.0,9086.0,8428.0,8197.0,8330.0,8595.0,9128.0,9426.0,9493.0,9636.0,9731.0,9894.0,9373.0,9168.0,9161.0,8614.0,8800.0,8210.0,7734.0,7220.0,6655.0,6442.0,5978.0,5621.0,5482.0,4013.0,3945.0,3665.0,3459.0,2950.0,2677.0,2327.0,1971.0,1784.0,1586.0,1411.0,1277.0,7181.0,41102.0,100662.0,89273.0,109431.0,92305.0,84479.0,93296.0,61300.0,23107.0,0.014012,0.057718,0.141357,0.125363,0.153671,0.129621,0.118631,0.131013,0.086082,0.032448,0.010084
1,State/Province,North America,United States,Alabama,AL,1,2021-01-30,01/30/2021,0,A,0,455582.0,0,1767953.0,0,7566.0,242143.0,0,2127308.0,376,41859.0,1879.0,,2539.0,,1452.0,2021-01-30 21:57:03.742534,South,4889347.0,48.89347,327052602.0,3270.52602,365057840.0,3650.5784,7796609000.0,77966.09105,0.01495,0.013393,0.000627,,,,,,,,,,56901.0,58290.0,59073.0,59799.0,60294.0,59568.0,58599.0,59537.0,60023.0,60241.0,60897.0,63083.0,62906.0,61883.0,61729.0,61740.0,61799.0,61924.0,62938.0,64125.0,63587.0,64201.0,63943.0,63719.0,63922.0,65079.0,65208.0,67027.0,69478.0,68758.0,64852.0,61469.0,59980.0,59615.0,60721.0,58941.0,59921.0,60346.0,60696.0,62200.0,58159.0,57993.0,57852.0,55498.0,58174.0,57008.0,58838.0,61959.0,65460.0,64750.0,60738.0,59494.0,59786.0,61321.0,65925.0,66906.0,66695.0,67073.0,67308.0,68221.0,65605.0,65211.0,65365.0,63117.0,62042.0,59584.0,56766.0,54694.0,52697.0,51707.0,50567.0,49884.0,51612.0,37091.0,36845.0,35441.0,36173.0,30575.0,27572.0,26053.0,23977.0,22580.0,19594.0,18222.0,16660.0,91543.0,237456.0,608466.0,631898.0,642187.0,589780.0,615279.0,657543.0,501447.0,256847.0,0.011638,0.048566,0.124447,0.12924,0.131344,0.120626,0.125841,0.134485,0.102559,0.052532,0.018723
2,State/Province,North America,United States,Arkansas,AR,5,2021-01-30,01/30/2021,0,A+,1824,294387.0,11145,2185289.0,7,4838.0,271911.0,12494,2420194.0,47,13599.0,911.0,280.0,,146.0,1421.0,2021-01-30 21:57:03.742534,South,3012542.0,30.12542,327052602.0,3270.52602,365057840.0,3650.5784,7796609000.0,77966.09105,0.009211,0.008252,0.000386,,,,,,,,,,36355.0,37006.0,37572.0,38610.0,38921.0,38404.0,37924.0,38827.0,38633.0,38959.0,38941.0,40404.0,41015.0,40146.0,39960.0,39598.0,39485.0,39395.0,38933.0,39714.0,40206.0,40211.0,40323.0,39367.0,38992.0,39539.0,39518.0,40912.0,42271.0,41927.0,39361.0,38289.0,37446.0,37354.0,37897.0,37550.0,38010.0,38198.0,38328.0,39332.0,36427.0,36037.0,35410.0,34319.0,35486.0,34449.0,34938.0,36485.0,37966.0,38136.0,35481.0,34716.0,34766.0,35572.0,38680.0,39707.0,39698.0,39360.0,39525.0,39697.0,38668.0,38092.0,37865.0,36917.0,36430.0,35478.0,34176.0,32682.0,31736.0,30888.0,30427.0,30017.0,31554.0,22864.0,23007.0,22169.0,22217.0,19362.0,17669.0,16670.0,14936.0,13764.0,12330.0,11253.0,10771.0,59912.0,152109.0,393213.0,396224.0,394514.0,369097.0,361189.0,385959.0,302829.0,161141.0,0.012068,0.050492,0.130525,0.131525,0.130957,0.12252,0.119895,0.128117,0.100523,0.05349,0.019888
3,Territory,North America,United States,American Samoa,AS,60,2021-01-30,01/30/2021,1,,0,0.0,0,2140.0,0,0.0,,0,2140.0,0,,,,,,,2021-01-30 21:57:03.742534,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,State/Province,North America,United States,Arizona,AZ,4,2021-01-30,01/30/2021,0,A+,5119,753379.0,12170,2734463.0,76,13098.0,102678.0,54791,6643932.0,435,52006.0,3828.0,984.0,,645.0,,2021-01-30 21:57:03.742534,West,7259090.0,72.5909,327052602.0,3270.52602,365057840.0,3650.5784,7796609000.0,77966.09105,0.022195,0.019885,0.000931,,,,,,,,,,81929.0,83065.0,85726.0,88192.0,90876.0,90858.0,90405.0,90319.0,91313.0,90797.0,92572.0,96605.0,96951.0,95849.0,95076.0,94742.0,91832.0,93119.0,97724.0,99334.0,95779.0,96720.0,97447.0,99492.0,100066.0,101835.0,102053.0,105016.0,106612.0,105885.0,99473.0,96012.0,93463.0,94419.0,94495.0,91292.0,93193.0,93126.0,92593.0,94337.0,87480.0,85369.0,85480.0,83038.0,86427.0,83705.0,82925.0,85342.0,90167.0,90646.0,85428.0,82772.0,81434.0,81904.0,87011.0,88220.0,88617.0,88864.0,90069.0,90909.0,88041.0,87973.0,87664.0,85106.0,85274.0,83374.0,81125.0,78896.0,77728.0,77376.0,76474.0,76787.0,81299.0,59741.0,58899.0,57521.0,58174.0,50088.0,45100.0,41784.0,38116.0,35128.0,31252.0,28331.0,25703.0,145737.0,347859.0,930745.0,966255.0,999263.0,892335.0,851334.0,880737.0,751699.0,411197.0,0.011286,0.04792,0.128218,0.13311,0.137657,0.122927,0.117278,0.121329,0.103553,0.056646,0.020076


us_summary_cols = [
    'Time', 'Date', 'Country',
    'Total Deaths', 'Deaths Daily', 'Total Recovered',
    'Total Tests', 'Tests Daily', 'Total Negative', 'Negative Daily', 'Total Cases', 'Cases Daily',
    'Total Hospitalized', 'Currently Hospitalized', 'Hospitalized Daily', 
    'Total In ICU', 'Currently In ICU', 'Total On Ventilator', 'Currently On Ventilator'
]
us_stats = states_input[us_summary_cols].groupby(['Time','Date','Country']).sum().reset_index()
us_stats["FIPS"] = 0
us_stats["Abbreviation"] = "US"
us_stats["State/Province"] = "" 
us_stats["Status"] = 0
us_stats["Accessed"] = currentTime
us_stats["Data Quality"] = ""
us_stats = us_stats[merge_order]
us_stats.head()

us_stats["Level"] = "Country"
us_stats["Region"] = "North America"

us_stats = pd.concat([states_input,us_stats])
us_stats = us_stats[us_columns]
us_stats["Level"] = us_stats.apply(lambda x: 
                                    "Country" if x["Region"] == "Country" else 
                                    "Territory" if x["State/Province"] in us_territories else 
                                    "State/Province",axis=1)
us_stats = us_stats.merge(all_populations,how="left",on=["Level","Region","Country","State/Province"]).reset_index()
us_stats = us_stats.drop(columns=["index"])
us_min = us_stats["Time"].min()
print("Min Date: " + (us_min))
us_max = us_stats["Time"].max()
print("Max Date: " + (us_max))
us_stats.head()

In [10]:
not_cn = c.loc[(c["Country"]!="Canada")].copy()
all_data = pd.concat([states_input,canada_df,not_cn],sort=False)
print_column_unique(all_data["State/Province"])
all_data = all_data.sort_values(by=['Region', 'Country', 'State/Province','Time']).reset_index()
all_data = all_data.drop(columns=["index"])
base_columns = [
    'Level', 'Region','Country','Census Region','State/Province','Abbreviation','FIPS',
    'Time', 'Date', 'Status', 'Data Quality', 
    'Cases Daily', 'Deaths Daily','Tests Daily',
    'Negative Daily', 'Total Negative',
    'Hospitalized Daily', 'Total Hospitalized', 'Currently Hospitalized',
    'ICU Daily', 'Total ICU',
    'Currently In ICU', 'Total In ICU',
    'Currently On Ventilator',  'Total On Ventilator',
    'Recovered Daily', 'Total Recovered',
    'Active Daily', 'Total Active',
    'Population', 'Population 100K',
    'Country Population', 'Country Population 100K',"Country Share",
    'Region Population', 'Region Population 100K',"Region Share",
    'World Population', 'World Population 100K',"World Share",
    'World Share (%)', 'Urban Population (%)', 'Annual Change (%)', 'Net Change', 'Migrants (net)', 'Density (P/Km²)',
    'Land Area (Km²)', 'Fertility Rate', 'Median Age',
    '< 1', 1, 2, 3, 4, 5, 6, 7, 8, 9,
    10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
    20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
    30, 31, 32, 33, 34, 35, 36, 37, 38, 39,
    40, 41, 42, 43, 44, 45, 46, 47, 48, 49,
    50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
    60, 61, 62, 63, 64, 65, 66, 67, 68, 69,
    70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
    80, 81, 82, 83, 84, '85+',
    '1-4','5-14','15-24','25-34','35-44','45-54','55-64','65-74','75-84',
    'Pct < 1','Pct 1-4','Pct 5-14','Pct 15-24','Pct 25-34','Pct 35-44','Pct 45-54','Pct 55-64','Pct 65-74','Pct 75-84','Pct 85+',
    'Accessed'
]
all_data = all_data[base_columns]
all_min = all_data["Time"].min()
print("Min Date: " + str(all_min))
all_max = all_data["Time"].max()
print("Max Date: " + str(all_max))
all_data.head()

grouping_cols = ["Region","Country","State/Province"]
base_cols = ["Cases","Tests","Deaths"]
calc_cols = []
for col in base_cols:
    all_data[col + " Daily"] = all_data[col + " Daily"].fillna(0).astype(int)
    all_data[col + " Daily Rate"] = all_data[col + " Daily"]/all_data["Population 100K"]
    all_data["Total " + col] = all_data.groupby(grouping_cols)[col + " Daily"].cumsum().reset_index(drop=True)
    all_data[col + " Daily 7D Rolling"] = all_data.groupby(grouping_cols, as_index=False)[col + " Daily"].rolling(7,min_periods=7).mean().reset_index(drop=True)
    all_data[col + " Daily Rate 7D Rolling"] = all_data.groupby(grouping_cols, as_index=False)[col + " Daily Rate"].rolling(7,min_periods=7)[col + " Daily Rate"].mean().reset_index(drop=True)
    all_data["Total " + col + " Rate"] = all_data["Total " + col]/all_data["Population 100K"]
    base_order = [
        col + ' Daily', col + ' Daily 7D Rolling', "Total " + col, 
        col + ' Daily Rate', col + ' Daily Rate 7D Rolling', "Total " + col + " Rate"
    ]
    calc_cols = calc_cols + base_order

all_data["Speed Daily"] = all_data["Cases Daily 7D Rolling"]/all_data["Population 100K"]
all_data["Positivity 7D Rolling"] = all_data["Cases Daily 7D Rolling"]/all_data["Tests Daily 7D Rolling"]
all_data["Positivity 7D Rolling"] = all_data["Positivity 7D Rolling"].apply(lambda x: np.nan if x == np.inf else x)
all_data["Acceleration Daily"] = all_data.groupby(grouping_cols, as_index=False)["Speed Daily"].diff().reset_index(drop=True)
all_data["Jerk Daily"] = all_data.groupby(grouping_cols, as_index=False)["Acceleration Daily"].diff().reset_index(drop=True)
all_data["Jounce Daily"] = all_data.groupby(grouping_cols, as_index=False)["Jerk Daily"].diff().reset_index(drop=True)
all_data["MM-DD-YYYY"] = all_data["Date"].apply(lambda x: datetime.strptime(x, '%m/%d/%Y').strftime('%m-%d-%Y'))
all_data["MM-DD-YYYY"] = all_data["MM-DD-YYYY"].astype(str)
all_data["DD-MM-YYYY"] = all_data["Date"].apply(lambda x: datetime.strptime(x, '%m/%d/%Y').strftime('%d-%m-%Y'))
all_data["DD-MM-YYYY"] = all_data["DD-MM-YYYY"].astype(str)
all_data["Week"] = all_data["Date"].apply(
    lambda x: 
    str(datetime.strptime(x, '%m/%d/%Y').isocalendar()[0]) + 
    " W" + str(datetime.strptime(x, '%m/%d/%Y').isocalendar()[1])
)
all_data["First Day of Week"] = all_data["Date"].apply(
    lambda x: datetime.strptime(x, '%m/%d/%Y') - timedelta(days=datetime.strptime(x, '%m/%d/%Y').weekday())
)
all_data["Last Day of Week"] = all_data["First Day of Week"].apply(lambda x: x + timedelta(days=6))
def shortDate(day):
    shortMonth = datetime.strftime(day,'%m').lstrip('0')
    shortDay  = datetime.strftime(day,'%d').lstrip('0')
    shortYear = datetime.strftime(day,'%y')
    return shortMonth + "/" + shortDay + "/" + shortYear
all_data["Week Date Range"] = all_data.apply(
    lambda x: shortDate(x["First Day of Week"]) + " - " + shortDate(x["Last Day of Week"]),
    axis=1
)
all_data["Day Count"] = all_data.groupby(["Region","Country","State/Province","Date"]).cumcount()

header_cols = [
    'Level', 'Region','Country','Census Region','State/Province','Abbreviation','FIPS',
    'Time', 'Date', 'Day Count', 'MM-DD-YYYY', 'DD-MM-YYYY', 'Week', 'First Day of Week', 'Last Day of Week', 'Week Date Range',
    'Status', 'Data Quality'
]
calc_cols = calc_cols + ["Positivity 7D Rolling", "Speed Daily", "Acceleration Daily", "Jerk Daily"]
other_cols = [
    'Negative Daily', 'Total Negative',
    'Hospitalized Daily', 'Total Hospitalized', 'Currently Hospitalized',
    'ICU Daily', 'Total ICU',
    'Currently In ICU', 'Total In ICU',
    'Currently On Ventilator',  'Total On Ventilator',
    'Recovered Daily', 'Total Recovered',
    'Active Daily', 'Total Active',
    'Population', 'Population 100K',
    'Country Population', 'Country Population 100K',"Country Share",
    'Region Population', 'Region Population 100K',"Region Share",
    'World Population', 'World Population 100K',"World Share",
    'World Share (%)', 'Urban Population (%)', 'Annual Change (%)', 'Net Change', 'Migrants (net)', 'Density (P/Km²)',
    'Land Area (Km²)', 'Fertility Rate', 'Median Age',
    '< 1', 1, 2, 3, 4, 5, 6, 7, 8, 9,
    10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
    20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
    30, 31, 32, 33, 34, 35, 36, 37, 38, 39,
    40, 41, 42, 43, 44, 45, 46, 47, 48, 49,
    50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
    60, 61, 62, 63, 64, 65, 66, 67, 68, 69,
    70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
    80, 81, 82, 83, 84, '85+',
    '1-4','5-14','15-24','25-34','35-44','45-54','55-64','65-74','75-84',
    'Pct < 1','Pct 1-4','Pct 5-14','Pct 15-24','Pct 25-34','Pct 35-44',
    'Pct 45-54','Pct 55-64','Pct 65-74','Pct 75-84','Pct 85+',
    'Accessed'
]
all_cols = header_cols + calc_cols + other_cols
print(all_cols)
all_data = all_data[all_cols].copy()
last_monday = date(year=2021,month=1,day=18)
print("Last Monday: ")
print(last_monday)
all_data['Time'] = all_data['Time'].astype(str)
all_min = all_data["Time"].min()
print("Min Date: " + str(all_min))
all_max = all_data["Time"].max()
print("Max Date: " + str(all_max))

Column Values:
['', 'Alberta', 'Illinois', 'Nunavut', 'Ontario', 'Texas', 'West Virginia', 'American Samoa', 'New Brunswick', 'Missouri', 'Pennsylvania', 'South Carolina', 'Alabama', 'U.S. Virgin Islands', 'Washington', 'British Columbia', 'Florida', 'Guam', 'Arkansas', 'Nebraska', 'South Dakota', 'Quebec', 'California', 'Georgia', 'Minnesota', 'New Hampshire', 'North Carolina', 'Saskatchewan', 'Virginia', 'Oregon', 'Newfoundland and Labrador', 'Puerto Rico', 'Alaska', 'Delaware', 'Maine', 'Montana', 'Northwest Territories', 'Commonwealth of the Northern Mariana Islands', 'Wisconsin', 'Manitoba', 'Nova Scotia', 'Oklahoma', 'New Mexico', 'World', 'Kentucky', 'Michigan', 'Nevada', 'New York', 'Prince Edward Island', 'Wyoming', 'Mississippi', 'Yukon', 'Colorado', 'Repatriated Canada', 'Hawaii', 'Rhode Island', 'Louisiana', 'District of Columbia', 'Utah', 'Vermont', 'Ohio', 'New Jersey', 'Idaho', 'Arizona', 'Massachusetts', 'Maryland', 'Tennessee', 'Connecticut', 'Indiana', 'Kansas', 'Nort

In [11]:
excess_days = all_data.loc[all_data["Day Count"]>0].copy()
excess_days.head()

Unnamed: 0,Level,Region,Country,Census Region,State/Province,Abbreviation,FIPS,Time,Date,Day Count,MM-DD-YYYY,DD-MM-YYYY,Week,First Day of Week,Last Day of Week,Week Date Range,Status,Data Quality,Cases Daily,Cases Daily 7D Rolling,Total Cases,Cases Daily Rate,Cases Daily Rate 7D Rolling,Total Cases Rate,Tests Daily,Tests Daily 7D Rolling,Total Tests,Tests Daily Rate,Tests Daily Rate 7D Rolling,Total Tests Rate,Deaths Daily,Deaths Daily 7D Rolling,Total Deaths,Deaths Daily Rate,Deaths Daily Rate 7D Rolling,Total Deaths Rate,Positivity 7D Rolling,Speed Daily,Acceleration Daily,Jerk Daily,Negative Daily,Total Negative,Hospitalized Daily,Total Hospitalized,Currently Hospitalized,ICU Daily,Total ICU,Currently In ICU,Total In ICU,Currently On Ventilator,Total On Ventilator,Recovered Daily,Total Recovered,Active Daily,Total Active,Population,Population 100K,Country Population,Country Population 100K,Country Share,Region Population,Region Population 100K,Region Share,World Population,World Population 100K,World Share,World Share (%),Urban Population (%),Annual Change (%),Net Change,Migrants (net),Density (P/Km²),Land Area (Km²),Fertility Rate,Median Age,< 1,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85+,1-4,5-14,15-24,25-34,35-44,45-54,55-64,65-74,75-84,Pct < 1,Pct 1-4,Pct 5-14,Pct 15-24,Pct 25-34,Pct 35-44,Pct 45-54,Pct 55-64,Pct 65-74,Pct 75-84,Pct 85+,Accessed


In [12]:
input_df = all_data.loc[all_data["Day Count"]==0].copy().reset_index()
input_df.drop(columns=["Day Count","index","MM-DD-YYYY","DD-MM-YYYY"],inplace=True)
input_df["State/Province"] = input_df["State/Province"].astype(str)
input_df["State/Province"] = input_df["State/Province"].apply(lambda x: "" if x == "nan" else x)
input_df["Census Region"] = input_df["Census Region"].astype(str)
input_df["Census Region"] = input_df["Census Region"].apply(lambda x: "" if x == "nan" else x)
input_df["Status"] = input_df["Status"].astype(str)
input_df["Status"] = input_df["Status"].apply(lambda x: "" if x == "nan" else x)
input_df["Data Quality"] = input_df["Data Quality"].astype(str)
input_df["Data Quality"] = input_df["Data Quality"].apply(lambda x: "" if x == "nan" else x)
input_df.head()

Unnamed: 0,Level,Region,Country,Census Region,State/Province,Abbreviation,FIPS,Time,Date,Week,First Day of Week,Last Day of Week,Week Date Range,Status,Data Quality,Cases Daily,Cases Daily 7D Rolling,Total Cases,Cases Daily Rate,Cases Daily Rate 7D Rolling,Total Cases Rate,Tests Daily,Tests Daily 7D Rolling,Total Tests,Tests Daily Rate,Tests Daily Rate 7D Rolling,Total Tests Rate,Deaths Daily,Deaths Daily 7D Rolling,Total Deaths,Deaths Daily Rate,Deaths Daily Rate 7D Rolling,Total Deaths Rate,Positivity 7D Rolling,Speed Daily,Acceleration Daily,Jerk Daily,Negative Daily,Total Negative,Hospitalized Daily,Total Hospitalized,Currently Hospitalized,ICU Daily,Total ICU,Currently In ICU,Total In ICU,Currently On Ventilator,Total On Ventilator,Recovered Daily,Total Recovered,Active Daily,Total Active,Population,Population 100K,Country Population,Country Population 100K,Country Share,Region Population,Region Population 100K,Region Share,World Population,World Population 100K,World Share,World Share (%),Urban Population (%),Annual Change (%),Net Change,Migrants (net),Density (P/Km²),Land Area (Km²),Fertility Rate,Median Age,< 1,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85+,1-4,5-14,15-24,25-34,35-44,45-54,55-64,65-74,75-84,Pct < 1,Pct 1-4,Pct 5-14,Pct 15-24,Pct 25-34,Pct 35-44,Pct 45-54,Pct 55-64,Pct 65-74,Pct 75-84,Pct 85+,Accessed
0,Country,Central Asia,Armenia,,,AM,,2020-03-01,03/01/2020,2020 W9,2020-02-24,2020-03-01,2/24/20 - 3/1/20,,,1,,1,0.033747,,0.033747,0,,0,0.0,,0.0,0,,0,0.0,,0.0,,,,,,,,,,,,,,,,,,,,2963243.0,29.63243,,,,326887719.0,3268.87719,0.009065,7796609000.0,77966.09105,0.00038,0.04,63.0,0.19,5512.0,-4998.0,104.0,28470.0,1.8,35.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1/30/2021
1,Country,Central Asia,Armenia,,,AM,,2020-03-02,03/02/2020,2020 W10,2020-03-02,2020-03-08,3/2/20 - 3/8/20,,,0,,1,0.0,,0.033747,0,,0,0.0,,0.0,0,,0,0.0,,0.0,,,,,,,,,,,,,,,,,,,,2963243.0,29.63243,,,,326887719.0,3268.87719,0.009065,7796609000.0,77966.09105,0.00038,0.04,63.0,0.19,5512.0,-4998.0,104.0,28470.0,1.8,35.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1/30/2021
2,Country,Central Asia,Armenia,,,AM,,2020-03-03,03/03/2020,2020 W10,2020-03-02,2020-03-08,3/2/20 - 3/8/20,,,0,,1,0.0,,0.033747,0,,0,0.0,,0.0,0,,0,0.0,,0.0,,,,,,,,,,,,,,,,,,,,2963243.0,29.63243,,,,326887719.0,3268.87719,0.009065,7796609000.0,77966.09105,0.00038,0.04,63.0,0.19,5512.0,-4998.0,104.0,28470.0,1.8,35.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1/30/2021
3,Country,Central Asia,Armenia,,,AM,,2020-03-04,03/04/2020,2020 W10,2020-03-02,2020-03-08,3/2/20 - 3/8/20,,,0,,1,0.0,,0.033747,0,,0,0.0,,0.0,0,,0,0.0,,0.0,,,,,,,,,,,,,,,,,,,,2963243.0,29.63243,,,,326887719.0,3268.87719,0.009065,7796609000.0,77966.09105,0.00038,0.04,63.0,0.19,5512.0,-4998.0,104.0,28470.0,1.8,35.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1/30/2021
4,Country,Central Asia,Armenia,,,AM,,2020-03-05,03/05/2020,2020 W10,2020-03-02,2020-03-08,3/2/20 - 3/8/20,,,0,,1,0.0,,0.033747,0,,0,0.0,,0.0,0,,0,0.0,,0.0,,,,,,,,,,,,,,,,,,,,2963243.0,29.63243,,,,326887719.0,3268.87719,0.009065,7796609000.0,77966.09105,0.00038,0.04,63.0,0.19,5512.0,-4998.0,104.0,28470.0,1.8,35.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1/30/2021


In [13]:
date_check = input_df.groupby(all_data["Country"])["Time","Week"].max()
date_check.head()

Unnamed: 0_level_0,Time,Week
Country,Unnamed: 1_level_1,Unnamed: 2_level_1
,2021-01-29,2021 W4
Afghanistan,2021-01-29,2021 W4
Albania,2021-01-29,2021 W4
Algeria,2021-01-29,2021 W4
Andorra,2021-01-29,2021 W4


In [14]:
# Read starter

pgmm = pd.read_excel(dataFolder + "Starter.xlsx")
print(pgmm.columns)
pgmm.head()

# Read R results

pgmm_files = [
    'Central Asia',
    'East Asia and Pacific',
    'Europe',
    'Latin America',
    'Middle East and North Africa',
    'United States',
    'Canada',
    'South Asia',
    'Sub-Saharan Africa'
]

def readOutput(r,pgmm_read_file):
    df = pd.read_csv(pgmm_read_file)
    df = df.drop(["Unnamed: 0"], axis=1)
    df = df[df["V1"] != 0]
    if ((r == "Canada") or (r == "United States")):
        df["Region"] = "North America"
        df.rename(columns = {"V2": "State/Province"}, inplace=True)
        df["Country"] = r
        df["Level"] = df["State/Province"].apply(lambda x: "Country" if (x == "Region") else "State/Province")
        df["State/Province"] = df["State/Province"].apply(lambda x: "" if (x == "Region") else x)
    else:
        df["Region"] = r
        df.rename(columns = {"V2": "Country"}, inplace=True)
        df["Level"] = df["Country"].apply(lambda x: "Region" if (x == "Region") else "Country")
        df["Country"] = df["Country"].apply(lambda x: "" if (x == "Region") else x)
        df["State/Province"] = ""
    df.rename(columns = {
        "V1": 'Last Day of Week Excel Date',
        "V3": 'Cases Daily Last Day of Week',
        "V4": 'Total Cases Last Day of Week',
        "V5": 'Cases 7D Moving Average',
        "V6": 'Cases Last Day of Week Rate 100K',
        "V7": 'Deaths Daily Last Day of Week',
        "V8": 'Total Deaths Last Day of Week',
        "V9": 'Deaths 7D Moving Average',
        "V10": 'Deaths Last Day of Week Rate 100K',
        "V11": 'Speed', #Cases Last Day of Week Rate 100K 7D Moving Average
        "V12": 'Acceleration', #Average Daily Change in Speed
        "V13": 'Jerk', #Average Daily Change in Acceleration
        "V14": '1-Day Persistence', 
        "V15": '7-Day Persistence'
    }, inplace=True)
    return df
    

for r in pgmm_files:
    file_stem = cleanedFolder + r
    pgmm_read_file = file_stem + output_file_end
    statistics_read_file = file_stem + statistics_file_end
    output_df = readOutput(r,pgmm_read_file)
    pgmm = pd.concat([pgmm, output_df], ignore_index=True, sort=False)

# Create Time Variables
pgmm["Last Day of Week Excel Date"] = pgmm["Last Day of Week Excel Date"].astype(int)
pgmm["Time"] = pgmm["Last Day of Week Excel Date"].apply(lambda x: datetime.fromordinal(datetime(1900, 1, 1).toordinal() + x - 2))
pgmm["Date"] = pgmm["Time"].apply(lambda x: x.strftime('%m/%d/%Y'))
pgmm["Week"] = pgmm["Date"].apply(lambda x: 
                                  str(datetime.strptime(x, '%m/%d/%Y').isocalendar()[0]) + 
                                  " W" +
                                  str(datetime.strptime(x, '%m/%d/%Y').isocalendar()[1])
                                 )
pgmm["Output"] = True
pgmm_order = [
    'Region', 'Country', 'State/Province', 
    'Last Day of Week Excel Date', 'Date', 'Week',
    'Speed', 'Acceleration', 'Jerk', '7-Day Persistence', '1-Day Persistence',
    'Cases Daily Last Day of Week', 'Total Cases Last Day of Week',
    'Cases 7D Moving Average', 'Cases Last Day of Week Rate 100K',
    'Deaths Daily Last Day of Week', 'Total Deaths Last Day of Week',
    'Deaths 7D Moving Average', 'Deaths Last Day of Week Rate 100K', 'Output'
]
pgmm = pgmm[pgmm_order]
pgmm.head(8)

Index(['Level', 'Region', 'Country', 'State/Province', 'Excel Date'], dtype='object')


Unnamed: 0,Region,Country,State/Province,Last Day of Week Excel Date,Date,Week,Speed,Acceleration,Jerk,7-Day Persistence,1-Day Persistence,Cases Daily Last Day of Week,Total Cases Last Day of Week,Cases 7D Moving Average,Cases Last Day of Week Rate 100K,Deaths Daily Last Day of Week,Total Deaths Last Day of Week,Deaths 7D Moving Average,Deaths Last Day of Week Rate 100K,Output
0,Central Asia,Armenia,,44178,12/13/2020,2020 W50,30.796375,0.168734,1.480039,4.195461,3.219543,1013.0,148325.0,912.571429,34.185519,24.0,2486.0,22.857143,0.809923,True
1,Central Asia,Armenia,,44185,12/20/2020,2020 W51,26.515351,-1.740371,-1.378798,3.738402,2.97021,652.0,153825.0,785.714286,22.00292,14.0,2630.0,20.571429,0.472455,True
2,Central Asia,Armenia,,44192,12/27/2020,2020 W52,19.32728,-0.805102,0.805102,3.218724,2.116293,485.0,157834.0,572.714286,16.367203,16.0,2768.0,19.714286,0.539949,True
3,Central Asia,Armenia,,44199,01/03/2021,2020 W53,10.572394,-1.234169,1.301663,2.346157,1.241093,229.0,160027.0,313.285714,7.72802,14.0,2850.0,11.714286,0.472455,True
4,Central Asia,Armenia,,44206,01/10/2021,2021 W1,10.143327,0.520665,-1.017225,1.386753,1.657435,337.0,162131.0,300.571429,11.372675,7.0,2929.0,11.285714,0.236228,True
5,Central Asia,Armenia,,44213,01/17/2021,2021 W2,11.835489,0.067494,0.626726,2.180954,5.397121,351.0,164586.0,350.714286,11.84513,5.0,2992.0,9.0,0.168734,True
6,Central Asia,Armenia,,44220,01/24/2021,2021 W3,6.990411,-1.079898,-0.766535,2.544792,3.701262,127.0,166036.0,207.142857,4.285845,5.0,3039.0,6.714286,0.168734,True
7,Central Asia,Azerbaijan,,44178,12/13/2020,2020 W50,41.134643,0.133851,0.211344,4.871768,4.309957,4451.0,175874.0,4170.714286,43.899027,39.0,1922.0,41.428571,0.384647,True


In [15]:
date_check = pgmm.groupby(pgmm["Country"])["Week"].max()
date_check.head()

Country
               2021 W3
Afghanistan    2021 W3
Albania        2021 W3
Algeria        2021 W3
Andorra        2021 W3
Name: Week, dtype: object

In [16]:
all_df = input_df.merge(pgmm,how="left",on=['Region', 'Country', 'State/Province', 'Date', 'Week'])
all_df["Output"] = all_df["Output"].apply(lambda x: x if x==True else False)
all_df["Level"] = all_df.apply(
    lambda x: "Country" if ((x["Country"]=="United States") & (x["State/Province"]=="")) else x["Level"],
    axis=1
)
all_order = [
    'Level', 'Region', 'Country', 'Census Region', 'State/Province', 'Abbreviation', 'FIPS',
    'Time','Date','Week', 'First Day of Week', 'Last Day of Week', 'Week Date Range', 'Last Day of Week Excel Date', 'Accessed',
    'Output', 'Status', 'Data Quality',
    'Speed', 'Speed Daily', 'Acceleration', 'Acceleration Daily', 'Jerk', 'Jerk Daily', '7-Day Persistence', '1-Day Persistence',
    'Cases Daily', 'Cases Daily 7D Rolling', 'Cases 7D Moving Average', 'Cases Daily Last Day of Week', 
    'Cases Daily Rate', 'Cases Daily Rate 7D Rolling', 'Cases Last Day of Week Rate 100K',
    'Total Cases', 'Total Cases Rate', 'Total Cases Last Day of Week',
    'Deaths Daily', 'Deaths Daily 7D Rolling', 'Deaths 7D Moving Average', 'Deaths Daily Last Day of Week', 
    'Deaths Daily Rate', 'Deaths Daily Rate 7D Rolling', 'Deaths Last Day of Week Rate 100K', 
    'Total Deaths', 'Total Deaths Rate', 'Total Deaths Last Day of Week',
    'Tests Daily', 'Tests Daily 7D Rolling',
    'Tests Daily Rate', 'Tests Daily Rate 7D Rolling',
    'Total Tests', 'Total Tests Rate',
    'Positivity 7D Rolling', 
    'Active Daily', 'Total Active', 
    'Negative Daily', 'Total Negative', 
    'Recovered Daily', 'Total Recovered', 
    'Hospitalized Daily', 'Currently Hospitalized', 'Total Hospitalized', 
    'ICU Daily', 'Total ICU', 
    'Currently In ICU', 'Total In ICU', 
    'Currently On Ventilator', 'Total On Ventilator', 
    "Population","Population 100K",
    "Country Population","Country Population 100K", "Country Share",
    "Region Population","Region Population 100K", "Region Share",
    "World Population", "World Population 100K", "World Share",
    'World Share (%)', 'Urban Population (%)', 'Annual Change (%)', 'Net Change', 'Migrants (net)', 'Density (P/Km²)',
    'Land Area (Km²)', 'Fertility Rate', 'Median Age',
    '< 1', 1, 2, 3, 4, 5, 6, 7, 8, 9,
    10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
    20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
    30, 31, 32, 33, 34, 35, 36, 37, 38, 39,
    40, 41, 42, 43, 44, 45, 46, 47, 48, 49,
    50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
    60, 61, 62, 63, 64, 65, 66, 67, 68, 69,
    70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
    80, 81, 82, 83, 84, '85+',
    '1-4','5-14','15-24','25-34','35-44','45-54','55-64','65-74','75-84',
    'Pct < 1','Pct 1-4','Pct 5-14','Pct 15-24','Pct 25-34','Pct 35-44','Pct 45-54','Pct 55-64','Pct 65-74','Pct 75-84','Pct 85+'
]
print_column_missing(all_df.columns,all_order)
all_df = all_df[all_order]

all_out_file = cleanedFolder + "all_combined.xlsx"
print(all_out_file)
all_df.to_excel(all_out_file, index=False)
all_df.head(14)

Column Values:
[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 'Total On Ventilator', '15-24', 'Deaths Daily 7D Rolling', 'Currently On Ventilator', 'Pct 65-74', 'World Population 100K', 'Total Deaths Last Day of Week', 'Tests Daily 7D Rolling', '1-Day Persistence', 'Pct < 1', 'Active Daily', 'World Population', 'Census Region', '45-54', '< 1', '5-14', 'World Share', 'Negative Daily', 'Accessed', 'Output', 'Pct 15-24', 'Positivity 7D Rolling', 'Hospitalized Daily', '65-74', 'Fertility Rate', 'Pct 5-14', 'Density (P/Km²)', 'Net Change', 'Deaths Daily Rate', 'Pct 75-84', 'Pct 1-4', 'Deaths Daily Last Day of Week', '1-4', 'Total Tests', 'Recovered Daily', 'Country Population', 'Total Negative', 'Abbreviation', 'Wo

Unnamed: 0,Level,Region,Country,Census Region,State/Province,Abbreviation,FIPS,Time,Date,Week,First Day of Week,Last Day of Week,Week Date Range,Last Day of Week Excel Date,Accessed,Output,Status,Data Quality,Speed,Speed Daily,Acceleration,Acceleration Daily,Jerk,Jerk Daily,7-Day Persistence,1-Day Persistence,Cases Daily,Cases Daily 7D Rolling,Cases 7D Moving Average,Cases Daily Last Day of Week,Cases Daily Rate,Cases Daily Rate 7D Rolling,Cases Last Day of Week Rate 100K,Total Cases,Total Cases Rate,Total Cases Last Day of Week,Deaths Daily,Deaths Daily 7D Rolling,Deaths 7D Moving Average,Deaths Daily Last Day of Week,Deaths Daily Rate,Deaths Daily Rate 7D Rolling,Deaths Last Day of Week Rate 100K,Total Deaths,Total Deaths Rate,Total Deaths Last Day of Week,Tests Daily,Tests Daily 7D Rolling,Tests Daily Rate,Tests Daily Rate 7D Rolling,Total Tests,Total Tests Rate,Positivity 7D Rolling,Active Daily,Total Active,Negative Daily,Total Negative,Recovered Daily,Total Recovered,Hospitalized Daily,Currently Hospitalized,Total Hospitalized,ICU Daily,Total ICU,Currently In ICU,Total In ICU,Currently On Ventilator,Total On Ventilator,Population,Population 100K,Country Population,Country Population 100K,Country Share,Region Population,Region Population 100K,Region Share,World Population,World Population 100K,World Share,World Share (%),Urban Population (%),Annual Change (%),Net Change,Migrants (net),Density (P/Km²),Land Area (Km²),Fertility Rate,Median Age,< 1,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85+,1-4,5-14,15-24,25-34,35-44,45-54,55-64,65-74,75-84,Pct < 1,Pct 1-4,Pct 5-14,Pct 15-24,Pct 25-34,Pct 35-44,Pct 45-54,Pct 55-64,Pct 65-74,Pct 75-84,Pct 85+
0,Country,Central Asia,Armenia,,,AM,,2020-03-01,03/01/2020,2020 W9,2020-02-24,2020-03-01,2/24/20 - 3/1/20,,1/30/2021,False,,,,,,,,,,,1,,,,0.033747,,,1,0.033747,,0,,,,0.0,,,0,0.0,,0,,0.0,,0,0.0,,,,,,,,,,,,,,,,,2963243.0,29.63243,,,,326887719.0,3268.87719,0.009065,7796609000.0,77966.09105,0.00038,0.04,63.0,0.19,5512.0,-4998.0,104.0,28470.0,1.8,35.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,Country,Central Asia,Armenia,,,AM,,2020-03-02,03/02/2020,2020 W10,2020-03-02,2020-03-08,3/2/20 - 3/8/20,,1/30/2021,False,,,,,,,,,,,0,,,,0.0,,,1,0.033747,,0,,,,0.0,,,0,0.0,,0,,0.0,,0,0.0,,,,,,,,,,,,,,,,,2963243.0,29.63243,,,,326887719.0,3268.87719,0.009065,7796609000.0,77966.09105,0.00038,0.04,63.0,0.19,5512.0,-4998.0,104.0,28470.0,1.8,35.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,Country,Central Asia,Armenia,,,AM,,2020-03-03,03/03/2020,2020 W10,2020-03-02,2020-03-08,3/2/20 - 3/8/20,,1/30/2021,False,,,,,,,,,,,0,,,,0.0,,,1,0.033747,,0,,,,0.0,,,0,0.0,,0,,0.0,,0,0.0,,,,,,,,,,,,,,,,,2963243.0,29.63243,,,,326887719.0,3268.87719,0.009065,7796609000.0,77966.09105,0.00038,0.04,63.0,0.19,5512.0,-4998.0,104.0,28470.0,1.8,35.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,Country,Central Asia,Armenia,,,AM,,2020-03-04,03/04/2020,2020 W10,2020-03-02,2020-03-08,3/2/20 - 3/8/20,,1/30/2021,False,,,,,,,,,,,0,,,,0.0,,,1,0.033747,,0,,,,0.0,,,0,0.0,,0,,0.0,,0,0.0,,,,,,,,,,,,,,,,,2963243.0,29.63243,,,,326887719.0,3268.87719,0.009065,7796609000.0,77966.09105,0.00038,0.04,63.0,0.19,5512.0,-4998.0,104.0,28470.0,1.8,35.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,Country,Central Asia,Armenia,,,AM,,2020-03-05,03/05/2020,2020 W10,2020-03-02,2020-03-08,3/2/20 - 3/8/20,,1/30/2021,False,,,,,,,,,,,0,,,,0.0,,,1,0.033747,,0,,,,0.0,,,0,0.0,,0,,0.0,,0,0.0,,,,,,,,,,,,,,,,,2963243.0,29.63243,,,,326887719.0,3268.87719,0.009065,7796609000.0,77966.09105,0.00038,0.04,63.0,0.19,5512.0,-4998.0,104.0,28470.0,1.8,35.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
5,Country,Central Asia,Armenia,,,AM,,2020-03-06,03/06/2020,2020 W10,2020-03-02,2020-03-08,3/2/20 - 3/8/20,,1/30/2021,False,,,,,,,,,,,0,,,,0.0,,,1,0.033747,,0,,,,0.0,,,0,0.0,,0,,0.0,,0,0.0,,,,,,,,,,,,,,,,,2963243.0,29.63243,,,,326887719.0,3268.87719,0.009065,7796609000.0,77966.09105,0.00038,0.04,63.0,0.19,5512.0,-4998.0,104.0,28470.0,1.8,35.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
6,Country,Central Asia,Armenia,,,AM,,2020-03-07,03/07/2020,2020 W10,2020-03-02,2020-03-08,3/2/20 - 3/8/20,,1/30/2021,False,,,,0.004821,,,,,,,0,0.142857,,,0.0,0.004821,,1,0.033747,,0,0.0,,,0.0,0.0,,0,0.0,,0,0.0,0.0,0.0,0,0.0,,,,,,,,,,,,,,,,,2963243.0,29.63243,,,,326887719.0,3268.87719,0.009065,7796609000.0,77966.09105,0.00038,0.04,63.0,0.19,5512.0,-4998.0,104.0,28470.0,1.8,35.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
7,Country,Central Asia,Armenia,,,AM,,2020-03-08,03/08/2020,2020 W10,2020-03-02,2020-03-08,3/2/20 - 3/8/20,,1/30/2021,False,,,,0.0,,-0.004821,,,,,0,0.0,,,0.0,0.0,,1,0.033747,,0,0.0,,,0.0,0.0,,0,0.0,,0,0.0,0.0,0.0,0,0.0,,,,,,,,,,,,,,,,,2963243.0,29.63243,,,,326887719.0,3268.87719,0.009065,7796609000.0,77966.09105,0.00038,0.04,63.0,0.19,5512.0,-4998.0,104.0,28470.0,1.8,35.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
8,Country,Central Asia,Armenia,,,AM,,2020-03-09,03/09/2020,2020 W11,2020-03-09,2020-03-15,3/9/20 - 3/15/20,,1/30/2021,False,,,,0.0,,0.0,,0.004821,,,0,0.0,,,0.0,0.0,,1,0.033747,,0,0.0,,,0.0,0.0,,0,0.0,,0,0.0,0.0,0.0,0,0.0,,,,,,,,,,,,,,,,,2963243.0,29.63243,,,,326887719.0,3268.87719,0.009065,7796609000.0,77966.09105,0.00038,0.04,63.0,0.19,5512.0,-4998.0,104.0,28470.0,1.8,35.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
9,Country,Central Asia,Armenia,,,AM,,2020-03-10,03/10/2020,2020 W11,2020-03-09,2020-03-15,3/9/20 - 3/15/20,,1/30/2021,False,,,,0.0,,0.0,,0.0,,,0,0.0,,,0.0,0.0,,1,0.033747,,0,0.0,,,0.0,0.0,,0,0.0,,0,0.0,0.0,0.0,0,0.0,,,,,,,,,,,,,,,,,2963243.0,29.63243,,,,326887719.0,3268.87719,0.009065,7796609000.0,77966.09105,0.00038,0.04,63.0,0.19,5512.0,-4998.0,104.0,28470.0,1.8,35.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [17]:
date_check = all_df.groupby(all_df["Country"])["Time","Week"].max()
date_check.head()

Unnamed: 0_level_0,Time,Week
Country,Unnamed: 1_level_1,Unnamed: 2_level_1
,2021-01-29,2021 W4
Afghanistan,2021-01-29,2021 W4
Albania,2021-01-29,2021 W4
Algeria,2021-01-29,2021 W4
Andorra,2021-01-29,2021 W4
