In [15]:
# Dependencies
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import requests
import io
import time
from datetime import datetime, timedelta, date
import json
import pprint
pp = pprint.PrettyPrinter(indent=4)
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format','{:.4f}'.format)

In [16]:
# R Days
today = datetime.today()
weekday = today.weekday()
if weekday >= 5:
    saturdayDaysAgo = weekday - 5
else:
    saturdayDaysAgo = weekday + 2
saturday = today - timedelta(days=saturdayDaysAgo)
last_saturday = saturday.strftime('%Y-%m-%d')
print("Saturday: " + last_saturday)
sunday = saturday + timedelta(days=1)
last_sunday = sunday.strftime('%Y-%m-%d')
print("Sunday: " + last_sunday)

Saturday: 2021-07-03
Sunday: 2021-07-04


In [17]:
#Folders and Files

# Folders
repositoryFolder = "C:/Users/jwg4880/Documents/GitHub/Global-COVID-Surveillance/data/"
demographicsFolder = repositoryFolder + "raw/demographics/"
configuredFolder = repositoryFolder + "configured/"
cleanedFolder = repositoryFolder + "cleaned/"
regionsFolder = repositoryFolder + "raw/regions/"
locationsFolder = repositoryFolder + "raw/locations/"

# Population Input File
all_populations_file = cleanedFolder + "all_populations.xlsx"
us_codes = demographicsFolder + "US State Codes.xlsx"

# Location Input
locations_file = demographicsFolder + "Country Geo.xlsx"

#Sources
population_source_url ="https://www.worldometers.info/world-population/population-by-country/"
canada_population_source = "https://www150.statcan.gc.ca/t1/tbl1/en/tv.action?pid=1710000901" # Statistics Canada Quarterly Population
us_population_source = "https://www2.census.gov/programs-surveys/popest/tables/2010-2019/state/asrh/sc-est2019-agesex-civ.csv"

github_url="https://github.com/dsbbfinddx/FINDCov19TrackerData/blob/master/processed/data_all.csv?raw=true"
owid_data = R"https://github.com/owid/covid-19-data/blob/master/public/data/owid-covid-data.xlsx?raw=true"
country_codes_coordinates = "https://raw.githubusercontent.com/dsbbfinddx/FINDCov19TrackerData/master/raw/countries_codes_and_coordinates.csv"
countries_geo = "https://raw.githubusercontent.com/dsbbfinddx/FINDCov19TrackerData/master/raw/countries.geo.json"
us_states_geo = "https://raw.githubusercontent.com/dsbbfinddx/FINDCov19TrackerData/master/raw/us-states.geo.json"

kaggle_locations = "https://www.kaggle.com/paultimothymooney/latitude-and-longitude-for-every-country-and-state"

canada_source_csv = "https://opendata.arcgis.com/datasets/3afa9ce11b8842cb889714611e6f3076_0.csv"
cdc_cases_deaths_csv = "https://data.cdc.gov/api/views/9mfq-cb36/rows.csv?accessType=DOWNLOAD&bom=true&format=true"
us_tests_csv = "https://raw.githubusercontent.com/govex/COVID-19/master/data_tables/testing_data/time_series_covid19_US.csv?raw=true"

# Countries and Regions

european_countries = [
    'Albania','Andorra','Austria','Belarus','Belgium','Bosnia & Herzegovina','Bulgaria',
    'Croatia','Czech Republic','Denmark','Estonia','Finland','France',
    'Germany','Greece','Greenland','Hungary','Iceland','Ireland','Isle of Man','Italy',
    'Latvia','Liechtenstein','Lithuania','Luxembourg','Malta','Moldova','Monaco','Montenegro',
    'Netherlands','Norway','Poland','Portugal','Romania',
    'San Marino','Serbia','Slovakia','Slovenia','Spain','Sweden','Switzerland',
    'Ukraine','United Kingdom','Vatican City'
]

north_american_countries = ["Canada","United States"]
carribean_countries = [
    "Antigua & Barbuda","Aruba","Bahamas","Barbados","Bermuda","British Virgin Islands",
    "Cayman Islands","Cuba","Curacao","Dominica","Dominican Republic","Grenada",
    "Haiti","Jamaica","Puerto Rico",
    "St. Barthelemy","St. Kitts & Nevis","St. Lucia","St. Vincent & Grenadines",
    "Sint Maarten","Trinidad & Tobago","Turks and Caicos Islands","United States Virgin Islands"
]
central_south_america_countries = [
    'Argentina','Belize','Bolivia','Brazil','Chile','Colombia','Costa Rica',
    'Ecuador','El Salvador','Guatemala','Guyana','Honduras',
    'Mexico','Nicaragua','Panama','Paraguay','Peru','Suriname','Uruguay','Venezuela'
]
latin_american_countries = carribean_countries + central_south_america_countries
americas_countries = latin_american_countries + north_american_countries

north_african_countries = [
    "Algeria","Djibouti","Egypt","Libya","Morocco","Tunisia","Western Sahara"
]
sub_saharan_african_countries = [
    "Angola","Benin","Botswana","Burkina Faso","Burundi",
    "Cabo Verde","Cameroon","Central African Republic","Chad","Comoros","Côte d’Ivoire",
    "Democratic Republic of Congo","Equatorial Guinea","Eritrea","Ethiopia",
    "Gabon","Gambia","Ghana","Guinea","Guinea-Bissau","Kenya","Lesotho","Liberia",
    "Madagascar","Malawi","Mali","Mauritania","Mauritius","Mozambique",
    "Namibia","Niger","Nigeria","Republic of the Congo","Rwanda",
    "São Tomé and Príncipe","Senegal","Seychelles","Sierra Leone",
    "Somalia","South Africa","South Sudan","Sudan","Swaziland",
    "Tanzania","Togo","Uganda","Zambia","Zimbabwe"
]
african_countries = north_african_countries + sub_saharan_african_countries
south_asia_countries = [
    "Afghanistan","Bangladesh","Bhutan","India","Maldives","Nepal","Pakistan","Sri Lanka"
]

middle_eastern_countries = [
    "Bahrain","Iran","Iraq","Israel","Jordan","Kuwait","Lebanon","Oman","Qatar",
    "Saudi Arabia","Syria","State of Palestine",
    "United Arab Emirates","Yemen"
]
middle_east_and_north_africa_countries = middle_eastern_countries + north_african_countries

central_asian_countries = [
    'Armenia','Azerbaijan','Cyprus','Faeroe Islands','Georgia','Gibraltar','Kazakhstan','Kosovo','Kyrgyzstan',
    'North Macedonia','Russia','Tajikistan','Turkey','Turkmenistan','Uzbekistan'
]
east_mediterranean_countries = middle_eastern_countries + central_asian_countries

east_asian_countries = [
    "Brunei","Cambodia","China","Indonesia","Japan","Laos","Malaysia","Mongolia","Myanmar","Niue","North Korea","Philippines",
    "Singapore","South Korea","Taiwan","Thailand","Timor","Vietnam"
]
pacific_countries = [
    "Australia","Cook Islands","Fiji","French Polynesia","Guam","Kiribati",
    "Marshall Islands","Micronesia","Nauru","New Caledonia","New Zealand",
    "Northern Mariana Islands","Palau","Papua New Guinea","Samoa","Solomon Islands","Tonga","Tuvalu","Vanuatu"
]
east_asia_and_pacific_countries = east_asian_countries + pacific_countries

lancet_regions = {
    "Africa": african_countries,
    "Americas": americas_countries,
    "Europe": european_countries,
    "Eastern Mediterranean": east_mediterranean_countries,
    "Southeast Asia": east_asian_countries,
    "Western Pacific": pacific_countries
}

In [18]:
#pp.pprint(lancet_regions)

In [19]:
configured_country_lists = [
    european_countries,
    latin_american_countries,
    sub_saharan_african_countries,
    south_asia_countries,
    central_asian_countries,
    middle_east_and_north_africa_countries,
    east_asia_and_pacific_countries
]
countries_by_region = {
    'Central Asia': central_asian_countries,
    'Europe': european_countries,
    'Latin America': latin_american_countries,
    'South Asia': south_asia_countries,
    'Sub-Saharan Africa': sub_saharan_african_countries,
    'Middle East and North Africa': middle_east_and_north_africa_countries,
    'East Asia and Pacific': east_asia_and_pacific_countries,
    'North America': north_american_countries
}

configured_countries = []
for country_list in configured_country_lists:
    for country in country_list:
        configured_countries.append(country)
configured_countries.sort()
all_countries = configured_countries +  north_american_countries
all_countries.sort()

configured_regions = [
    'Central Asia',
    'East Asia and Pacific',
    'Europe',
    'Latin America',
    'Middle East and North Africa',
    'North America',
    'South Asia',
    'Sub-Saharan Africa'
]

regions = [
    'Central Asia',
    'East Asia and Pacific',
    'Europe',
    'Latin America',
    'Middle East and North Africa',
    'North America',
    'South Asia',
    'Sub-Saharan Africa',
    'Territory'
]

unincorporated_disputed_territories = [
    "American Samoa", "Anguilla","Caribbean Netherlands","Channel Islands","Curaçao",
    "Falkland Islands","French Guiana","Guadeloupe","Hong Kong","International"
]

countries_by_region = {
    'Central Asia': central_asian_countries,
    'Europe': european_countries,
    'Latin America': latin_american_countries,
    'South Asia': south_asia_countries,
    'Sub-Saharan Africa': sub_saharan_african_countries,
    'Middle East and North Africa': middle_east_and_north_africa_countries,
    'East Asia and Pacific': east_asia_and_pacific_countries,
    'North America': north_american_countries,
    'Territory': unincorporated_disputed_territories
}

census_regions = {
    0: {"name" : "United States",
        "states" : ["United States"]},
    1: {"name" : "Northeast",
        "states" :["Connecticut", "Maine", "New Hampshire", "Vermont", "Massachusetts", 
                   "Rhode Island", "New Jersey", "New York", "Pennsylvania"]},
    3: {"name" : "South",
        "states" : ["Maryland", "Delaware", "West Virginia", "Virginia", "Kentucky", 
                    "Tennessee", "North Carolina", "South Carolina", "Georgia", "Florida", 
                    "Alabama", "Mississippi", "Arkansas", "Louisiana", "Oklahoma", "Texas", 
                    "District of Columbia", "Puerto Rico"]},
    2: {"name" : "Midwest",
        "states" : ["North Dakota", "South Dakota", "Nebraska", "Kansas", "Missouri", "Iowa", 
                    "Minnesota", "Wisconsin", "Illinois", "Michigan", "Indiana", "Ohio"]},
    4: {"name" : "West",
        "states" : ["Washington", "Idaho", "Montana", "Wyoming", "Oregon", "California", "Nevada", 
                    "Utah", "Colorado", "Arizona", "New Mexico", "Alaska", "Hawaii"]}
}

us_states = [
    'Alabama','Alaska','Arizona','Arkansas','California','Colorado','Connecticut','Delaware','District of Columbia',
    'Florida','Georgia','Hawaii','Idaho','Illinois','Indiana','Iowa','Kansas','Kentucky','Louisiana',
    'Maine','Maryland','Massachusetts','Michigan','Minnesota','Mississippi','Missouri','Montana','Nebraska',
    'Nevada','New Hampshire','New Jersey','New Mexico','New York','North Carolina','North Dakota',
    'Ohio','Oklahoma','Oregon','Pennsylvania','Rhode Island','South Carolina','South Dakota','Tennessee','Texas',
    'Utah','Vermont','Virginia','Washington','West Virginia','Wisconsin','Wyoming'
]
canada_provinces = [
    'Alberta','British Columbia','Manitoba','New Brunswick','Newfoundland and Labrador','Northwest Territories',
    'Nova Scotia','Ontario','Prince Edward Island','Quebec','Saskatchewan','Yukon'
]
states_and_provinces = us_states + canada_provinces

country_conversions = {
    "Antigua & Barbuda": ["Antigua and Barbuda"],
    "Bahamas": ["Bahamas, The"],
    "Bosnia & Herzegovina": ["Bosnia and Herzegovina"],
    "Brunei": ["Brunei Darussalam"],
    "Cabo Verde": ["Cape Verde"],
    "Caribbean Netherlands": ["Bonaire Sint Eustatius and Saba"],
    "Côte d’Ivoire": ["Cote d'Ivoire","Cote dIvoire"],
    "Czech Republic": ["Czechia","Czech Republic (Czechia)"],
    "Democratic Republic of Congo": ["Congo - Kinshasa"],
    "Egypt": ["Egypt, Arab Rep."],
    "Faeroe Islands": ["Faroe Islands"],
    "Gambia": ["Gambia, The"],
    "Hong Kong": ["Hong Kong SAR China"],
    "Iran": ["Iran, Islamic Rep."],
    "Kyrgyzstan": ["Kyrgyz Republic"],
    "Laos": ["Lao PDR"],
    "Micronesia": ["Micronesia, Fed. Sts.","Micronesia (country)"],
    "Myanmar": ["Myanmar (Burma)","Burma"],
    "North Macedonia": ["Macedonia"],
    "Pitcairn Island": ["Pitcairn"],
    "Republic of the Congo": ["Congo - Brazzaville","Congo"],
    "Russia": ["Russian Federation"],
    "São Tomé and Príncipe": ["Sao Tome and Principe","Sao Tome & Príncipe","São Tomé & Príncipe"],
    "Sint Maarten": ["Sint Maarten (Dutch part)"],
    "Slovakia": ["Slovak Republic"],
    "St. Helena":["Saint Helena"],
    "St. Kitts & Nevis": ["Saint Kitts and Nevis"],
    "St. Lucia": ["Saint Lucia"],
    "St. Vincent & Grenadines": ["Saint Vincent and the Grenadines"],
    "State of Palestine": ["Palestinian Territories","Palestine"],
    "Swaziland": ["Eswatini"],
    "Syria": ["Syrian Arab Republic"],
    "Timor": ["Timor-Leste"],
    "Trinidad & Tobago": ["Trinidad and Tobago"],
    "Vatican City": ["Holy See","Vatican"],
    "Wallis & Futuna": ["Wallis and Futuna"],
    "Yemen": ["Yemen, Rep."],
    "" : ["nan"]
}
us_territories =['American Samoa','Commonwealth of the Northern Mariana Islands','Guam','Puerto Rico','U.S. Virgin Islands']
territories = {
    "American Samoa":{"Region":"North America","Country":"United States"}, 
    "Anguilla":{"Region":"Europe","Country":"United Kingdom"},
    "Caribbean Netherlands":{"Region":"Europe","Country":"Netherlands"},
    "Channel Islands":{"Region":"Europe","Country":"Channel Islands"},
    "Curaçao":{"Region":"Europe","Country":"Netherlands"},
    "Falkland Islands":{"Region":"Europe","Country":"United Kingdom"},
    "French Guiana":{"Region":"Europe","Country":"France"},
    "Guadeloupe":{"Region":"Europe","Country":"France"},
    "Guernsey":{"Region":"Europe","Country":"United Kingdom"},
    "Hong Kong":{"Region":"East Asia and Pacific","Country":"China"},
    "Jersey":{"Region":"Europe","Country":"United Kingdom"},
    "International":{"Region":"World","Country":""},
    "Macao":{"Region":"East Asia and Pacific","Country":"China"},
    "Martinique":{"Region":"Europe","Country":"France"},
    "Mayotte":{"Region":"Europe","Country":"France"},
    "Montserrat":{"Region":"Europe","Country":"United Kingdom"},
    "Northern Cyprus":{"Region":"Europe","Country":"Turkey"},
    "Pitcairn Island":{"Region":"Europe","Country":"United Kingdom"},
    "Réunion":{"Region":"Europe","Country":"France"},
    "St. Helena":{"Region":"Europe","Country":"United Kingdom"},
    "St. Martin":{"Region":"Europe","Country":"France"},
    "St. Pierre & Miquelon":{"Region":"Europe","Country":"France"},
    "Tokelau":{"Region":"East Asia and Pacific","Country":"New Zealand"},
    "Turks and Caicos":{"Region":"Europe","Country":"United Kingdom"},
    "U.S. Virgin Islands":{"Region":"North America","Country":"United States"},
    "Wallis & Futuna":{"Region":"Europe","Country":"France"},
    "World":{"Region":"World","Country":""}
}

In [20]:
def titleCase(words):
    if len(words) > 3:
        titlecased = ""
        wordsArray = words.lower().split(" ")
        for word in wordsArray:
            if len(titlecased) > 0 :
                titlecased = titlecased + " "
            if word == "and":
                titlecased = titlecased + "and"
            else:
                titlecased = titlecased + word.capitalize()
        return titlecased
    else:
        return words.upper()

def fixRegion(code):
    region_name = ""
    for region in census_regions:
        if region["number"] == code:
            region_name = region["name"]
            break
    if region_name == "":
        region_name = "Other"
        print(str(code) + " not found")
    return region_name

def checkRegions(regionColumn, countryColumn):
    fixed = []
    for i in range(0,len(regionColumn)):
        region = regionColumn[i]
        country = countryColumn[i]
        if (not (region in regions)) and (not (country in fixed)):
            fixed.append(country)
            print(f"{country} = {region}")

# CDC Standard age ranges 0-17, 18-29, 30-49, and 50-64
# CDC COVID Reporting Age Ranges https://www.cdc.gov/nchs/nvss/vsrr/covid_weekly/index.htm
def getAgeRange(age):
    age_range = ""
    if age == 0:
        age_range = "< 1"
    elif age == 999:
        age_range = "Total"
    elif age < 5:
        age_range = "1-4"
    elif age < 15:
        age_range = "5-14"
    elif age < 25:
        age_range = "15-24"
    elif age < 35:
        age_range = "25-34"
    elif age < 45:
        age_range = "35-44"
    elif age < 55:
        age_range = "45-54"
    elif age < 65:
        age_range = "55-64"
    elif age < 75:
        age_range = "65-74"
    elif age < 85:
        age_range = "75-84"
    elif age == 85:
        age_range = "85+"
    return age_range

def fixSex(code):
    sex = ""
    if code == 0:
        sex = "Population 2019"
    elif code == 1:
        sex = "Male"
    elif code == 2:
        sex = "Female"
    else:
        print(str(code) + " is not a sex")
    return sex

def us_date(x):
    month = x[5:7]
    day = x[8:11]
    year = x[0:4]
    conversion = month + "/" + day +"/"+ year
    return conversion

def removeDecimal(data):
    strData = str(data)
    decimalLocation = strData.find(".")
    if decimalLocation > -1:
        return strData[0:decimalLocation]
    else:
        return strData

def emptyNan(value):
    if (value == "nan"):
        return ""
    else:
        return value

def printColumns(df, label):
    print(label)
    print(df.columns)

def print_column_unique(column):
    print("Column Values:")
    values = list(set(sorted(column,key=lambda v: (isinstance(v, str), v))))
    # values = column.sort_values(ascending = True).unique()
    print(values)
    return values

def print_column_missing(column, comparison):
    values = print_column_unique(column)
    print("Comparison:")
    print(comparison)
    missing_values = []
    for value in values:
        if not value in comparison:
            missing_values.append(value)
    if len(missing_values) > 0:
        print("Column values not in comparison:")
        print(missing_values)
    else:
        print("No missing values")
    missing_values = []
    for value in comparison:
        if not value in values:
            missing_values.append(value)
    if len(missing_values) > 0:
        print("Comparison values not in column:")
        print(missing_values)
    else:
        print("No missing values")
    return values

def division(a,b):
    if not (b == 0):
        return a/b
    else:
        return np.nan
    
def key_from_value(value, dictionary, default):
    return_value = default
    for key, values in dictionary.items():
        if value.strip() in values:
            return_value = key
            break
    return return_value.strip()

def region_from_country(country):
    return key_from_value(country, countries_by_region, "")

def fixTerritoryRegion(territory):
    characteristics = territories[territory]
    return characteristics["Region"]

def fixCountry(value):
    return key_from_value(value, country_conversions, value)

def fixCountries(countries_column, configuredCountries):
    countries_conversion = countries_column.astype(str)
    countries_conversion = countries_conversion.apply(lambda x: fixCountry(x))
    print(conversions)
    countries = print_column_missing(countries_conversion,configuredCountries)
    return countries_conversion

def checkCountries(column):
    fixed = []
    for value in column:
        fixedValue = fixCountry(value)
        if (not (fixedValue == value)) and (not (value in fixed)):
            fixed.append(value)
            print(f"{value} => {fixedValue}")
    if len(fixed) == 0:
        print("No countries need to be fixed.")

def testConversion(title, test_array, conversion):
    print(title)
    no_conversions = []
    for value in test_array:
        return_value = ""
        if conversion == "country":
            return_value = fixCountry(value)
        elif conversion == "region":
            return_value = region_from_country(fixCountry(value))
        if return_value != value.strip():
            print(value.strip() + "," + return_value)
        if return_value == "":
            no_conversions.append(value)
    if len(no_conversions) > 0:
        print("Missing Conversions")
        print(no_conversions)
    print("")
    
def division(a,b):
    if not (b == 0):
        return a/b
    else:
        return np.nan
    
def fixProvince(value):
    province_map = {
        'BC': 'British Columbia',
        'NL': 'Newfoundland and Labrador',
        'NWT': 'Northwest Territories',
        'PEI': 'Prince Edward Island',
        'Repatriated': 'Repatriated Canada',
        'Repatriated Cdn': 'Repatriated Canada'
    }
    value = titleCase(value)
    if value in province_map.keys():
        new_province = province_map[value]
        return new_province
    else:
        return value

def censusRegionByState(state):
    for key in census_regions.keys():
        if (state in census_regions[key]["states"]):
            return census_regions[key]["name"]
        
def fixUSRegion(code):
    region = census_regions[code]
    region_name = region["name"]
    return region_name

def convertDateToExcel(dayString) :
    temp = datetime(1899, 12, 30)
    day = datetime.strptime(dayString, '%m/%d/%Y')
    delta = day - temp
    return float(delta.days) + (float(delta.seconds) / 86400)

def firstDay(dayString):
    dt = datetime.strptime(dayString, '%m/%d/%Y')
    firstDate = dt - timedelta(days=dt.weekday())
    return firstDate.strftime('%m/%d/%Y')

def lancet_region(country):
    region = ""
    for search_region in lancet_regions:
        if country in lancet_regions[search_region]:
            region = search_region
            break
    return region

In [21]:
all_populations = pd.read_excel(cleanedFolder + "all_populations.xlsx")
population_groups = ["Level","Region","Census Region","Country","State/Province"]
for group in population_groups:
    all_populations[group] = all_populations[group].astype(str)
    all_populations[group] = all_populations[group].apply(lambda x: "" if x=="nan" else x)
print(all_populations.columns)
all_populations.head()

Index([                  'Level',                  'Region',
                       'Country',           'Census Region',
                'State/Province',              'Population',
               'Population 100K',      'Country Population',
       'Country Population 100K',       'Region Population',
       ...
                       'Pct 1-4',                'Pct 5-14',
                     'Pct 15-24',               'Pct 25-34',
                     'Pct 35-44',               'Pct 45-54',
                     'Pct 55-64',               'Pct 65-74',
                     'Pct 75-84',                 'Pct 85+'],
      dtype='object', length=131)


Unnamed: 0,Level,Region,Country,Census Region,State/Province,Population,Population 100K,Country Population,Country Population 100K,Region Population,Region Population 100K,World Population,World Population 100K,Country Share,Region Share,World Share,World Share (%),Urban Population (%),Annual Change (%),Net Change,Migrants (net),Density (P/Km²),Land Area (Km²),Fertility Rate,Median Age,< 1,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85+,1-4,5-14,15-24,25-34,35-44,45-54,55-64,65-74,75-84,Pct < 1,Pct 1-4,Pct 5-14,Pct 15-24,Pct 25-34,Pct 35-44,Pct 45-54,Pct 55-64,Pct 65-74,Pct 75-84,Pct 85+
0,Country,Central Asia,Armenia,,,2963243,29.6324,,,326887719,3268.8772,7796609105,77966.0911,,0.0091,0.0004,0.04,63.0,0.19,5512.0,-4998.0,104.0,28470.0,1.8,35.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,Country,Central Asia,Azerbaijan,,,10139177,101.3918,,,326887719,3268.8772,7796609105,77966.0911,,0.031,0.0013,0.13,56.0,0.91,91459.0,1200.0,123.0,82658.0,2.1,32.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,Country,Central Asia,Cyprus,,,1207359,12.0736,,,326887719,3268.8772,7796609105,77966.0911,,0.0037,0.0002,0.02,67.0,0.73,8784.0,5000.0,131.0,9240.0,1.3,37.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,Country,Central Asia,Faeroe Islands,,,48863,0.4886,,,326887719,3268.8772,7796609105,77966.0911,,0.0001,0.0,0.0,43.0,0.38,185.0,,35.0,1396.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,Country,Central Asia,Georgia,,,3989167,39.8917,,,326887719,3268.8772,7796609105,77966.0911,,0.0122,0.0005,0.05,58.0,-0.19,-7598.0,-10000.0,57.0,69490.0,2.1,38.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [22]:
# Global input data owid
print(owid_data)
o = pd.io.excel.read_excel(owid_data)
print(o.columns)

o = o.rename(columns={
    "location":"Country",
    "population": "Owid Population",
    "new_tests":"Owid Tests",
    "total_tests": "Owid Total Tests",
    "new_cases": "Owid Cases",
    "new_deaths": "Owid Deaths"
}).reset_index()

o["date"] = o["date"].astype(str)
o["Year"] = o["date"].apply(lambda x: x[0:4])
o["Month"] = o["date"].apply(lambda x: x[5:7])
o["Day"] = o["date"].apply(lambda x: x[8:10])
o["Date"] = o.apply(lambda x: x["Month"] + "/" + x["Day"] + "/" + x["Year"],axis=1)
o["Time"] = o.date.apply(lambda x: datetime.strptime(x, '%Y-%m-%d'))

o["Country"] = o["Country"].astype(str)
print_column_unique(o["Country"])
o["Country"] = o["Country"].apply(lambda x: "" if x=="nan" else x)
o["Country"] = o["Country"].apply(lambda x: fixCountry(x))
checkCountries(o["Country"])

min_date = o["Time"].min()
max_date = o["Time"].max()
print("Min: " + str(min_date))
print("Max: " + str(max_date))

o_integer_columns = ["Owid Population", "Owid Tests", "Owid Cases", "Owid Deaths"]
o[o_integer_columns] = o[o_integer_columns].apply(lambda x: pd.to_numeric(x, errors='coerce', downcast='integer'))
o_columns = ["Country", "Time", "Date", "Owid Population", "Owid Tests", "Owid Cases", "Owid Deaths"]
o = o[o_columns]

o.head()
#o_notnull = o.loc[pd.notnull(o["Owid Tests"])].copy().reset_index().drop(columns=["index"])
#o_notnull.head()

https://github.com/owid/covid-19-data/blob/master/public/data/owid-covid-data.xlsx?raw=true
Index(['iso_code', 'continent', 'location', 'date', 'total_cases', 'new_cases',
       'new_cases_smoothed', 'total_deaths', 'new_deaths',
       'new_deaths_smoothed', 'total_cases_per_million',
       'new_cases_per_million', 'new_cases_smoothed_per_million',
       'total_deaths_per_million', 'new_deaths_per_million',
       'new_deaths_smoothed_per_million', 'reproduction_rate', 'icu_patients',
       'icu_patients_per_million', 'hosp_patients',
       'hosp_patients_per_million', 'weekly_icu_admissions',
       'weekly_icu_admissions_per_million', 'weekly_hosp_admissions',
       'weekly_hosp_admissions_per_million', 'new_tests', 'total_tests',
       'total_tests_per_thousand', 'new_tests_per_thousand',
       'new_tests_smoothed', 'new_tests_smoothed_per_thousand',
       'positive_rate', 'tests_per_case', 'tests_units', 'total_vaccinations',
       'people_vaccinated', 'people_fully_vacc

Unnamed: 0,Country,Time,Date,Owid Population,Owid Tests,Owid Cases,Owid Deaths
0,Afghanistan,2020-02-24,02/24/2020,38928341.0,,1.0,
1,Afghanistan,2020-02-25,02/25/2020,38928341.0,,0.0,
2,Afghanistan,2020-02-26,02/26/2020,38928341.0,,0.0,
3,Afghanistan,2020-02-27,02/27/2020,38928341.0,,0.0,
4,Afghanistan,2020-02-28,02/28/2020,38928341.0,,0.0,


In [23]:
date_check = o.groupby(o["Country"])["Time"].max()
date_check.head(200)

Country
Afghanistan   2021-07-03
Africa        2021-07-03
Albania       2021-07-03
Algeria       2021-07-03
Andorra       2021-07-03
                 ...    
Suriname      2021-07-03
Swaziland     2021-07-03
Sweden        2021-07-03
Switzerland   2021-07-03
Syria         2021-07-03
Name: Time, Length: 200, dtype: datetime64[ns]

In [24]:
# Global input data
github_request=requests.get(github_url).content
c=pd.read_csv(io.StringIO(github_request.decode('utf-8')))
currentTime = datetime.now()

print("c  original columns")
print(c.columns)

print("Sets")
c["set"] = c["set"].astype(str)
sets = print_column_unique(c["set"])
c = c[c["set"] == "country"].copy().rename(columns={
    "name":"Country",
    "unit":"Abbreviation",
    "time":"Time"
}).reset_index()
c = c.drop(columns=["index","set","pop_100k"])

print("Countries")
c["Country"] = c["Country"].astype(str)
c["Country"] = c["Country"].apply(lambda x: "" if x=="nan" else x)
c["Country"] = c["Country"].apply(lambda x: fixCountry(x))
checkCountries(c["Country"])

# Format text date and add datetime for date
c["Time"] = c["Time"].astype(str)
c["Time"] = c["Time"].apply(lambda x: us_date(x))
c["Date"] = c["Time"]
c["Time"] = c.apply(lambda x: pd.to_datetime(x["Date"], format="%m/%d/%Y"), axis=1)
minmax_dates = c.groupby(["Country"]).agg({"Date": [np.min,np.max]})
min_date = c["Time"].min()
max_date = c["Time"].max()
print("Min: " + str(min_date))
print("Max: " + str(max_date))
c["Accessed"] = str(currentTime.month) + '/' + str(currentTime.day) + '/' + str(currentTime.year)

o_not_international = o.loc[
    (o["Country"]!="International") & 
    (o["Country"]!="Africa") & 
    (o["Country"]!="Asia") &
    (o["Country"]!="Europe") & 
    (o["Country"]!="European Union") & 
    (o["Country"]!="North America") & 
    (o["Country"]!="Oceania") & 
    (o["Country"]!="South America")
].copy()
c = o_not_international.merge(c,how="left",on=["Country","Date"])

incomes = ["High income","Upper middle income","Lower middle income","Low income"]
c["Region"] = c["Country"].apply(lambda x: region_from_country(x))
c["Level"] = c.apply(lambda x: "Territory" if x["Region"] == "" 
                     else "World" if ((x["Region"] == "World") or (x["Country"] in incomes))  
                     else "Country",axis=1)
c["Region"] = c.apply(lambda x: "World" if (x["Country"] in incomes) 
                      else fixTerritoryRegion(x["Country"]) if x["Region"] == "" 
                      else x["Region"],axis=1)
c["State/Province"] = c.apply(lambda x: x["Country"] if x["Level"]=="Territory" else "",axis=1)
c["Country"] = c.apply(lambda x: "" if x["Level"]=="Territory" else x["Country"],axis=1)

# Format numeric columns
numeric_columns = [
    'new_cases_orig','new_deaths_orig','new_tests_orig',
    'cap_cum_cases','cap_new_cases',
    'cap_cum_deaths','cap_new_deaths',
    'cap_cum_tests','cap_new_tests',
    'all_cum_cases','all_new_cases','all_cum_deaths','all_new_deaths',
    'all_cum_tests','all_new_tests',
    'pos'
]
float_columns = [
    'cap_cum_cases','cap_new_cases','cap_cum_deaths',
    'cap_new_deaths','cap_cum_tests','cap_new_tests'
]
integer_columns = [
    'new_cases_orig', 'Owid Cases', 'all_cum_cases',
    'new_deaths_orig', 'Owid Deaths', 'all_cum_deaths',
    'new_tests_orig', 'Owid Tests', 'all_cum_tests', 'all_new_tests'
]
c[float_columns] = c[float_columns].apply(pd.to_numeric)
c[integer_columns] = c[integer_columns].apply(lambda x: pd.to_numeric(x, errors='coerce', downcast='integer'))
c = c.drop(columns=['Time_y'])
c = c.rename(columns ={
    "Time_x":"Time"
})
c["Tests Daily"] = c.apply(
    lambda x: x["Owid Tests"] if (pd.isna(x["new_tests_orig"]) and pd.notna(x["Owid Tests"])) else x["new_tests_orig"],
    axis=1
)
c["Cases Daily"] = c.apply(
    lambda x: x["Owid Cases"] if (pd.isna(x["new_cases_orig"]) and pd.notna(x["Owid Cases"])) else x["new_cases_orig"],
    axis=1
)
c["Deaths Daily"] = c.apply(
    lambda x: x["Owid Deaths"] if (pd.isna(x["new_deaths_orig"]) and pd.notna(x["Owid Deaths"])) else x["new_deaths_orig"],
    axis=1
)
print("c columns after Tests Daily")
print(c.columns)

c_order = [
    'Level', 'Region', 'Country', 'State/Province', 'Abbreviation',
    'Time', 'Date', 'Accessed',
    'Tests Daily', 'Cases Daily', 'Deaths Daily'
]
c = c[c_order]

c = c.where(c.notnull(), None)

c = c.merge(all_populations,how="left",on=["Level","Region","Country","State/Province"])

c_data_order = [
    'Level', 'Region', 'Country', 'State/Province', 'Abbreviation', 'Time', 'Date', 
    'Tests Daily', 'Cases Daily', 'Deaths Daily',
    'Population', 'Population 100K',
    'Country Population', 'Country Population 100K',"Country Share",
    'Region Population', 'Region Population 100K',"Region Share",
    'World Population', 'World Population 100K',"World Share",
    'World Share (%)', 'Urban Population (%)', 'Annual Change (%)', 'Net Change', 'Migrants (net)', 'Density (P/Km²)',
    'Land Area (Km²)', 'Fertility Rate', 'Median Age',
    '< 1', 1, 2, 3, 4, 5, 6, 7, 8, 9,
    10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
    20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
    30, 31, 32, 33, 34, 35, 36, 37, 38, 39,
    40, 41, 42, 43, 44, 45, 46, 47, 48, 49,
    50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
    60, 61, 62, 63, 64, 65, 66, 67, 68, 69,
    70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
    80, 81, 82, 83, 84, '85+',
    '1-4','5-14','15-24','25-34','35-44','45-54','55-64','65-74','75-84',
    'Pct < 1','Pct 1-4','Pct 5-14','Pct 15-24','Pct 25-34','Pct 35-44','Pct 45-54','Pct 55-64','Pct 65-74','Pct 75-84','Pct 85+',
    'Accessed'
]
c = c.sort_values(by=['Region','Country','Time']).reset_index()
print(c.columns)
c = c[c_data_order]

pop_check = c.loc[pd.isnull(c["Population"])]
print_column_unique(pop_check["Country"])

c.head()

c  original columns
Index(['set', 'name', 'unit', 'time', 'cum_tests_orig', 'new_tests_orig',
       'pop_100k', 'new_cases_orig', 'new_deaths_orig', 'cap_cum_cases',
       'cap_new_cases', 'cap_cum_deaths', 'cap_new_deaths', 'cap_cum_tests',
       'cap_new_tests', 'all_cum_cases', 'all_new_cases', 'all_cum_deaths',
       'all_new_deaths', 'all_cum_tests', 'all_new_tests', 'pos'],
      dtype='object')
Sets
Column Values:
['country', 'income', 'region']
Countries
No countries need to be fixed.
Min: 2020-01-18 00:00:00
Max: 2021-07-04 00:00:00
c columns after Tests Daily
Index(['Country', 'Time', 'Date', 'Owid Population', 'Owid Tests',
       'Owid Cases', 'Owid Deaths', 'Abbreviation', 'cum_tests_orig',
       'new_tests_orig', 'new_cases_orig', 'new_deaths_orig', 'cap_cum_cases',
       'cap_new_cases', 'cap_cum_deaths', 'cap_new_deaths', 'cap_cum_tests',
       'cap_new_tests', 'all_cum_cases', 'all_new_cases', 'all_cum_deaths',
       'all_new_deaths', 'all_cum_tests', 'all_new_

Unnamed: 0,Level,Region,Country,State/Province,Abbreviation,Time,Date,Tests Daily,Cases Daily,Deaths Daily,Population,Population 100K,Country Population,Country Population 100K,Country Share,Region Population,Region Population 100K,Region Share,World Population,World Population 100K,World Share,World Share (%),Urban Population (%),Annual Change (%),Net Change,Migrants (net),Density (P/Km²),Land Area (Km²),Fertility Rate,Median Age,< 1,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85+,1-4,5-14,15-24,25-34,35-44,45-54,55-64,65-74,75-84,Pct < 1,Pct 1-4,Pct 5-14,Pct 15-24,Pct 25-34,Pct 35-44,Pct 45-54,Pct 55-64,Pct 65-74,Pct 75-84,Pct 85+,Accessed
0,Country,Central Asia,Armenia,,AM,2020-03-01,03/01/2020,,1.0,0.0,2963243.0,29.6324,,,,326887719.0,3268.8772,0.0091,7796609105.0,77966.0911,0.0004,0.04,63.0,0.19,5512.0,-4998.0,104.0,28470.0,1.8,35.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,7/4/2021
1,Country,Central Asia,Armenia,,AM,2020-03-02,03/02/2020,,0.0,0.0,2963243.0,29.6324,,,,326887719.0,3268.8772,0.0091,7796609105.0,77966.0911,0.0004,0.04,63.0,0.19,5512.0,-4998.0,104.0,28470.0,1.8,35.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,7/4/2021
2,Country,Central Asia,Armenia,,AM,2020-03-03,03/03/2020,,0.0,0.0,2963243.0,29.6324,,,,326887719.0,3268.8772,0.0091,7796609105.0,77966.0911,0.0004,0.04,63.0,0.19,5512.0,-4998.0,104.0,28470.0,1.8,35.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,7/4/2021
3,Country,Central Asia,Armenia,,AM,2020-03-04,03/04/2020,,0.0,0.0,2963243.0,29.6324,,,,326887719.0,3268.8772,0.0091,7796609105.0,77966.0911,0.0004,0.04,63.0,0.19,5512.0,-4998.0,104.0,28470.0,1.8,35.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,7/4/2021
4,Country,Central Asia,Armenia,,AM,2020-03-05,03/05/2020,,0.0,0.0,2963243.0,29.6324,,,,326887719.0,3268.8772,0.0091,7796609105.0,77966.0911,0.0004,0.04,63.0,0.19,5512.0,-4998.0,104.0,28470.0,1.8,35.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,7/4/2021


In [25]:
date_check = c.groupby(c["Country"])["Time"].max()
date_check.head(200)

Country
                           2021-07-03
Afghanistan                2021-07-03
Albania                    2021-07-03
Algeria                    2021-07-03
Andorra                    2021-07-03
                              ...    
Tunisia                    2021-07-03
Turkey                     2021-07-03
Turkmenistan               2021-04-04
Turks and Caicos Islands   2021-06-18
Tuvalu                     2021-06-15
Name: Time, Length: 200, dtype: datetime64[ns]

In [26]:
# Canada raw data
canada_source_request = requests.get(canada_source_csv).content
canada_df = pd.read_csv(io.StringIO(canada_source_request.decode('utf-8')))
currentTime = datetime.now()

print("Original Canada Columns")
print(canada_df.columns)
canada_df.rename(columns = {
    'Province': 'State/Province',
    'SummaryDate': 'Time',
    'TotalCases': 'Total Cases','DailyTotals': 'Cases Daily',
    'TotalRecovered' : 'Total Recovered','DailyRecovered': 'Recovered Daily',
    'TotalDeaths': 'Total Deaths','DailyDeaths': 'Deaths Daily',
    'TotalTested': 'Total Tests','DailyTested': 'Tests Daily',
    'TotalActive': 'Total Active','DailyActive': 'Active Daily',
    'TotalHospitalized': 'Total Hospitalized','DailyHospitalized': 'Hospitalized Daily',
    'TotalICU': 'Total ICU', 'DailyICU': 'ICU Daily'
}, inplace = True)
print("Renamed Canada Columns")
print(canada_df.columns)

canada_df.drop(columns=["OBJECTID"], inplace = True)
canada_df["Accessed"] = str(currentTime.month) + '/' + str(currentTime.day) + '/' + str(currentTime.year)
canada_df["Country"] = "Canada"
canada_df["Region"] = "North America"
canada_df["State/Province"] = canada_df["State/Province"].apply(lambda x: fixProvince(x))
canada_df["Date"] = canada_df["Time"].apply(lambda x: us_date(x).replace(" ",""))
canada_df["Time"] = canada_df["Date"].apply(lambda x: datetime.strptime(x, '%m/%d/%Y'))
canada_df["Level"] = canada_df["State/Province"].apply(lambda x: "Country" if x == "Canada" else "State/Province")
canada_df["State/Province"] = canada_df["State/Province"].apply(lambda x: "" if x=="Canada" else x)
string_columns = ["State/Province","Abbreviation","Country","Region"]
canada_df = canada_df.sort_values(by=["Level","Country","State/Province","Date"])
canada_df = canada_df.reset_index()
canada_df = canada_df.drop(columns=["index"])
canada_df = canada_df.merge(all_populations,how="left",on=["Level","Region","Country","State/Province"]).reset_index()
canada_order = [
    'Level', 'Region', 'Country', 'State/Province', 'Abbreviation', 'Time', 'Date',
    'Cases Daily', 'Total Cases',
    'Tests Daily', 'Total Tests', 
    'Deaths Daily', 'Total Deaths', 
    'Recovered Daily', 'Total Recovered', 
    'Active Daily', 'Total Active',
    'Hospitalized Daily', 'Total Hospitalized', 
    'ICU Daily', 'Total ICU', 
    'Population', 'Population 100K',
    'Country Population', 'Country Population 100K',"Country Share",
    'Region Population', 'Region Population 100K',"Region Share",
    'World Population', 'World Population 100K',"World Share",
    'World Share (%)', 'Urban Population (%)', 'Annual Change (%)', 'Net Change', 'Migrants (net)', 'Density (P/Km²)',
    'Land Area (Km²)', 'Fertility Rate', 'Median Age',
    '< 1', 1, 2, 3, 4, 5, 6, 7, 8, 9,
    10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
    20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
    30, 31, 32, 33, 34, 35, 36, 37, 38, 39,
    40, 41, 42, 43, 44, 45, 46, 47, 48, 49,
    50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
    60, 61, 62, 63, 64, 65, 66, 67, 68, 69,
    70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
    80, 81, 82, 83, 84, '85+',
    '1-4','5-14','15-24','25-34','35-44','45-54','55-64','65-74','75-84',
    'Pct < 1','Pct 1-4','Pct 5-14','Pct 15-24','Pct 25-34','Pct 35-44','Pct 45-54','Pct 55-64','Pct 65-74','Pct 75-84','Pct 85+',
    'Accessed'
]
canada_df = canada_df[canada_order].copy()
canada_min = canada_df["Time"].min()
print("Min Date: " + str(canada_min))
canada_max = canada_df["Time"].max()
print("Max Date: " + str(canada_max))
print_column_unique(canada_df["State/Province"])
canada_df.head(75)

Original Canada Columns
Index(['OBJECTID', 'Province', 'Abbreviation', 'DailyTotals', 'SummaryDate',
       'TotalCases', 'TotalRecovered', 'DailyRecovered', 'TotalDeaths',
       'DailyDeaths', 'TotalTested', 'DailyTested', 'TotalActive',
       'DailyActive', 'TotalHospitalized', 'DailyHospitalized', 'TotalICU',
       'DailyICU', 'TotalVaccinated', 'DailyVaccinated', 'TotalDose1',
       'DailyDose1', 'TotalDose2', 'DailyDose2'],
      dtype='object')
Renamed Canada Columns
Index(['OBJECTID', 'State/Province', 'Abbreviation', 'Cases Daily', 'Time',
       'Total Cases', 'Total Recovered', 'Recovered Daily', 'Total Deaths',
       'Deaths Daily', 'Total Tests', 'Tests Daily', 'Total Active',
       'Active Daily', 'Total Hospitalized', 'Hospitalized Daily', 'Total ICU',
       'ICU Daily', 'TotalVaccinated', 'DailyVaccinated', 'TotalDose1',
       'DailyDose1', 'TotalDose2', 'DailyDose2'],
      dtype='object')
Min Date: 2020-01-25 00:00:00
Max Date: 2021-07-03 00:00:00
Column Values

Unnamed: 0,Level,Region,Country,State/Province,Abbreviation,Time,Date,Cases Daily,Total Cases,Tests Daily,Total Tests,Deaths Daily,Total Deaths,Recovered Daily,Total Recovered,Active Daily,Total Active,Hospitalized Daily,Total Hospitalized,ICU Daily,Total ICU,Population,Population 100K,Country Population,Country Population 100K,Country Share,Region Population,Region Population 100K,Region Share,World Population,World Population 100K,World Share,World Share (%),Urban Population (%),Annual Change (%),Net Change,Migrants (net),Density (P/Km²),Land Area (Km²),Fertility Rate,Median Age,< 1,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85+,1-4,5-14,15-24,25-34,35-44,45-54,55-64,65-74,75-84,Pct < 1,Pct 1-4,Pct 5-14,Pct 15-24,Pct 25-34,Pct 35-44,Pct 45-54,Pct 55-64,Pct 65-74,Pct 75-84,Pct 85+,Accessed
0,Country,North America,Canada,,CA,2021-01-01,01/01/2021,1302,582704,163563,16051201,0,15605,6,489816,-4.0000,73598,0.0000,4067.0000,0.0000,794.0000,38005238.0000,380.0524,,,,365057840.0000,3650.5784,0.1041,7796609105.0000,77966.0911,0.0049,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,7/4/2021
1,Country,North America,Canada,,CA,2021-01-02,01/02/2021,7583,590287,218050,16269251,109,15714,5264,495080,1311.0000,74909,-192.0000,3875.0000,-9.0000,785.0000,38005238.0000,380.0524,,,,365057840.0000,3650.5784,0.1041,7796609105.0000,77966.0911,0.0049,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,7/4/2021
2,Country,North America,Canada,,CA,2021-01-03,01/03/2021,11372,601659,61889,16331140,151,15865,9891,504971,931.0000,75840,42.0000,3917.0000,24.0000,809.0000,38005238.0000,380.0524,,,,365057840.0000,3650.5784,0.1041,7796609105.0000,77966.0911,0.0049,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,7/4/2021
3,Country,North America,Canada,,CA,2021-01-04,01/04/2021,9765,611424,-39110,16292030,209,16074,12912,517883,5095.0000,80935,251.0000,4168.0000,-2.0000,807.0000,38005238.0000,380.0524,,,,365057840.0000,3650.5784,0.1041,7796609105.0000,77966.0911,0.0049,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,7/4/2021
4,Country,North America,Canada,,CA,2021-01-05,01/05/2021,7222,618646,219952,16511982,160,16234,5681,523564,1434.0000,82369,193.0000,4361.0000,23.0000,830.0000,38005238.0000,380.0524,,,,365057840.0000,3650.5784,0.1041,7796609105.0000,77966.0911,0.0049,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,7/4/2021
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
70,Country,North America,Canada,,CA,2020-02-17,02/17/2020,0,8,0,0,0,0,0,1,0.0000,7,,,,,38005238.0000,380.0524,,,,365057840.0000,3650.5784,0.1041,7796609105.0000,77966.0911,0.0049,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,7/4/2021
71,Country,North America,Canada,,CA,2021-02-17,02/17/2021,2605,834187,79950,23780997,38,21435,3553,779766,-987.0000,32906,-25.0000,2362.0000,0.0000,592.0000,38005238.0000,380.0524,,,,365057840.0000,3650.5784,0.1041,7796609105.0000,77966.0911,0.0049,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,7/4/2021
72,Country,North America,Canada,,CA,2020-02-18,02/18/2020,0,8,0,0,0,0,0,1,0.0000,7,,,,,38005238.0000,380.0524,,,,365057840.0000,3650.5784,0.1041,7796609105.0000,77966.0911,0.0049,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,7/4/2021
73,Country,North America,Canada,,CA,2021-02-18,02/18/2021,3314,837501,117157,23898154,63,21498,3650,783416,-379.0000,32527,-5.0000,2357.0000,-34.0000,558.0000,38005238.0000,380.0524,,,,365057840.0000,3650.5784,0.1041,7796609105.0000,77966.0911,0.0049,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,7/4/2021


In [27]:
us_state_codes = pd.read_excel(us_codes)
us_state_codes["Census Region"] = us_state_codes["State Name"].apply(lambda x: censusRegionByState(x))
us_state_codes.rename(columns = {
    'State Name':'State/Province',
    "State Abbreviation": "Abbreviation"
}, inplace = True)
us_state_codes.head(70)

Unnamed: 0,State/Province,FIPS,Abbreviation,Status,Census Region
0,United States,0,US,0,United States
1,Alabama,1,AL,0,South
2,Alaska,2,AK,0,West
3,Arizona,4,AZ,0,West
4,Arkansas,5,AR,0,South
...,...,...,...,...,...
65,Baker Island,81,,4,
66,Howland Island,84,,4,
67,Jarvis Island,86,,4,
68,Kingman Reef,89,,4,


In [28]:
us_tests_request = requests.get(us_tests_csv).content
us_tests = pd.read_csv(io.StringIO(us_tests_request.decode('utf-8')))
print("Original Tests:")
print(us_tests.columns)
us_tests["Time"] = us_tests["date"].apply(lambda x: datetime.strptime(x, '%m/%d/%Y'))
us_tests.rename(
    columns = {
        'date': 'Date', 
        'state' : 'Abbreviation',
        'tests_viral_total' : 'Total Tests'
    }, inplace = True)
us_tests["Total Tests"] = us_tests["Total Tests"].fillna(us_tests['tests_combined_total'])
us_tests = us_tests.sort_values(by=['Abbreviation','Time']).reset_index().drop(columns=['index'])
us_tests["Tests Daily"] = us_tests.groupby(['Abbreviation'], as_index=False)["Total Tests"].diff().reset_index(drop=True).fillna(0)
us_tests.drop(columns=[
    'Date',
    'cases_conf_probable', 
    'tests_viral_positive', 'tests_viral_negative', 
    'people_viral_positive', 'people_viral_total', 
    'encounters_viral_total','tests_combined_total'
], inplace = True)
print("Renamed Columns")
print(us_tests.columns)
us_tests_order = ['Abbreviation', 'Time', 'Total Tests', 'Tests Daily']
us_tests = us_tests[us_tests_order]
print_column_unique(us_tests["Abbreviation"])
us_tests_min = us_tests["Time"].min()
print("Min Date: " + str(us_tests_min))
us_tests_max = us_tests["Time"].max()
print("Max Date: " + str(us_tests_max))
us_tests.head()

Original Tests:
Index(['date', 'state', 'cases_conf_probable', 'cases_confirmed',
       'cases_probable', 'tests_viral_positive', 'tests_viral_negative',
       'tests_viral_total', 'tests_antigen_positive', 'tests_antigen_total',
       'people_viral_positive', 'people_viral_total',
       'people_antigen_positive', 'people_antigen_total',
       'encounters_viral_total', 'tests_combined_total'],
      dtype='object')
Renamed Columns
Index(['Abbreviation', 'cases_confirmed', 'cases_probable', 'Total Tests',
       'tests_antigen_positive', 'tests_antigen_total',
       'people_antigen_positive', 'people_antigen_total', 'Time',
       'Tests Daily'],
      dtype='object')
Column Values:
['MA', 'NJ', 'WY', 'AS', 'HI', 'DE', 'MN', 'NV', 'IA', 'LA', 'KS', 'ID', 'MI', 'UT', 'MT', 'DC', 'KY', 'NY', 'WV', 'MD', 'ME', 'GA', 'OH', 'IL', 'IN', 'CA', 'WI', 'WA', 'TX', 'MP', 'GU', 'NH', 'OR', 'SC', 'FL', 'PA', 'ND', 'AR', 'NE', 'NC', 'AL', 'AK', 'PR', 'TN', 'CT', 'CO', 'VA', 'OK', 'MO', 'SD', 'A

Unnamed: 0,Abbreviation,Time,Total Tests,Tests Daily
0,AK,2020-03-06,8.0,0.0
1,AK,2020-03-07,12.0,4.0
2,AK,2020-03-08,14.0,2.0
3,AK,2020-03-09,23.0,9.0
4,AK,2020-03-10,23.0,0.0


In [29]:
cdc_cases_deaths_request=requests.get(cdc_cases_deaths_csv).content
cdc_cases_deaths=pd.read_csv(io.StringIO(cdc_cases_deaths_request.decode('utf-8')))
print("Original Columns:")
print(cdc_cases_deaths.columns)

currentTime = datetime.now()
cdc_cases_deaths["Accessed"] = currentTime
cdc_cases_deaths["Time"] = cdc_cases_deaths["submission_date"].apply(lambda x: datetime.strptime(x, '%m/%d/%Y'))
cdc_cases_deaths.drop(columns=[
    'created_at',
    'conf_cases', 'prob_cases', 'pnew_case',
    'conf_death', 'prob_death', 'pnew_death', 
    'consent_cases', 'consent_deaths'
], inplace = True)
cdc_cases_deaths.rename(
    columns = {
        'submission_date': 'Date', 'state' : 'Abbreviation',
        'tot_cases' : 'Total Cases', 'new_case' : 'Cases Daily',
        'tot_death' : 'Total Deaths', 'new_death' : 'Deaths Daily'
    }, inplace = True)
cdc_cases_deaths["Country"] = "United States"
cdc_cases_deaths["Region"] = "North America"

cdc_cases_deaths = cdc_cases_deaths.merge(us_tests, how="left", on=["Abbreviation", "Time"])
print("US Tests Columns:")
print(cdc_cases_deaths.columns)
print_column_unique(cdc_cases_deaths["Abbreviation"])
cdc_cases_deaths["Tests Daily"] = cdc_cases_deaths["Tests Daily"].fillna(0).astype('int64')
cdc_cases_deaths["Total Tests"] = cdc_cases_deaths["Total Tests"].fillna(0).astype('int64')
cdc_cases_deaths["Cases Daily"] = cdc_cases_deaths["Cases Daily"].fillna(0).apply(lambda x: int(float(x.replace(",","")))).astype('int64')
cdc_cases_deaths["Total Cases"] = cdc_cases_deaths["Total Cases"].fillna(0).apply(lambda x: int(float(x.replace(",","")))).astype('int64')
cdc_cases_deaths["Deaths Daily"] = cdc_cases_deaths["Deaths Daily"].fillna(0).apply(lambda x: int(float(x.replace(",","")))).astype('int64')
cdc_cases_deaths["Total Deaths"] = cdc_cases_deaths["Total Deaths"].fillna(0).apply(lambda x: int(float(x.replace(",","")))).astype('int64')
headers = ['Region', 'Country', 'Abbreviation', 'Time', 'Date','Accessed']
cols = ['Cases Daily', 'Total Cases', 'Deaths Daily',  'Total Deaths', 'Tests Daily', 'Total Tests']
all_cols = headers + cols
cdc_cases_deaths = cdc_cases_deaths[all_cols]
cdc_cases_deaths = cdc_cases_deaths.sort_values(by=['Time','Abbreviation']).reset_index().drop(columns=['index'])
cdc_cases_deaths.head()

ny = cdc_cases_deaths.loc[cdc_cases_deaths["Abbreviation"].str.contains("NY")].copy()
ny["Abbreviation"] = "NY"
ny_sum = ny.groupby(headers)[cols].sum().reset_index()
ny_sum.head()

cdc_cases_deaths = cdc_cases_deaths.loc[~cdc_cases_deaths["Abbreviation"].str.contains("NY")].copy()
cdc_cases_deaths = pd.concat([cdc_cases_deaths,ny_sum])
cdc_cases_deaths = cdc_cases_deaths.sort_values(by=['Abbreviation', 'Time']).reset_index().drop(columns=['index'])
cdc_cases_deaths.head()

Original Columns:
Index(['submission_date', 'state', 'tot_cases', 'conf_cases', 'prob_cases',
       'new_case', 'pnew_case', 'tot_death', 'conf_death', 'prob_death',
       'new_death', 'pnew_death', 'created_at', 'consent_cases',
       'consent_deaths'],
      dtype='object')
US Tests Columns:
Index(['Date', 'Abbreviation', 'Total Cases', 'Cases Daily', 'Total Deaths',
       'Deaths Daily', 'Accessed', 'Time', 'Country', 'Region', 'Total Tests',
       'Tests Daily'],
      dtype='object')
Column Values:
['MA', 'NJ', 'WY', 'AS', 'HI', 'DE', 'MN', 'NV', 'IA', 'LA', 'KS', 'ID', 'MI', 'UT', 'MT', 'DC', 'KY', 'NY', 'WV', 'MD', 'ME', 'GA', 'OH', 'IL', 'IN', 'CA', 'WI', 'WA', 'FSM', 'TX', 'MP', 'GU', 'NH', 'OR', 'PW', 'RMI', 'FL', 'SC', 'PA', 'ND', 'AR', 'NE', 'NC', 'AL', 'AK', 'PR', 'TN', 'CT', 'CO', 'NYC', 'OK', 'VA', 'MO', 'SD', 'AZ', 'NM', 'RI', 'VI', 'MS', 'VT']


Unnamed: 0,Region,Country,Abbreviation,Time,Date,Accessed,Cases Daily,Total Cases,Deaths Daily,Total Deaths,Tests Daily,Total Tests
0,North America,United States,AK,2020-01-22,01/22/2020,2021-07-04 20:44:00.168809,0,0,0,0,0,0
1,North America,United States,AK,2020-01-23,01/23/2020,2021-07-04 20:44:00.168809,0,0,0,0,0,0
2,North America,United States,AK,2020-01-24,01/24/2020,2021-07-04 20:44:00.168809,0,0,0,0,0,0
3,North America,United States,AK,2020-01-25,01/25/2020,2021-07-04 20:44:00.168809,0,0,0,0,0,0
4,North America,United States,AK,2020-01-26,01/26/2020,2021-07-04 20:44:00.168809,0,0,0,0,0,0


In [30]:
cdc_cases_deaths = cdc_cases_deaths.merge(us_state_codes, how="left", on="Abbreviation")
print("State Codes Columns:")
print(cdc_cases_deaths.columns)

cdc_cases_deaths["Level"] = cdc_cases_deaths.apply(
    lambda x: "Territory" if x["State/Province"] in us_territories else "State/Province", axis=1)

cdc_cases_deaths = cdc_cases_deaths.merge(
    all_populations,how="left",on=["Level", "Region", "Country", 'Census Region', "State/Province"]
).reset_index().drop(columns=["index"])
print("Population Columns:")
print(cdc_cases_deaths.columns)

cdc_cases_deaths = cdc_cases_deaths.sort_values(by=['State/Province','Time']).reset_index().drop(columns=['index'])
cdc_cases_deaths_order = [
    'Level', 'Region', 'Country', 'Census Region', 'State/Province', 'Abbreviation', 'FIPS', 'Status',
    'Time', 'Date', 
    'Cases Daily', 'Total Cases', 
    'Deaths Daily', 'Total Deaths',
    'Tests Daily', 'Total Tests',
    'Population', 'Population 100K',
    'Country Population', 'Country Population 100K',"Country Share",
    'Region Population', 'Region Population 100K',"Region Share",
    'World Population', 'World Population 100K',"World Share",
    'World Share (%)', 'Urban Population (%)', 'Annual Change (%)', 'Net Change', 'Migrants (net)', 'Density (P/Km²)',
    'Land Area (Km²)', 'Fertility Rate', 'Median Age',
    '< 1', 1, 2, 3, 4, 5, 6, 7, 8, 9,
    10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
    20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
    30, 31, 32, 33, 34, 35, 36, 37, 38, 39,
    40, 41, 42, 43, 44, 45, 46, 47, 48, 49,
    50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
    60, 61, 62, 63, 64, 65, 66, 67, 68, 69,
    70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
    80, 81, 82, 83, 84, '85+',
    '1-4','5-14','15-24','25-34','35-44','45-54','55-64','65-74','75-84',
    'Pct < 1','Pct 1-4','Pct 5-14','Pct 15-24','Pct 25-34','Pct 35-44','Pct 45-54','Pct 55-64','Pct 65-74','Pct 75-84','Pct 85+',
    'Accessed'
]
cdc_cases_deaths = cdc_cases_deaths[cdc_cases_deaths_order]
print("Ordered Columns:")
print(cdc_cases_deaths.columns)
cdc_cases_deaths["State/Province"] = cdc_cases_deaths["State/Province"].astype(str).apply(lambda x: "" if x=="nan" else x)
cdc_cases_deaths["Abbreviation"] = cdc_cases_deaths["Abbreviation"].astype(str).apply(lambda x: "" if x=="nan" else x)

us_territories = {
    "AS": {"name": "American Samoa", "level": "Territory"},
    "DC": {"name": "District of Columbia", "level": "Territory"},
    "FSM": {"name": "Federated States of Micronesia", "level": "Territory"},
    "GU": {"name": "Guam", "level": "Territory"},
    "MP": {"name": "Northern Mariana Islands", "level": "Territory"},
    "NYC": {"name": "New York City", "level": "Territory"},
    "PR": {"name": "Puerto Rico", "level": "Territory"},
    "PW": {"name": "Palau", "level": "Territory"},
    "RMI": {"name": "Republic of Marshall Islands", "level": "Territory"},
    "VI": {"name": "Virgin Islands", "level": "Territory"}
}
def getTerritoryName(abbreviation):
    territory = us_territories[abbreviation]
    territory_name = territory["name"]
    return territory_name
cdc_cases_deaths["State/Province"] = cdc_cases_deaths.apply(
    lambda x: getTerritoryName(x["Abbreviation"]) if x["Abbreviation"] in  us_territories else x["State/Province"],
    axis=1
)
cdc_cases_deaths["Level"] = cdc_cases_deaths.apply(
    lambda x: "Territory" if x["Abbreviation"] in  us_territories else "State/Province",
    axis=1
)

print("US States")
print_column_unique(cdc_cases_deaths["State/Province"])
print_column_unique(cdc_cases_deaths["Abbreviation"])
us_min = cdc_cases_deaths["Time"].min()
print("US Min Date: " + str(us_min))
us_max = cdc_cases_deaths["Time"].max()
print("US Max Date: " + str(us_max))

cdc_cases_deaths.head(75)

State Codes Columns:
Index(['Region', 'Country', 'Abbreviation', 'Time', 'Date', 'Accessed',
       'Cases Daily', 'Total Cases', 'Deaths Daily', 'Total Deaths',
       'Tests Daily', 'Total Tests', 'State/Province', 'FIPS', 'Status',
       'Census Region'],
      dtype='object')
Population Columns:
Index([      'Region',      'Country', 'Abbreviation',         'Time',
               'Date',     'Accessed',  'Cases Daily',  'Total Cases',
       'Deaths Daily', 'Total Deaths',
       ...
            'Pct 1-4',     'Pct 5-14',    'Pct 15-24',    'Pct 25-34',
          'Pct 35-44',    'Pct 45-54',    'Pct 55-64',    'Pct 65-74',
          'Pct 75-84',      'Pct 85+'],
      dtype='object', length=143)
Ordered Columns:
Index([         'Level',         'Region',        'Country',  'Census Region',
       'State/Province',   'Abbreviation',           'FIPS',         'Status',
                 'Time',           'Date',
       ...
             'Pct 5-14',      'Pct 15-24',      'Pct 25-34', 

Unnamed: 0,Level,Region,Country,Census Region,State/Province,Abbreviation,FIPS,Status,Time,Date,Cases Daily,Total Cases,Deaths Daily,Total Deaths,Tests Daily,Total Tests,Population,Population 100K,Country Population,Country Population 100K,Country Share,Region Population,Region Population 100K,Region Share,World Population,World Population 100K,World Share,World Share (%),Urban Population (%),Annual Change (%),Net Change,Migrants (net),Density (P/Km²),Land Area (Km²),Fertility Rate,Median Age,< 1,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85+,1-4,5-14,15-24,25-34,35-44,45-54,55-64,65-74,75-84,Pct < 1,Pct 1-4,Pct 5-14,Pct 15-24,Pct 25-34,Pct 35-44,Pct 45-54,Pct 55-64,Pct 65-74,Pct 75-84,Pct 85+,Accessed
0,State/Province,North America,United States,South,Alabama,AL,1.0000,0.0000,2020-01-22,01/22/2020,33,33,0,0,0,0,4889347.0000,48.8935,327052602.0000,3270.5260,0.0149,365057840.0000,3650.5784,0.0134,7796609105.0000,77966.0911,0.0006,,,,,,,,,,56901.0000,58290.0000,59073.0000,59799.0000,60294.0000,59568.0000,58599.0000,59537.0000,60023.0000,60241.0000,60897.0000,63083.0000,62906.0000,61883.0000,61729.0000,61740.0000,61799.0000,61924.0000,62938.0000,64125.0000,63587.0000,64201.0000,63943.0000,63719.0000,63922.0000,65079.0000,65208.0000,67027.0000,69478.0000,68758.0000,64852.0000,61469.0000,59980.0000,59615.0000,60721.0000,58941.0000,59921.0000,60346.0000,60696.0000,62200.0000,58159.0000,57993.0000,57852.0000,55498.0000,58174.0000,57008.0000,58838.0000,61959.0000,65460.0000,64750.0000,60738.0000,59494.0000,59786.0000,61321.0000,65925.0000,66906.0000,66695.0000,67073.0000,67308.0000,68221.0000,65605.0000,65211.0000,65365.0000,63117.0000,62042.0000,59584.0000,56766.0000,54694.0000,52697.0000,51707.0000,50567.0000,49884.0000,51612.0000,37091.0000,36845.0000,35441.0000,36173.0000,30575.0000,27572.0000,26053.0000,23977.0000,22580.0000,19594.0000,18222.0000,16660.0000,91543.0000,237456.0000,608466.0000,631898.0000,642187.0000,589780.0000,615279.0000,657543.0000,501447.0000,256847.0000,0.0116,0.0486,0.1244,0.1292,0.1313,0.1206,0.1258,0.1345,0.1026,0.0525,0.0187,2021-07-04 20:44:00.168809
1,State/Province,North America,United States,South,Alabama,AL,1.0000,0.0000,2020-01-23,01/23/2020,1,34,0,0,0,0,4889347.0000,48.8935,327052602.0000,3270.5260,0.0149,365057840.0000,3650.5784,0.0134,7796609105.0000,77966.0911,0.0006,,,,,,,,,,56901.0000,58290.0000,59073.0000,59799.0000,60294.0000,59568.0000,58599.0000,59537.0000,60023.0000,60241.0000,60897.0000,63083.0000,62906.0000,61883.0000,61729.0000,61740.0000,61799.0000,61924.0000,62938.0000,64125.0000,63587.0000,64201.0000,63943.0000,63719.0000,63922.0000,65079.0000,65208.0000,67027.0000,69478.0000,68758.0000,64852.0000,61469.0000,59980.0000,59615.0000,60721.0000,58941.0000,59921.0000,60346.0000,60696.0000,62200.0000,58159.0000,57993.0000,57852.0000,55498.0000,58174.0000,57008.0000,58838.0000,61959.0000,65460.0000,64750.0000,60738.0000,59494.0000,59786.0000,61321.0000,65925.0000,66906.0000,66695.0000,67073.0000,67308.0000,68221.0000,65605.0000,65211.0000,65365.0000,63117.0000,62042.0000,59584.0000,56766.0000,54694.0000,52697.0000,51707.0000,50567.0000,49884.0000,51612.0000,37091.0000,36845.0000,35441.0000,36173.0000,30575.0000,27572.0000,26053.0000,23977.0000,22580.0000,19594.0000,18222.0000,16660.0000,91543.0000,237456.0000,608466.0000,631898.0000,642187.0000,589780.0000,615279.0000,657543.0000,501447.0000,256847.0000,0.0116,0.0486,0.1244,0.1292,0.1313,0.1206,0.1258,0.1345,0.1026,0.0525,0.0187,2021-07-04 20:44:00.168809
2,State/Province,North America,United States,South,Alabama,AL,1.0000,0.0000,2020-01-24,01/24/2020,0,34,0,0,0,0,4889347.0000,48.8935,327052602.0000,3270.5260,0.0149,365057840.0000,3650.5784,0.0134,7796609105.0000,77966.0911,0.0006,,,,,,,,,,56901.0000,58290.0000,59073.0000,59799.0000,60294.0000,59568.0000,58599.0000,59537.0000,60023.0000,60241.0000,60897.0000,63083.0000,62906.0000,61883.0000,61729.0000,61740.0000,61799.0000,61924.0000,62938.0000,64125.0000,63587.0000,64201.0000,63943.0000,63719.0000,63922.0000,65079.0000,65208.0000,67027.0000,69478.0000,68758.0000,64852.0000,61469.0000,59980.0000,59615.0000,60721.0000,58941.0000,59921.0000,60346.0000,60696.0000,62200.0000,58159.0000,57993.0000,57852.0000,55498.0000,58174.0000,57008.0000,58838.0000,61959.0000,65460.0000,64750.0000,60738.0000,59494.0000,59786.0000,61321.0000,65925.0000,66906.0000,66695.0000,67073.0000,67308.0000,68221.0000,65605.0000,65211.0000,65365.0000,63117.0000,62042.0000,59584.0000,56766.0000,54694.0000,52697.0000,51707.0000,50567.0000,49884.0000,51612.0000,37091.0000,36845.0000,35441.0000,36173.0000,30575.0000,27572.0000,26053.0000,23977.0000,22580.0000,19594.0000,18222.0000,16660.0000,91543.0000,237456.0000,608466.0000,631898.0000,642187.0000,589780.0000,615279.0000,657543.0000,501447.0000,256847.0000,0.0116,0.0486,0.1244,0.1292,0.1313,0.1206,0.1258,0.1345,0.1026,0.0525,0.0187,2021-07-04 20:44:00.168809
3,State/Province,North America,United States,South,Alabama,AL,1.0000,0.0000,2020-01-25,01/25/2020,3,37,0,0,0,0,4889347.0000,48.8935,327052602.0000,3270.5260,0.0149,365057840.0000,3650.5784,0.0134,7796609105.0000,77966.0911,0.0006,,,,,,,,,,56901.0000,58290.0000,59073.0000,59799.0000,60294.0000,59568.0000,58599.0000,59537.0000,60023.0000,60241.0000,60897.0000,63083.0000,62906.0000,61883.0000,61729.0000,61740.0000,61799.0000,61924.0000,62938.0000,64125.0000,63587.0000,64201.0000,63943.0000,63719.0000,63922.0000,65079.0000,65208.0000,67027.0000,69478.0000,68758.0000,64852.0000,61469.0000,59980.0000,59615.0000,60721.0000,58941.0000,59921.0000,60346.0000,60696.0000,62200.0000,58159.0000,57993.0000,57852.0000,55498.0000,58174.0000,57008.0000,58838.0000,61959.0000,65460.0000,64750.0000,60738.0000,59494.0000,59786.0000,61321.0000,65925.0000,66906.0000,66695.0000,67073.0000,67308.0000,68221.0000,65605.0000,65211.0000,65365.0000,63117.0000,62042.0000,59584.0000,56766.0000,54694.0000,52697.0000,51707.0000,50567.0000,49884.0000,51612.0000,37091.0000,36845.0000,35441.0000,36173.0000,30575.0000,27572.0000,26053.0000,23977.0000,22580.0000,19594.0000,18222.0000,16660.0000,91543.0000,237456.0000,608466.0000,631898.0000,642187.0000,589780.0000,615279.0000,657543.0000,501447.0000,256847.0000,0.0116,0.0486,0.1244,0.1292,0.1313,0.1206,0.1258,0.1345,0.1026,0.0525,0.0187,2021-07-04 20:44:00.168809
4,State/Province,North America,United States,South,Alabama,AL,1.0000,0.0000,2020-01-26,01/26/2020,0,37,0,0,0,0,4889347.0000,48.8935,327052602.0000,3270.5260,0.0149,365057840.0000,3650.5784,0.0134,7796609105.0000,77966.0911,0.0006,,,,,,,,,,56901.0000,58290.0000,59073.0000,59799.0000,60294.0000,59568.0000,58599.0000,59537.0000,60023.0000,60241.0000,60897.0000,63083.0000,62906.0000,61883.0000,61729.0000,61740.0000,61799.0000,61924.0000,62938.0000,64125.0000,63587.0000,64201.0000,63943.0000,63719.0000,63922.0000,65079.0000,65208.0000,67027.0000,69478.0000,68758.0000,64852.0000,61469.0000,59980.0000,59615.0000,60721.0000,58941.0000,59921.0000,60346.0000,60696.0000,62200.0000,58159.0000,57993.0000,57852.0000,55498.0000,58174.0000,57008.0000,58838.0000,61959.0000,65460.0000,64750.0000,60738.0000,59494.0000,59786.0000,61321.0000,65925.0000,66906.0000,66695.0000,67073.0000,67308.0000,68221.0000,65605.0000,65211.0000,65365.0000,63117.0000,62042.0000,59584.0000,56766.0000,54694.0000,52697.0000,51707.0000,50567.0000,49884.0000,51612.0000,37091.0000,36845.0000,35441.0000,36173.0000,30575.0000,27572.0000,26053.0000,23977.0000,22580.0000,19594.0000,18222.0000,16660.0000,91543.0000,237456.0000,608466.0000,631898.0000,642187.0000,589780.0000,615279.0000,657543.0000,501447.0000,256847.0000,0.0116,0.0486,0.1244,0.1292,0.1313,0.1206,0.1258,0.1345,0.1026,0.0525,0.0187,2021-07-04 20:44:00.168809
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
70,State/Province,North America,United States,South,Alabama,AL,1.0000,0.0000,2020-04-01,04/01/2020,206,3420,7,56,495,7774,4889347.0000,48.8935,327052602.0000,3270.5260,0.0149,365057840.0000,3650.5784,0.0134,7796609105.0000,77966.0911,0.0006,,,,,,,,,,56901.0000,58290.0000,59073.0000,59799.0000,60294.0000,59568.0000,58599.0000,59537.0000,60023.0000,60241.0000,60897.0000,63083.0000,62906.0000,61883.0000,61729.0000,61740.0000,61799.0000,61924.0000,62938.0000,64125.0000,63587.0000,64201.0000,63943.0000,63719.0000,63922.0000,65079.0000,65208.0000,67027.0000,69478.0000,68758.0000,64852.0000,61469.0000,59980.0000,59615.0000,60721.0000,58941.0000,59921.0000,60346.0000,60696.0000,62200.0000,58159.0000,57993.0000,57852.0000,55498.0000,58174.0000,57008.0000,58838.0000,61959.0000,65460.0000,64750.0000,60738.0000,59494.0000,59786.0000,61321.0000,65925.0000,66906.0000,66695.0000,67073.0000,67308.0000,68221.0000,65605.0000,65211.0000,65365.0000,63117.0000,62042.0000,59584.0000,56766.0000,54694.0000,52697.0000,51707.0000,50567.0000,49884.0000,51612.0000,37091.0000,36845.0000,35441.0000,36173.0000,30575.0000,27572.0000,26053.0000,23977.0000,22580.0000,19594.0000,18222.0000,16660.0000,91543.0000,237456.0000,608466.0000,631898.0000,642187.0000,589780.0000,615279.0000,657543.0000,501447.0000,256847.0000,0.0116,0.0486,0.1244,0.1292,0.1313,0.1206,0.1258,0.1345,0.1026,0.0525,0.0187,2021-07-04 20:44:00.168809
71,State/Province,North America,United States,South,Alabama,AL,1.0000,0.0000,2020-04-02,04/02/2020,235,3655,3,59,962,8736,4889347.0000,48.8935,327052602.0000,3270.5260,0.0149,365057840.0000,3650.5784,0.0134,7796609105.0000,77966.0911,0.0006,,,,,,,,,,56901.0000,58290.0000,59073.0000,59799.0000,60294.0000,59568.0000,58599.0000,59537.0000,60023.0000,60241.0000,60897.0000,63083.0000,62906.0000,61883.0000,61729.0000,61740.0000,61799.0000,61924.0000,62938.0000,64125.0000,63587.0000,64201.0000,63943.0000,63719.0000,63922.0000,65079.0000,65208.0000,67027.0000,69478.0000,68758.0000,64852.0000,61469.0000,59980.0000,59615.0000,60721.0000,58941.0000,59921.0000,60346.0000,60696.0000,62200.0000,58159.0000,57993.0000,57852.0000,55498.0000,58174.0000,57008.0000,58838.0000,61959.0000,65460.0000,64750.0000,60738.0000,59494.0000,59786.0000,61321.0000,65925.0000,66906.0000,66695.0000,67073.0000,67308.0000,68221.0000,65605.0000,65211.0000,65365.0000,63117.0000,62042.0000,59584.0000,56766.0000,54694.0000,52697.0000,51707.0000,50567.0000,49884.0000,51612.0000,37091.0000,36845.0000,35441.0000,36173.0000,30575.0000,27572.0000,26053.0000,23977.0000,22580.0000,19594.0000,18222.0000,16660.0000,91543.0000,237456.0000,608466.0000,631898.0000,642187.0000,589780.0000,615279.0000,657543.0000,501447.0000,256847.0000,0.0116,0.0486,0.1244,0.1292,0.1313,0.1206,0.1258,0.1345,0.1026,0.0525,0.0187,2021-07-04 20:44:00.168809
72,State/Province,North America,United States,South,Alabama,AL,1.0000,0.0000,2020-04-03,04/03/2020,217,3872,8,67,883,9619,4889347.0000,48.8935,327052602.0000,3270.5260,0.0149,365057840.0000,3650.5784,0.0134,7796609105.0000,77966.0911,0.0006,,,,,,,,,,56901.0000,58290.0000,59073.0000,59799.0000,60294.0000,59568.0000,58599.0000,59537.0000,60023.0000,60241.0000,60897.0000,63083.0000,62906.0000,61883.0000,61729.0000,61740.0000,61799.0000,61924.0000,62938.0000,64125.0000,63587.0000,64201.0000,63943.0000,63719.0000,63922.0000,65079.0000,65208.0000,67027.0000,69478.0000,68758.0000,64852.0000,61469.0000,59980.0000,59615.0000,60721.0000,58941.0000,59921.0000,60346.0000,60696.0000,62200.0000,58159.0000,57993.0000,57852.0000,55498.0000,58174.0000,57008.0000,58838.0000,61959.0000,65460.0000,64750.0000,60738.0000,59494.0000,59786.0000,61321.0000,65925.0000,66906.0000,66695.0000,67073.0000,67308.0000,68221.0000,65605.0000,65211.0000,65365.0000,63117.0000,62042.0000,59584.0000,56766.0000,54694.0000,52697.0000,51707.0000,50567.0000,49884.0000,51612.0000,37091.0000,36845.0000,35441.0000,36173.0000,30575.0000,27572.0000,26053.0000,23977.0000,22580.0000,19594.0000,18222.0000,16660.0000,91543.0000,237456.0000,608466.0000,631898.0000,642187.0000,589780.0000,615279.0000,657543.0000,501447.0000,256847.0000,0.0116,0.0486,0.1244,0.1292,0.1313,0.1206,0.1258,0.1345,0.1026,0.0525,0.0187,2021-07-04 20:44:00.168809
73,State/Province,North America,United States,South,Alabama,AL,1.0000,0.0000,2020-04-04,04/04/2020,147,4019,4,71,1234,10853,4889347.0000,48.8935,327052602.0000,3270.5260,0.0149,365057840.0000,3650.5784,0.0134,7796609105.0000,77966.0911,0.0006,,,,,,,,,,56901.0000,58290.0000,59073.0000,59799.0000,60294.0000,59568.0000,58599.0000,59537.0000,60023.0000,60241.0000,60897.0000,63083.0000,62906.0000,61883.0000,61729.0000,61740.0000,61799.0000,61924.0000,62938.0000,64125.0000,63587.0000,64201.0000,63943.0000,63719.0000,63922.0000,65079.0000,65208.0000,67027.0000,69478.0000,68758.0000,64852.0000,61469.0000,59980.0000,59615.0000,60721.0000,58941.0000,59921.0000,60346.0000,60696.0000,62200.0000,58159.0000,57993.0000,57852.0000,55498.0000,58174.0000,57008.0000,58838.0000,61959.0000,65460.0000,64750.0000,60738.0000,59494.0000,59786.0000,61321.0000,65925.0000,66906.0000,66695.0000,67073.0000,67308.0000,68221.0000,65605.0000,65211.0000,65365.0000,63117.0000,62042.0000,59584.0000,56766.0000,54694.0000,52697.0000,51707.0000,50567.0000,49884.0000,51612.0000,37091.0000,36845.0000,35441.0000,36173.0000,30575.0000,27572.0000,26053.0000,23977.0000,22580.0000,19594.0000,18222.0000,16660.0000,91543.0000,237456.0000,608466.0000,631898.0000,642187.0000,589780.0000,615279.0000,657543.0000,501447.0000,256847.0000,0.0116,0.0486,0.1244,0.1292,0.1313,0.1206,0.1258,0.1345,0.1026,0.0525,0.0187,2021-07-04 20:44:00.168809


In [31]:
not_cn = c.loc[(c["Country"]!="Canada")].copy()
all_data = pd.concat([cdc_cases_deaths,canada_df,not_cn],sort=False)
print_column_unique(all_data["State/Province"])
all_data = all_data.sort_values(by=['Region', 'Country', 'State/Province','Time']).reset_index().drop(columns=["index"])
base_columns = [
    'Level', 'Region','Country','Census Region','State/Province','Abbreviation','FIPS',
    'Time', 'Date', 'Status', # 'Data Quality', 
    'Cases Daily', 'Deaths Daily','Tests Daily',
    'Recovered Daily', 'Total Recovered',
    'Active Daily', 'Total Active',
    'Population', 'Population 100K',
    'Country Population', 'Country Population 100K',"Country Share",
    'Region Population', 'Region Population 100K',"Region Share",
    'World Population', 'World Population 100K',"World Share",
    'World Share (%)', 'Urban Population (%)', 'Annual Change (%)', 'Net Change', 'Migrants (net)', 'Density (P/Km²)',
    'Land Area (Km²)', 'Fertility Rate', 'Median Age',
    '< 1', 1, 2, 3, 4, 5, 6, 7, 8, 9,
    10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
    20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
    30, 31, 32, 33, 34, 35, 36, 37, 38, 39,
    40, 41, 42, 43, 44, 45, 46, 47, 48, 49,
    50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
    60, 61, 62, 63, 64, 65, 66, 67, 68, 69,
    70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
    80, 81, 82, 83, 84, '85+',
    '1-4','5-14','15-24','25-34','35-44','45-54','55-64','65-74','75-84',
    'Pct < 1','Pct 1-4','Pct 5-14','Pct 15-24','Pct 25-34','Pct 35-44','Pct 45-54','Pct 55-64','Pct 65-74','Pct 75-84','Pct 85+',
    'Accessed'
]
all_data = all_data[base_columns]
all_min = all_data["Time"].min()
print("Min Date: " + str(all_min))
all_max = all_data["Time"].max()
print("Max Date: " + str(all_max))
all_data.head()

grouping_cols = ["Region","Country","State/Province"]
base_cols = ["Cases","Tests","Deaths"]
calc_cols = []
for col in base_cols:
    all_data[col + " Daily"] = all_data[col + " Daily"].fillna(0).apply(lambda x: int(float(str(x).replace(",",""))))
    all_data[col + " Weekly"] = all_data[col + " Daily"].rolling(7,min_periods=7).sum().reset_index(drop=True)
    all_data[col + " Daily Rate"] = all_data[col + " Daily"]/all_data["Population 100K"]
    all_data[col + " Weekly Rate"] = all_data[col + " Weekly"]/all_data["Population 100K"]
    all_data["Total " + col] = all_data.groupby(grouping_cols)[col + " Daily"].cumsum().reset_index(drop=True)
    all_data[col + " Daily 7D Rolling"] = all_data.groupby(grouping_cols, as_index=False)[col + " Daily"].rolling(7,min_periods=7).mean().reset_index(drop=True)
    all_data[col + " Daily Rate 7D Rolling"] = all_data.groupby(grouping_cols, as_index=False)[col + " Daily Rate"].rolling(7,min_periods=7).mean().reset_index(drop=True)
    all_data["Total " + col + " Rate"] = all_data["Total " + col]/all_data["Population 100K"]
    base_order = [
        col + ' Daily', col + ' Daily 7D Rolling', col + " Weekly", "Total " + col, 
        col + ' Daily Rate', col + ' Daily Rate 7D Rolling', col + " Weekly Rate", "Total " + col + " Rate"
    ]
    calc_cols = calc_cols + base_order

all_data["Speed Daily"] = all_data["Cases Daily 7D Rolling"]/all_data["Population 100K"]
all_data["Speed Weekly"] = all_data["Cases Weekly"]/all_data["Population 100K"]
all_data["Positivity 7D Rolling"] = all_data["Cases Daily 7D Rolling"]/all_data["Tests Daily 7D Rolling"]
all_data["Positivity 7D Rolling"] = all_data["Positivity 7D Rolling"].apply(lambda x: np.nan if x == np.inf else x)
all_data["Positivity Weekly"] = all_data["Cases Weekly"]/all_data["Tests Weekly"].apply(lambda x: np.nan if x == np.inf else x)
all_data["Acceleration Daily"] = all_data.groupby(grouping_cols, as_index=False)["Speed Daily"].diff().reset_index(drop=True)
all_data["Acceleration Weekly"] = all_data.groupby(grouping_cols, as_index=False)["Speed Weekly"].diff(7).reset_index(drop=True)
all_data["Jerk Daily"] = all_data.groupby(grouping_cols, as_index=False)["Acceleration Daily"].diff().reset_index(drop=True)
all_data["Jerk Weekly"] = all_data.groupby(grouping_cols, as_index=False)["Acceleration Weekly"].diff(7).reset_index(drop=True)
all_data["Jounce Daily"] = all_data.groupby(grouping_cols, as_index=False)["Jerk Daily"].diff().reset_index(drop=True)
all_data["MM-DD-YYYY"] = all_data["Date"].apply(lambda x: datetime.strptime(x, '%m/%d/%Y').strftime('%m-%d-%Y'))
all_data["MM-DD-YYYY"] = all_data["MM-DD-YYYY"].astype(str)
all_data["DD-MM-YYYY"] = all_data["Date"].apply(lambda x: datetime.strptime(x, '%m/%d/%Y').strftime('%d-%m-%Y'))
all_data["DD-MM-YYYY"] = all_data["DD-MM-YYYY"].astype(str)
all_data["Week"] = all_data["Date"].apply(
    lambda x: 
    str(datetime.strptime(x, '%m/%d/%Y').isocalendar()[0]) + 
    " W" + str(datetime.strptime(x, '%m/%d/%Y').isocalendar()[1])
)
all_data["First Day of Week"] = all_data["Date"].apply(
    lambda x: datetime.strptime(x, '%m/%d/%Y') - timedelta(days=datetime.strptime(x, '%m/%d/%Y').weekday())
)
all_data["Last Day of Week"] = all_data["First Day of Week"].apply(lambda x: x + timedelta(days=6))
def shortDate(day):
    shortMonth = datetime.strftime(day,'%m').lstrip('0')
    shortDay  = datetime.strftime(day,'%d').lstrip('0')
    shortYear = datetime.strftime(day,'%y')
    return shortMonth + "/" + shortDay + "/" + shortYear
all_data["Week Date Range"] = all_data.apply(
    lambda x: shortDate(x["First Day of Week"]) + " - " + shortDate(x["Last Day of Week"]),
    axis=1
)
all_data["Day Count"] = all_data.groupby(["Region","Country","State/Province","Date"]).cumcount()
all_data["Census Region"] = all_data["Census Region"].astype(str)
all_data["Census Region"] = all_data["Census Region"].apply(lambda x: "" if x=="nan" else x)
all_data["Lancet Region"] = all_data["Country"].apply(lambda x: lancet_region(x))

header_cols = [
    'Level', 'Region', 'Lancet Region', 'Country', 'Census Region', 'State/Province', 'Abbreviation', 'FIPS',
    'Time', 'Date', 'Day Count', 'MM-DD-YYYY', 'DD-MM-YYYY', 'Week', 'First Day of Week', 'Last Day of Week', 'Week Date Range',
    'Status' # , 'Data Quality'
]
calc_cols = calc_cols + [
    "Positivity 7D Rolling", "Positivity Weekly", 
    "Speed Daily", "Speed Weekly",
    "Acceleration Daily","Acceleration Weekly", 
    "Jerk Daily", "Jerk Weekly"
]
other_cols = [
    'Recovered Daily', 'Total Recovered',
    'Active Daily', 'Total Active',
    'Population', 'Population 100K',
    'Country Population', 'Country Population 100K',"Country Share",
    'Region Population', 'Region Population 100K',"Region Share",
    'World Population', 'World Population 100K',"World Share",
    'World Share (%)', 'Urban Population (%)', 'Annual Change (%)', 'Net Change', 'Migrants (net)', 'Density (P/Km²)',
    'Land Area (Km²)', 'Fertility Rate', 'Median Age',
    '< 1', 1, 2, 3, 4, 5, 6, 7, 8, 9,
    10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
    20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
    30, 31, 32, 33, 34, 35, 36, 37, 38, 39,
    40, 41, 42, 43, 44, 45, 46, 47, 48, 49,
    50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
    60, 61, 62, 63, 64, 65, 66, 67, 68, 69,
    70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
    80, 81, 82, 83, 84, '85+',
    '1-4','5-14','15-24','25-34','35-44','45-54','55-64','65-74','75-84',
    'Pct < 1','Pct 1-4','Pct 5-14','Pct 15-24','Pct 25-34','Pct 35-44',
    'Pct 45-54','Pct 55-64','Pct 65-74','Pct 75-84','Pct 85+',
    'Accessed'
]
all_cols = header_cols + calc_cols + other_cols
print(all_cols)
all_data = all_data[all_cols].copy()
last_monday = date(year=2021,month=1,day=18)
print("Last Monday: ")
print(last_monday)
all_data['Time'] = all_data['Time'].astype(str)
all_min = all_data["Time"].min()
print("Min Date: " + str(all_min))
all_max = all_data["Time"].max()
print("Max Date: " + str(all_max))
all_data.head()

Column Values:
['', 'Newfoundland and Labrador', 'Oregon', 'Republic of Marshall Islands', 'District of Columbia', 'Macao', 'West Virginia', 'New Brunswick', 'Georgia', 'Alberta', 'Iowa', 'Ohio', 'Connecticut', 'British Columbia', 'Maine', 'Quebec', 'Alaska', 'Texas', 'St. Helena', 'Colorado', 'Virginia', 'Jersey', 'Tennessee', 'Vermont', 'Puerto Rico', 'Alabama', 'Guam', 'New York', 'Pennsylvania', 'Northern Mariana Islands', 'Arkansas', 'Kentucky', 'New Mexico', 'North Dakota', 'Wisconsin', 'Northern Cyprus', 'Kansas', 'Idaho', 'Ontario', 'Mississippi', 'Montana', 'Oklahoma', 'Michigan', 'Guernsey', 'Wyoming', 'Minnesota', 'Nevada', 'Louisiana', 'South Dakota', 'California', 'Maryland', 'American Samoa', 'Arizona', 'Illinois', 'Massachusetts', 'South Carolina', 'Delaware', 'Pitcairn Island', 'Indiana', 'Repatriated Canada', 'Federated States of Micronesia', 'World', 'North Carolina', 'Missouri', 'Palau', 'Yukon', 'Hawaii', 'Utah', 'Saskatchewan', 'Montserrat', 'Northwest Territories'

Unnamed: 0,Level,Region,Lancet Region,Country,Census Region,State/Province,Abbreviation,FIPS,Time,Date,Day Count,MM-DD-YYYY,DD-MM-YYYY,Week,First Day of Week,Last Day of Week,Week Date Range,Status,Cases Daily,Cases Daily 7D Rolling,Cases Weekly,Total Cases,Cases Daily Rate,Cases Daily Rate 7D Rolling,Cases Weekly Rate,Total Cases Rate,Tests Daily,Tests Daily 7D Rolling,Tests Weekly,Total Tests,Tests Daily Rate,Tests Daily Rate 7D Rolling,Tests Weekly Rate,Total Tests Rate,Deaths Daily,Deaths Daily 7D Rolling,Deaths Weekly,Total Deaths,Deaths Daily Rate,Deaths Daily Rate 7D Rolling,Deaths Weekly Rate,Total Deaths Rate,Positivity 7D Rolling,Positivity Weekly,Speed Daily,Speed Weekly,Acceleration Daily,Acceleration Weekly,Jerk Daily,Jerk Weekly,Recovered Daily,Total Recovered,Active Daily,Total Active,Population,Population 100K,Country Population,Country Population 100K,Country Share,Region Population,Region Population 100K,Region Share,World Population,World Population 100K,World Share,World Share (%),Urban Population (%),Annual Change (%),Net Change,Migrants (net),Density (P/Km²),Land Area (Km²),Fertility Rate,Median Age,< 1,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85+,1-4,5-14,15-24,25-34,35-44,45-54,55-64,65-74,75-84,Pct < 1,Pct 1-4,Pct 5-14,Pct 15-24,Pct 25-34,Pct 35-44,Pct 45-54,Pct 55-64,Pct 65-74,Pct 75-84,Pct 85+,Accessed
0,Country,Central Asia,Eastern Mediterranean,Armenia,,,AM,,2020-03-01,03/01/2020,0,03-01-2020,01-03-2020,2020 W9,2020-02-24,2020-03-01,2/24/20 - 3/1/20,,1,,,1,0.0337,,,0.0337,0,,,0,0.0,,,0.0,0,,,0,0.0,,,0.0,,,,,,,,,,,,,2963243.0,29.6324,,,,326887719.0,3268.8772,0.0091,7796609105.0,77966.0911,0.0004,0.04,63.0,0.19,5512.0,-4998.0,104.0,28470.0,1.8,35.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,7/4/2021
1,Country,Central Asia,Eastern Mediterranean,Armenia,,,AM,,2020-03-02,03/02/2020,0,03-02-2020,02-03-2020,2020 W10,2020-03-02,2020-03-08,3/2/20 - 3/8/20,,0,,,1,0.0,,,0.0337,0,,,0,0.0,,,0.0,0,,,0,0.0,,,0.0,,,,,,,,,,,,,2963243.0,29.6324,,,,326887719.0,3268.8772,0.0091,7796609105.0,77966.0911,0.0004,0.04,63.0,0.19,5512.0,-4998.0,104.0,28470.0,1.8,35.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,7/4/2021
2,Country,Central Asia,Eastern Mediterranean,Armenia,,,AM,,2020-03-03,03/03/2020,0,03-03-2020,03-03-2020,2020 W10,2020-03-02,2020-03-08,3/2/20 - 3/8/20,,0,,,1,0.0,,,0.0337,0,,,0,0.0,,,0.0,0,,,0,0.0,,,0.0,,,,,,,,,,,,,2963243.0,29.6324,,,,326887719.0,3268.8772,0.0091,7796609105.0,77966.0911,0.0004,0.04,63.0,0.19,5512.0,-4998.0,104.0,28470.0,1.8,35.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,7/4/2021
3,Country,Central Asia,Eastern Mediterranean,Armenia,,,AM,,2020-03-04,03/04/2020,0,03-04-2020,04-03-2020,2020 W10,2020-03-02,2020-03-08,3/2/20 - 3/8/20,,0,,,1,0.0,,,0.0337,0,,,0,0.0,,,0.0,0,,,0,0.0,,,0.0,,,,,,,,,,,,,2963243.0,29.6324,,,,326887719.0,3268.8772,0.0091,7796609105.0,77966.0911,0.0004,0.04,63.0,0.19,5512.0,-4998.0,104.0,28470.0,1.8,35.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,7/4/2021
4,Country,Central Asia,Eastern Mediterranean,Armenia,,,AM,,2020-03-05,03/05/2020,0,03-05-2020,05-03-2020,2020 W10,2020-03-02,2020-03-08,3/2/20 - 3/8/20,,0,,,1,0.0,,,0.0337,0,,,0,0.0,,,0.0,0,,,0,0.0,,,0.0,,,,,,,,,,,,,2963243.0,29.6324,,,,326887719.0,3268.8772,0.0091,7796609105.0,77966.0911,0.0004,0.04,63.0,0.19,5512.0,-4998.0,104.0,28470.0,1.8,35.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,7/4/2021


In [32]:
excess_days = all_data.loc[all_data["Day Count"]>0].copy()
excess_days.head(100)

Unnamed: 0,Level,Region,Lancet Region,Country,Census Region,State/Province,Abbreviation,FIPS,Time,Date,Day Count,MM-DD-YYYY,DD-MM-YYYY,Week,First Day of Week,Last Day of Week,Week Date Range,Status,Cases Daily,Cases Daily 7D Rolling,Cases Weekly,Total Cases,Cases Daily Rate,Cases Daily Rate 7D Rolling,Cases Weekly Rate,Total Cases Rate,Tests Daily,Tests Daily 7D Rolling,Tests Weekly,Total Tests,Tests Daily Rate,Tests Daily Rate 7D Rolling,Tests Weekly Rate,Total Tests Rate,Deaths Daily,Deaths Daily 7D Rolling,Deaths Weekly,Total Deaths,Deaths Daily Rate,Deaths Daily Rate 7D Rolling,Deaths Weekly Rate,Total Deaths Rate,Positivity 7D Rolling,Positivity Weekly,Speed Daily,Speed Weekly,Acceleration Daily,Acceleration Weekly,Jerk Daily,Jerk Weekly,Recovered Daily,Total Recovered,Active Daily,Total Active,Population,Population 100K,Country Population,Country Population 100K,Country Share,Region Population,Region Population 100K,Region Share,World Population,World Population 100K,World Share,World Share (%),Urban Population (%),Annual Change (%),Net Change,Migrants (net),Density (P/Km²),Land Area (Km²),Fertility Rate,Median Age,< 1,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85+,1-4,5-14,15-24,25-34,35-44,45-54,55-64,65-74,75-84,Pct < 1,Pct 1-4,Pct 5-14,Pct 15-24,Pct 25-34,Pct 35-44,Pct 45-54,Pct 55-64,Pct 65-74,Pct 75-84,Pct 85+,Accessed


In [33]:
all_data = all_data.loc[all_data["Day Count"]==0].copy().reset_index().drop(columns=["Day Count","index"])
all_data.to_excel(cleanedFolder + "all_raw_input.xlsx", index=False)
all_data.to_csv(cleanedFolder + "all_raw_input.csv", index=False)
all_data.head(8)

Unnamed: 0,Level,Region,Lancet Region,Country,Census Region,State/Province,Abbreviation,FIPS,Time,Date,MM-DD-YYYY,DD-MM-YYYY,Week,First Day of Week,Last Day of Week,Week Date Range,Status,Cases Daily,Cases Daily 7D Rolling,Cases Weekly,Total Cases,Cases Daily Rate,Cases Daily Rate 7D Rolling,Cases Weekly Rate,Total Cases Rate,Tests Daily,Tests Daily 7D Rolling,Tests Weekly,Total Tests,Tests Daily Rate,Tests Daily Rate 7D Rolling,Tests Weekly Rate,Total Tests Rate,Deaths Daily,Deaths Daily 7D Rolling,Deaths Weekly,Total Deaths,Deaths Daily Rate,Deaths Daily Rate 7D Rolling,Deaths Weekly Rate,Total Deaths Rate,Positivity 7D Rolling,Positivity Weekly,Speed Daily,Speed Weekly,Acceleration Daily,Acceleration Weekly,Jerk Daily,Jerk Weekly,Recovered Daily,Total Recovered,Active Daily,Total Active,Population,Population 100K,Country Population,Country Population 100K,Country Share,Region Population,Region Population 100K,Region Share,World Population,World Population 100K,World Share,World Share (%),Urban Population (%),Annual Change (%),Net Change,Migrants (net),Density (P/Km²),Land Area (Km²),Fertility Rate,Median Age,< 1,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85+,1-4,5-14,15-24,25-34,35-44,45-54,55-64,65-74,75-84,Pct < 1,Pct 1-4,Pct 5-14,Pct 15-24,Pct 25-34,Pct 35-44,Pct 45-54,Pct 55-64,Pct 65-74,Pct 75-84,Pct 85+,Accessed
0,Country,Central Asia,Eastern Mediterranean,Armenia,,,AM,,2020-03-01,03/01/2020,03-01-2020,01-03-2020,2020 W9,2020-02-24,2020-03-01,2/24/20 - 3/1/20,,1,,,1,0.0337,,,0.0337,0,,,0,0.0,,,0.0,0,,,0,0.0,,,0.0,,,,,,,,,,,,,2963243.0,29.6324,,,,326887719.0,3268.8772,0.0091,7796609105.0,77966.0911,0.0004,0.04,63.0,0.19,5512.0,-4998.0,104.0,28470.0,1.8,35.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,7/4/2021
1,Country,Central Asia,Eastern Mediterranean,Armenia,,,AM,,2020-03-02,03/02/2020,03-02-2020,02-03-2020,2020 W10,2020-03-02,2020-03-08,3/2/20 - 3/8/20,,0,,,1,0.0,,,0.0337,0,,,0,0.0,,,0.0,0,,,0,0.0,,,0.0,,,,,,,,,,,,,2963243.0,29.6324,,,,326887719.0,3268.8772,0.0091,7796609105.0,77966.0911,0.0004,0.04,63.0,0.19,5512.0,-4998.0,104.0,28470.0,1.8,35.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,7/4/2021
2,Country,Central Asia,Eastern Mediterranean,Armenia,,,AM,,2020-03-03,03/03/2020,03-03-2020,03-03-2020,2020 W10,2020-03-02,2020-03-08,3/2/20 - 3/8/20,,0,,,1,0.0,,,0.0337,0,,,0,0.0,,,0.0,0,,,0,0.0,,,0.0,,,,,,,,,,,,,2963243.0,29.6324,,,,326887719.0,3268.8772,0.0091,7796609105.0,77966.0911,0.0004,0.04,63.0,0.19,5512.0,-4998.0,104.0,28470.0,1.8,35.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,7/4/2021
3,Country,Central Asia,Eastern Mediterranean,Armenia,,,AM,,2020-03-04,03/04/2020,03-04-2020,04-03-2020,2020 W10,2020-03-02,2020-03-08,3/2/20 - 3/8/20,,0,,,1,0.0,,,0.0337,0,,,0,0.0,,,0.0,0,,,0,0.0,,,0.0,,,,,,,,,,,,,2963243.0,29.6324,,,,326887719.0,3268.8772,0.0091,7796609105.0,77966.0911,0.0004,0.04,63.0,0.19,5512.0,-4998.0,104.0,28470.0,1.8,35.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,7/4/2021
4,Country,Central Asia,Eastern Mediterranean,Armenia,,,AM,,2020-03-05,03/05/2020,03-05-2020,05-03-2020,2020 W10,2020-03-02,2020-03-08,3/2/20 - 3/8/20,,0,,,1,0.0,,,0.0337,0,,,0,0.0,,,0.0,0,,,0,0.0,,,0.0,,,,,,,,,,,,,2963243.0,29.6324,,,,326887719.0,3268.8772,0.0091,7796609105.0,77966.0911,0.0004,0.04,63.0,0.19,5512.0,-4998.0,104.0,28470.0,1.8,35.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,7/4/2021
5,Country,Central Asia,Eastern Mediterranean,Armenia,,,AM,,2020-03-06,03/06/2020,03-06-2020,06-03-2020,2020 W10,2020-03-02,2020-03-08,3/2/20 - 3/8/20,,0,,,1,0.0,,,0.0337,0,,,0,0.0,,,0.0,0,,,0,0.0,,,0.0,,,,,,,,,,,,,2963243.0,29.6324,,,,326887719.0,3268.8772,0.0091,7796609105.0,77966.0911,0.0004,0.04,63.0,0.19,5512.0,-4998.0,104.0,28470.0,1.8,35.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,7/4/2021
6,Country,Central Asia,Eastern Mediterranean,Armenia,,,AM,,2020-03-07,03/07/2020,03-07-2020,07-03-2020,2020 W10,2020-03-02,2020-03-08,3/2/20 - 3/8/20,,0,0.1429,1.0,1,0.0,0.0048,0.0337,0.0337,0,0.0,0.0,0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0.0,0.0,0.0,0.0,,inf,0.0048,0.0337,,,,,,,,,2963243.0,29.6324,,,,326887719.0,3268.8772,0.0091,7796609105.0,77966.0911,0.0004,0.04,63.0,0.19,5512.0,-4998.0,104.0,28470.0,1.8,35.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,7/4/2021
7,Country,Central Asia,Eastern Mediterranean,Armenia,,,AM,,2020-03-08,03/08/2020,03-08-2020,08-03-2020,2020 W10,2020-03-02,2020-03-08,3/2/20 - 3/8/20,,0,0.0,0.0,1,0.0,0.0,0.0,0.0337,0,0.0,0.0,0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0.0,0.0,0.0,0.0,,,0.0,0.0,-0.0048,,,,,,,,2963243.0,29.6324,,,,326887719.0,3268.8772,0.0091,7796609105.0,77966.0911,0.0004,0.04,63.0,0.19,5512.0,-4998.0,104.0,28470.0,1.8,35.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,7/4/2021


In [34]:
date_check = all_data.groupby(all_data["Country"])[["Time","Week"]].max()
date_check = date_check.loc[date_check["Time"]<last_saturday].copy()
date_check.reset_index(inplace=True)
print(date_check.columns)
bad_countries = date_check["Country"].to_list()
print("Countries with incomplete data")
print(bad_countries)
date_check.head()

Index(['Country', 'Time', 'Week'], dtype='object')
Countries with incomplete data
['Anguilla', 'Aruba', 'Bermuda', 'British Virgin Islands', 'Caribbean Netherlands', 'Cayman Islands', 'Cook Islands', 'Faeroe Islands', 'Falkland Islands', 'French Polynesia', 'Gibraltar', 'Greenland', 'Nauru', 'New Caledonia', 'Sint Maarten', 'Tonga', 'Turkmenistan', 'Turks and Caicos Islands', 'Tuvalu']


Unnamed: 0,Country,Time,Week
0,Anguilla,2021-06-25,2021 W9
1,Aruba,2021-07-02,2021 W26
2,Bermuda,2021-06-29,2021 W9
3,British Virgin Islands,2021-06-25,2021 W25
4,Caribbean Netherlands,2021-04-09,2021 W14


In [35]:
min_header_cols = [
    'Level', 'Region','Country','Census Region','State/Province','Abbreviation','FIPS',
    'Time', 'Date', 'MM-DD-YYYY', 'DD-MM-YYYY', 'Week', 'First Day of Week', 'Last Day of Week', 'Week Date Range',
    'Status','Lancet Region'
]
min_cols = min_header_cols + calc_cols + ["Population","Population 100K","Accessed"]
print(min_cols)
min_data = all_data[min_cols].copy()
print_column_unique(min_data["State/Province"])
min_data = min_data[~min_data['Time'].isnull()]
min_data = min_data[(min_data['Time'] < last_sunday) & ~min_data["Country"].isin(bad_countries)]
min_data.head()

['Level', 'Region', 'Country', 'Census Region', 'State/Province', 'Abbreviation', 'FIPS', 'Time', 'Date', 'MM-DD-YYYY', 'DD-MM-YYYY', 'Week', 'First Day of Week', 'Last Day of Week', 'Week Date Range', 'Status', 'Lancet Region', 'Cases Daily', 'Cases Daily 7D Rolling', 'Cases Weekly', 'Total Cases', 'Cases Daily Rate', 'Cases Daily Rate 7D Rolling', 'Cases Weekly Rate', 'Total Cases Rate', 'Tests Daily', 'Tests Daily 7D Rolling', 'Tests Weekly', 'Total Tests', 'Tests Daily Rate', 'Tests Daily Rate 7D Rolling', 'Tests Weekly Rate', 'Total Tests Rate', 'Deaths Daily', 'Deaths Daily 7D Rolling', 'Deaths Weekly', 'Total Deaths', 'Deaths Daily Rate', 'Deaths Daily Rate 7D Rolling', 'Deaths Weekly Rate', 'Total Deaths Rate', 'Positivity 7D Rolling', 'Positivity Weekly', 'Speed Daily', 'Speed Weekly', 'Acceleration Daily', 'Acceleration Weekly', 'Jerk Daily', 'Jerk Weekly', 'Population', 'Population 100K', 'Accessed']
Column Values:
['', 'Newfoundland and Labrador', 'Oregon', 'Republic of Mar

Unnamed: 0,Level,Region,Country,Census Region,State/Province,Abbreviation,FIPS,Time,Date,MM-DD-YYYY,DD-MM-YYYY,Week,First Day of Week,Last Day of Week,Week Date Range,Status,Lancet Region,Cases Daily,Cases Daily 7D Rolling,Cases Weekly,Total Cases,Cases Daily Rate,Cases Daily Rate 7D Rolling,Cases Weekly Rate,Total Cases Rate,Tests Daily,Tests Daily 7D Rolling,Tests Weekly,Total Tests,Tests Daily Rate,Tests Daily Rate 7D Rolling,Tests Weekly Rate,Total Tests Rate,Deaths Daily,Deaths Daily 7D Rolling,Deaths Weekly,Total Deaths,Deaths Daily Rate,Deaths Daily Rate 7D Rolling,Deaths Weekly Rate,Total Deaths Rate,Positivity 7D Rolling,Positivity Weekly,Speed Daily,Speed Weekly,Acceleration Daily,Acceleration Weekly,Jerk Daily,Jerk Weekly,Population,Population 100K,Accessed
0,Country,Central Asia,Armenia,,,AM,,2020-03-01,03/01/2020,03-01-2020,01-03-2020,2020 W9,2020-02-24,2020-03-01,2/24/20 - 3/1/20,,Eastern Mediterranean,1,,,1,0.0337,,,0.0337,0,,,0,0.0,,,0.0,0,,,0,0.0,,,0.0,,,,,,,,,2963243.0,29.6324,7/4/2021
1,Country,Central Asia,Armenia,,,AM,,2020-03-02,03/02/2020,03-02-2020,02-03-2020,2020 W10,2020-03-02,2020-03-08,3/2/20 - 3/8/20,,Eastern Mediterranean,0,,,1,0.0,,,0.0337,0,,,0,0.0,,,0.0,0,,,0,0.0,,,0.0,,,,,,,,,2963243.0,29.6324,7/4/2021
2,Country,Central Asia,Armenia,,,AM,,2020-03-03,03/03/2020,03-03-2020,03-03-2020,2020 W10,2020-03-02,2020-03-08,3/2/20 - 3/8/20,,Eastern Mediterranean,0,,,1,0.0,,,0.0337,0,,,0,0.0,,,0.0,0,,,0,0.0,,,0.0,,,,,,,,,2963243.0,29.6324,7/4/2021
3,Country,Central Asia,Armenia,,,AM,,2020-03-04,03/04/2020,03-04-2020,04-03-2020,2020 W10,2020-03-02,2020-03-08,3/2/20 - 3/8/20,,Eastern Mediterranean,0,,,1,0.0,,,0.0337,0,,,0,0.0,,,0.0,0,,,0,0.0,,,0.0,,,,,,,,,2963243.0,29.6324,7/4/2021
4,Country,Central Asia,Armenia,,,AM,,2020-03-05,03/05/2020,03-05-2020,05-03-2020,2020 W10,2020-03-02,2020-03-08,3/2/20 - 3/8/20,,Eastern Mediterranean,0,,,1,0.0,,,0.0337,0,,,0,0.0,,,0.0,0,,,0,0.0,,,0.0,,,,,,,,,2963243.0,29.6324,7/4/2021


In [36]:
print("Max date:" + str(min_data["Time"].max()))
min_xlsx_file = cleanedFolder + "daily_raw_input.xlsx"
min_data.to_excel(min_xlsx_file, index=False)
print("Min Excel file output")
min_csv_file = cleanedFolder + "daily_raw_input.csv"
min_data.to_csv(min_csv_file, index=False, sep=',', float_format='%.6f')
print("Min CSV file output")
min_data.head(8)

Max date:2021-07-03
Min Excel file output
Min CSV file output


Unnamed: 0,Level,Region,Country,Census Region,State/Province,Abbreviation,FIPS,Time,Date,MM-DD-YYYY,DD-MM-YYYY,Week,First Day of Week,Last Day of Week,Week Date Range,Status,Lancet Region,Cases Daily,Cases Daily 7D Rolling,Cases Weekly,Total Cases,Cases Daily Rate,Cases Daily Rate 7D Rolling,Cases Weekly Rate,Total Cases Rate,Tests Daily,Tests Daily 7D Rolling,Tests Weekly,Total Tests,Tests Daily Rate,Tests Daily Rate 7D Rolling,Tests Weekly Rate,Total Tests Rate,Deaths Daily,Deaths Daily 7D Rolling,Deaths Weekly,Total Deaths,Deaths Daily Rate,Deaths Daily Rate 7D Rolling,Deaths Weekly Rate,Total Deaths Rate,Positivity 7D Rolling,Positivity Weekly,Speed Daily,Speed Weekly,Acceleration Daily,Acceleration Weekly,Jerk Daily,Jerk Weekly,Population,Population 100K,Accessed
0,Country,Central Asia,Armenia,,,AM,,2020-03-01,03/01/2020,03-01-2020,01-03-2020,2020 W9,2020-02-24,2020-03-01,2/24/20 - 3/1/20,,Eastern Mediterranean,1,,,1,0.0337,,,0.0337,0,,,0,0.0,,,0.0,0,,,0,0.0,,,0.0,,,,,,,,,2963243.0,29.6324,7/4/2021
1,Country,Central Asia,Armenia,,,AM,,2020-03-02,03/02/2020,03-02-2020,02-03-2020,2020 W10,2020-03-02,2020-03-08,3/2/20 - 3/8/20,,Eastern Mediterranean,0,,,1,0.0,,,0.0337,0,,,0,0.0,,,0.0,0,,,0,0.0,,,0.0,,,,,,,,,2963243.0,29.6324,7/4/2021
2,Country,Central Asia,Armenia,,,AM,,2020-03-03,03/03/2020,03-03-2020,03-03-2020,2020 W10,2020-03-02,2020-03-08,3/2/20 - 3/8/20,,Eastern Mediterranean,0,,,1,0.0,,,0.0337,0,,,0,0.0,,,0.0,0,,,0,0.0,,,0.0,,,,,,,,,2963243.0,29.6324,7/4/2021
3,Country,Central Asia,Armenia,,,AM,,2020-03-04,03/04/2020,03-04-2020,04-03-2020,2020 W10,2020-03-02,2020-03-08,3/2/20 - 3/8/20,,Eastern Mediterranean,0,,,1,0.0,,,0.0337,0,,,0,0.0,,,0.0,0,,,0,0.0,,,0.0,,,,,,,,,2963243.0,29.6324,7/4/2021
4,Country,Central Asia,Armenia,,,AM,,2020-03-05,03/05/2020,03-05-2020,05-03-2020,2020 W10,2020-03-02,2020-03-08,3/2/20 - 3/8/20,,Eastern Mediterranean,0,,,1,0.0,,,0.0337,0,,,0,0.0,,,0.0,0,,,0,0.0,,,0.0,,,,,,,,,2963243.0,29.6324,7/4/2021
5,Country,Central Asia,Armenia,,,AM,,2020-03-06,03/06/2020,03-06-2020,06-03-2020,2020 W10,2020-03-02,2020-03-08,3/2/20 - 3/8/20,,Eastern Mediterranean,0,,,1,0.0,,,0.0337,0,,,0,0.0,,,0.0,0,,,0,0.0,,,0.0,,,,,,,,,2963243.0,29.6324,7/4/2021
6,Country,Central Asia,Armenia,,,AM,,2020-03-07,03/07/2020,03-07-2020,07-03-2020,2020 W10,2020-03-02,2020-03-08,3/2/20 - 3/8/20,,Eastern Mediterranean,0,0.1429,1.0,1,0.0,0.0048,0.0337,0.0337,0,0.0,0.0,0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0.0,0.0,0.0,0.0,,inf,0.0048,0.0337,,,,,2963243.0,29.6324,7/4/2021
7,Country,Central Asia,Armenia,,,AM,,2020-03-08,03/08/2020,03-08-2020,08-03-2020,2020 W10,2020-03-02,2020-03-08,3/2/20 - 3/8/20,,Eastern Mediterranean,0,0.0,0.0,1,0.0,0.0,0.0,0.0337,0,0.0,0.0,0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0.0,0.0,0.0,0.0,,,0.0,0.0,-0.0048,,,,2963243.0,29.6324,7/4/2021


In [37]:
date_check = min_data.groupby(min_data["Country"])[["Time","Week"]].max()
date_check.head()

Unnamed: 0_level_0,Time,Week
Country,Unnamed: 1_level_1,Unnamed: 2_level_1
,2021-07-03,2021 W9
Afghanistan,2021-07-03,2021 W9
Albania,2021-07-03,2021 W9
Algeria,2021-07-03,2021 W9
Andorra,2021-07-03,2021 W9


In [38]:
print(bad_countries)
us_check = date_check.reset_index()
print("US States Min/Max")
print(us_min)
print(us_max)
us_check.loc[us_check["Country"]=="United States"]

['Anguilla', 'Aruba', 'Bermuda', 'British Virgin Islands', 'Caribbean Netherlands', 'Cayman Islands', 'Cook Islands', 'Faeroe Islands', 'Falkland Islands', 'French Polynesia', 'Gibraltar', 'Greenland', 'Nauru', 'New Caledonia', 'Sint Maarten', 'Tonga', 'Turkmenistan', 'Turks and Caicos Islands', 'Tuvalu']
US States Min/Max
2020-01-22 00:00:00
2021-07-02 00:00:00


Unnamed: 0,Country,Time,Week
185,United States,2021-07-03,2021 W9
