In [None]:
# Dependencies
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import requests
import io
import time
from datetime import datetime, timedelta, date
import json
import pprint
pp = pprint.PrettyPrinter(indent=4)
pd.set_option('display.max_columns', None)

#Folders and Files

# Folders
repositoryFolder = "D:/Repositories/Global-COVID-Surveillance/data/"
localDownloadFolder = "C:/Users/janin/Downloads/"
demographicsFolder = repositoryFolder + "raw/demographics/"
configuredFolder = repositoryFolder + "configured/"
cleanedFolder = repositoryFolder + "cleaned/"
regionsFolder = repositoryFolder + "raw/regions/"
locationsFolder = repositoryFolder + "raw/locations/"

# Population Input File
all_populations_file = cleanedFolder + "all_populations.xlsx"
us_codes = demographicsFolder + "US State Codes.xlsx"

# Location Input
locations_file = demographicsFolder + "Country Geo.xlsx"

#Sources
population_source_url ="https://www.worldometers.info/world-population/population-by-country/"
canada_population_source = "https://www150.statcan.gc.ca/t1/tbl1/en/tv.action?pid=1710000901" # Statistics Canada Quarterly Population
us_population_source = "https://www2.census.gov/programs-surveys/popest/tables/2010-2019/state/asrh/sc-est2019-agesex-civ.csv"

github_url="https://github.com/dsbbfinddx/FINDCov19TrackerData/blob/master/processed/data_all.csv?raw=true"
owid_data = R"https://github.com/owid/covid-19-data/blob/master/public/data/owid-covid-data.xlsx?raw=true"
country_codes_coordinates = "https://raw.githubusercontent.com/dsbbfinddx/FINDCov19TrackerData/master/raw/countries_codes_and_coordinates.csv"
countries_geo = "https://raw.githubusercontent.com/dsbbfinddx/FINDCov19TrackerData/master/raw/countries.geo.json"
us_states_geo = "https://raw.githubusercontent.com/dsbbfinddx/FINDCov19TrackerData/master/raw/us-states.geo.json"

kaggle_locations = "https://www.kaggle.com/paultimothymooney/latitude-and-longitude-for-every-country-and-state"

canada_source_csv = "https://opendata.arcgis.com/datasets/3afa9ce11b8842cb889714611e6f3076_0.csv"
us_source_csv = "https://covidtracking.com/data/download/all-states-history.csv"

# Countries and Regions
european_countries = [
    'Albania','Andorra','Austria','Belarus','Belgium','Bosnia & Herzegovina','Bulgaria',
    'Croatia','Czech Republic','Denmark','Estonia','Finland','France',
    'Germany','Greece','Greenland','Hungary','Iceland','Ireland','Isle of Man','Italy',
    'Latvia','Liechtenstein','Lithuania','Luxembourg','Malta','Moldova','Monaco','Montenegro',
    'Netherlands','Norway','Poland','Portugal','Romania',
    'San Marino','Serbia','Slovakia','Slovenia','Spain','Sweden','Switzerland',
    'Ukraine','United Kingdom','Vatican City'
]
carribean_countries = [
    "Antigua & Barbuda","Aruba","Bahamas","Barbados","Bermuda","British Virgin Islands",
    "Cayman Islands","Cuba","Curacao","Dominica","Dominican Republic","Grenada",
    "Haiti","Jamaica","Puerto Rico",
    "St. Barthelemy","St. Kitts & Nevis","St. Lucia","St. Vincent & Grenadines",
    "Sint Maarten","Trinidad & Tobago","Turks and Caicos Islands","United States Virgin Islands"
]
central_south_america_countries = [
    'Argentina','Belize','Bolivia','Brazil','Chile','Colombia','Costa Rica',
    'Ecuador','El Salvador','Guatemala','Guyana','Honduras',
    'Mexico','Nicaragua','Panama','Paraguay','Peru','Suriname','Uruguay','Venezuela'
]
latin_american_countries = carribean_countries + central_south_america_countries
sub_saharan_african_countries = [
    "Angola","Benin","Botswana","Burkina Faso","Burundi",
    "Cabo Verde","Cameroon","Central African Republic","Chad","Comoros","Côte d’Ivoire",
    "Democratic Republic of Congo","Equatorial Guinea","Eritrea","Ethiopia",
    "Gabon","Gambia","Ghana","Guinea","Guinea-Bissau","Kenya","Lesotho","Liberia",
    "Madagascar","Malawi","Mali","Mauritania","Mauritius","Mozambique",
    "Namibia","Niger","Nigeria","Republic of the Congo","Rwanda",
    "São Tomé and Príncipe","Senegal","Seychelles","Sierra Leone",
    "Somalia","South Africa","South Sudan","Sudan","Swaziland",
    "Tanzania","Togo","Uganda","Zambia","Zimbabwe"
]
south_asia_countries = [
    "Afghanistan","Bangladesh","Bhutan","India","Maldives","Nepal","Pakistan","Sri Lanka"
]
central_asian_countries = [
    'Armenia','Azerbaijan','Cyprus','Faeroe Islands','Georgia','Gibraltar','Kazakhstan','Kosovo','Kyrgyzstan',
    'North Macedonia','Russia','Tajikistan','Turkey','Turkmenistan','Uzbekistan'
]
east_asian_countries = [
    "Brunei","Cambodia","China","Indonesia","Japan","Laos","Malaysia","Mongolia","Myanmar","Niue","North Korea","Philippines",
    "Singapore","South Korea","Taiwan","Thailand","Timor","Vietnam"
]
pacific_countries = [
    "Australia","Cook Islands","Fiji","French Polynesia","Guam","Kiribati",
    "Marshall Islands","Micronesia","Nauru","New Caledonia","New Zealand",
    "Northern Mariana Islands","Palau","Papua New Guinea","Samoa","Solomon Islands","Tonga","Tuvalu","Vanuatu"
]
east_asia_and_pacific_countries = east_asian_countries + pacific_countries
middle_eastern_countries = [
    "Bahrain","Iran","Iraq","Israel","Jordan","Kuwait","Lebanon","Oman","Qatar",
    "Saudi Arabia","Syria","State of Palestine",
    "United Arab Emirates","Yemen"
]
north_african_countries = [
    "Algeria","Djibouti","Egypt","Libya","Morocco","Tunisia","Western Sahara"
]
middle_east_and_north_africa_countries = middle_eastern_countries + north_african_countries 
north_american_countries = ["Canada","United States"]
configured_country_lists = [
    european_countries,
    latin_american_countries,
    sub_saharan_african_countries,
    south_asia_countries,
    central_asian_countries,
    middle_east_and_north_africa_countries,
    east_asia_and_pacific_countries
]
countries_by_region = {
    'Central Asia': central_asian_countries,
    'Europe': european_countries,
    'Latin America': latin_american_countries,
    'South Asia': south_asia_countries,
    'Sub-Saharan Africa': sub_saharan_african_countries,
    'Middle East and North Africa': middle_east_and_north_africa_countries,
    'East Asia and Pacific': east_asia_and_pacific_countries,
    'North America': north_american_countries
}

configured_countries = []
for country_list in configured_country_lists:
    for country in country_list:
        configured_countries.append(country)
configured_countries.sort()
all_countries = configured_countries +  north_american_countries
all_countries.sort()

configured_regions = [
    'Central Asia',
    'East Asia and Pacific',
    'Europe',
    'Latin America',
    'Middle East and North Africa',
    'North America',
    'South Asia',
    'Sub-Saharan Africa'
]

regions = [
    'Central Asia',
    'East Asia and Pacific',
    'Europe',
    'Latin America',
    'Middle East and North Africa',
    'North America',
    'South Asia',
    'Sub-Saharan Africa',
    'Territory'
]

unincorporated_disputed_territories = [
    "American Samoa", "Anguilla","Caribbean Netherlands","Channel Islands","Curaçao",
    "Falkland Islands","French Guiana","Guadeloupe","Hong Kong","International"
]

countries_by_region = {
    'Central Asia': central_asian_countries,
    'Europe': european_countries,
    'Latin America': latin_american_countries,
    'South Asia': south_asia_countries,
    'Sub-Saharan Africa': sub_saharan_african_countries,
    'Middle East and North Africa': middle_east_and_north_africa_countries,
    'East Asia and Pacific': east_asia_and_pacific_countries,
    'North America': north_american_countries,
    'Territory': unincorporated_disputed_territories
}

census_regions = {
    0: {"name" : "United States",
        "states" : ["United States"]},
    1: {"name" : "Northeast",
        "states" :["Connecticut", "Maine", "New Hampshire", "Vermont", "Massachusetts", 
                   "Rhode Island", "New Jersey", "New York", "Pennsylvania"]},
    3: {"name" : "South",
        "states" : ["Maryland", "Delaware", "West Virginia", "Virginia", "Kentucky", 
                    "Tennessee", "North Carolina", "South Carolina", "Georgia", "Florida", 
                    "Alabama", "Mississippi", "Arkansas", "Louisiana", "Oklahoma", "Texas", 
                    "District of Columbia", "Puerto Rico"]},
    2: {"name" : "Midwest",
        "states" : ["North Dakota", "South Dakota", "Nebraska", "Kansas", "Missouri", "Iowa", 
                    "Minnesota", "Wisconsin", "Illinois", "Michigan", "Indiana", "Ohio"]},
    4: {"name" : "West",
        "states" : ["Washington", "Idaho", "Montana", "Wyoming", "Oregon", "California", "Nevada", 
                    "Utah", "Colorado", "Arizona", "New Mexico", "Alaska", "Hawaii"]}
}

us_states = [
    'Alabama','Alaska','Arizona','Arkansas','California','Colorado','Connecticut','Delaware','District of Columbia',
    'Florida','Georgia','Hawaii','Idaho','Illinois','Indiana','Iowa','Kansas','Kentucky','Louisiana',
    'Maine','Maryland','Massachusetts','Michigan','Minnesota','Mississippi','Missouri','Montana','Nebraska',
    'Nevada','New Hampshire','New Jersey','New Mexico','New York','North Carolina','North Dakota',
    'Ohio','Oklahoma','Oregon','Pennsylvania','Rhode Island','South Carolina','South Dakota','Tennessee','Texas',
    'Utah','Vermont','Virginia','Washington','West Virginia','Wisconsin','Wyoming'
]
canada_provinces = [
    'Alberta','British Columbia','Manitoba','New Brunswick','Newfoundland and Labrador','Northwest Territories',
    'Nova Scotia','Ontario','Prince Edward Island','Quebec','Saskatchewan','Yukon'
]
states_and_provinces = us_states + canada_provinces

country_conversions = {
    "Antigua & Barbuda": ["Antigua and Barbuda"],
    "Bahamas": ["Bahamas, The"],
    "Bosnia & Herzegovina": ["Bosnia and Herzegovina"],
    "Brunei": ["Brunei Darussalam"],
    "Cabo Verde": ["Cape Verde"],
    "Côte d’Ivoire": ["Cote d'Ivoire","Cote dIvoire"],
    "Czech Republic": ["Czechia","Czech Republic (Czechia)"],
    "Democratic Republic of Congo": ["Congo - Kinshasa"],
    "Egypt": ["Egypt, Arab Rep."],
    "Faeroe Islands": ["Faroe Islands"],
    "Gambia": ["Gambia, The"],
    "Hong Kong": ["Hong Kong SAR China"],
    "Iran": ["Iran, Islamic Rep."],
    "Kyrgyzstan": ["Kyrgyz Republic"],
    "Laos": ["Lao PDR"],
    "Micronesia": ["Micronesia, Fed. Sts.","Micronesia (country)"],
    "Myanmar": ["Myanmar (Burma)","Burma"],
    "North Macedonia": ["Macedonia"],
    "State of Palestine": ["Palestinian Territories","Palestine"],
    "Republic of the Congo": ["Congo - Brazzaville","Congo"],
    "Russia": ["Russian Federation"],
    "São Tomé and Príncipe": ["Sao Tome and Principe","Sao Tome & Príncipe","São Tomé & Príncipe"],
    "Sint Maarten": ["Sint Maarten (Dutch part)"],
    "Slovakia": ["Slovak Republic"],
    "St. Kitts & Nevis": ["Saint Kitts and Nevis"],
    "St. Lucia": ["Saint Lucia"],
    "St. Vincent & Grenadines": ["Saint Vincent and the Grenadines"],
    "Swaziland": ["Eswatini"],
    "Syria": ["Syrian Arab Republic"],
    "Timor": ["Timor-Leste"],
    "Trinidad & Tobago": ["Trinidad and Tobago"],
    "Vatican City": ["Holy See","Vatican"],
    "Yemen": ["Yemen, Rep."],
    "" : ["nan"]
}
us_territories =['American Samoa','Commonwealth of the Northern Mariana Islands','Guam','Puerto Rico','U.S. Virgin Islands']
territories = {
    "American Samoa":{"Region":"North America","Country":"United States"}, 
    "Anguilla":{"Region":"Europe","Country":"United Kingdom"},
    "Caribbean Netherlands":{"Region":"Europe","Country":"Netherlands"},
    "Channel Islands":{"Region":"Europe","Country":"Channel Islands"},
    "Curaçao":{"Region":"Europe","Country":"Netherlands"},
    "Falkland Islands":{"Region":"Europe","Country":"United Kingdom"},
    "French Guiana":{"Region":"Europe","Country":"France"},
    "Guadeloupe":{"Region":"Europe","Country":"France"},
    "Hong Kong":{"Region":"East Asia and Pacific","Country":"China"},
    "International":{"Region":"World","Country":""},
    "Macao":{"Region":"East Asia and Pacific","Country":"China"},
    "Martinique":{"Region":"Europe","Country":"France"},
    "Mayotte":{"Region":"Europe","Country":"France"},
    "Montserrat":{"Region":"Europe","Country":"United Kingdom"},
    "Réunion":{"Region":"Europe","Country":"France"},
    "St. Helena":{"Region":"Europe","Country":"United Kingdom"},
    "St. Martin":{"Region":"Europe","Country":"France"},
    "St. Pierre & Miquelon":{"Region":"Europe","Country":"France"},
    "Tokelau":{"Region":"East Asia and Pacific","Country":"New Zealand"},
    "Turks and Caicos":{"Region":"Europe","Country":"United Kingdom"},
    "U.S. Virgin Islands":{"Region":"North America","Country":"United States"},
    "Wallis & Futuna":{"Region":"Europe","Country":"France"},
    "World":{"Region":"World","Country":""}
}


In [None]:
def titleCase(words):
    if len(words) > 3:
        titlecased = ""
        wordsArray = words.lower().split(" ")
        for word in wordsArray:
            if len(titlecased) > 0 :
                titlecased = titlecased + " "
            if word == "and":
                titlecased = titlecased + "and"
            else:
                titlecased = titlecased + word.capitalize()
        return titlecased
    else:
        return words.upper()

def fixRegion(code):
    region_name = ""
    for region in census_regions:
        if region["number"] == code:
            region_name = region["name"]
            break
    if region_name == "":
        region_name = "Other"
        print(str(code) + " not found")
    return region_name

def checkRegions(regionColumn, countryColumn):
    fixed = []
    for i in range(0,len(regionColumn)):
        region = regionColumn[i]
        country = countryColumn[i]
        if (not (region in regions)) and (not (country in fixed)):
            fixed.append(country)
            print(f"{country} = {region}")

# CDC Standard age ranges 0-17, 18-29, 30-49, and 50-64
# CDC COVID Reporting Age Ranges https://www.cdc.gov/nchs/nvss/vsrr/covid_weekly/index.htm
def getAgeRange(age):
    age_range = ""
    if age == 0:
        age_range = "< 1"
    elif age == 999:
        age_range = "Total"
    elif age < 5:
        age_range = "1-4"
    elif age < 15:
        age_range = "5-14"
    elif age < 25:
        age_range = "15-24"
    elif age < 35:
        age_range = "25-34"
    elif age < 45:
        age_range = "35-44"
    elif age < 55:
        age_range = "45-54"
    elif age < 65:
        age_range = "55-64"
    elif age < 75:
        age_range = "65-74"
    elif age < 85:
        age_range = "75-84"
    elif age == 85:
        age_range = "85+"
    return age_range

def fixSex(code):
    sex = ""
    if code == 0:
        sex = "Population 2019"
    elif code == 1:
        sex = "Male"
    elif code == 2:
        sex = "Female"
    else:
        print(str(code) + " is not a sex")
    return sex

def us_date(x):
    month = x[5:7]
    day = x[8:11]
    year = x[0:4]
    conversion = month + "/" + day +"/"+ year
    return conversion

def removeDecimal(data):
    strData = str(data)
    decimalLocation = strData.find(".")
    if decimalLocation > -1:
        return strData[0:decimalLocation]
    else:
        return strData

def emptyNan(value):
    if (value == "nan"):
        return ""
    else:
        return value

def printColumns(df, label):
    print(label)
    print(df.columns)

def print_column_unique(column):
    print("Column Values:")
    values = column.sort_values(ascending = True).unique()
    print(values)
    return values

def print_column_missing(column, comparison):
    values = print_column_unique(column)
    print("Comparison:")
    print(comparison)
    missing_values = []
    for value in values:
        if not value in comparison:
            missing_values.append(value)
    if len(missing_values) > 0:
        print("Column values not in comparison:")
        print(missing_values)
    else:
        print("No missing values")
    missing_values = []
    for value in comparison:
        if not value in values:
            missing_values.append(value)
    if len(missing_values) > 0:
        print("Comparison values not in column:")
        print(missing_values)
    else:
        print("No missing values")
    return values

def division(a,b):
    if not (b == 0):
        return a/b
    else:
        return np.nan
    
def key_from_value(value, dictionary, default):
    return_value = default
    for key, values in dictionary.items():
        if value.strip() in values:
            return_value = key
            break
    return return_value.strip()

def region_from_country(country):
    return key_from_value(country, countries_by_region, "")

def fixTerritoryRegion(territory):
    characteristics = territories[territory]
    return characteristics["Region"]

def fixCountry(value):
    return key_from_value(value, country_conversions, value)

def fixCountries(countries_column, configuredCountries):
    countries_conversion = countries_column.astype(str)
    countries_conversion = countries_conversion.apply(lambda x: fixCountry(x))
    print(conversions)
    countries = print_column_missing(countries_conversion,configuredCountries)
    return countries_conversion

def checkCountries(column):
    fixed = []
    for value in column:
        fixedValue = fixCountry(value)
        if (not (fixedValue == value)) and (not (value in fixed)):
            fixed.append(value)
            print(f"{value} => {fixedValue}")
    if len(fixed) == 0:
        print("No countries need to be fixed.")

def testConversion(title, test_array, conversion):
    print(title)
    no_conversions = []
    for value in test_array:
        return_value = ""
        if conversion == "country":
            return_value = fixCountry(value)
        elif conversion == "region":
            return_value = region_from_country(fixCountry(value))
        if return_value != value.strip():
            print(value.strip() + "," + return_value)
        if return_value == "":
            no_conversions.append(value)
    if len(no_conversions) > 0:
        print("Missing Conversions")
        print(no_conversions)
    print("")
    
def division(a,b):
    if not (b == 0):
        return a/b
    else:
        return np.nan
    
def fixProvince(value):
    province_map = {
        'BC': 'British Columbia',
        'NL': 'Newfoundland and Labrador',
        'NWT': 'Northwest Territories',
        'PEI': 'Prince Edward Island',
        'Repatriated': 'Repatriated Canada',
        'Repatriated Cdn': 'Repatriated Canada'
    }
    value = titleCase(value)
    if value in province_map.keys():
        new_province = province_map[value]
        return new_province
    else:
        return value

def censusRegionByState(state):
    for key in census_regions.keys():
        if (state in census_regions[key]["states"]):
            return census_regions[key]["name"]
        
def fixUSRegion(code):
    region = census_regions[code]
    region_name = region["name"]
    return region_name

def convertDateToExcel(dayString) :
    temp = datetime(1899, 12, 30)
    day = datetime.strptime(dayString, '%m/%d/%Y')
    delta = day - temp
    return float(delta.days) + (float(delta.seconds) / 86400)

def firstDay(dayString):
    dt = datetime.strptime(dayString, '%m/%d/%Y')
    firstDate = dt - timedelta(days=dt.weekday())
    return firstDate.strftime('%m/%d/%Y')

In [None]:
all_populations = pd.read_excel(cleanedFolder + "all_populations.xlsx")
population_groups = ["Level","Region","Census Region","Country","State/Province"]
for group in population_groups:
    all_populations[group] = all_populations[group].astype(str)
    all_populations[group] = all_populations[group].apply(lambda x: "" if x=="nan" else x)
print(all_populations.columns)
all_populations.head()

In [None]:
# Global input data owid
print(owid_data)
o = pd.io.excel.read_excel(owid_data)
print(o.columns)

o = o.rename(columns={
    "location":"Country",
    "population": "Owid Population",
    "new_tests":"Owid Tests",
    "total_tests": "Owid Total Tests",
    "new_cases": "Owid Cases",
    "new_deaths": "Owid Deaths"
}).reset_index()

o["date"] = o["date"].astype(str)
o["Year"] = o["date"].apply(lambda x: x[0:4])
o["Month"] = o["date"].apply(lambda x: x[5:7])
o["Day"] = o["date"].apply(lambda x: x[8:10])
o["Date"] = o.apply(lambda x: x["Month"] + "/" + x["Day"] + "/" + x["Year"],axis=1)
o["Time"] = o.date.apply(lambda x: datetime.strptime(x, '%Y-%m-%d'))

o["Country"] = o["Country"].astype(str)
print_column_unique(o["Country"])
o["Country"] = o["Country"].apply(lambda x: "" if x=="nan" else x)
o["Country"] = o["Country"].apply(lambda x: fixCountry(x))
checkCountries(o["Country"])

min_date = o["Time"].min()
max_date = o["Time"].max()
print("Min: " + str(min_date))
print("Max: " + str(max_date))

o_integer_columns = ["Owid Population", "Owid Tests", "Owid Cases", "Owid Deaths"]
o[o_integer_columns] = o[o_integer_columns].apply(lambda x: pd.to_numeric(x, errors='coerce', downcast='integer'))
o_columns = ["Country", "Time", "Date", "Owid Population", "Owid Tests", "Owid Cases", "Owid Deaths"]
o = o[o_columns]

o.head()
#o_notnull = o.loc[pd.notnull(o["Owid Tests"])].copy().reset_index().drop(columns=["index"])
#o_notnull.head()

In [None]:
date_check = o.groupby(o["Country"])["Time"].max()
date_check.head(200)

In [None]:
# Global input data
github_request=requests.get(github_url).content
c=pd.read_csv(io.StringIO(github_request.decode('utf-8')))
currentTime = datetime.now()

print("c  original columns")
print(c.columns)

print("Sets")
c["set"] = c["set"].astype(str)
sets = print_column_unique(c["set"])
c = c[c["set"] == "country"].copy().rename(columns={
    "name":"Country",
    "unit":"Abbreviation",
    "time":"Time"
}).reset_index()
c = c.drop(columns=["index","set","pop_100k"])

print("Countries")
c["Country"] = c["Country"].astype(str)
c["Country"] = c["Country"].apply(lambda x: "" if x=="nan" else x)
c["Country"] = c["Country"].apply(lambda x: fixCountry(x))
checkCountries(c["Country"])

# Format text date and add datetime for date
c["Time"] = c["Time"].astype(str)
c["Time"] = c["Time"].apply(lambda x: us_date(x))
c["Date"] = c["Time"]
c["Time"] = c.apply(lambda x: pd.to_datetime(x["Date"], format="%m/%d/%Y"), axis=1)
minmax_dates = c.groupby(["Country"]).agg({"Date": [np.min,np.max]})
min_date = c["Time"].min()
max_date = c["Time"].max()
print("Min: " + str(min_date))
print("Max: " + str(max_date))
c["Accessed"] = str(currentTime.month) + '/' + str(currentTime.day) + '/' + str(currentTime.year)

o_not_international = o.loc[o["Country"]!="International"].copy()
c = o_not_international.merge(c,how="left",on=["Country","Date"])

c["Region"] = c["Country"].apply(lambda x: region_from_country(x))
c["Level"] = c.apply(lambda x: "Territory" if x["Region"] == "" else "World" if x["Region"] == "World" else "Country",axis=1)
c["Region"] = c.apply(lambda x: fixTerritoryRegion(x["Country"]) if x["Region"] == "" else x["Region"],axis=1)
c["State/Province"] = c.apply(lambda x: x["Country"] if x["Level"]=="Territory" else "",axis=1)
c["Country"] = c.apply(lambda x: "" if x["Level"]=="Territory" else x["Country"],axis=1)

# Format numeric columns
numeric_columns = [
    'new_cases_orig','new_deaths_orig','new_tests_orig',
    'cap_cum_cases','cap_new_cases',
    'cap_cum_deaths','cap_new_deaths',
    'cap_cum_tests','cap_new_tests',
    'all_cum_cases','all_new_cases','all_cum_deaths','all_new_deaths',
    'all_cum_tests','all_new_tests',
    'pos'
]
float_columns = [
    'cap_cum_cases','cap_new_cases','cap_cum_deaths',
    'cap_new_deaths','cap_cum_tests','cap_new_tests'
]
integer_columns = [
    'new_cases_orig', 'Owid Cases', 'all_cum_cases',
    'new_deaths_orig', 'Owid Deaths', 'all_cum_deaths',
    'new_tests_orig', 'Owid Tests', 'all_cum_tests', 'all_new_tests'
]
c[float_columns] = c[float_columns].apply(pd.to_numeric)
c[integer_columns] = c[integer_columns].apply(lambda x: pd.to_numeric(x, errors='coerce', downcast='integer'))
c = c.drop(columns=['Time_y'])
c = c.rename(columns ={
    "Time_x":"Time"
})
c["Tests Daily"] = c.apply(
    lambda x: x["Owid Tests"] if (pd.isna(x["new_tests_orig"]) and pd.notna(x["Owid Tests"])) else x["new_tests_orig"],
    axis=1
)
c["Cases Daily"] = c.apply(
    lambda x: x["Owid Cases"] if (pd.isna(x["new_cases_orig"]) and pd.notna(x["Owid Cases"])) else x["new_cases_orig"],
    axis=1
)
c["Deaths Daily"] = c.apply(
    lambda x: x["Owid Deaths"] if (pd.isna(x["new_deaths_orig"]) and pd.notna(x["Owid Deaths"])) else x["new_deaths_orig"],
    axis=1
)
print("c columns after Tests Daily")
print(c.columns)

c_order = [
    'Level', 'Region', 'Country', 'State/Province', 'Abbreviation',
    'Time', 'Date', 'Accessed',
    'Tests Daily', 'Cases Daily', 'Deaths Daily'
]
#    'new_tests_orig', 'Owid Tests', 'Tests Daily', 'cum_tests_orig', 'cap_new_tests', 'cap_cum_tests', 'all_new_tests', 
#    'new_cases_orig', 'Owid Cases', 'Cases Daily', 'cap_new_cases', 'cap_cum_cases', 'all_new_cases', 'all_cum_cases', 'all_cum_tests', 
#    'new_deaths_orig', 'Owid Deaths', 'Deaths Daily', 'cap_new_deaths', 'cap_cum_deaths', 'all_new_deaths', 'all_cum_deaths',
#    'pos'
#]
c = c[c_order]

#has_data = c.all_cum_cases > 0
#c = c[has_data]
c = c.where(c.notnull(), None)

#c.rename(columns = {
#    'cum_tests_orig': "Total Tests",
#    'pop_100k': "Original Population 100K",
#    'cap_cum_cases': "Total Cases Per Capita",
#    'cap_new_cases': "Cases Daily per Capita (7 day rolling average)",
#    'cap_cum_deaths': "Total Deaths Per Capita",
#    'cap_new_deaths': "Death Daily Per capita (7 day rolling average)",
#    'cap_cum_tests': "Total Tests Per Capita (7 day rolling average)",
#    'cap_new_tests': "Tests Daily Per Capita (7 day rolling average)",
#    'all_cum_cases': "Total Cases",
#    'all_new_cases': "Cases Daily (7 day rolling average)",
#    'all_cum_deaths': "Total Deaths",
#    'all_new_deaths': "Death Daily (7 day rolling average)",
#    'all_cum_tests': "Total Tests (7 day rolling average)",
#    'all_new_tests': "Tests Daily (7 day rolling average)",
#    'pos': "Positivity Rate (7 day rolling average)"
#},inplace=True)
c = c.merge(all_populations,how="left",on=["Level","Region","Country","State/Province"])

c_data_order = [
    'Level', 'Region', 'Country', 'State/Province', 'Abbreviation', 'Time', 'Date', 
    'Tests Daily', 'Cases Daily', 'Deaths Daily',
#    'Cases Daily', 'Cases Daily (7 day rolling average)', 'Cases Daily per Capita (7 day rolling average)',
#    'Total Cases', 'Total Cases Per Capita', 
#    'Tests Daily', 'Tests Daily (7 day rolling average)', 'Tests Daily Per Capita (7 day rolling average)',
#    'Total Tests', 'Total Tests (7 day rolling average)', 'Total Tests Per Capita (7 day rolling average)',
#    'Positivity Rate (7 day rolling average)',  
#    'Deaths Daily', 'Death Daily (7 day rolling average)', 'Death Daily Per capita (7 day rolling average)',
#    'Total Deaths', 'Total Deaths Per Capita',
    'Population', 'Population 100K',
    'Country Population', 'Country Population 100K',"Country Share",
    'Region Population', 'Region Population 100K',"Region Share",
    'World Population', 'World Population 100K',"World Share",
    'World Share (%)', 'Urban Population (%)', 'Annual Change (%)', 'Net Change', 'Migrants (net)', 'Density (P/Km²)',
    'Land Area (Km²)', 'Fertility Rate', 'Median Age',
    '< 1', 1, 2, 3, 4, 5, 6, 7, 8, 9,
    10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
    20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
    30, 31, 32, 33, 34, 35, 36, 37, 38, 39,
    40, 41, 42, 43, 44, 45, 46, 47, 48, 49,
    50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
    60, 61, 62, 63, 64, 65, 66, 67, 68, 69,
    70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
    80, 81, 82, 83, 84, '85+',
    '1-4','5-14','15-24','25-34','35-44','45-54','55-64','65-74','75-84',
    'Pct < 1','Pct 1-4','Pct 5-14','Pct 15-24','Pct 25-34','Pct 35-44','Pct 45-54','Pct 55-64','Pct 65-74','Pct 75-84','Pct 85+',
    'Accessed'
]
c = c.sort_values(by=['Region','Country','Time']).reset_index()
print(c.columns)
c = c[c_data_order]

pop_check = c.loc[pd.isnull(c["Population"])]
print_column_unique(pop_check["Country"])

c.head()

In [None]:
date_check = c.groupby(c["Country"])["Time"].max()
date_check.head(200)

In [None]:
# Canada raw data
canada_source_request = requests.get(canada_source_csv).content
canada_df = pd.read_csv(io.StringIO(canada_source_request.decode('utf-8')))
currentTime = datetime.now()

print("Original Canada Columns")
print(canada_df.columns)
canada_df.rename(columns = {
    'Province': 'State/Province',
    'SummaryDate': 'Time',
    'TotalCases': 'Total Cases','DailyTotals': 'Cases Daily',
    'TotalRecovered' : 'Total Recovered','DailyRecovered': 'Recovered Daily',
    'TotalDeaths': 'Total Deaths','DailyDeaths': 'Deaths Daily',
    'TotalTested': 'Total Tests','DailyTested': 'Tests Daily',
    'TotalActive': 'Total Active','DailyActive': 'Active Daily',
    'TotalHospitalized': 'Total Hospitalized','DailyHospitalized': 'Hospitalized Daily',
    'TotalICU': 'Total ICU', 'DailyICU': 'ICU Daily'
}, inplace = True)
print("Renamed Canada Columns")
print(canada_df.columns)

canada_df.drop(columns=["OBJECTID"], inplace = True)
canada_df["Accessed"] = str(currentTime.month) + '/' + str(currentTime.day) + '/' + str(currentTime.year)
canada_df["Country"] = "Canada"
canada_df["Region"] = "North America"
canada_df["State/Province"] = canada_df["State/Province"].apply(lambda x: fixProvince(x))
canada_df["Date"] = canada_df["Time"].apply(lambda x: us_date(x).replace(" ",""))
canada_df["Time"] = canada_df["Date"].apply(lambda x: datetime.strptime(x, '%m/%d/%Y'))
canada_df["Level"] = canada_df["State/Province"].apply(lambda x: "Country" if x == "Canada" else "State/Province")
canada_df["State/Province"] = canada_df["State/Province"].apply(lambda x: "" if x=="Canada" else x)
string_columns = ["State/Province","Abbreviation","Country","Region"]
canada_df = canada_df.sort_values(by=["Level","Country","State/Province","Date"])
canada_df = canada_df.reset_index()
canada_df = canada_df.drop(columns=["index"])
canada_df = canada_df.merge(all_populations,how="left",on=["Level","Region","Country","State/Province"]).reset_index()
canada_order = [
    'Level', 'Region', 'Country', 'State/Province', 'Abbreviation', 'Time', 'Date',
    'Cases Daily', 'Total Cases',
    'Tests Daily', 'Total Tests', 
    'Deaths Daily', 'Total Deaths', 
    'Recovered Daily', 'Total Recovered', 
    'Active Daily', 'Total Active',
    'Hospitalized Daily', 'Total Hospitalized', 
    'ICU Daily', 'Total ICU', 
    'Population', 'Population 100K',
    'Country Population', 'Country Population 100K',"Country Share",
    'Region Population', 'Region Population 100K',"Region Share",
    'World Population', 'World Population 100K',"World Share",
    'World Share (%)', 'Urban Population (%)', 'Annual Change (%)', 'Net Change', 'Migrants (net)', 'Density (P/Km²)',
    'Land Area (Km²)', 'Fertility Rate', 'Median Age',
    '< 1', 1, 2, 3, 4, 5, 6, 7, 8, 9,
    10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
    20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
    30, 31, 32, 33, 34, 35, 36, 37, 38, 39,
    40, 41, 42, 43, 44, 45, 46, 47, 48, 49,
    50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
    60, 61, 62, 63, 64, 65, 66, 67, 68, 69,
    70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
    80, 81, 82, 83, 84, '85+',
    '1-4','5-14','15-24','25-34','35-44','45-54','55-64','65-74','75-84',
    'Pct < 1','Pct 1-4','Pct 5-14','Pct 15-24','Pct 25-34','Pct 35-44','Pct 45-54','Pct 55-64','Pct 65-74','Pct 75-84','Pct 85+',
    'Accessed'
]
canada_df = canada_df[canada_order].copy()
canada_min = canada_df["Time"].min()
print("Min Date: " + str(canada_min))
canada_max = canada_df["Time"].max()
print("Max Date: " + str(canada_max))
print_column_unique(canada_df["State/Province"])
canada_df.head(75)

In [None]:
us_state_codes = pd.read_excel(us_codes)
us_state_codes["Census Region"] = us_state_codes["State Name"].apply(lambda x: censusRegionByState(x))
us_state_codes.rename(columns = {
    'State Name':'State/Province',
    "State Abbreviation": "Abbreviation"
}, inplace = True)
us_state_codes.head(70)

us_states_url = "https://covidtracking.com/data/download/all-states-history.csv"
us_states_request = requests.get(us_states_url).content
states=pd.read_csv(io.StringIO(us_states_request.decode('utf-8')))
currentTime = datetime.now()
printColumns(states, "Original US Columns")
states["Accessed"] = currentTime
states["Country"] = "United States"
states = states.drop(
    columns = [
        'deathConfirmed', 'deathProbable',
        'hospitalized',
        'negativeTestsAntibody', 'negativeTestsPeopleAntibody', 'negativeTestsViral',
        'positiveScore', 'positiveTestsAntibody', 'positiveTestsAntigen',
        'positiveTestsPeopleAntibody', 'positiveTestsPeopleAntigen',
        'positiveTestsViral', 'positiveCasesViral',
        'totalTestEncountersViral', 'totalTestEncountersViralIncrease',
        'totalTestsAntibody', 'totalTestsAntigen',
        'totalTestsPeopleAntibody', 'totalTestsPeopleAntigen',
        'totalTestsPeopleViral', 'totalTestsPeopleViralIncrease',
        'totalTestsViral', 'totalTestsViralIncrease'
    ])
states.rename(
    columns = {
        'date': 'Time', 'state' : 'Abbreviation', 'dataQualityGrade': 'Data Quality',
        'totalTestResults' : 'Total Tests', 'totalTestResultsIncrease' : 'Tests Daily',
        'negative' : 'Total Negative', 'negativeIncrease' : 'Negative Daily',
        'positive' : 'Total Cases', 'positiveIncrease' : 'Cases Daily',
        'recovered' : 'Total Recovered',
        'death' : 'Total Deaths', 'deathIncrease' : 'Deaths Daily',
        'hospitalizedCumulative' : 'Total Hospitalized', 'hospitalizedIncrease' : 'Hospitalized Daily', 'hospitalizedCurrently' : 'Currently Hospitalized',
        'inIcuCumulative' : 'Total In ICU', 'inIcuCurrently' : 'Currently In ICU',
        'onVentilatorCumulative' : 'Total On Ventilator', 'onVentilatorCurrently' : 'Currently On Ventilator'
    }, inplace = True)
states["Date"] = states["Time"].astype(str)
states["Date"] = states["Date"].apply(lambda x: us_date(x))
states["Time"] = pd.to_datetime(states["Date"], format="%m/%d/%Y")
#printColumns(states, "Post Rename Columns")
states_input = pd.merge(states, us_state_codes, how="left", on="Abbreviation")
merge_order = [
    'Time', 'Date', 'Abbreviation', 'State/Province', 'Country', 'FIPS', 'Status', 'Data Quality', 
    'Total Deaths', 'Deaths Daily', 'Total Recovered',
    'Total Tests', 'Tests Daily', 
    'Total Negative', 'Negative Daily', 'Total Cases', 'Cases Daily',    
    'Total Hospitalized', 'Currently Hospitalized', 'Hospitalized Daily',
    'Total In ICU', 'Currently In ICU',
    'Total On Ventilator', 'Currently On Ventilator', 
    'Accessed' 
]
states_input = states_input[merge_order]
#printColumns(states_input, "States Input Merge Columns")
states_input["Region"] = "North America"
states_input["Level"] = states_input.apply(
    lambda x:
    "Territory" if x["State/Province"] in us_territories else 
    "State/Province",
    axis=1
)

print(states_input.columns)
us_min = states_input["Time"].min()
print("Min Date: " + str(us_min))
us_max = states_input["Time"].max()
print("Max Date: " + str(us_max))

us_columns = [
    'Level', 'Region','Country','State/Province','Abbreviation','FIPS',
    'Time', 'Date', 'Status', 'Data Quality', 
    'Cases Daily', 'Total Cases', 'Negative Daily', 'Total Negative',
    'Deaths Daily', 'Total Deaths', 'Total Recovered', 
    'Tests Daily', 'Total Tests', 
    'Hospitalized Daily', 'Total Hospitalized', 'Currently Hospitalized', 
    'Currently In ICU', 'Total In ICU', 'Currently On Ventilator',  'Total On Ventilator',
    'Accessed'
]
states_input = states_input[us_columns]
states_input = states_input.merge(all_populations,how="left",on=["Level","Region","Country","State/Province"]).reset_index()
states_input = states_input.drop(columns=["index"])

print("US States")
print_column_unique(states_input["State/Province"])

states_input.head()

us_summary_cols = [
    'Time', 'Date', 'Country',
    'Total Deaths', 'Deaths Daily', 'Total Recovered',
    'Total Tests', 'Tests Daily', 'Total Negative', 'Negative Daily', 'Total Cases', 'Cases Daily',
    'Total Hospitalized', 'Currently Hospitalized', 'Hospitalized Daily', 
    'Total In ICU', 'Currently In ICU', 'Total On Ventilator', 'Currently On Ventilator'
]
us_stats = states_input[us_summary_cols].groupby(['Time','Date','Country']).sum().reset_index()
us_stats["FIPS"] = 0
us_stats["Abbreviation"] = "US"
us_stats["State/Province"] = "" 
us_stats["Status"] = 0
us_stats["Accessed"] = currentTime
us_stats["Data Quality"] = ""
us_stats = us_stats[merge_order]
us_stats.head()

us_stats["Level"] = "Country"
us_stats["Region"] = "North America"

us_stats = pd.concat([states_input,us_stats])
us_stats = us_stats[us_columns]
us_stats["Level"] = us_stats.apply(lambda x: 
                                    "Country" if x["Region"] == "Country" else 
                                    "Territory" if x["State/Province"] in us_territories else 
                                    "State/Province",axis=1)
us_stats = us_stats.merge(all_populations,how="left",on=["Level","Region","Country","State/Province"]).reset_index()
us_stats = us_stats.drop(columns=["index"])
us_min = us_stats["Time"].min()
print("Min Date: " + (us_min))
us_max = us_stats["Time"].max()
print("Max Date: " + (us_max))
us_stats.head()

In [None]:
not_cn = c.loc[(c["Country"]!="Canada")].copy()
all_data = pd.concat([states_input,canada_df,not_cn],sort=False)
print_column_unique(all_data["State/Province"])
all_data = all_data.sort_values(by=['Region', 'Country', 'State/Province','Time']).reset_index()
all_data = all_data.drop(columns=["index"])
base_columns = [
    'Level', 'Region','Country','Census Region','State/Province','Abbreviation','FIPS',
    'Time', 'Date', 'Status', 'Data Quality', 
    'Cases Daily', 'Deaths Daily','Tests Daily',
    'Negative Daily', 'Total Negative',
    'Hospitalized Daily', 'Total Hospitalized', 'Currently Hospitalized',
    'ICU Daily', 'Total ICU',
    'Currently In ICU', 'Total In ICU',
    'Currently On Ventilator',  'Total On Ventilator',
    'Recovered Daily', 'Total Recovered',
    'Active Daily', 'Total Active',
    'Population', 'Population 100K',
    'Country Population', 'Country Population 100K',"Country Share",
    'Region Population', 'Region Population 100K',"Region Share",
    'World Population', 'World Population 100K',"World Share",
    'World Share (%)', 'Urban Population (%)', 'Annual Change (%)', 'Net Change', 'Migrants (net)', 'Density (P/Km²)',
    'Land Area (Km²)', 'Fertility Rate', 'Median Age',
    '< 1', 1, 2, 3, 4, 5, 6, 7, 8, 9,
    10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
    20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
    30, 31, 32, 33, 34, 35, 36, 37, 38, 39,
    40, 41, 42, 43, 44, 45, 46, 47, 48, 49,
    50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
    60, 61, 62, 63, 64, 65, 66, 67, 68, 69,
    70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
    80, 81, 82, 83, 84, '85+',
    '1-4','5-14','15-24','25-34','35-44','45-54','55-64','65-74','75-84',
    'Pct < 1','Pct 1-4','Pct 5-14','Pct 15-24','Pct 25-34','Pct 35-44','Pct 45-54','Pct 55-64','Pct 65-74','Pct 75-84','Pct 85+',
    'Accessed'
]
all_data = all_data[base_columns]
all_min = all_data["Time"].min()
print("Min Date: " + str(all_min))
all_max = all_data["Time"].max()
print("Max Date: " + str(all_max))
all_data.head()

grouping_cols = ["Region","Country","State/Province"]
base_cols = ["Cases","Tests","Deaths"]
calc_cols = []
for col in base_cols:
    all_data[col + " Daily"] = all_data[col + " Daily"].fillna(0).astype(int)
    all_data[col + " Daily Rate"] = all_data[col + " Daily"]/all_data["Population 100K"]
    all_data["Total " + col] = all_data.groupby(grouping_cols)[col + " Daily"].cumsum().reset_index(drop=True)
    all_data[col + " Daily 7D Rolling"] = all_data.groupby(grouping_cols, as_index=False)[col + " Daily"].rolling(7,min_periods=7).mean().reset_index(drop=True)
    all_data[col + " Daily Rate 7D Rolling"] = all_data.groupby(grouping_cols, as_index=False)[col + " Daily Rate"].rolling(7,min_periods=7)[col + " Daily Rate"].mean().reset_index(drop=True)
    all_data["Total " + col + " Rate"] = all_data["Total " + col]/all_data["Population 100K"]
    base_order = [
        col + ' Daily', col + ' Daily 7D Rolling', "Total " + col, 
        col + ' Daily Rate', col + ' Daily Rate 7D Rolling', "Total " + col + " Rate"
    ]
    calc_cols = calc_cols + base_order

all_data["Speed Daily"] = all_data["Cases Daily 7D Rolling"]/all_data["Population 100K"]
all_data["Positivity 7D Rolling"] = all_data["Cases Daily 7D Rolling"]/all_data["Tests Daily 7D Rolling"]
all_data["Positivity 7D Rolling"] = all_data["Positivity 7D Rolling"].apply(lambda x: np.nan if x == np.inf else x)
all_data["Acceleration Daily"] = all_data.groupby(grouping_cols, as_index=False)["Speed Daily"].diff().reset_index(drop=True)
all_data["Jerk Daily"] = all_data.groupby(grouping_cols, as_index=False)["Acceleration Daily"].diff().reset_index(drop=True)
all_data["Jounce Daily"] = all_data.groupby(grouping_cols, as_index=False)["Jerk Daily"].diff().reset_index(drop=True)
all_data["MM-DD-YYYY"] = all_data["Date"].apply(lambda x: datetime.strptime(x, '%m/%d/%Y').strftime('%m-%d-%Y'))
all_data["MM-DD-YYYY"] = all_data["MM-DD-YYYY"].astype(str)
all_data["DD-MM-YYYY"] = all_data["Date"].apply(lambda x: datetime.strptime(x, '%m/%d/%Y').strftime('%d-%m-%Y'))
all_data["DD-MM-YYYY"] = all_data["DD-MM-YYYY"].astype(str)
all_data["Week"] = all_data["Date"].apply(
    lambda x: 
    str(datetime.strptime(x, '%m/%d/%Y').isocalendar()[0]) + 
    " W" + str(datetime.strptime(x, '%m/%d/%Y').isocalendar()[1])
)
all_data["First Day of Week"] = all_data["Date"].apply(
    lambda x: datetime.strptime(x, '%m/%d/%Y') - timedelta(days=datetime.strptime(x, '%m/%d/%Y').weekday())
)
all_data["Last Day of Week"] = all_data["First Day of Week"].apply(lambda x: x + timedelta(days=6))
def shortDate(day):
    shortMonth = datetime.strftime(day,'%m').lstrip('0')
    shortDay  = datetime.strftime(day,'%d').lstrip('0')
    shortYear = datetime.strftime(day,'%y')
    return shortMonth + "/" + shortDay + "/" + shortYear
all_data["Week Date Range"] = all_data.apply(
    lambda x: shortDate(x["First Day of Week"]) + " - " + shortDate(x["Last Day of Week"]),
    axis=1
)
all_data["Day Count"] = all_data.groupby(["Region","Country","State/Province","Date"]).cumcount()

header_cols = [
    'Level', 'Region','Country','Census Region','State/Province','Abbreviation','FIPS',
    'Time', 'Date', 'Day Count', 'MM-DD-YYYY', 'DD-MM-YYYY', 'Week', 'First Day of Week', 'Last Day of Week', 'Week Date Range',
    'Status', 'Data Quality'
]
calc_cols = calc_cols + ["Positivity 7D Rolling", "Speed Daily", "Acceleration Daily", "Jerk Daily"]
other_cols = [
    'Negative Daily', 'Total Negative',
    'Hospitalized Daily', 'Total Hospitalized', 'Currently Hospitalized',
    'ICU Daily', 'Total ICU',
    'Currently In ICU', 'Total In ICU',
    'Currently On Ventilator',  'Total On Ventilator',
    'Recovered Daily', 'Total Recovered',
    'Active Daily', 'Total Active',
    'Population', 'Population 100K',
    'Country Population', 'Country Population 100K',"Country Share",
    'Region Population', 'Region Population 100K',"Region Share",
    'World Population', 'World Population 100K',"World Share",
    'World Share (%)', 'Urban Population (%)', 'Annual Change (%)', 'Net Change', 'Migrants (net)', 'Density (P/Km²)',
    'Land Area (Km²)', 'Fertility Rate', 'Median Age',
    '< 1', 1, 2, 3, 4, 5, 6, 7, 8, 9,
    10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
    20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
    30, 31, 32, 33, 34, 35, 36, 37, 38, 39,
    40, 41, 42, 43, 44, 45, 46, 47, 48, 49,
    50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
    60, 61, 62, 63, 64, 65, 66, 67, 68, 69,
    70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
    80, 81, 82, 83, 84, '85+',
    '1-4','5-14','15-24','25-34','35-44','45-54','55-64','65-74','75-84',
    'Pct < 1','Pct 1-4','Pct 5-14','Pct 15-24','Pct 25-34','Pct 35-44',
    'Pct 45-54','Pct 55-64','Pct 65-74','Pct 75-84','Pct 85+',
    'Accessed'
]
all_cols = header_cols + calc_cols + other_cols
print(all_cols)
all_data = all_data[all_cols].copy()
last_monday = date(year=2021,month=1,day=18)
print("Last Monday: ")
print(last_monday)
all_data['Time'] = all_data['Time'].astype(str)
all_min = all_data["Time"].min()
print("Min Date: " + str(all_min))
all_max = all_data["Time"].max()
print("Max Date: " + str(all_max))

In [None]:
excess_days = all_data.loc[all_data["Day Count"]>0].copy()
excess_days.head()

In [None]:
all_data = all_data.loc[all_data["Day Count"]==0].copy().reset_index()
all_data.drop(columns=["Day Count","index"])
all_data.to_excel(cleanedFolder + "all_raw_input.xlsx", index=False)
all_data.to_csv(cleanedFolder + "all_raw_input.csv", index=False)
all_data.head(8)

In [None]:
date_check = all_data.groupby(all_data["Country"])["Time","Week"].max()
date_check.head()

In [None]:
min_header_cols = [
    'Level', 'Region','Country','Census Region','State/Province','Abbreviation','FIPS',
    'Time', 'Date', 'MM-DD-YYYY', 'DD-MM-YYYY', 'Week', 'First Day of Week', 'Last Day of Week', 'Week Date Range',
    'Status', 'Data Quality'
]
min_cols = min_header_cols + calc_cols + ["Population","Population 100K","Accessed"]
print(min_cols)
min_data = all_data[min_cols]
print_column_unique(min_data["State/Province"])
min_data = min_data[(min_data['Time'] < '2021-01-25')]
print("Max date:" + min_data["Time"].max())
min_xlsx_file = cleanedFolder + "daily_raw_input.xlsx"
min_data.to_excel(min_xlsx_file, index=False)
print("Min Excel file output")
min_csv_file = cleanedFolder + "daily_raw_input.csv"
min_data.to_csv(min_csv_file, index=False)
print("Min CSV file output")
min_data.head(8)

In [None]:
date_check = min_data.groupby(min_data["Country"])["Time","Week"].max()
date_check.head()