In [25]:
# Dependencies
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import requests
import io
import time
from datetime import datetime, timedelta
import json
import pprint
pp = pprint.PrettyPrinter(indent=4)
pd.set_option('display.max_columns', None)

In [26]:
#Folders and Files

# Folders
repositoryFolder = "D:/Repositories/Global-COVID-Surveillance/data/"
localDownloadFolder = "C:/Users/janin/Downloads/"
demographicsFolder = repositoryFolder + "raw/demographics/"
configuredFolder = repositoryFolder + "configured/"
cleanedFolder = repositoryFolder + "cleaned/"
regionsFolder = repositoryFolder + "raw/regions/"
locationsFolder = repositoryFolder + "raw/locations/"

# Population Input Files
global_population_input_file = demographicsFolder + "Country Populations 2020.xlsx"
us_codes = demographicsFolder + "US State Codes.xlsx"

# Population Output Files
canada_population_file = demographicsFolder + "Canada Population.xlsx"
us_population_file = demographicsFolder + "US Population.xlsx"
all_populations_file = cleanedFolder + "Populations_cleaned.xlsx"

# Location Input
locations_file = demographicsFolder + "Country Geo.xlsx"

# R Files
south_africa_r = regionsFolder + "SSA-Temp.xlsx"
south_asia_r = regionsFolder + "SouthAsia excel updated 20201008.xlsx"
latin_america_r = regionsFolder + "LatinAmerica.xlsx"
central_asia_r = regionsFolder + "CentralAsia-Results.xlsx"
europe_r = regionsFolder + "Europe-Results-Updated.xlsx"
middle_east_r = regionsFolder + "Middle East Output.xlsx"
east_asia_pacific_r = regionsFolder + "East Asia and Pacific output.xlsx"
canada_r = regionsFolder + "_Canada Ouput.xlsx"
us_r = regionsFolder + "USState-Results.xlsx"

In [27]:
#Sources
population_source_url ="https://www.worldometers.info/world-population/population-by-country/"
canada_population_source = "https://www150.statcan.gc.ca/t1/tbl1/en/tv.action?pid=1710000901" # Statistics Canada Quarterly Population
us_population_source = "https://www2.census.gov/programs-surveys/popest/tables/2010-2019/state/asrh/sc-est2019-agesex-civ.csv"

github_url="https://github.com/dsbbfinddx/FINDCov19TrackerData/blob/master/processed/data_all.csv?raw=true"
country_codes_coordinates = "https://raw.githubusercontent.com/dsbbfinddx/FINDCov19TrackerData/master/raw/countries_codes_and_coordinates.csv"
countries_geo = "https://raw.githubusercontent.com/dsbbfinddx/FINDCov19TrackerData/master/raw/countries.geo.json"
us_states_geo = "https://raw.githubusercontent.com/dsbbfinddx/FINDCov19TrackerData/master/raw/us-states.geo.json"

kaggle_locations = "https://www.kaggle.com/paultimothymooney/latitude-and-longitude-for-every-country-and-state"

canada_source_csv = "https://opendata.arcgis.com/datasets/3afa9ce11b8842cb889714611e6f3076_0.csv"
us_source_csv = "https://covidtracking.com/data/download/all-states-history.csv"

In [28]:
# Countries and Regions

european_countries = [
    'Albania','Andorra','Austria','Belarus','Belgium','Bosnia & Herzegovina','Bulgaria',
    'Croatia','Czech Republic','Denmark','Estonia','Finland','France',
    'Germany','Greece','Greenland','Hungary','Iceland','Ireland','Isle of Man','Italy',
    'Latvia','Liechtenstein','Lithuania','Luxembourg','Malta','Moldova','Monaco','Montenegro',
    'Netherlands','Norway','Poland','Portugal','Romania',
    'San Marino','Serbia','Slovakia','Slovenia','Spain','Sweden','Switzerland',
    'Ukraine','United Kingdom','Vatican City'
]

north_american_countries = ["Canada","United States"]

carribean_countries = [
    "Antigua & Barbuda","Aruba","Bahamas","Barbados","Bermuda","British Virgin Islands",
    "Cayman Islands","Cuba","Curacao","Dominica","Dominican Republic","Grenada",
    "Haiti","Jamaica","Puerto Rico","St. Kitts & Nevis","St. Lucia","St. Vincent & Grenadines",
    "Sint Maarten","Trinidad & Tobago","Turks and Caicos Islands","United States Virgin Islands"
]
central_south_america_countries = [
    'Argentina','Belize','Bolivia','Brazil','Chile','Colombia','Costa Rica',
    'Ecuador','El Salvador','Guatemala','Guyana','Honduras',
    'Mexico','Nicaragua','Panama','Paraguay','Peru','Suriname','Uruguay','Venezuela'
]
latin_american_countries = carribean_countries + central_south_america_countries
american_countries = north_american_countries + latin_american_countries

south_asia_countries = [
    "Afghanistan","Bangladesh","Bhutan","India","Maldives","Nepal","Pakistan","Sri Lanka"
]
central_asian_countries = [
    'Armenia','Azerbaijan','Cyprus','Faeroe Islands','Georgia','Gibraltar','Kazakhstan','Kosovo','Kyrgyzstan',
    'North Macedonia','Russia','Tajikistan','Turkey','Turkmenistan','Uzbekistan'
]
east_asian_countries = [
    "Brunei","Cambodia","China","Indonesia","Japan","Laos","Malaysia","Mongolia","Myanmar","Niue","North Korea","Philippines",
    "Singapore","South Korea","Taiwan","Thailand","Timor","Vietnam"
]
pacific_countries = [
    "Australia","Cook Islands","Fiji","French Polynesia","Guam","Kiribati",
    "Marshall Islands","Micronesia","Nauru","New Caledonia","New Zealand",
    "Northern Mariana Islands","Palau","Papua New Guinea","Samoa","Solomon Islands","Tonga","Tuvalu","Vanuatu"
]
east_asia_and_pacific_countries = east_asian_countries + pacific_countries

middle_eastern_countries = [
    "Bahrain","Iran","Iraq","Israel","Jordan","Kuwait","Lebanon","Oman","Qatar","Saudi Arabia","Syria",
    "United Arab Emirates","Yemen"
]
north_african_countries = [
    "Algeria","Djibouti","Egypt","Libya","Morocco","Tunisia","Western Sahara"
]
middle_east_and_north_africa_countries = middle_eastern_countries + north_african_countries

sub_saharan_african_countries = [
    "Angola","Benin","Botswana","Burkina Faso","Burundi",
    "Cabo Verde","Cameroon","Central African Republic","Chad","Comoros","Côte d’Ivoire",
    "Democratic Republic of Congo","Equatorial Guinea","Eritrea","Ethiopia",
    "Gabon","Gambia","Ghana","Guinea","Guinea-Bissau","Kenya","Lesotho","Liberia",
    "Madagascar","Malawi","Mali","Mauritania","Mauritius","Mozambique",
    "Namibia","Niger","Nigeria","Republic of the Congo","Rwanda",
    "São Tomé and Príncipe","Senegal","Seychelles","Sierra Leone",
    "Somalia","South Africa","South Sudan","Sudan","Swaziland",
    "Tanzania","Togo","Uganda","Zambia","Zimbabwe"
]

unincorporated_disputed_territories = [
    "American Samoa", "Anguilla","Caribbean Netherlands","Channel Islands","Curaçao",
    "Falkland Islands","French Guiana","Guadeloupe","Hong Kong"
]

country_lists = [
    central_asian_countries,
    east_asia_and_pacific_countries,
    european_countries,
    latin_american_countries,
    middle_east_and_north_africa_countries,
    north_american_countries,
    sub_saharan_african_countries,
    south_asia_countries
]

all_countries = []
for country_list in country_lists:
    all_countries = all_countries + country_list
all_countries.sort()

regions = [
    'Central Asia',
    'East Asia and Pacific',
    'Europe',
    'Latin America',
    'Middle East and North Africa',
    'North America',
    'South Asia',
    'Sub-Saharan Africa',
    'Territory'
]

countries_by_region = {
    'Central Asia': central_asian_countries,
    'Europe': european_countries,
    'Latin America': latin_american_countries,
    'South Asia': south_asia_countries,
    'Sub-Saharan Africa': sub_saharan_african_countries,
    'Middle East and North Africa': middle_east_and_north_africa_countries,
    'East Asia and Pacific': east_asia_and_pacific_countries,
    'North America': north_american_countries,
    'Territory': unincorporated_disputed_territories
}

country_conversions = {
    "Antigua & Barbuda": ["Antigua and Barbuda"],
    "Bahamas": ["Bahamas, The"],
    "Bosnia & Herzegovina": ["Bosnia and Herzegovina"],
    "Brunei": ["Brunei Darussalam"],
    "Cabo Verde": ["Cape Verde"],
    "Côte d’Ivoire": ["Cote d'Ivoire","Cote dIvoire"],
    "Czech Republic": ["Czechia","Czech Republic (Czechia)"],
    "Democratic Republic of Congo": ["Congo - Kinshasa"],
    "Egypt": ["Egypt, Arab Rep."],
    "Faeroe Islands": ["Faroe Islands"],
    "Gambia": ["Gambia, The"],
    "Hong Kong" : ["Hong Kong SAR China"],
    "Iran": ["Iran, Islamic Rep."],
    "Kyrgyzstan": ["Kyrgyz Republic"],
    "Laos": ["Lao PDR"],
    "Micronesia": ["Micronesia, Fed. Sts."],
    "Myanmar": ["Myanmar (Burma)","Burma"],
    "North Macedonia": ["Macedonia"],
    "State of Palestine": ["Palestinian Territories"],
    "Republic of the Congo": ["Congo - Brazzaville"],
    "Russia": ["Russian Federation"],
    "São Tomé and Príncipe": ["Sao Tome and Principe","Sao Tome & Príncipe","São Tomé & Príncipe"],
    "Sint Maarten": ["Sint Maarten (Dutch part)"],
    "Slovakia": ["Slovak Republic"],
    "St. Kitts & Nevis": ["Saint Kitts and Nevis"],
    "St. Lucia": ["Saint Lucia"],
    "St. Vincent & Grenadines": ["Saint Vincent and the Grenadines"],
    "Swaziland": ["Eswatini"],
    "Syria": ["Syrian Arab Republic"],
    "Timor": ["Timor-Leste"],
    "Trinidad & Tobago": ["Trinidad and Tobago"],
    "Vatican City": ["Holy See"],
    "Yemen": ["Yemen, Rep."],
    "" : ["nan"]
}

census_regions = {
    0: {"name" : "United States",
        "states" : ["United States"]},
    1: {"name" : "Northeast",
        "states" :["Connecticut", "Maine", "New Hampshire", "Vermont", "Massachusetts", 
                   "Rhode Island", "New Jersey", "New York", "Pennsylvania"]},
    3: {"name" : "South",
        "states" : ["Maryland", "Delaware", "West Virginia", "Virginia", "Kentucky", 
                    "Tennessee", "North Carolina", "South Carolina", "Georgia", "Florida", 
                    "Alabama", "Mississippi", "Arkansas", "Louisiana", "Oklahoma", "Texas", 
                    "District of Columbia", "Puerto Rico"]},
    2: {"name" : "Midwest",
        "states" : ["North Dakota", "South Dakota", "Nebraska", "Kansas", "Missouri", "Iowa", 
                    "Minnesota", "Wisconsin", "Illinois", "Michigan", "Indiana", "Ohio"]},
    4: {"name" : "West",
        "states" : ["Washington", "Idaho", "Montana", "Wyoming", "Oregon", "California", "Nevada", 
                    "Utah", "Colorado", "Arizona", "New Mexico", "Alaska", "Hawaii"]}
}

us_states = [
    'Alabama','Alaska','Arizona','Arkansas','California','Colorado','Connecticut','Delaware','District of Columbia',
    'Florida','Georgia','Hawaii','Idaho','Illinois','Indiana','Iowa','Kansas','Kentucky','Louisiana',
    'Maine','Maryland','Massachusetts','Michigan','Minnesota','Mississippi','Missouri','Montana','Nebraska',
    'Nevada','New Hampshire','New Jersey','New Mexico','New York','North Carolina','North Dakota',
    'Ohio','Oklahoma','Oregon','Pennsylvania','Rhode Island','South Carolina','South Dakota','Tennessee','Texas',
    'Utah','Vermont','Virginia','Washington','West Virginia','Wisconsin','Wyoming'
]
canada_provinces = [
    'Alberta','British Columbia','Manitoba','New Brunswick','Newfoundland and Labrador','Northwest Territories',
    'Nova Scotia','Ontario','Prince Edward Island','Quebec','Saskatchewan','Yukon'
]
states_and_provinces = us_states + canada_provinces

In [29]:
#Functions
def titleCase(words):
    if len(words) > 3:
        titlecased = ""
        wordsArray = words.lower().split(" ")
        for word in wordsArray:
            if len(titlecased) > 0 :
                titlecased = titlecased + " "
            if word == "and":
                titlecased = titlecased + "and"
            else:
                titlecased = titlecased + word.capitalize()
        return titlecased
    else:
        return words.upper()

def fixCensusRegion(code):
    region_name = ""
    for region in census_regions:
        if region["number"] == code:
            region_name = region["name"]
            break
    if region_name == "":
        region_name = "Other"
        print(str(code) + " not found")
    return region_name

# CDC Standard age ranges 0-17, 18-29, 30-49, and 50-64
# CDC COVID Reporting Age Ranges https://www.cdc.gov/nchs/nvss/vsrr/covid_weekly/index.htm
def getAgeRange(age):
    age_range = ""
    if age == 0:
        age_range = "< 1"
    elif age == 999:
        age_range = "Total"
    elif age < 5:
        age_range = "1-4"
    elif age < 15:
        age_range = "5-14"
    elif age < 25:
        age_range = "15-24"
    elif age < 35:
        age_range = "25-34"
    elif age < 45:
        age_range = "35-44"
    elif age < 55:
        age_range = "45-54"
    elif age < 65:
        age_range = "55-64"
    elif age < 75:
        age_range = "65-74"
    elif age < 85:
        age_range = "75-84"
    elif age == 85:
        age_range = "85+"
    return age_range

def fixSex(code):
    sex = ""
    if code == 0:
        sex = "Population 2019"
    elif code == 1:
        sex = "Male"
    elif code == 2:
        sex = "Female"
    else:
        print(str(code) + " is not a sex")
    return sex

def us_date(x):
    month = x[5:7]
    day = x[8:11]
    year = x[0:4]
    conversion = month + "/" + day +"/"+ year
    return conversion

def removeDecimal(data):
    strData = str(data)
    decimalLocation = strData.find(".")
    if decimalLocation > -1:
        return strData[0:decimalLocation]
    else:
        return strData

def emptyNan(value):
    if (value == "nan"):
        return ""
    else:
        return value

def printColumns(df, label):
    print(label)
    print(df.columns)

def print_column_unique(column):
    print("Column Values:")
    values = column.sort_values(ascending = True).unique()
    print(values)
    return values

def print_column_missing(column, comparison):
    values = print_column_unique(column)
    print("Comparison:")
    print(comparison)
    missing_values = []
    for value in values:
        if not value in comparison:
            missing_values.append(value)
    if len(missing_values) > 0:
        print("Column values not in comparison:")
        print(missing_values)
    else:
        print("No missing values")
    missing_values = []
    for value in comparison:
        if not value in values:
            missing_values.append(value)
    if len(missing_values) > 0:
        print("Comparison values not in column:")
        print(missing_values)
    else:
        print("No missing values")
    return values

def key_from_value(value, dictionary, default):
    return_value = default
    for key, values in dictionary.items():
        if value.strip() in values:
            return_value = key
            break
    return return_value.strip()

def fixRegion(country):
    return key_from_value(country, countries_by_region, "Territory")

def checkRegions(regionColumn, countryColumn):
    fixed = []
    for i in range(0,len(regionColumn)):
        region = regionColumn[i]
        country = countryColumn[i]
        if (not (region in regions)) and (not (country in fixed)):
            fixed.append(country)
            print(f"{country} = {region}")

def fixCountry(value):
    return key_from_value(value, country_conversions, value)

def checkCountries(column):
    fixed = []
    for value in column:
        fixedValue = fixCountry(value)
        if (not (fixedValue == value)) and (not (value in fixed)):
            fixed.append(value)
            print(f"{value} => {fixedValue}")

def fixLevel(country, state):
    level = ""
    if state in states_and_provinces:
        level = "State/Province"
    elif country in all_countries:
        level = "Country"
    elif country in ["",None,np.nan]:
        level = "Region"
    else:
        level = "Territory"
    return level

def fixCountries(countries_column, configuredCountries):
    countries_conversion = countries_column.astype(str)
    countries_conversion = countries_conversion.apply(lambda x: fixCountry(x))
    print(conversions)
    countries = print_column_missing(countries_conversion,configuredCountries)
    return countries_conversion

def testConversion(title, test_array, conversion):
    print(title)
    no_conversions = []
    for value in test_array:
        return_value = ""
        if conversion == "country":
            return_value = fixCountry(value)
        elif conversion == "region":
            return_value = region_from_country(fixCountry(value))
        if return_value != value.strip():
            print(value.strip() + "," + return_value)
        if return_value == "":
            no_conversions.append(value)
    if len(no_conversions) > 0:
        print("Missing Conversions")
        print(no_conversions)
    print("")
    
def division(a,b):
    if not (b == 0):
        return a/b
    else:
        return np.nan
    
def fixProvince(value):
    province_map = {
        'BC': 'British Columbia',
        'NL': 'Newfoundland and Labrador',
        'NWT': 'Northwest Territories',
        'PEI': 'Prince Edward Island',
        'Repatriated': 'Repatriated Canada',
        'Repatriated Cdn': 'Repatriated Canada'
    }
    value = titleCase(value)
    if value in province_map.keys():
        new_province = province_map[value]
        return new_province
    else:
        return value

def censusRegionByState(state):
    for key in census_regions.keys():
        if (state in census_regions[key]["states"]):
            return census_regions[key]["name"]
        
def fixUSRegion(code):
    region = census_regions[code]
    region_name = region["name"]
    return region_name

def convertDateToExcel(dayString) :
    temp = datetime(1899, 12, 30)
    day = datetime.strptime(dayString, '%m/%d/%Y')
    delta = day - temp
    return float(delta.days) + (float(delta.seconds) / 86400)

def firstDay(dayString):
    dt = datetime.strptime(dayString, '%m/%d/%Y')
    firstDate = dt - timedelta(days=dt.weekday())
    return firstDate.strftime('%m/%d/%Y')

In [30]:
#Global Populations
country_populations = pd.read_excel(global_population_input_file)
country_populations["Population 100K"] = country_populations["Population"]/100000
country_populations["Population Source"] = population_source_url
conversions = {}
country_populations["Country"] = country_populations["Country"].astype(str)
country_populations["Country"] = country_populations["Country"].apply(lambda x: fixCountry(x))
country_populations_order = [
    'Country', 'Population', 'Population 100K', 'Fertility Rate', 'Median Age',
    'World Share (%)', 'Urban Population (%)', 'Annual Change (%)', 'Net Change', 'Migrants (net)',
    'Density (P/Km²)', 'Land Area (Km²)']
country_populations = country_populations[country_populations_order]
country_populations.head()

Unnamed: 0,Country,Population,Population 100K,Fertility Rate,Median Age,World Share (%),Urban Population (%),Annual Change (%),Net Change,Migrants (net),Density (P/Km²),Land Area (Km²)
0,Afghanistan,38928346,389.28346,4.6,18.0,0.5,25.0,2.33,886592,-62920.0,60,652860
1,Albania,2877797,28.77797,1.6,36.0,0.04,63.0,-0.11,-3120,-14000.0,105,27400
2,Algeria,43851044,438.51044,3.1,29.0,0.56,73.0,1.85,797990,-10000.0,18,2381740
3,American Samoa,55191,0.55191,,,0.0,88.0,-0.22,-121,,276,200
4,Andorra,77265,0.77265,,,0.0,88.0,0.16,123,,164,470


In [31]:
# Canada Population
population_cn = pd.read_excel(canada_population_file)
population_cn.rename(columns = {"GEO": "State/Province",
                                "VALUE": "Population",
                                "REF_DATE": "Quarter"}, inplace = True)
population_cn = population_cn[["Quarter","State/Province","Population"]]
last_quarter = population_cn["Quarter"].max()
print("Canada Populations " + last_quarter)
canada_last_population = population_cn.loc[population_cn["Quarter"]==last_quarter].copy()[["State/Province","Population"]]
canada_last_population["Population 100K"] = canada_last_population["Population"]/100000
canada_last_population.reset_index(drop=True,inplace=True)
canada_last_population["Country"] = "Canada"
canada_last_population["State/Province"] = canada_last_population["State/Province"].apply(lambda x: "" if x=="Canada" else x)
canada_last_population.head(20)

Canada Populations 2020-07


Unnamed: 0,State/Province,Population,Population 100K,Country
0,,38005238,380.05238,Canada
1,Newfoundland and Labrador,522103,5.22103,Canada
2,Prince Edward Island,159625,1.59625,Canada
3,Nova Scotia,979351,9.79351,Canada
4,New Brunswick,781476,7.81476,Canada
5,Quebec,8574571,85.74571,Canada
6,Ontario,14734014,147.34014,Canada
7,Manitoba,1379263,13.79263,Canada
8,Saskatchewan,1178681,11.78681,Canada
9,Alberta,4421876,44.21876,Canada


In [32]:
us_state_codes = pd.read_excel(us_codes)
us_state_codes["Census Region"] = us_state_codes["State Name"].apply(lambda x: censusRegionByState(x))
us_state_codes.rename(columns = {
    'State Name':'State/Province',
    "State Abbreviation": "Abbreviation"
}, inplace = True)
us_state_codes.head(70)

Unnamed: 0,State/Province,FIPS,Abbreviation,Status,Census Region
0,United States,0,US,0,United States
1,Alabama,1,AL,0,South
2,Alaska,2,AK,0,West
3,Arizona,4,AZ,0,West
4,Arkansas,5,AR,0,South
...,...,...,...,...,...
65,Baker Island,81,,4,
66,Howland Island,84,,4,
67,Jarvis Island,86,,4,
68,Kingman Reef,89,,4,


In [33]:
# US Population
us_states_census_demographics_request = requests.get(us_population_source).content
us_demographics = pd.read_csv(io.StringIO(us_states_census_demographics_request.decode('utf-8')))
currentTime = datetime.now()
us_demographics["Downloaded"] = currentTime
us_demographics["Country"] = "United States"
us_demographics["REGION"] = us_demographics["REGION"].apply(lambda x: fixUSRegion(x))
us_demographics["SEX"] = us_demographics["SEX"].apply(lambda x: fixSex(x))
us_demographics["Age Range"] = us_demographics["AGE"].apply(lambda x: getAgeRange(x))
keep_columns = ["REGION","STATE","NAME","SEX","AGE","POPEST2019_CIV","Downloaded","Country", "Age Range"]
us_demographics = us_demographics[keep_columns]
us_demographics.rename(columns = {'REGION': 'Census Region',
                                  'NAME' : 'State Name',
                                  'STATE' : 'FIPS',
                                  'POPEST2019_CIV' : 'Population 2019',
                                  'SEX' : 'Sex',
                                  'AGE' : 'Age'}, 
                       inplace = True)

us_sex = us_demographics.drop(columns=["Age Range"]).loc[us_demographics["Age"]==999].copy()
us_sex = us_sex.pivot_table(
    index=["Downloaded","Country","Census Region","State Name","FIPS","Sex"],
    columns='Age',
    values = 'Population 2019',
    aggfunc='first'
).reset_index().rename_axis(None, axis=1)
us_sex["Total Population"] = us_sex[999]
us_sex = us_sex.sort_values(["FIPS", "Sex"])
us_sex = us_sex.drop(columns=[999])
us_sex = us_sex.pivot_table(
    index=["Downloaded","Country","Census Region","State Name","FIPS"],
    columns='Sex',
    values = 'Total Population',
    aggfunc='first'
).reset_index().rename_axis(None, axis=1)
print(us_sex.columns)
us_sex["Pct Male"] = us_sex["Male"]/us_sex["Population 2019"]
us_sex["Pct Female"] = us_sex["Female"]/us_sex["Population 2019"]
us_sex = us_sex.sort_values(["FIPS"])

us_age = us_demographics[["Census Region","FIPS","State Name","Age", "Age Range", "Population 2019"]].copy()
us_age = us_age.pivot_table(index=["Census Region","FIPS","State Name"], 
                      columns='Age', 
                      values='Population 2019', 
                      aggfunc='first').reset_index().rename_axis(None, axis=1)
us_age["Total Population"] = us_age[999]
us_age["< 1"] = us_age[0]
us_age["1-4"] = us_age[1]+us_age[2]+us_age[3]+us_age[4]
us_age["5-14"] = us_age[5]+us_age[6]+us_age[7]+us_age[8]+us_age[9]+us_age[10]+us_age[11]+us_age[12]+us_age[13]+us_age[14]
us_age["15-24"] = us_age[15]+us_age[16]+us_age[17]+us_age[18]+us_age[19]+us_age[20]+us_age[21]+us_age[22]+us_age[23]+us_age[24]
us_age["25-34"] = us_age[25]+us_age[26]+us_age[27]+us_age[28]+us_age[29]+us_age[30]+us_age[31]+us_age[32]+us_age[33]+us_age[34]
us_age["35-44"] = us_age[35]+us_age[36]+us_age[37]+us_age[38]+us_age[39]+us_age[40]+us_age[41]+us_age[42]+us_age[43]+us_age[44]
us_age["45-54"] = us_age[45]+us_age[46]+us_age[47]+us_age[48]+us_age[49]+us_age[50]+us_age[51]+us_age[52]+us_age[53]+us_age[54]
us_age["55-64"] = us_age[55]+us_age[56]+us_age[57]+us_age[58]+us_age[59]+us_age[60]+us_age[61]+us_age[62]+us_age[63]+us_age[64]
us_age["65-74"] = us_age[65]+us_age[66]+us_age[67]+us_age[68]+us_age[69]+us_age[70]+us_age[71]+us_age[72]+us_age[73]+us_age[74]
us_age["75-84"] = us_age[75]+us_age[76]+us_age[77]+us_age[78]+us_age[79]+us_age[80]+us_age[81]+us_age[82]+us_age[83]+us_age[84]
us_age["85+"] = us_age[85]
us_age["Pct < 1"] = us_age["< 1"]/us_age["Total Population"]
us_age["Pct 1-4"] = us_age["1-4"]/us_age["Total Population"]
us_age["Pct 5-14"] = us_age["5-14"]/us_age["Total Population"]
us_age["Pct 15-24"] = us_age["15-24"]/us_age["Total Population"]
us_age["Pct 25-34"] = us_age["25-34"]/us_age["Total Population"]
us_age["Pct 35-44"] = us_age["35-44"]/us_age["Total Population"]
us_age["Pct 45-54"] = us_age["45-54"]/us_age["Total Population"]
us_age["Pct 55-64"] = us_age["55-64"]/us_age["Total Population"]
us_age["Pct 65-74"] = us_age["65-74"]/us_age["Total Population"]
us_age["Pct 75-84"] = us_age["75-84"]/us_age["Total Population"]
us_age["Pct 85+"] = us_age["85+"]/us_age["Total Population"]
us_age = us_age.drop(columns=["Census Region","State Name",0,999])
age_order = [
    'FIPS', 'Total Population',
    '< 1', 1, 2, 3, 4, 5, 6, 7, 8, 9,
    10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
    20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
    30, 31, 32, 33, 34, 35, 36, 37, 38, 39,
    40, 41, 42, 43, 44, 45, 46, 47, 48, 49,
    50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
    60, 61, 62, 63, 64, 65, 66, 67, 68, 69,
    70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
    80, 81, 82, 83, 84, '85+',
    '1-4', '5-14', '15-24', '25-34', '35-44', '45-54', '55-64', '65-74', '75-84',
    'Pct < 1', 'Pct 1-4', 'Pct 5-14', 'Pct 15-24', 'Pct 25-34', 'Pct 35-44', 'Pct 45-54', 'Pct 55-64', 'Pct 65-74', 'Pct 75-84', 'Pct 85+'
]
us_age = us_age[age_order]
us_age = us_age.sort_values(["FIPS"])

us_state_demographics = pd.merge(us_sex, us_age, how="left", on="FIPS")
us_state_demographics.drop('Census Region', axis=1, inplace=True)
us_state_demographics = pd.merge(us_state_demographics, us_state_codes, how="left", on="FIPS")
us_state_demographics = us_state_demographics.drop(columns=["FIPS", "Status"])
us_state_demographics["Country"] = "United States"
us_state_demographics.rename(
    columns = {"Population 2019": "Population"},
    inplace = True)
us_state_demographics["Population 100K"] = us_state_demographics["Population"]/100000
us_state_demographics["Population Source"] = us_population_source
demographics_order = [
    'Country', 'State/Province', 'Census Region',
    'Population', 'Population 100K', 
    'Female', 'Male', 'Pct Male', 'Pct Female',
    '< 1', 1, 2, 3, 4, 5, 6, 7, 8, 9,
    10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
    20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
    30, 31, 32, 33, 34, 35, 36, 37, 38, 39,
    40, 41, 42, 43, 44, 45, 46, 47, 48, 49,
    50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
    60, 61, 62, 63, 64, 65, 66, 67, 68, 69,
    70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
    80, 81, 82, 83, 84, '85+',
    '1-4', '5-14', '15-24', '25-34', '35-44', '45-54', '55-64', '65-74', '75-84',
    'Pct < 1', 'Pct 1-4', 'Pct 5-14', 'Pct 15-24', 'Pct 25-34', 'Pct 35-44', 'Pct 45-54', 'Pct 55-64', 'Pct 65-74', 'Pct 75-84', 'Pct 85+',
    'Population Source'
]
printColumns(us_state_demographics,"US State Demographics Columns")
us_state_demographics = us_state_demographics[demographics_order]
us_state_demographics = us_state_demographics.sort_values(["State/Province"])
print(us_population_file)
us_state_demographics.to_excel(us_population_file, index = False)
us_state_demographics.head()

Index(['Downloaded', 'Country', 'Census Region', 'State Name', 'FIPS',
       'Female', 'Male', 'Population 2019'],
      dtype='object')
US State Demographics Columns
Index([       'Downloaded',           'Country',        'State Name',
                  'Female',              'Male',        'Population',
                'Pct Male',        'Pct Female',  'Total Population',
                     '< 1',
       ...
               'Pct 45-54',         'Pct 55-64',         'Pct 65-74',
               'Pct 75-84',           'Pct 85+',    'State/Province',
            'Abbreviation',     'Census Region',   'Population 100K',
       'Population Source'],
      dtype='object', length=120)
D:/Repositories/Global-COVID-Surveillance/data/raw/demographics/US Population.xlsx


Unnamed: 0,Country,State/Province,Census Region,Population,Population 100K,Female,Male,Pct Male,Pct Female,< 1,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85+,1-4,5-14,15-24,25-34,35-44,45-54,55-64,65-74,75-84,Pct < 1,Pct 1-4,Pct 5-14,Pct 15-24,Pct 25-34,Pct 35-44,Pct 45-54,Pct 55-64,Pct 65-74,Pct 75-84,Pct 85+,Population Source
1,United States,Alabama,South,4889347,48.89347,2531653,2357694,0.48221,0.51779,56901,58290,59073,59799,60294,59568,58599,59537,60023,60241,60897,63083,62906,61883,61729,61740,61799,61924,62938,64125,63587,64201,63943,63719,63922,65079,65208,67027,69478,68758,64852,61469,59980,59615,60721,58941,59921,60346,60696,62200,58159,57993,57852,55498,58174,57008,58838,61959,65460,64750,60738,59494,59786,61321,65925,66906,66695,67073,67308,68221,65605,65211,65365,63117,62042,59584,56766,54694,52697,51707,50567,49884,51612,37091,36845,35441,36173,30575,27572,26053,23977,22580,19594,18222,16660,91543,237456,608466,631898,642187,589780,615279,657543,501447,256847,0.011638,0.048566,0.124447,0.12924,0.131344,0.120626,0.125841,0.134485,0.102559,0.052532,0.018723,https://www2.census.gov/programs-surveys/popes...
2,United States,Alaska,West,712114,7.12114,347065,365049,0.512627,0.487373,9978,10012,10186,10509,10395,10414,10303,10286,10436,10157,9976,10016,9887,9509,9678,9488,9410,9343,8518,7525,8088,8617,9132,9252,9900,10318,10693,11456,11576,11552,10946,10809,10460,10822,10799,10303,10452,9962,9667,9685,8865,8589,8533,7954,8295,7827,7962,8230,8696,9086,8428,8197,8330,8595,9128,9426,9493,9636,9731,9894,9373,9168,9161,8614,8800,8210,7734,7220,6655,6442,5978,5621,5482,4013,3945,3665,3459,2950,2677,2327,1971,1784,1586,1411,1277,7181,41102,100662,89273,109431,92305,84479,93296,61300,23107,0.014012,0.057718,0.141357,0.125363,0.153671,0.129621,0.118631,0.131013,0.086082,0.032448,0.010084,https://www2.census.gov/programs-surveys/popes...
3,United States,Arizona,West,7259090,72.5909,3658425,3600665,0.496022,0.503978,81929,83065,85726,88192,90876,90858,90405,90319,91313,90797,92572,96605,96951,95849,95076,94742,91832,93119,97724,99334,95779,96720,97447,99492,100066,101835,102053,105016,106612,105885,99473,96012,93463,94419,94495,91292,93193,93126,92593,94337,87480,85369,85480,83038,86427,83705,82925,85342,90167,90646,85428,82772,81434,81904,87011,88220,88617,88864,90069,90909,88041,87973,87664,85106,85274,83374,81125,78896,77728,77376,76474,76787,81299,59741,58899,57521,58174,50088,45100,41784,38116,35128,31252,28331,25703,145737,347859,930745,966255,999263,892335,851334,880737,751699,411197,0.011286,0.04792,0.128218,0.13311,0.137657,0.122927,0.117278,0.121329,0.103553,0.056646,0.020076,https://www2.census.gov/programs-surveys/popes...
4,United States,Arkansas,South,3012542,30.12542,1535409,1477133,0.490328,0.509672,36355,37006,37572,38610,38921,38404,37924,38827,38633,38959,38941,40404,41015,40146,39960,39598,39485,39395,38933,39714,40206,40211,40323,39367,38992,39539,39518,40912,42271,41927,39361,38289,37446,37354,37897,37550,38010,38198,38328,39332,36427,36037,35410,34319,35486,34449,34938,36485,37966,38136,35481,34716,34766,35572,38680,39707,39698,39360,39525,39697,38668,38092,37865,36917,36430,35478,34176,32682,31736,30888,30427,30017,31554,22864,23007,22169,22217,19362,17669,16670,14936,13764,12330,11253,10771,59912,152109,393213,396224,394514,369097,361189,385959,302829,161141,0.012068,0.050492,0.130525,0.131525,0.130957,0.12252,0.119895,0.128117,0.100523,0.05349,0.019888,https://www2.census.gov/programs-surveys/popes...
5,United States,California,West,39356141,393.56141,19843586,19512555,0.495794,0.504206,462589,462713,477322,485894,495198,493458,494221,493396,504330,493445,492283,511109,512662,507455,505628,503712,501846,497188,515261,501692,493088,497749,512251,533604,557011,576604,588951,613288,640318,640758,611094,595453,577581,574306,575253,556953,559625,555569,542576,549416,510476,499294,496070,486794,500853,483161,483781,493411,516222,535475,504901,490446,479620,481731,506585,509027,503106,495480,495887,505031,475913,467624,457973,440326,435989,411959,392969,371355,358803,354786,334996,325934,330822,255411,249635,233568,229072,200181,182011,172849,158877,148337,135255,124947,116502,749846,1921127,5007987,5113402,5993606,5257626,4975333,4786356,3386670,1701599,0.011754,0.048814,0.127248,0.129926,0.152292,0.133591,0.126418,0.121616,0.086052,0.043236,0.019053,https://www2.census.gov/programs-surveys/popes...


In [34]:
# Global input data
github_request=requests.get(github_url).content
c=pd.read_csv(io.StringIO(github_request.decode('utf-8')))
currentTime = datetime.now()

print("Columns")
print(c.columns)

print("Sets")
c["set"] = c["set"].astype(str)
sets = print_column_unique(c["set"])

print("Names")
c["name"] = c["name"].astype(str)
c["name"] = c["name"].apply(lambda x: "" if x=="nan" else x)
c["name"] = c["name"].apply(lambda x: fixCountry(x))
checkCountries(c["name"])
print_column_missing(c["name"],all_countries)
names = print_column_unique(c["name"])

print("Units")
c["unit"] = c["unit"].astype(str)
c["unit"] = c["unit"].apply(lambda x: "" if x=="unit" else x)
units = print_column_unique(c["unit"])

conversions = {}
c["Region"] = c["name"].apply(lambda x: fixRegion(x))
c["Level"] = c.apply(lambda x: fixLevel(x["name"],""),axis=1)

# Format text date and add datetime for date
c["time"] = c["time"].astype(str)
c["time"] = c["time"].apply(lambda x: us_date(x))
c["Date"] = c["time"]
c["time"] = c.apply(lambda x: pd.to_datetime(x["Date"], format="%m/%d/%Y"), axis=1)
minmax_dates = c.groupby(["name"]).agg({"Date": [np.min,np.max]})
min_date = c["Date"].min()
max_date = c["Date"].max()
print("Min: " + min_date)
print("Max: " + max_date)
c.sort_values(by=['set','name','Date'], inplace=True)

# Format numeric columns
numeric_columns = ['pop_100k',
                   'new_cases_orig','new_deaths_orig','new_tests_orig',
                   'cap_cum_cases','cap_new_cases',
                   'cap_cum_deaths','cap_new_deaths',
                   'cap_cum_tests','cap_new_tests',
                   'all_cum_cases','all_new_cases','all_cum_deaths','all_new_deaths',
                   'all_cum_tests','all_new_tests',
                   'pos'
                  ]
float_columns = ['pop_100k',
                 'cap_cum_cases','cap_new_cases','cap_cum_deaths',
                 'cap_new_deaths','cap_cum_tests','cap_new_tests'
                ]
integer_columns = ['new_cases_orig','new_deaths_orig','new_tests_orig',
                   'all_cum_cases','all_cum_deaths','all_cum_tests','all_new_tests'
                  ]
c[float_columns] = c[float_columns].apply(pd.to_numeric)
c[integer_columns] = c[integer_columns].apply(lambda x: pd.to_numeric(x, errors='coerce', downcast='integer'))
has_data = c.all_cum_cases > 0
c = c[has_data]
c = c.where(c.notnull(), None)

# Add missing columns to match Google sheet
c["State/Province"] = ""
c["Accessed"] = str(currentTime.month) + '/' + str(currentTime.day) + '/' + str(currentTime.year)

c = c.rename({'time': "Time",
              'name': "Country",
              'unit': "Abbreviation",
              'cum_tests_orig': "Total Tests",
              'new_tests_orig': "Tests Daily",
              'pop_100k': "Population 100K",
              'new_cases_orig': "Cases Daily",
              'new_deaths_orig': "Deaths Daily",
              'cap_cum_cases': "Total Cases Per Capita",
              'cap_new_cases': "Cases Daily per Capita (7 day rolling average)",
              'cap_cum_deaths': "Total Deaths Per Capita",
              'cap_new_deaths': "Death Daily Per capita (7 day rolling average)",
              'cap_cum_tests': "Total Tests Per Capita (7 day rolling average)",
              'cap_new_tests': "Tests Daily Per Capita (7 day rolling average)",
              'all_cum_cases': "Total Cases",
              'all_new_cases': "Cases Daily (7 day rolling average)",
              'all_cum_deaths': "Total Deaths",
              'all_new_deaths': "Death Daily (7 day rolling average)",
              'all_cum_tests': "Total Tests (7 day rolling average)",
              'all_new_tests': "Tests Daily (7 day rolling average)",
              'pos': "Positivity Rate (7 day rolling average)"
             },axis=1)
c_data_order = [
    'Level', 'Region', 'Country', 'State/Province', 'Abbreviation', 'Time', 'Date', 'Population 100K',
    'Cases Daily', 'Cases Daily (7 day rolling average)', 'Cases Daily per Capita (7 day rolling average)',
    'Total Cases', 'Total Cases Per Capita', 
    'Tests Daily', 'Tests Daily (7 day rolling average)', 'Tests Daily Per Capita (7 day rolling average)',
    'Total Tests', 'Total Tests (7 day rolling average)', 'Total Tests Per Capita (7 day rolling average)',
    'Positivity Rate (7 day rolling average)',  
    'Deaths Daily', 'Death Daily (7 day rolling average)', 'Death Daily Per capita (7 day rolling average)',
    'Total Deaths','Total Deaths Per Capita',
    'Accessed'
]
c = c.reset_index()
c = c.loc[c["set"]=="country"].copy()
c = c[c_data_order].copy()
c.head()

Columns
Index(['set', 'name', 'unit', 'time', 'cum_tests_orig', 'new_tests_orig',
       'pop_100k', 'new_cases_orig', 'new_deaths_orig', 'cap_cum_cases',
       'cap_new_cases', 'cap_cum_deaths', 'cap_new_deaths', 'cap_cum_tests',
       'cap_new_tests', 'all_cum_cases', 'all_new_cases', 'all_cum_deaths',
       'all_new_deaths', 'all_cum_tests', 'all_new_tests', 'pos'],
      dtype='object')
Sets
Column Values:
['country' 'income' 'region']
Names
Column Values:
['' 'Afghanistan' 'Albania' 'Algeria' 'Andorra' 'Angola'
 'Antigua & Barbuda' 'Argentina' 'Armenia' 'Australia' 'Austria'
 'Azerbaijan' 'Bahamas' 'Bahrain' 'Bangladesh' 'Barbados' 'Belarus'
 'Belgium' 'Belize' 'Benin' 'Bermuda' 'Bhutan' 'Bolivia'
 'Bosnia & Herzegovina' 'Botswana' 'Brazil' 'Brunei' 'Bulgaria'
 'Burkina Faso' 'Burundi' 'Cabo Verde' 'Cambodia' 'Cameroon' 'Canada'
 'Central African Republic' 'Chad' 'Chile' 'China' 'Colombia' 'Comoros'
 'Costa Rica' 'Croatia' 'Cuba' 'Cyprus' 'Czech Republic' 'Côte d’Ivoire'
 'Demo

Column Values:
['AD' 'AE' 'AF' 'AG' 'AL' 'AM' 'AO' 'AR' 'AT' 'AU' 'AZ' 'Africa' 'Asia'
 'BA' 'BB' 'BD' 'BE' 'BF' 'BG' 'BH' 'BI' 'BJ' 'BM' 'BN' 'BO' 'BR' 'BS'
 'BT' 'BW' 'BY' 'BZ' 'CA' 'CD' 'CF' 'CG' 'CH' 'CI' 'CL' 'CM' 'CN' 'CO'
 'CR' 'CU' 'CV' 'CY' 'CZ' 'DE' 'DJ' 'DK' 'DM' 'DO' 'DZ' 'EC' 'EE' 'EG'
 'EH' 'ER' 'ES' 'ET' 'Europe' 'FI' 'FJ' 'FO' 'FR' 'GA' 'GB' 'GD' 'GE' 'GH'
 'GL' 'GM' 'GN' 'GQ' 'GR' 'GT' 'GW' 'GY' 'HK' 'HN' 'HR' 'HT' 'HU' 'High'
 'ID' 'IE' 'IL' 'IN' 'IQ' 'IR' 'IS' 'IT' 'JM' 'JO' 'JP' 'KE' 'KG' 'KH'
 'KM' 'KN' 'KR' 'KW' 'KZ' 'LA' 'LB' 'LC' 'LI' 'LK' 'LR' 'LS' 'LT' 'LU'
 'LV' 'LY' 'Low' 'Lower middle' 'MA' 'MC' 'MD' 'ME' 'MG' 'MK' 'ML' 'MM'
 'MN' 'MR' 'MT' 'MU' 'MV' 'MW' 'MX' 'MY' 'MZ' 'NC' 'NE' 'NG' 'NI' 'NL'
 'NO' 'NP' 'NZ' 'North America' 'OM' 'Oceania' 'PA' 'PE' 'PF' 'PG' 'PH'
 'PK' 'PL' 'PS' 'PT' 'PY' 'QA' 'RO' 'RS' 'RU' 'RW' 'SA' 'SC' 'SD' 'SE'
 'SG' 'SI' 'SK' 'SL' 'SM' 'SN' 'SO' 'SR' 'SS' 'ST' 'SV' 'SY' 'SZ'
 'South America' 'TD' 'TG' 'TH' 'TJ' 'TL' 'TN' 'TR' 'TT' '

Unnamed: 0,Level,Region,Country,State/Province,Abbreviation,Time,Date,Population 100K,Cases Daily,Cases Daily (7 day rolling average),Cases Daily per Capita (7 day rolling average),Total Cases,Total Cases Per Capita,Tests Daily,Tests Daily (7 day rolling average),Tests Daily Per Capita (7 day rolling average),Total Tests,Total Tests (7 day rolling average),Total Tests Per Capita (7 day rolling average),Positivity Rate (7 day rolling average),Deaths Daily,Death Daily (7 day rolling average),Death Daily Per capita (7 day rolling average),Total Deaths,Total Deaths Per Capita,Accessed
0,Country,South Asia,Afghanistan,,AF,2020-02-24,02/24/2020,389.28,1,0,0,1,0.00256885,,,,,0,0,,0,0,0,0,0,12/2/2020
1,Country,South Asia,Afghanistan,,AF,2020-02-25,02/25/2020,389.28,0,0,0,1,0.00256885,,,,,0,0,,0,0,0,0,0,12/2/2020
2,Country,South Asia,Afghanistan,,AF,2020-02-26,02/26/2020,389.28,0,0,0,1,0.00256885,,,,,0,0,,0,0,0,0,0,12/2/2020
3,Country,South Asia,Afghanistan,,AF,2020-02-27,02/27/2020,389.28,0,0,0,1,0.00256885,,,,,0,0,,0,0,0,0,0,12/2/2020
4,Country,South Asia,Afghanistan,,AF,2020-02-28,02/28/2020,389.28,0,0,0,1,0.00256885,,,,,0,0,,0,0,0,0,0,12/2/2020


In [35]:
# Generate country raw data
group_cols = ['Level', 'Region', 'Country', 'State/Province', 'Abbreviation', 'Time', 'Date','Accessed']
sum_cols = ['Population 100K', 'Cases Daily', 'Tests Daily','Deaths Daily']
raw_data_order = group_cols + sum_cols
raw_data = c[raw_data_order].copy()
raw_data.head()

Unnamed: 0,Level,Region,Country,State/Province,Abbreviation,Time,Date,Accessed,Population 100K,Cases Daily,Tests Daily,Deaths Daily
0,Country,South Asia,Afghanistan,,AF,2020-02-24,02/24/2020,12/2/2020,389.28,1,,0
1,Country,South Asia,Afghanistan,,AF,2020-02-25,02/25/2020,12/2/2020,389.28,0,,0
2,Country,South Asia,Afghanistan,,AF,2020-02-26,02/26/2020,12/2/2020,389.28,0,,0
3,Country,South Asia,Afghanistan,,AF,2020-02-27,02/27/2020,12/2/2020,389.28,0,,0
4,Country,South Asia,Afghanistan,,AF,2020-02-28,02/28/2020,12/2/2020,389.28,0,,0


In [36]:
# Generate raw region data
checkRegions(raw_data["Region"], raw_data["Country"])
region_group_cols = ['Level', 'Region', 'Time', 'Date','Accessed']
raw_data_regions = raw_data.groupby(region_group_cols)[sum_cols].sum()
raw_data_regions = raw_data_regions.reset_index()
raw_data_regions["Level"] = "Region"
raw_data_regions = raw_data_regions[region_group_cols + sum_cols].copy()
raw_data_regions.head()

Unnamed: 0,Level,Region,Time,Date,Accessed,Population 100K,Cases Daily,Tests Daily,Deaths Daily
0,Region,Central Asia,2020-01-31,01/31/2020,12/2/2020,1459.34,2.0,0.0,0.0
1,Region,Central Asia,2020-02-01,02/01/2020,12/2/2020,1459.34,0.0,0.0,0.0
2,Region,Central Asia,2020-02-02,02/02/2020,12/2/2020,1459.34,0.0,0.0,0.0
3,Region,Central Asia,2020-02-03,02/03/2020,12/2/2020,1459.34,0.0,0.0,0.0
4,Region,Central Asia,2020-02-04,02/04/2020,12/2/2020,1459.34,0.0,0.0,0.0


In [37]:
# Canada raw data
canada_source_request = requests.get(canada_source_csv).content
canada_df = pd.read_csv(io.StringIO(canada_source_request.decode('utf-8')))
currentTime = datetime.now()

print("Original Canada Columns")
print(canada_df.columns)
canada_df.rename(columns = {
    'Province': 'State/Province',
    'SummaryDate': 'Time',
    'TotalCases': 'Cases Total','DailyTotals': 'Cases Daily',
    'TotalRecovered' : 'Recovered Total','DailyRecovered': 'Recovered Daily',
    'TotalDeaths': 'Deaths Total','DailyDeaths': 'Deaths Daily',
    'TotalTested': 'Tests Total','DailyTested': 'Tests Daily',
    'TotalActive': 'Active Total','DailyActive': 'Active Daily',
    'TotalHospitalized': 'Hospitalized Total','DailyHospitalized': 'Hospitalized Daily',
    'TotalICU': 'ICU Total', 'DailyICU': 'ICU Daily'
}, inplace = True)
print("Renamed Canada Columns")
print(canada_df.columns)

canada_df.drop(columns=["OBJECTID"], inplace = True)
canada_df["Accessed"] = str(currentTime.month) + '/' + str(currentTime.day) + '/' + str(currentTime.year)
canada_df["Country"] = "Canada"
canada_df["Region"] = "North America"
canada_df["State/Province"] = canada_df["State/Province"].apply(lambda x: fixProvince(x))
canada_df["Date"] = canada_df["Time"].apply(lambda x: us_date(x).replace(" ",""))
canada_df["Level"] = canada_df["State/Province"].apply(lambda x: "Country" if x == "Canada" else "State/Province")
canada_df["State/Province"] = canada_df["State/Province"].apply(lambda x: "" if x=="Canada" else x)
string_columns = ["State/Province","Abbreviation","Country","Region"]
canada_df = canada_df.sort_values(by=["Level","Country","State/Province","Date"])
canada_df = canada_df.reset_index()
canada_df = canada_df.drop(columns=["index"])
canada_df = canada_df.merge(canada_last_population, how='left', on=["Country","State/Province"])
canada_order = [
    'Level', 'Region', 'Country', 'State/Province', 'Abbreviation', 'Time', 'Date', 'Accessed',
    "Population", "Population 100K",
    'Cases Daily', 'Cases Total', 'Tests Daily', 'Tests Total', 
    'Deaths Total', 'Deaths Daily',
    'Recovered Total', 'Recovered Daily', 'Active Daily', 'Active Total',
    'Hospitalized Total', 'Hospitalized Daily', 'ICU Total', 'ICU Daily'
]
canada_df = canada_df[canada_order].copy()
canada_df.loc[canada_df["Level"]=="State/Province"].head()

Original Canada Columns
Index(['OBJECTID', 'Province', 'Abbreviation', 'DailyTotals', 'SummaryDate',
       'TotalCases', 'TotalRecovered', 'DailyRecovered', 'TotalDeaths',
       'DailyDeaths', 'TotalTested', 'DailyTested', 'TotalActive',
       'DailyActive', 'TotalHospitalized', 'DailyHospitalized', 'TotalICU',
       'DailyICU'],
      dtype='object')
Renamed Canada Columns
Index(['OBJECTID', 'State/Province', 'Abbreviation', 'Cases Daily', 'Time',
       'Cases Total', 'Recovered Total', 'Recovered Daily', 'Deaths Total',
       'Deaths Daily', 'Tests Total', 'Tests Daily', 'Active Total',
       'Active Daily', 'Hospitalized Total', 'Hospitalized Daily', 'ICU Total',
       'ICU Daily'],
      dtype='object')


Unnamed: 0,Level,Region,Country,State/Province,Abbreviation,Time,Date,Accessed,Population,Population 100K,Cases Daily,Cases Total,Tests Daily,Tests Total,Deaths Total,Deaths Daily,Recovered Total,Recovered Daily,Active Daily,Active Total,Hospitalized Total,Hospitalized Daily,ICU Total,ICU Daily
312,State/Province,North America,Canada,Alberta,AB,2020/01/25 12:00:00+00,01/25/2020,12/2/2020,4421876.0,44.21876,0,0,0,0,0,0,0,0,0.0,0,,,,
313,State/Province,North America,Canada,Alberta,AB,2020/01/26 12:00:00+00,01/26/2020,12/2/2020,4421876.0,44.21876,0,0,0,0,0,0,0,0,0.0,0,,,,
314,State/Province,North America,Canada,Alberta,AB,2020/01/27 12:00:00+00,01/27/2020,12/2/2020,4421876.0,44.21876,0,0,0,0,0,0,0,0,0.0,0,,,,
315,State/Province,North America,Canada,Alberta,AB,2020/01/28 12:00:00+00,01/28/2020,12/2/2020,4421876.0,44.21876,0,0,0,0,0,0,0,0,0.0,0,,,,
316,State/Province,North America,Canada,Alberta,AB,2020/01/29 12:00:00+00,01/29/2020,12/2/2020,4421876.0,44.21876,0,0,0,0,0,0,0,0,0.0,0,,,,


In [41]:
us_raw_demographics = us_state_demographics.drop(columns=["Country"])
all_us_demographics = us_raw_demographics.columns.tolist()
print(len(all_us_demographics))
all_us_demographics.pop(len(all_us_demographics)-1)
all_us_demographics.pop(0)
print(all_us_demographics)
us_raw_demographics.head()

# Combine US States and Demographics
states_input = pd.merge(states, us_state_codes, how="left", on="Abbreviation")
states_input = states_input.drop(columns=["Census Region"])
us_raw_demographics = us_state_demographics.drop(columns=["Country"])
states_input = pd.merge(states_input, us_raw_demographics, how="left", on="State/Province")
print(states_input.columns.tolist())
states_input = states_input.sort_values(["State/Province","Time"])
states_input = states_input.reset_index()

characteristics_order = [
    "Time", 'Date', 'Level', 'Region', 'Country', 'Abbreviation', 'State/Province', 'FIPS', 
    'Status', 'Data Quality', 'Accessed'
]
stats_order = [
    'Cases Daily', 'Total Cases',
    'Tests Daily', 'Total Tests', 'Negative Daily', 'Total Negative',
    'Deaths Daily', 'Total Deaths',
    'Hospitalized Daily', 'Currently Hospitalized', 'Total Hospitalized',
    'Currently In ICU', 'Total In ICU', 'Currently On Ventilator', 'Total On Ventilator', "Total Recovered"
]
merge_order = characteristics_order + all_us_demographics + stats_order
states_input = states_input[merge_order].copy()
states_input.to_excel("C:/Users/janin/Downloads/states_input.xlsx",index=False)
states_input.head()

Pre Rename Columns
Index(['date', 'state', 'dataQualityGrade', 'death', 'deathConfirmed',
       'deathIncrease', 'deathProbable', 'hospitalized',
       'hospitalizedCumulative', 'hospitalizedCurrently',
       'hospitalizedIncrease', 'inIcuCumulative', 'inIcuCurrently', 'negative',
       'negativeIncrease', 'negativeTestsAntibody',
       'negativeTestsPeopleAntibody', 'negativeTestsViral',
       'onVentilatorCumulative', 'onVentilatorCurrently', 'positive',
       'positiveCasesViral', 'positiveIncrease', 'positiveScore',
       'positiveTestsAntibody', 'positiveTestsAntigen',
       'positiveTestsPeopleAntibody', 'positiveTestsPeopleAntigen',
       'positiveTestsViral', 'recovered', 'totalTestEncountersViral',
       'totalTestEncountersViralIncrease', 'totalTestResults',
       'totalTestResultsIncrease', 'totalTestsAntibody', 'totalTestsAntigen',
       'totalTestsPeopleAntibody', 'totalTestsPeopleAntigen',
       'totalTestsPeopleViral', 'totalTestsPeopleViralIncrease',
     

Unnamed: 0,Time,Date,Level,Region,Country,Abbreviation,State/Province,FIPS,Status,Data Quality,Accessed,Census Region,Population,Population 100K,Female,Male,Pct Male,Pct Female,< 1,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85+,1-4,5-14,15-24,25-34,35-44,45-54,55-64,65-74,75-84,Pct < 1,Pct 1-4,Pct 5-14,Pct 15-24,Pct 25-34,Pct 35-44,Pct 45-54,Pct 55-64,Pct 65-74,Pct 75-84,Pct 85+,Cases Daily,Total Cases,Tests Daily,Total Tests,Negative Daily,Total Negative,Deaths Daily,Total Deaths,Hospitalized Daily,Currently Hospitalized,Total Hospitalized,Currently In ICU,Total In ICU,Currently On Ventilator,Total On Ventilator,Total Recovered
0,2020-03-07,03/07/2020,State/Province,North America,United States,AL,Alabama,1,0,,2020-12-02 21:42:41.746711,South,4889347.0,48.89347,2531653.0,2357694.0,0.48221,0.51779,56901.0,58290.0,59073.0,59799.0,60294.0,59568.0,58599.0,59537.0,60023.0,60241.0,60897.0,63083.0,62906.0,61883.0,61729.0,61740.0,61799.0,61924.0,62938.0,64125.0,63587.0,64201.0,63943.0,63719.0,63922.0,65079.0,65208.0,67027.0,69478.0,68758.0,64852.0,61469.0,59980.0,59615.0,60721.0,58941.0,59921.0,60346.0,60696.0,62200.0,58159.0,57993.0,57852.0,55498.0,58174.0,57008.0,58838.0,61959.0,65460.0,64750.0,60738.0,59494.0,59786.0,61321.0,65925.0,66906.0,66695.0,67073.0,67308.0,68221.0,65605.0,65211.0,65365.0,63117.0,62042.0,59584.0,56766.0,54694.0,52697.0,51707.0,50567.0,49884.0,51612.0,37091.0,36845.0,35441.0,36173.0,30575.0,27572.0,26053.0,23977.0,22580.0,19594.0,18222.0,16660.0,91543.0,237456.0,608466.0,631898.0,642187.0,589780.0,615279.0,657543.0,501447.0,256847.0,0.011638,0.048566,0.124447,0.12924,0.131344,0.120626,0.125841,0.134485,0.102559,0.052532,0.018723,0,0.0,0,0.0,0,,0,,0,,,,,,,
1,2020-03-08,03/08/2020,State/Province,North America,United States,AL,Alabama,1,0,,2020-12-02 21:42:41.746711,South,4889347.0,48.89347,2531653.0,2357694.0,0.48221,0.51779,56901.0,58290.0,59073.0,59799.0,60294.0,59568.0,58599.0,59537.0,60023.0,60241.0,60897.0,63083.0,62906.0,61883.0,61729.0,61740.0,61799.0,61924.0,62938.0,64125.0,63587.0,64201.0,63943.0,63719.0,63922.0,65079.0,65208.0,67027.0,69478.0,68758.0,64852.0,61469.0,59980.0,59615.0,60721.0,58941.0,59921.0,60346.0,60696.0,62200.0,58159.0,57993.0,57852.0,55498.0,58174.0,57008.0,58838.0,61959.0,65460.0,64750.0,60738.0,59494.0,59786.0,61321.0,65925.0,66906.0,66695.0,67073.0,67308.0,68221.0,65605.0,65211.0,65365.0,63117.0,62042.0,59584.0,56766.0,54694.0,52697.0,51707.0,50567.0,49884.0,51612.0,37091.0,36845.0,35441.0,36173.0,30575.0,27572.0,26053.0,23977.0,22580.0,19594.0,18222.0,16660.0,91543.0,237456.0,608466.0,631898.0,642187.0,589780.0,615279.0,657543.0,501447.0,256847.0,0.011638,0.048566,0.124447,0.12924,0.131344,0.120626,0.125841,0.134485,0.102559,0.052532,0.018723,0,0.0,0,0.0,0,,0,,0,,,,,,,
2,2020-03-09,03/09/2020,State/Province,North America,United States,AL,Alabama,1,0,,2020-12-02 21:42:41.746711,South,4889347.0,48.89347,2531653.0,2357694.0,0.48221,0.51779,56901.0,58290.0,59073.0,59799.0,60294.0,59568.0,58599.0,59537.0,60023.0,60241.0,60897.0,63083.0,62906.0,61883.0,61729.0,61740.0,61799.0,61924.0,62938.0,64125.0,63587.0,64201.0,63943.0,63719.0,63922.0,65079.0,65208.0,67027.0,69478.0,68758.0,64852.0,61469.0,59980.0,59615.0,60721.0,58941.0,59921.0,60346.0,60696.0,62200.0,58159.0,57993.0,57852.0,55498.0,58174.0,57008.0,58838.0,61959.0,65460.0,64750.0,60738.0,59494.0,59786.0,61321.0,65925.0,66906.0,66695.0,67073.0,67308.0,68221.0,65605.0,65211.0,65365.0,63117.0,62042.0,59584.0,56766.0,54694.0,52697.0,51707.0,50567.0,49884.0,51612.0,37091.0,36845.0,35441.0,36173.0,30575.0,27572.0,26053.0,23977.0,22580.0,19594.0,18222.0,16660.0,91543.0,237456.0,608466.0,631898.0,642187.0,589780.0,615279.0,657543.0,501447.0,256847.0,0.011638,0.048566,0.124447,0.12924,0.131344,0.120626,0.125841,0.134485,0.102559,0.052532,0.018723,0,0.0,0,0.0,0,,0,,0,,,,,,,
3,2020-03-10,03/10/2020,State/Province,North America,United States,AL,Alabama,1,0,,2020-12-02 21:42:41.746711,South,4889347.0,48.89347,2531653.0,2357694.0,0.48221,0.51779,56901.0,58290.0,59073.0,59799.0,60294.0,59568.0,58599.0,59537.0,60023.0,60241.0,60897.0,63083.0,62906.0,61883.0,61729.0,61740.0,61799.0,61924.0,62938.0,64125.0,63587.0,64201.0,63943.0,63719.0,63922.0,65079.0,65208.0,67027.0,69478.0,68758.0,64852.0,61469.0,59980.0,59615.0,60721.0,58941.0,59921.0,60346.0,60696.0,62200.0,58159.0,57993.0,57852.0,55498.0,58174.0,57008.0,58838.0,61959.0,65460.0,64750.0,60738.0,59494.0,59786.0,61321.0,65925.0,66906.0,66695.0,67073.0,67308.0,68221.0,65605.0,65211.0,65365.0,63117.0,62042.0,59584.0,56766.0,54694.0,52697.0,51707.0,50567.0,49884.0,51612.0,37091.0,36845.0,35441.0,36173.0,30575.0,27572.0,26053.0,23977.0,22580.0,19594.0,18222.0,16660.0,91543.0,237456.0,608466.0,631898.0,642187.0,589780.0,615279.0,657543.0,501447.0,256847.0,0.011638,0.048566,0.124447,0.12924,0.131344,0.120626,0.125841,0.134485,0.102559,0.052532,0.018723,0,0.0,0,0.0,0,0.0,0,,0,,,,,,,
4,2020-03-11,03/11/2020,State/Province,North America,United States,AL,Alabama,1,0,,2020-12-02 21:42:41.746711,South,4889347.0,48.89347,2531653.0,2357694.0,0.48221,0.51779,56901.0,58290.0,59073.0,59799.0,60294.0,59568.0,58599.0,59537.0,60023.0,60241.0,60897.0,63083.0,62906.0,61883.0,61729.0,61740.0,61799.0,61924.0,62938.0,64125.0,63587.0,64201.0,63943.0,63719.0,63922.0,65079.0,65208.0,67027.0,69478.0,68758.0,64852.0,61469.0,59980.0,59615.0,60721.0,58941.0,59921.0,60346.0,60696.0,62200.0,58159.0,57993.0,57852.0,55498.0,58174.0,57008.0,58838.0,61959.0,65460.0,64750.0,60738.0,59494.0,59786.0,61321.0,65925.0,66906.0,66695.0,67073.0,67308.0,68221.0,65605.0,65211.0,65365.0,63117.0,62042.0,59584.0,56766.0,54694.0,52697.0,51707.0,50567.0,49884.0,51612.0,37091.0,36845.0,35441.0,36173.0,30575.0,27572.0,26053.0,23977.0,22580.0,19594.0,18222.0,16660.0,91543.0,237456.0,608466.0,631898.0,642187.0,589780.0,615279.0,657543.0,501447.0,256847.0,0.011638,0.048566,0.124447,0.12924,0.131344,0.120626,0.125841,0.134485,0.102559,0.052532,0.018723,0,0.0,10,10.0,10,10.0,0,,0,,,,,,,


In [44]:
# Create Daily Input
canada_raw_data = canada_df[raw_data_order].copy()
us_raw_data = states_input[raw_data_order].copy()
all_raw_data = pd.concat([raw_data_regions,us_raw_data,canada_raw_data,raw_data], sort=False)

all_raw_data["Level"] = all_raw_data["Level"].fillna("").astype(str)

all_raw_data["Region"] = all_raw_data["Region"].fillna("").astype(str)
region_populations = all_raw_data.loc[all_raw_data["Level"]=="Region"].groupby(["Region"])["Region","Population 100K"].max().reset_index(drop=True)
region_populations.rename(columns = {"Population 100K":"Region Population 100K"},inplace=True)
all_raw_data = all_raw_data.merge(region_populations,how="left",on="Region")
all_raw_data["Region Population Percent"] = all_raw_data["Population 100K"]/all_raw_data["Region Population 100K"]

all_raw_data["Country"] = all_raw_data["Country"].fillna("").astype(str)
all_raw_data["State/Province"] = all_raw_data["State/Province"].fillna("").astype(str)
all_raw_data["Abbreviation"] = all_raw_data["Abbreviation"].fillna("").astype(str)

all_raw_data["Date"] = all_raw_data["Date"].astype(str)
all_raw_data["Excel Date"] = all_raw_data["Date"].apply(lambda x: convertDateToExcel(x))
all_raw_data["Excel Date"] = all_raw_data["Excel Date"].astype(int)
all_raw_data["Week"] = all_raw_data["Date"].apply(lambda x: datetime.strptime(x, '%m/%d/%Y').strftime('%Y%V'))
all_raw_data["First Day of Week"] = all_raw_data["Date"].apply(lambda x: firstDay(x))

grouping_cols = ["Region","Country","State/Province"]
base_cols = ["Cases","Tests","Deaths"]
daily_data_order = [
    'Level', 'Region', 'Country', 'State/Province', 'Abbreviation', 
    'Time', 'Date', 'Excel Date', 'Week', 'First Day of Week', 'Accessed',
    'Population 100K', 'Region Population 100K', 'Region Population Percent'
]
all_raw_data = all_raw_data.sort_values(["Region","Country","State/Province",'Time']).reset_index(drop=True)

for col in base_cols:
    all_raw_data[col + " Daily"] = all_raw_data[col + " Daily"].fillna(0).astype(int)
    all_raw_data[col + " Daily Rate"] = all_raw_data[col + " Daily"]/all_raw_data["Population 100K"]
    all_raw_data["Total " + col] = all_raw_data.groupby(grouping_cols)[col + " Daily"].cumsum().reset_index(drop=True)
    all_raw_data[col + " Daily 7D Rolling"] = all_raw_data.groupby(grouping_cols, as_index=False)[col + " Daily"].rolling(7,min_periods=7).mean().reset_index(drop=True)
    all_raw_data[col + " Daily 2W Rolling"] = all_raw_data.groupby(grouping_cols, as_index=False)[col + " Daily"].rolling(14,min_periods=14).mean().reset_index(drop=True)
    all_raw_data[col + " Daily Rate 7D Rolling"] = all_raw_data.groupby(grouping_cols, as_index=False)[col + " Daily Rate"].rolling(7,min_periods=7)[col + " Daily Rate"].mean().reset_index(drop=True)
    all_raw_data[col + " Daily Rate 2W Rolling"] = all_raw_data.groupby(grouping_cols, as_index=False)[col + " Daily Rate"].rolling(14,min_periods=14)[col + " Daily Rate"].mean().reset_index(drop=True)
    all_raw_data["Total " + col + " Rate"] = all_raw_data["Total " + col]/all_raw_data["Population 100K"]
    base_order = [
        col + ' Daily', col + ' Daily 7D Rolling', col + ' Daily 2W Rolling', "Total " + col,
        col + ' Daily Rate', col + ' Daily Rate 7D Rolling', col + ' Daily Rate 2W Rolling', "Total " + col + " Rate"
    ]
    daily_data_order = daily_data_order + base_order

all_raw_data["Positivity 7D Rolling"] = all_raw_data["Cases Daily 7D Rolling"]/all_raw_data["Tests Daily 7D Rolling"]
all_raw_data["Positivity 7D Rolling"] = all_raw_data["Positivity 7D Rolling"].apply(lambda x: np.nan if x == np.inf else x)
all_raw_data["Positivity 2W Rolling"] = all_raw_data["Cases Daily 2W Rolling"]/all_raw_data["Tests Daily 2W Rolling"]
all_raw_data["Positivity 2W Rolling"] = all_raw_data["Positivity 2W Rolling"].apply(lambda x: np.nan if x == np.inf else x)
all_raw_data["Speed Daily"] = all_raw_data.groupby(grouping_cols, as_index=False)["Cases Daily 7D Rolling"].diff().reset_index(drop=True)
all_raw_data["Acceleration Daily"] = all_raw_data.groupby(grouping_cols, as_index=False)["Speed Daily"].diff().reset_index(drop=True)
all_raw_data["Jerk Daily"] = all_raw_data.groupby(grouping_cols, as_index=False)["Acceleration Daily"].diff().reset_index(drop=True)
daily_data_order = daily_data_order + ["Positivity 7D Rolling", "Positivity 2W Rolling", "Speed Daily", "Acceleration Daily", "Jerk Daily"]

print(daily_data_order)
all_raw_data = all_raw_data[daily_data_order].copy()

all_raw_data["MM-DD-YYYY"] = all_raw_data["Date"].apply(lambda x: datetime.strptime(x, '%m/%d/%Y').strftime('%m-%d-%Y'))
all_raw_data["MM-DD-YYYY"] = all_raw_data["MM-DD-YYYY"].astype(str)
all_raw_data["DD-MM-YYYY"] = all_raw_data["Date"].apply(lambda x: datetime.strptime(x, '%m/%d/%Y').strftime('%d-%m-%Y'))
all_raw_data["DD-MM-YYYY"] = all_raw_data["DD-MM-YYYY"].astype(str)

all_raw_data.to_excel(cleanedFolder + "daily_raw_input.xlsx", index=False)
all_raw_data.to_csv(cleanedFolder + "daily_raw_input.csv", index=False)
all_raw_data.head()

['Level', 'Region', 'Country', 'State/Province', 'Abbreviation', 'Time', 'Date', 'Excel Date', 'Week', 'First Day of Week', 'Accessed', 'Population 100K', 'Region Population 100K', 'Region Population Percent', 'Cases Daily', 'Cases Daily 7D Rolling', 'Cases Daily 2W Rolling', 'Total Cases', 'Cases Daily Rate', 'Cases Daily Rate 7D Rolling', 'Cases Daily Rate 2W Rolling', 'Total Cases Rate', 'Tests Daily', 'Tests Daily 7D Rolling', 'Tests Daily 2W Rolling', 'Total Tests', 'Tests Daily Rate', 'Tests Daily Rate 7D Rolling', 'Tests Daily Rate 2W Rolling', 'Total Tests Rate', 'Deaths Daily', 'Deaths Daily 7D Rolling', 'Deaths Daily 2W Rolling', 'Total Deaths', 'Deaths Daily Rate', 'Deaths Daily Rate 7D Rolling', 'Deaths Daily Rate 2W Rolling', 'Total Deaths Rate', 'Positivity 7D Rolling', 'Positivity 2W Rolling', 'Speed Daily', 'Acceleration Daily', 'Jerk Daily']


Unnamed: 0,Level,Region,Country,State/Province,Abbreviation,Time,Date,Excel Date,Week,First Day of Week,Accessed,Population 100K,Region Population 100K,Region Population Percent,Cases Daily,Cases Daily 7D Rolling,Cases Daily 2W Rolling,Total Cases,Cases Daily Rate,Cases Daily Rate 7D Rolling,Cases Daily Rate 2W Rolling,Total Cases Rate,Tests Daily,Tests Daily 7D Rolling,Tests Daily 2W Rolling,Total Tests,Tests Daily Rate,Tests Daily Rate 7D Rolling,Tests Daily Rate 2W Rolling,Total Tests Rate,Deaths Daily,Deaths Daily 7D Rolling,Deaths Daily 2W Rolling,Total Deaths,Deaths Daily Rate,Deaths Daily Rate 7D Rolling,Deaths Daily Rate 2W Rolling,Total Deaths Rate,Positivity 7D Rolling,Positivity 2W Rolling,Speed Daily,Acceleration Daily,Jerk Daily,MM-DD-YYYY,DD-MM-YYYY
0,Region,Central Asia,,,,2020-01-31 00:00:00,01/31/2020,43861,202005,01/27/2020,12/2/2020,1459.34,3207.62,0.45496,2,,,2,0.00137048,,,0.00137048,0,,,0,0,,,0,0,,,0,0,,,0,,,,,,01-31-2020,31-01-2020
1,Region,Central Asia,,,,2020-02-01 00:00:00,02/01/2020,43862,202005,01/27/2020,12/2/2020,1459.34,3207.62,0.45496,0,,,2,0.0,,,0.00137048,0,,,0,0,,,0,0,,,0,0,,,0,,,,,,02-01-2020,01-02-2020
2,Region,Central Asia,,,,2020-02-02 00:00:00,02/02/2020,43863,202005,01/27/2020,12/2/2020,1459.34,3207.62,0.45496,0,,,2,0.0,,,0.00137048,0,,,0,0,,,0,0,,,0,0,,,0,,,,,,02-02-2020,02-02-2020
3,Region,Central Asia,,,,2020-02-03 00:00:00,02/03/2020,43864,202006,02/03/2020,12/2/2020,1459.34,3207.62,0.45496,0,,,2,0.0,,,0.00137048,0,,,0,0,,,0,0,,,0,0,,,0,,,,,,02-03-2020,03-02-2020
4,Region,Central Asia,,,,2020-02-04 00:00:00,02/04/2020,43865,202006,02/03/2020,12/2/2020,1459.34,3207.62,0.45496,0,,,2,0.0,,,0.00137048,0,,,0,0,,,0,0,,,0,0,,,0,,,,,,02-04-2020,04-02-2020


In [23]:
populations = all_raw_data.groupby(["Region","Country","State/Province","Population 100K","Region Population 100K","Region Population Percent"])["Date"].count()
populations = populations.reset_index()

populations.head()


Unnamed: 0,Region,Country,State/Province,Population 100K,Region Population 100K,Region Population Percent,Date
0,Central Asia,,,1459.34,3207.62,0.45496,26
1,Central Asia,,,1520.06,3207.62,0.47389,4
2,Central Asia,,,1651.08,3207.62,0.514737,8
3,Central Asia,,,1663.15,3207.62,0.5185,2
4,Central Asia,,,2506.54,3207.62,0.781433,2


In [None]:
# Create Weekly Input Data

canada_raw_data = canada_df[raw_data_order].copy()
us_raw_data = states_input[raw_data_order].copy()
to_concat = [raw_data_regions,us_raw_data,canada_raw_data,raw_data]
weekly_raw_data = pd.concat([s.reset_index(drop=True) for s in to_concat], sort=False)
weekly_raw_data = weekly_raw_data.reset_index(drop=True)
weekly_raw_data["Level"] = weekly_raw_data["Level"].fillna("").astype(str)
weekly_raw_data["Region"] = weekly_raw_data["Region"].fillna("").astype(str)
weekly_raw_data["Country"] = weekly_raw_data["Country"].fillna("").astype(str)
weekly_raw_data["State/Province"] = weekly_raw_data["State/Province"].fillna("").astype(str)
weekly_raw_data["Abbreviation"] = weekly_raw_data["Abbreviation"].fillna("").astype(str)
weekly_raw_data["Date"] = weekly_raw_data["Date"].astype(str)
weekly_raw_data["Week"] = weekly_raw_data["Date"].apply(lambda x: datetime.strptime(x, '%m/%d/%Y').strftime('%Y%V'))
weekly_raw_data["Week"] = weekly_raw_data["Week"].dropna().astype(int)
weekly_raw_data["First Day of Week"] = weekly_raw_data["Date"].apply(lambda x: firstDay(x))
weekly_raw_data["Cases Daily"] = weekly_raw_data["Cases Daily"].fillna(0)
weekly_raw_data["Tests Daily"] = weekly_raw_data["Tests Daily"].fillna(0)
weekly_raw_data["Deaths Daily"] = weekly_raw_data["Deaths Daily"].fillna(0)
weekly_raw_data = weekly_raw_data.sort_values(['Level', 'Region', 'Country', 'State/Province', 'Abbreviation','Time']).reset_index(drop=True)
weekly_raw_data.head()

grouped_data_order = [
    'Level', 'Region', 'Country', 'State/Province', 'Abbreviation',
    'Week', 'First Day of Week', 'Accessed','Population 100K'
]
weekly_grouped_data = weekly_raw_data.groupby(grouped_data_order,as_index=False).sum().reset_index(drop=True)
weekly_grouped_data.rename(columns = {
    'Cases Daily': 'Cases Weekly',
    'Tests Daily': 'Tests Weekly',
    'Deaths Daily': 'Deaths Weekly'
}, inplace = True)
weekly_grouped_data["Region"] = weekly_grouped_data["Region"].fillna("").astype(str)
weekly_grouped_data["Country"] = weekly_grouped_data["Country"].fillna("").astype(str)
weekly_grouped_data["State/Province"] = weekly_raw_data["State/Province"].fillna("").astype(str)
weekly_grouped_data = weekly_grouped_data.sort_values(['Region', 'Country', 'State/Province', 'Week']).reset_index(drop=True)

base_cols = ["Cases","Tests","Deaths"]
for col in base_cols:
    weekly_grouped_data[col + " Weekly"] = weekly_grouped_data[col + " Weekly"].astype(int)
    weekly_grouped_data[col + " Weekly Rate"] = weekly_grouped_data[col + " Weekly"]/weekly_grouped_data["Population 100K"]
    weekly_grouped_data["Weekly Total " + col] = weekly_grouped_data.groupby(grouping_cols)[col + " Weekly"].cumsum().reset_index(drop=True)
    weekly_grouped_data["Weekly Total " + col + " Rate"] = weekly_grouped_data["Weekly Total " + col]/weekly_grouped_data["Population 100K"]
    weekly_grouped_data[col + " 2W Rolling"] = weekly_grouped_data.groupby(grouping_cols, as_index=False)[col + " Weekly"].rolling(2,min_periods=2).sum().reset_index(drop=True)
    weekly_grouped_data[col + " 2W Rolling"] = weekly_grouped_data[col + " 2W Rolling"].fillna(0).astype(int)
    weekly_grouped_data[col + " 2W Rolling Rate"] = weekly_grouped_data[col + " 2W Rolling"]/weekly_grouped_data["Population 100K"]

weekly_grouped_data["Positivity Weekly"] = weekly_grouped_data["Cases Weekly"]/weekly_grouped_data["Tests Weekly"]
weekly_grouped_data["Positivity Weekly"] = weekly_grouped_data["Positivity Weekly"].apply(lambda x: np.nan if x == np.inf else x)
weekly_grouped_data["Positivity 2W Rolling"] = weekly_grouped_data["Cases 2W Rolling"]/weekly_grouped_data["Tests 2W Rolling"]
weekly_grouped_data["Positivity 2W Rolling"] = weekly_grouped_data["Positivity 2W Rolling"].apply(lambda x: np.nan if x == np.inf else x)

print(weekly_grouped_data.columns)
weekly_order = [
    'Level', 'Region', 'Country', 'State/Province', 'Abbreviation', 'Week', 'First Day of Week', 'Accessed', 'Population 100K', 
    'Cases Weekly','Cases Weekly Rate','Weekly Total Cases', 'Weekly Total Cases Rate', 
    'Cases 2W Rolling', 'Cases 2W Rolling Rate', 
    'Tests Weekly', 'Tests Weekly Rate', 'Weekly Total Tests', 'Weekly Total Tests Rate',
    'Tests 2W Rolling', 'Tests 2W Rolling Rate',
    'Deaths Weekly', 'Deaths Weekly Rate', 'Weekly Total Deaths', 'Weekly Total Deaths Rate',
    'Deaths 2W Rolling', 'Deaths 2W Rolling Rate', 
    'Positivity Weekly', 'Positivity 2W Rolling'
]
weekly_grouped_data = weekly_grouped_data[weekly_order]
weekly_grouped_data.to_excel(cleanedFolder + "weekly_raw_input.xlsx", index=False)
weekly_grouped_data.to_csv(cleanedFolder + "weekly_raw_input.csv", index=False)
weekly_grouped_data.head(21)