In [None]:
# Dependencies
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import requests
import io
import time
from datetime import datetime, timedelta
import json
import pprint
pp = pprint.PrettyPrinter(indent=4)
pd.set_option('display.max_columns', None)

rFolder = "C:/Users/janin/Downloads/"
dataFolder = "D:/Repositories/Global-COVID-Surveillance/data/"
cleanedFolder = dataFolder + "cleaned/"


now = datetime.now()
current_date = now.strftime('%Y-%m-%d')
print(current_date)
file_date = "2021-01-25"
file_end = " -Results- " + current_date +" .csv"
output_file_end = " -Results- " + file_date +" .csv"
statistics_file_end = " - GMM - " + file_date + " .txt"
print(file_end)

def print_column_unique(column):
    print("Column Values:")
    values = list(set(sorted(column,key=lambda v: (isinstance(v, str), v))))
    print(values)
    return values

def print_column_missing(column, comparison):
    values = print_column_unique(column)
    print("Comparison:")
    print(comparison)
    missing_values = []
    for value in values:
        inComparison = False
        for c in comparison:
            inComparison = (str(value) == str(c))
            if inComparison:
                break
        if not inComparison:
            missing_values.append(value)
    if len(missing_values) > 0:
        print("Column values not in comparison:")
        print(missing_values)
    else:
        print("No missing values")
    missing_values = []
    for value in comparison:
        if not value in values:
            missing_values.append(value)
    if len(missing_values) > 0:
        print("Comparison values not in column:")
        print(missing_values)
    else:
        print("No missing values")
    return values

populations = pd.read_excel(cleanedFolder + "all_populations.xlsx")
population_groups = ["Level","Region","Census Region","Country","State/Province"]
for group in population_groups:
    populations[group] = populations[group].astype(str)
    populations[group] = populations[group].apply(lambda x: "" if x=="nan" else x)
populations.head()

In [None]:
# Read starter

pgmm = pd.read_excel(dataFolder + "Starter.xlsx")
print(pgmm.columns)
pgmm.head()

# Read R results

pgmm_files = [
    'Central Asia',
    'East Asia and Pacific',
    'Europe',
    'Latin America',
    'Middle East and North Africa',
    'United States',
    'Canada',
    'South Asia',
    'Sub-Saharan Africa'
]

def readOutput(r,pgmm_read_file):
    df = pd.read_csv(pgmm_read_file)
    df = df.drop(["Unnamed: 0"], axis=1)
    df = df[df["V1"] != 0]
    if ((r == "Canada") or (r == "United States")):
        df["Region"] = "North America"
        df.rename(columns = {"V2": "State/Province"}, inplace=True)
        df["Country"] = r
        df["Level"] = df["State/Province"].apply(lambda x: "Country" if (x == "Region") else "State/Province")
        df["State/Province"] = df["State/Province"].apply(lambda x: "" if (x == "Region") else x)
    else:
        df["Region"] = r
        df.rename(columns = {"V2": "Country"}, inplace=True)
        df["Level"] = df["Country"].apply(lambda x: "Region" if (x == "Region") else "Country")
        df["Country"] = df["Country"].apply(lambda x: "" if (x == "Region") else x)
        df["State/Province"] = ""
    df.rename(columns = {
        "V1": 'Last Day of Week Excel Date',
        "V3": 'Cases Daily Last Day of Week',
        "V4": 'Total Cases Last Day of Week',
        "V5": 'Cases 7D Moving Average',
        "V6": 'Cases Last Day of Week Rate 100K',
        "V7": 'Deaths Daily Last Day of Week',
        "V8": 'Total Deaths Last Day of Week',
        "V9": 'Deaths 7D Moving Average',
        "V10": 'Deaths Last Day of Week Rate 100K',
        "V11": 'Speed', #Cases Last Day of Week Rate 100K 7D Moving Average
        "V12": 'Acceleration', #Average Daily Change in Speed
        "V13": 'Jerk', #Average Daily Change in Acceleration
        "V14": '1-Day Persistence', 
        "V15": '7-Day Persistence'
    }, inplace=True)
    return df
    

for r in pgmm_files:
    file_stem = cleanedFolder + r
    pgmm_read_file = file_stem + output_file_end
    statistics_read_file = file_stem + statistics_file_end
    output_df = readOutput(r,pgmm_read_file)
    pgmm = pd.concat([pgmm, output_df], ignore_index=True, sort=False)

# Create Time Variables
pgmm["Last Day of Week Excel Date"] = pgmm["Last Day of Week Excel Date"].astype(int)
pgmm["Time"] = pgmm["Last Day of Week Excel Date"].apply(lambda x: datetime.fromordinal(datetime(1900, 1, 1).toordinal() + x - 2))
pgmm["Date"] = pgmm["Time"].apply(lambda x: x.strftime('%m/%d/%Y'))
pgmm["Week"] = pgmm["Date"].apply(lambda x: 
                                  str(datetime.strptime(x, '%m/%d/%Y').isocalendar()[0]) + 
                                  " W" +
                                  str(datetime.strptime(x, '%m/%d/%Y').isocalendar()[1])
                                 )
pgmm["Output"] = True
pgmm_order = [
    'Region', 'Country', 'State/Province', 
    'Last Day of Week Excel Date', 'Date', 'Week',
    'Speed', 'Acceleration', 'Jerk', '7-Day Persistence', '1-Day Persistence',
    'Cases Daily Last Day of Week', 'Total Cases Last Day of Week',
    'Cases 7D Moving Average', 'Cases Last Day of Week Rate 100K',
    'Deaths Daily Last Day of Week', 'Total Deaths Last Day of Week',
    'Deaths 7D Moving Average', 'Deaths Last Day of Week Rate 100K', 'Output'
]
pgmm = pgmm[pgmm_order]
pgmm.head(8)

In [None]:
date_check = pgmm.groupby(pgmm["Country"])["Week"].max()
date_check.head()

us_check = pgmm.loc[(pgmm["Country"]=="United States") & (pgmm["State/Province"]=="")]
us_check.head()

In [None]:
input_df = pd.read_excel(cleanedFolder + "all_raw_input.xlsx")
input_df.drop(columns=["MM-DD-YYYY","DD-MM-YYYY"],inplace=True)
input_df["State/Province"] = input_df["State/Province"].astype(str)
input_df["State/Province"] = input_df["State/Province"].apply(lambda x: "" if x == "nan" else x)
input_df["Census Region"] = input_df["Census Region"].astype(str)
input_df["Census Region"] = input_df["Census Region"].apply(lambda x: "" if x == "nan" else x)
input_df["Status"] = input_df["Status"].astype(str)
input_df["Status"] = input_df["Status"].apply(lambda x: "" if x == "nan" else x)
input_df["Data Quality"] = input_df["Data Quality"].astype(str)
input_df["Data Quality"] = input_df["Data Quality"].apply(lambda x: "" if x == "nan" else x)
input_df.head()

us_check = input_df.loc[(input_df["Country"]=="United States") & (input_df["State/Province"]=="")]
us_check.head()

In [None]:
all_df = input_df.merge(pgmm,how="left",on=['Region', 'Country', 'State/Province', 'Date', 'Week'])
all_df["Output"] = all_df["Output"].apply(lambda x: x if x==True else False)
all_df["Level"] = all_df.apply(
    lambda x: "Country" if ((x["Country"]=="United States") & (x["State/Province"]=="")) else x["Level"],
    axis=1
)
all_order = [
    'Level', 'Region', 'Country', 'Census Region', 'State/Province', 'Abbreviation', 'FIPS',
    'Time','Date','Week', 'First Day of Week', 'Last Day of Week', 'Week Date Range', 'Last Day of Week Excel Date', 'Accessed',
    'Output', 'Status', 'Data Quality',
    'Speed', 'Speed Daily', 'Acceleration', 'Acceleration Daily', 'Jerk', 'Jerk Daily', '7-Day Persistence', '1-Day Persistence',
    'Cases Daily', 'Cases Daily 7D Rolling', 'Cases 7D Moving Average', 'Cases Daily Last Day of Week', 
    'Cases Daily Rate', 'Cases Daily Rate 7D Rolling', 'Cases Last Day of Week Rate 100K',
    'Total Cases', 'Total Cases Rate', 'Total Cases Last Day of Week',
    'Deaths Daily', 'Deaths Daily 7D Rolling', 'Deaths 7D Moving Average', 'Deaths Daily Last Day of Week', 
    'Deaths Daily Rate', 'Deaths Daily Rate 7D Rolling', 'Deaths Last Day of Week Rate 100K', 
    'Total Deaths', 'Total Deaths Rate', 'Total Deaths Last Day of Week',
    'Tests Daily', 'Tests Daily 7D Rolling',
    'Tests Daily Rate', 'Tests Daily Rate 7D Rolling',
    'Total Tests', 'Total Tests Rate',
    'Positivity 7D Rolling', 
    'Active Daily', 'Total Active', 
    'Negative Daily', 'Total Negative', 
    'Recovered Daily', 'Total Recovered', 
    'Hospitalized Daily', 'Currently Hospitalized', 'Total Hospitalized', 
    'ICU Daily', 'Total ICU', 
    'Currently In ICU', 'Total In ICU', 
    'Currently On Ventilator', 'Total On Ventilator', 
    "Population","Population 100K",
    "Country Population","Country Population 100K", "Country Share",
    "Region Population","Region Population 100K", "Region Share",
    "World Population", "World Population 100K", "World Share",
    'World Share (%)', 'Urban Population (%)', 'Annual Change (%)', 'Net Change', 'Migrants (net)', 'Density (P/Km²)',
    'Land Area (Km²)', 'Fertility Rate', 'Median Age',
    '< 1', 1, 2, 3, 4, 5, 6, 7, 8, 9,
    10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
    20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
    30, 31, 32, 33, 34, 35, 36, 37, 38, 39,
    40, 41, 42, 43, 44, 45, 46, 47, 48, 49,
    50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
    60, 61, 62, 63, 64, 65, 66, 67, 68, 69,
    70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
    80, 81, 82, 83, 84, '85+',
    '1-4','5-14','15-24','25-34','35-44','45-54','55-64','65-74','75-84',
    'Pct < 1','Pct 1-4','Pct 5-14','Pct 15-24','Pct 25-34','Pct 35-44','Pct 45-54','Pct 55-64','Pct 65-74','Pct 75-84','Pct 85+'
]
print_column_missing(all_df.columns,all_order)
all_df = all_df[all_order]

all_out_file = cleanedFolder + "all_combined.xlsx"
print(all_out_file)
all_df.to_excel(all_out_file, index=False)
all_df.head(14)

us_check = all_df.loc[(all_df["Country"]=="United States") & (all_df["State/Province"]=="") & (all_df["Output"]==True)]
us_check.head()

In [7]:
date_check = all_df.groupby(all_df["Country"])["Time","Week"].max()
date_check.head()

Unnamed: 0_level_0,Time,Week
Country,Unnamed: 1_level_1,Unnamed: 2_level_1
Afghanistan,2021-01-24,2021 W3
Albania,2021-01-24,2021 W3
Algeria,2021-01-24,2021 W3
Andorra,2021-01-24,2021 W3
Angola,2021-01-24,2021 W3
