In [None]:
import numpy as np
import pandas as pd
from multiprocessing import Pool

In [None]:
### Create a parallelizing function
def parallel1(data, func, n_cores = 25):
    ### Split data by state into 25 sections
    splits = np.array_split(data["State"].unique(), 25)
    
    ### Create empty list
    data_split = []
    
    ### Add each split dataframe to the list
    for i in range(25):
        data_split.append(data[data["State"].isin(list(splits[i]))])
    
    ### Run 
    pool = Pool(n_cores)
    data1 = pd.concat(pool.map(func, data_split))
    pool.close()
    pool.join()
    return data1

In [None]:
### Define function to create new cases data
def newCases1(data):
    changeInCases = []
    ### For each state.
    for state in data["State"].unique():
        ### For each county in the state
        for county in data["County Name"][data["State"] == state].unique():
            ### Calculate diff in case for each day, keep first day
            changeInCases.extend(np.diff(data["Total Cases"][(data["County Name"] == county) &
                                                                         (data["State"] == state)],
                                             prepend = data["Total Cases"][(data["County Name"] == county) &
                                                                         (data["State"] == state)].iloc[0]))
    ### Add to data
    data["New Cases"] = changeInCases

    return data

In [None]:
### Define function to create new deaths data
def newDeaths1(data):
    changeInDeaths = []
    ### For each state.
    for state in data["State"].unique():
        ### For each county in the state
        for county in data["County Name"][data["State"] == state].unique():
            ### Calculate diff in case for each day, keep first day
            changeInDeaths.extend(np.diff(data["Total Deaths"][(data["County Name"] == county) &
                                                                           (data["State"] == state)],
                                             prepend = data["Total Deaths"][(data["County Name"] == county) &
                                                                           (data["State"] == state)].iloc[0]))
            
    ### Add to data
    data["New Deaths"] = changeInDeaths
        
    return data

In [None]:
### Create a parallelizing function
def parallel2(data, func, n_cores = 25):
    ### Split data by state into 25 sections
    splits = np.array_split(data["State"].unique(), 25)
    
    ### Create empty list
    data_split = []
    
    ### Add each split dataframe to the list
    for i in range(25):
        data_split.append(data[data["State"].isin(list(splits[i]))])
    
    pool = Pool(n_cores)
    data1 = pd.concat(pool.map(func, data_split))
    pool.close()
    pool.join()
    return data1

In [None]:
### Define function to create new cases data
def newCases2(data):
    changeInCases = []
    ### For each state.
    for state in data["State"].unique():
        ### Calculate diff in case for each day, keep first day
        changeInCases.extend(np.diff(data["Total Cases"][data["State"] == state],
                                         prepend = data["Total Cases"][data["State"] == state].iloc[0]))
    ### Add to data
    data["New Cases"] = changeInCases

    return data

In [None]:
### Define function to create new deaths data
def newDeaths2(data):
    changeInDeaths = []
    ### For each state.
    for state in data["State"].unique():
        ### Calculate diff in case for each day, keep first day
        changeInDeaths.extend(np.diff(data["Total Deaths"][data["State"] == state],
                                         prepend = data["Total Deaths"][data["State"] == state].iloc[0]))
            
    ### Add to data
    data["New Deaths"] = changeInDeaths
        
    return data

In [None]:
def run_all():

    ### Number of confirmed cases by county
    !curl https://usafactsstatic.blob.core.windows.net/public/data/covid-19/covid_confirmed_usafacts.csv --output data/cases.csv

    ### Number of confirmed deaths by county
    !curl https://usafactsstatic.blob.core.windows.net/public/data/covid-19/covid_deaths_usafacts.csv --output data/deaths.csv

    ### Total Cases
    cases = pd.read_csv("data/cases.csv")

    odd = "Unnamed: " + str(len(cases.columns) - 1)

    if (cases.columns[-1] == odd):
        cases = cases.drop(columns = cases.columns[-1])

    ### Total Deaths
    deaths = pd.read_csv("data/deaths.csv")

    if (cases.columns[-1] == odd):
        deaths = deaths.drop(columns = deaths.columns[-1])


    ### Total Population
    population = pd.read_csv("data/population.csv")

    #### County Data

    ### Remove Wade Hampton Area
    cases = cases.drop(list(cases[cases["County Name"] == "Wade Hampton Census Area"].index))

    ### New York City Unallocated/Probable
    cases = cases.drop(list(cases[cases["County Name"] == "New York City Unallocated/Probable"].index))

    ### Remove Grand Princess Cruise Ship
    cases = cases.drop(list(cases[cases["County Name"] == "Grand Princess Cruise Ship"].index))
    
    ### Remove Aleutians West Census Area
    cases = cases.drop(list(cases[cases["County Name"] == "Aleutians West Census Area"].index))


    #### Deaths Data
    ### Remove Wade Hampton Area
    deaths = deaths.drop(list(deaths[deaths["County Name"] == "Wade Hampton Census Area"].index))

    ### New York City Unallocated/Probable
    deaths = deaths.drop(list(deaths[deaths["County Name"] == "New York City Unallocated/Probable"].index))

    ### Remove Grand Princess Cruise Ship
    deaths = deaths.drop(list(deaths[deaths["County Name"] == "Grand Princess Cruise Ship"].index))
    
    ### Remove Aleutians West Census Area
    deaths = deaths.drop(list(deaths[deaths["County Name"] == "Aleutians West Census Area"].index))

    cases = cases.rename(columns = {"State" : "StateABV"})

    deaths = deaths.rename(columns = {"State" : "StateABV"})

    ### County FIPS
    countyFIPS = pd.read_csv("data/countyFIPS.csv")

    ### State FIPS
    stateFIPS = pd.read_csv("data/stateFIPS.csv")

    ### Drop cases county labels
    cases = cases.drop(columns = "County Name")

    ### Add County Name from countyFIPS
    cases = cases.merge(countyFIPS, how = "left")

    ### Add State names from stateFIPS
    cases = cases.merge(stateFIPS, how = "left")

    ### Drop deaths county labels
    deaths = deaths.drop(columns = "County Name")

    ### Add County Name from countyFIPS
    deaths = deaths.merge(countyFIPS, how = "left")

    ### Add State names from stateFIPS
    deaths = deaths.merge(stateFIPS, how = "left")

    ### Drop population county and state labels
    population = population.drop(columns = "County Name")

    ### Add County Name from countyFIPS
    population = population.merge(countyFIPS, how = "left")

    ### Unpivot cases data
    cases = pd.melt(cases, id_vars = ['County Name', "State", "StateABV", "countyFIPS", "stateFIPS"],
                     value_vars = cases.columns[3:-2],
                     var_name = "Date", value_name = "Cases")

    ### Unpivot death data
    deaths = pd.melt(deaths, id_vars = ['County Name', "State", "StateABV", "countyFIPS", "stateFIPS"],
                     value_vars = list(deaths.columns[3:-2]),
                     var_name = "Date", value_name = "Deaths")

    ### Merge dataframes
    cases_deaths = cases.merge(deaths, on = ["State", "StateABV", "County Name", "Date", "countyFIPS", "stateFIPS"])

    ### Merge dataframes
    cases_deaths = cases_deaths.merge(population, on = ["countyFIPS","County Name"], how = "left")

    ### Sort
    cases_deaths = cases_deaths.astype({"Date" : "datetime64"})
    cases_deaths = cases_deaths.sort_values(["State","County Name","Date"], ascending = [True, True, True])


    ### Rename population and cases
    cases_deaths = cases_deaths.rename(columns = {"Cases" : "Total Cases",
                                                  "Deaths" : "Total Deaths"})

    cases_deaths = cases_deaths.reset_index().drop(columns = "index")

    cases_deaths = cases_deaths.astype({"County Name" : "category",
                                        "State" : "category",
                                        "countyFIPS" : "str",
                                        "stateFIPS" : "str"})

    ### First six states end where DC begins
    firstSix = cases_deaths[:list(cases_deaths["countyFIPS"][cases_deaths["State"] == "DC"].index)[0]]

    ### Create a new column with the fixed FIPS codes
    firstSix.insert(2,"countyFIPS2", '0' + firstSix["countyFIPS"])

    ### Drop the old FIPS codes and rename the new FIPS codes column
    firstSix = firstSix.drop(columns = "countyFIPS")
    firstSix = firstSix.rename(columns = {"countyFIPS2" : "countyFIPS"})

    firstSixIndex = np.arange(start = 0, stop = list(cases_deaths["countyFIPS"][cases_deaths["State"] == "DC"].index)[0])
    cases_deaths = cases_deaths.drop(firstSixIndex)

    cases_deaths = pd.concat([firstSix,cases_deaths])

    cases_deaths2 = cases_deaths[cases_deaths["County Name"] != "Statewide Unallocated"]
    cases_deaths2 = cases_deaths2.reset_index()
    cases_deaths2 = cases_deaths2.drop(columns = "index")

    ### First for Alabama
    ### Aggregate data
    StateData = cases_deaths[cases_deaths['State'] == "Alabama"].groupby("Date").agg(
            TotalCases = pd.NamedAgg(column = "Total Cases", aggfunc = sum),
            TotalDeaths = pd.NamedAgg(column = "Total Deaths", aggfunc = sum),
            Population = pd.NamedAgg(column = "Population", aggfunc = sum))

    ### Make a vector of the state and its FIPS
    state = np.repeat("Alabama", len(cases_deaths["Date"].unique()))
    stateABV = np.repeat("AL", len(cases_deaths["Date"].unique()))
    statefips = np.repeat('1', len(cases_deaths["Date"].unique()))

    ### Grab dates
    date = cases_deaths["Date"].unique()

    ### Insert into State Data
    StateData.insert(0, "stateFIPS", statefips)
    StateData.insert(0, "StateABV", stateABV)
    StateData.insert(0, "State", state)
    StateData.insert(0, "Date", date)

    ### Now the rest
    for state, fipsNum, stateABV in zip(cases_deaths["State"].unique()[1:], cases_deaths["stateFIPS"].unique()[1:], 
                                        cases_deaths["StateABV"].unique()[1:]) :
        ### Aggregate data
        myStateData = cases_deaths[cases_deaths['State'] == state].groupby("Date").agg(
            TotalCases = pd.NamedAgg(column = "Total Cases", aggfunc = sum),
            TotalDeaths = pd.NamedAgg(column = "Total Deaths", aggfunc = sum),
            Population = pd.NamedAgg(column = "Population", aggfunc = sum))

        ### Make a vector of the state/fips and grab dates
        mystate = np.repeat(state, len(cases_deaths["Date"].unique()))
        mystateABV = np.repeat(stateABV, len(cases_deaths["Date"].unique()))
        mystatefips = np.repeat(fipsNum, len(cases_deaths["Date"].unique()))
        mydate = cases_deaths["Date"].unique()

        ### Insert data
        myStateData.insert(0, "stateFIPS", mystatefips)
        myStateData.insert(0, "StateABV", mystateABV)
        myStateData.insert(0, "State", state)
        myStateData.insert(0, "Date", date)

        ### Stack state datas
        StateData = pd.concat([StateData, myStateData])

    ### Reset indicies
    StateData = StateData.set_index(np.arange(0,len(StateData)))

    ### First for date
    ### Aggregate data
    USAData = StateData[StateData['Date'] == StateData["Date"].unique()[0]].groupby("Date").agg(
            TotalCases = pd.NamedAgg(column = "TotalCases", aggfunc = sum),
            TotalDeaths = pd.NamedAgg(column = "TotalDeaths", aggfunc = sum),
            Population = pd.NamedAgg(column = "Population", aggfunc = sum))

    ### Insert into usaData
    USAData.insert(0, "Date", StateData["Date"].unique()[0])
    USAData.insert(0, "Country", "United States")


    ### For the rest of dates
    for day in StateData["Date"].unique()[1:]:
        ### Aggregate data
        myUSAData = StateData[StateData['Date'] == day].groupby("Date").agg(
            TotalCases = pd.NamedAgg(column = "TotalCases", aggfunc = sum),
            TotalDeaths = pd.NamedAgg(column = "TotalDeaths", aggfunc = sum),
            Population = pd.NamedAgg(column = "Population", aggfunc = sum))

        ### Insert date into data
        myUSAData.insert(0, "Date", day)
        myUSAData.insert(0, "Country", "United States")

        ### Stack state datas
        USAData = pd.concat([USAData, myUSAData])



    ### Reset indicies
    USAData = USAData.set_index(np.arange(0,len(USAData)))

    ### Rename Columns
    StateData = StateData.rename(columns = {"TotalCases" : "Total Cases",
                                            "TotalDeaths" : "Total Deaths"})
    USAData = USAData.rename(columns = {"TotalCases" : "Total Cases",
                                            "TotalDeaths" : "Total Deaths"})

    cases_deaths2 = parallel1(cases_deaths2, newCases1)

    cases_deaths2 = parallel1(cases_deaths2, newDeaths1)

    StateData = parallel2(StateData, newCases2)

    StateData = parallel2(StateData, newDeaths2)

    ### New Cases
    USAData["New Cases"] = abs(np.diff(USAData["Total Cases"], prepend = USAData["Total Cases"].iloc[0]))

    ### New Deaths
    USAData["New Deaths"] = abs(np.diff(USAData["Total Deaths"], prepend = USAData["Total Deaths"].iloc[0]))

    ### Percent of population that have cases.
    cases_deaths2["%Cases"] = np.where(cases_deaths2["Population"] != 0,
                                       round((cases_deaths2["Total Cases"] / cases_deaths2["Population"]) * 100, 3),
                                       0)

    ### Percent of population that have died.
    cases_deaths2["%Deaths"] = np.where(cases_deaths2["Population"] != 0,
                                        round((cases_deaths2["Total Deaths"] / cases_deaths2["Population"]) * 100, 3),
                                        0)

    ### Percent of population that have cases.
    StateData["%Cases"] = np.where(StateData["Population"] != 0,
                                   round((StateData["Total Cases"] / StateData["Population"]) * 100, 3),
                                   0)

    ### Percent of population that have died.
    StateData["%Deaths"] = np.where(StateData["Population"] != 0,
                                    round((StateData["Total Deaths"] / StateData["Population"]) * 100, 3),
                                    0)

    ### Percent of population that have cases.
    USAData["%Cases"] = np.where(USAData["Population"] != 0,
                                 round((USAData["Total Cases"] / USAData["Population"]) * 100, 3),
                                 0)

    ### Percent of population that have died.
    USAData["%Deaths"] = np.where(USAData["Population"] != 0,
                                  round((USAData["Total Deaths"] / USAData["Population"]) * 100, 3),
                                  0)

    cases_deaths2["log(Total Cases)"] = round(np.log(cases_deaths2["Total Cases"]), 3)

    cases_deaths2["log(Total Deaths)"] = round(np.log(cases_deaths2["Total Deaths"]), 3)

    cases_deaths2["log(New Cases)"] = round(np.log(cases_deaths2["New Cases"]), 3)

    cases_deaths2["log(New Deaths)"] = round(np.log(cases_deaths2["New Deaths"]), 3)

    StateData["log(Total Cases)"] = round(np.log(StateData["Total Cases"]), 3)

    StateData["log(Total Deaths)"] = round(np.log(StateData["Total Deaths"]), 3)

    StateData["log(New Cases)"] = round(np.log(StateData["New Cases"]), 3)

    StateData["log(New Deaths)"] = round(np.log(StateData["New Deaths"]), 3)

    USAData["log(Total Cases)"] = round(np.log(USAData["Total Cases"]), 3)

    USAData["log(Total Deaths)"] = round(np.log(USAData["Total Deaths"]), 3)

    USAData["log(New Cases)"] = round(np.log(USAData["New Cases"]), 3)

    USAData["log(New Deaths)"] = round(np.log(USAData["New Deaths"]), 3)

    StateData = StateData.astype({"State" : "category",
                                  "stateFIPS" : "str"})

    USAData = USAData.astype({"Country" : "category"})

    CountyData = cases_deaths2

    ### Google Mobility data
    !curl https://www.gstatic.com/covid19/mobility/Global_Mobility_Report.csv?cachebust=7d0cb7d254d29111 --output data/mobility.csv

    GoogleMobility = pd.read_csv("data/mobility.csv", dtype = "str")
    GoogleMobility = GoogleMobility.drop(columns = 'metro_area')

    ### Keep only US
    GoogleMobility = GoogleMobility[GoogleMobility["country_region_code"] == "US"]

    ### Mobility data for whole country
    GoogleUsaMobility = GoogleMobility[GoogleMobility["sub_region_1"].isnull()]

    ### Mobility data for states
    GoogleStateMobility = GoogleMobility[(GoogleMobility["sub_region_1"].isnull() != True) & (GoogleMobility["sub_region_2"].isnull())]

    ### Mobility data for counties
    GoogleCountyMobility = GoogleMobility[GoogleMobility["sub_region_2"].isnull() != True]

    ### Drop columns from usaMobility
    GoogleUsaMobility = GoogleUsaMobility.drop(columns = ["country_region_code", "sub_region_1",
                                              "sub_region_2", "iso_3166_2_code",
                                              "census_fips_code"])

    ### Drop columns from stateMobility
    GoogleStateMobility = GoogleStateMobility.drop(columns = ["country_region_code", "country_region", 
                                                  "sub_region_2", "iso_3166_2_code", 
                                                  "census_fips_code"])

    ### Drop columns from countyMobility
    GoogleCountyMobility = GoogleCountyMobility.drop(columns = ["country_region_code", "country_region",
                                                    "sub_region_1", "iso_3166_2_code"])

    ### Rename usaMobility columns
    GoogleUsaMobility = GoogleUsaMobility.rename(columns = {"country_region" : "Country",
                                                "date" : "Date",
                                                "retail_and_recreation_percent_change_from_baseline" : "%Retail/Rec Change",
                                                "grocery_and_pharmacy_percent_change_from_baseline" : "%Grocery/Pharm Change",
                                                "parks_percent_change_from_baseline" : "%Parks Change",
                                                "transit_stations_percent_change_from_baseline" : "%Transit Change",
                                                "workplaces_percent_change_from_baseline" : "%Workplace Change",
                                                "residential_percent_change_from_baseline" : "%Residential Change"})
    GoogleUsaMobility = GoogleUsaMobility.astype({"Date" : "datetime64"})


    ### Rename stateMobility columns
    GoogleStateMobility = GoogleStateMobility.rename(columns = {"sub_region_1" : "State",
                                                "date" : "Date",
                                                "retail_and_recreation_percent_change_from_baseline" : "%Retail/Rec Change",
                                                "grocery_and_pharmacy_percent_change_from_baseline" : "%Grocery/Pharm Change",
                                                "parks_percent_change_from_baseline" : "%Parks Change",
                                                "transit_stations_percent_change_from_baseline" : "%Transit Change",
                                                "workplaces_percent_change_from_baseline" : "%Workplace Change",
                                                "residential_percent_change_from_baseline" : "%Residential Change"})
    GoogleStateMobility = GoogleStateMobility.astype({"Date" : "datetime64"})


    ### Rename countyMobility columns
    GoogleCountyMobility = GoogleCountyMobility.rename(columns = {"sub_region_2" : "County Name",
                                                "census_fips_code" : "countyFIPS",
                                                "date" : "Date",
                                                "retail_and_recreation_percent_change_from_baseline" : "%Retail/Rec Change",
                                                "grocery_and_pharmacy_percent_change_from_baseline" : "%Grocery/Pharm Change",
                                                "parks_percent_change_from_baseline" : "%Parks Change",
                                                "transit_stations_percent_change_from_baseline" : "%Transit Change",
                                                "workplaces_percent_change_from_baseline" : "%Workplace Change",
                                                "residential_percent_change_from_baseline" : "%Residential Change"})
    GoogleCountyMobility = GoogleCountyMobility.astype({"Date" : "datetime64"})


    ### Re-label District of Columbia as DC
    DCindex = list(GoogleStateMobility["State"][GoogleStateMobility["State"] == "District of Columbia"].index)
    for index in DCindex:
        GoogleStateMobility["State"][index] = "DC"

    ### Go grab data
    !curl https://data.cdc.gov/api/views/9bhg-hcku/rows.csv?accessType=DOWNLOAD --output data/sexage.csv

    ### Read in data
    DeathsSexAge = pd.read_csv("data/sexage.csv")

    DeathsSexAge = DeathsSexAge.drop(columns = ["Total Deaths",
                                                "Pneumonia Deaths",
                                                "Pneumonia and COVID-19 Deaths",
                                                "Influenza Deaths", 
                                                "Pneumonia, Influenza, or COVID-19 Deaths",
                                                "Footnote"])

    ### Drop Puerto Rico, Puerto Rico Total
    PRindex = list(DeathsSexAge["State"][(DeathsSexAge["State"] == "Puerto Rico") | (DeathsSexAge["State"] == "Puerto Rico Total")].index)
    DeathsSexAge = DeathsSexAge.drop(index = PRindex)

    ### Rename DC
    DCindex = list(DeathsSexAge["State"][DeathsSexAge["State"] == "District of Columbia"].index)
    DeathsSexAge["State"][DCindex] = "DC"

    ### Go grab data
    !curl https://data.cdc.gov/api/views/pj7m-y5uh/rows.csv?accessType=DOWNLOAD --output data/race.csv

    ### Read in Data
    race = pd.read_csv("data/race.csv")

    race = race.drop(columns = "Footnote")

    ### Drop NYC.
    NYCindex = list(race["State"][race["State"] == "New York City"].index)
    race = race.drop(index = NYCindex)

    ### Rename New York<sup>5</sup> to New York.
    NYindex = list(race["State"][race["State"] == "New York<sup>5</sup>"].index)
    race["State"][NYindex] = "New York"

    ### Rename DC
    DCindex = list(race["State"][race["State"] == "District of Columbia"].index)
    race["State"][DCindex] = "DC"

    countDeaths = race[race["Indicator"] == "Count of COVID-19 deaths"]
    distDeaths = race[race["Indicator"] == "Distribution of COVID-19 deaths (%)"]
    unweightDeaths = race[race["Indicator"] == "Unweighted distribution of population (%)"]
    weightDeaths = race[race["Indicator"] == "Weighted distribution of population (%)"]

    ### Unpivot
    countDeaths = pd.melt(countDeaths, id_vars = ["Data as of","State", "Indicator"],
           value_vars = countDeaths.columns[5:],
           var_name = "Race", value_name = "Count of COVID-19 deaths")

    ### Drop Indicator
    countDeaths = countDeaths.drop(columns = "Indicator")

    ### Unpivot
    distDeaths = pd.melt(distDeaths, id_vars = ["Data as of","State", "Indicator"],
           value_vars = distDeaths.columns[5:],
           var_name = "Race", value_name = "Distribution of COVID-19 deaths (%)")

    ### Drop Indicator
    distDeaths = distDeaths.drop(columns = "Indicator")

    ### Unpivot
    unweightDeaths = pd.melt(unweightDeaths, id_vars = ["Data as of","State", "Indicator"],
           value_vars = unweightDeaths.columns[5:],
           var_name = "Race", value_name = "Unweighted distribution of population (%)")

    ### Drop Indicator
    unweightDeaths = unweightDeaths.drop(columns = "Indicator")

    ### Unpivot
    weightDeaths = pd.melt(weightDeaths, id_vars = ["Data as of","State", "Indicator"],
           value_vars = weightDeaths.columns[5:],
           var_name = "Race", value_name = "Weighted distribution of population (%)")

    ### Drop Indicator
    weightDeaths = weightDeaths.drop(columns = "Indicator")

    raceNew = countDeaths.merge(distDeaths, how = "inner", on = ["Data as of", "State", "Race"])
    raceNew = raceNew.merge(unweightDeaths, how = "inner", on = ["Data as of", "State", "Race"])
    raceNew = raceNew.merge(weightDeaths, how = "inner", on = ["Data as of", "State", "Race"])

    ### Go grab data
    !curl https://www.cdc.gov/nhsn/pdfs/covid19/covid19-NatEst.csv --output data/hospital.csv

    ### Load in data
    hospital = pd.read_csv("data/hospital.csv")

    ### Drop the Notes & state columns and the first row.
    hospital = hospital.drop(columns = ["state", "Notes"])
    hospital = hospital.drop(index = 0)
    hospital = hospital.reset_index(drop = True)

    ### Rename columns
    hospital = hospital.rename(columns = {'statename' : "State", 
                                          'collectionDate': "Date"})

    ### Convert Date into datetime
    hospital = hospital.astype({"Date" : "datetime64"})

    ### Remove Puerto Rico 
    PRindex = list(hospital["State"][hospital["State"] == "Puerto Rico"].index)
    hospital = hospital.drop(index = PRindex)

    ### Rename DC
    DCindex = list(hospital["State"][hospital["State"] == "District of Columbia"].index)
    hospital["State"][DCindex] = "DC"

    ### Go grab data
    !curl https://raw.githubusercontent.com/nytimes/covid-19-data/master/us.csv --output data/NYTusa.csv

    ### Load in data
    NYTusa = pd.read_csv('data/NYTusa.csv')

    ### Set data as datetime and rename columns
    NYTusa = NYTusa.astype({"date" : "datetime64"})

    NYTusa = NYTusa.rename(columns = {'date' : "Date",
                                      'cases' : 'Total Cases',
                                      'deaths' : 'Total Deaths'})

    ### New Cases
    NYTusa["New Cases"] = abs(np.diff(NYTusa["Total Cases"], prepend = NYTusa["Total Cases"].iloc[0]))

    ### New Deaths
    NYTusa["New Deaths"] = abs(np.diff(NYTusa["Total Deaths"], prepend = NYTusa["Total Deaths"].iloc[0]))

    ### Add population.
    NYTusa["Population"] = np.repeat(USAData["Population"][0], len(NYTusa))

    ### Percent of population that have cases.
    NYTusa["%Cases"] = np.where(NYTusa["Population"] != 0,
                                 round((NYTusa["Total Cases"] / NYTusa["Population"]) * 100, 3),
                                 0)

    ### Percent of population that have died.
    NYTusa["%Deaths"] = np.where(NYTusa["Population"] != 0,
                                  round((NYTusa["Total Deaths"] / NYTusa["Population"]) * 100, 3),
                                  0)

    ### Logarithmic Scales
    NYTusa["log(Total Cases)"] = round(np.log(NYTusa["Total Cases"]), 3)
    NYTusa["log(Total Deaths)"] = round(np.log(NYTusa["Total Deaths"]), 3)
    NYTusa["log(New Cases)"] = round(np.log(NYTusa["New Cases"]), 3)
    NYTusa["log(New Deaths)"] = round(np.log(NYTusa["New Deaths"]), 3)

    NYTusa['Country'] = np.repeat('United States', len(NYTusa))
    NYTusa = NYTusa[list(USAData.columns)]

    ### Go grab data
    !curl https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-states.csv --output data/NYTstate.csv

    ### Load in data
    NYTstate = pd.read_csv('data/NYTstate.csv')

    ### Set data to datetime and set fips to string
    NYTstate = NYTstate.astype({"date" : "datetime64",
                                "fips" : 'str'})

    ### Rename columns
    NYTstate = NYTstate.rename(columns = {'date' : "Date",
                                          'state' : 'State',
                                          'fips' : 'stateFIPS',
                                          'cases' : 'Total Cases',
                                          'deaths' : 'Total Deaths'})

    ### Sort data
    NYTstate = NYTstate.sort_values(['State', 'Date'], ascending = [True, True])

    ### Rename District of Columbia to DC
    DCindex = list(NYTstate["State"][NYTstate["State"] == "District of Columbia"].index)
    for index in DCindex:
        NYTstate["State"][index] = "DC"

    ### Remove Guam, Puerto Rico, Virgin Islands, Northern Mariana Islands
    Guamindex = list(NYTstate["State"][NYTstate["State"] == "Guam"].index)
    NYTstate = NYTstate.drop(index = Guamindex)

    PRindex = list(NYTstate["State"][NYTstate["State"] == "Puerto Rico"].index)
    NYTstate = NYTstate.drop(index = PRindex)

    VIindex = list(NYTstate["State"][NYTstate["State"] == "Virgin Islands"].index)
    NYTstate = NYTstate.drop(index = VIindex)

    NMIindex = list(NYTstate["State"][NYTstate["State"] == "Northern Mariana Islands"].index)
    NYTstate = NYTstate.drop(index = NMIindex)

    NYTstate = parallel2(NYTstate, newCases2)
    NYTstate = parallel2(NYTstate, newDeaths2)

    ### Grab state population and state abbreviations
    NYTstate = NYTstate.merge(StateData[['stateFIPS', 'Population', 'StateABV', 'Date']], on = ['stateFIPS', 'Date'], how = 'left')

    ### Percent of population that have cases.
    NYTstate["%Cases"] = np.where(NYTstate["Population"] != 0,
                                 round((NYTstate["Total Cases"] / NYTstate["Population"]) * 100, 3),
                                 0)

    ### Percent of population that have died.
    NYTstate["%Deaths"] = np.where(NYTstate["Population"] != 0,
                                  round((NYTstate["Total Deaths"] / NYTstate["Population"]) * 100, 3),
                                  0)

    ### Logarithmic Scales
    NYTstate["log(Total Cases)"] = round(np.log(NYTstate["Total Cases"]), 3)
    NYTstate["log(Total Deaths)"] = round(np.log(NYTstate["Total Deaths"]), 3)
    NYTstate["log(New Cases)"] = round(np.log(NYTstate["New Cases"]), 3)
    NYTstate["log(New Deaths)"] = round(np.log(NYTstate["New Deaths"]), 3)

    NYTstate = NYTstate[list(StateData.columns)]

    ### Go grab data
    !curl https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-counties.csv --output data/NYTcounty.csv

    ### Load in data
    NYTcounty = pd.read_csv('data/NYTcounty.csv', dtype = str)

    ### Rename columns, change data types, and sort data
    NYTcounty = NYTcounty.astype({"date" : "datetime64",
                            "cases" : "int64",
                            "deaths" : 'int64'})

    NYTcounty = NYTcounty.rename(columns = {'date' : 'Date',
                                            'county' : 'County Name',
                                            'state' : 'State',
                                            'fips' : 'countyFIPS',
                                            'cases' : "Total Cases",
                                            'deaths' : 'Total Deaths'})

    NYTcounty = NYTcounty.sort_values(by = ["State", 'County Name', 'Date'], ascending = [True, True, True])

    ### Rename District of Columbia to DC
    DCindex = list(NYTcounty["State"][NYTcounty["State"] == "District of Columbia"].index)
    for index in DCindex:
        NYTcounty["State"][index] = "DC"
        
    ### Remove Aleutians West Census Area
    NYTcounty = NYTcounty.drop(list(NYTcounty[NYTcounty["County Name"] == "Aleutians West Census Area"].index))

    ### Remove Guam and Puerto Rico
    Guamindex = list(NYTcounty["State"][NYTcounty["State"] == "Guam"].index)
    NYTcounty = NYTcounty.drop(index = Guamindex)

    PRindex = list(NYTcounty["State"][NYTcounty["State"] == "Puerto Rico"].index)
    NYTcounty = NYTcounty.drop(index = PRindex)

    VIindex = list(NYTcounty["State"][NYTcounty["State"] == "Virgin Islands"].index)
    NYTcounty = NYTcounty.drop(index = VIindex)

    NMIindex = list(NYTcounty["State"][NYTcounty["State"] == "Northern Mariana Islands"].index)
    NYTcounty = NYTcounty.drop(index = NMIindex)


    ### Make copies for the 5 counties of NYC
    NewYorkcounty = NYTcounty[NYTcounty["County Name"] == 'New York City'].copy()
    Kingscounty = NYTcounty[NYTcounty["County Name"] == 'New York City'].copy()
    Queenscounty = NYTcounty[NYTcounty["County Name"] == 'New York City'].copy()
    Bronxcounty = NYTcounty[NYTcounty["County Name"] == 'New York City'].copy()
    Richmondcounty = NYTcounty[NYTcounty["County Name"] == 'New York City'].copy()

    ### Change county name, countyFIPS, and divide Total Cases/Total Deaths by 5
    NewYorkcounty['County Name'] = "New York"
    NewYorkcounty['countyFIPS'] = '36061'
    NewYorkcounty['Total Cases'] = NewYorkcounty['Total Cases'] / 5
    NewYorkcounty['Total Deaths'] = NewYorkcounty['Total Deaths'] / 5

    Kingscounty['County Name'] = "Kings"
    Kingscounty['countyFIPS'] = '36047'
    Kingscounty['Total Cases'] = Kingscounty['Total Cases'] / 5
    Kingscounty['Total Deaths'] = Kingscounty['Total Deaths'] / 5

    Queenscounty['County Name'] = "Queens"
    Queenscounty['countyFIPS'] = '36081'
    Queenscounty['Total Cases'] = Queenscounty['Total Cases'] / 5
    Queenscounty['Total Deaths'] = Queenscounty['Total Deaths'] / 5

    Bronxcounty['County Name'] = "Bronx"
    Bronxcounty['countyFIPS'] = '36005'
    Bronxcounty['Total Cases'] = Bronxcounty['Total Cases'] / 5
    Bronxcounty['Total Deaths'] = Bronxcounty['Total Deaths'] / 5

    Richmondcounty['County Name'] = "Richmond"
    Richmondcounty['countyFIPS'] = '36085'
    Richmondcounty['Total Cases'] = Richmondcounty['Total Cases'] / 5
    Richmondcounty['Total Deaths'] = Richmondcounty['Total Deaths'] / 5

    ### Now add those counties to the data frame.
    NYTcounty = pd.concat([NYTcounty, NewYorkcounty, Kingscounty, Queenscounty, Bronxcounty, Richmondcounty])
    NYTcounty = NYTcounty.sort_values(by = ["State", 'County Name', 'Date'], ascending = [True, True, True])

    ### Calculate New Cases and New Deaths
    NYTcounty = parallel1(NYTcounty, newCases1)
    NYTcounty = parallel1(NYTcounty, newDeaths1)

    ### Grab populations and state abbreviations
    NYTcounty = NYTcounty.merge(CountyData[['countyFIPS', 'Population', 'StateABV', 'Date', 'stateFIPS']], on = ['countyFIPS','Date'], how = 'left')

    ### For New York City, sum the populations of the 5 NYC counties
    NewYorkpop = NYTcounty['Population'][(NYTcounty['State'] == 'New York') & (NYTcounty['County Name'] == 'New York')].iloc[0]
    Kingspop = NYTcounty['Population'][(NYTcounty['State'] == 'New York') & (NYTcounty['County Name'] == 'Kings')].iloc[0]
    Queenspop = NYTcounty['Population'][(NYTcounty['State'] == 'New York') & (NYTcounty['County Name'] == 'Queens')].iloc[0]
    Bronxpop = NYTcounty['Population'][(NYTcounty['State'] == 'New York') & (NYTcounty['County Name'] == 'Bronx')].iloc[0]
    Richmondpop = NYTcounty['Population'][(NYTcounty['State'] == 'New York') & (NYTcounty['County Name'] == 'Richmond')].iloc[0]

    NYTcounty['Population'][NYTcounty['County Name'] == 'New York City'] = NewYorkpop + Kingspop + Queenspop + Bronxpop + Richmondpop


    ### For Kansas City, sum the populations of the 4 KC counties.
    Casspop = NYTcounty['Population'][(NYTcounty['State'] == 'Missouri') & (NYTcounty['County Name'] == 'Cass')].iloc[0]
    Claypop = NYTcounty['Population'][(NYTcounty['State'] == 'Missouri') & (NYTcounty['County Name'] == 'Clay')].iloc[0]
    Jacksonpop = NYTcounty['Population'][(NYTcounty['State'] == 'Missouri') & (NYTcounty['County Name'] == 'Jackson')].iloc[0]
    Plattepop = NYTcounty['Population'][(NYTcounty['State'] == 'Missouri') & (NYTcounty['County Name'] == 'Platte')].iloc[0]

    NYTcounty['Population'][NYTcounty['County Name'] == 'Kansas City'] = Casspop + Claypop + Jacksonpop + Plattepop

    NYTcounty["%Cases"] = np.where(NYTcounty["Population"] != 0,
                                 round((NYTcounty["Total Cases"] / NYTcounty["Population"]) * 100, 3),
                                 0)

    ### Percent of population that have died.
    NYTcounty["%Deaths"] = np.where(NYTcounty["Population"] != 0,
                                  round((NYTcounty["Total Deaths"] / NYTcounty["Population"]) * 100, 3),
                                  0)

    ### Logarithmic Scales
    NYTcounty["log(Total Cases)"] = round(np.log(NYTcounty["Total Cases"]), 3)
    NYTcounty["log(Total Deaths)"] = round(np.log(NYTcounty["Total Deaths"]), 3)
    NYTcounty["log(New Cases)"] = round(np.log(NYTcounty["New Cases"]), 3)
    NYTcounty["log(New Deaths)"] = round(np.log(NYTcounty["New Deaths"]), 3)

    NYTcounty = NYTcounty[list(CountyData.columns)]

    
    
    CountyData.to_csv("data/countyData.csv", index = False)
    StateData.to_csv("data/stateData.csv", index = False)
    USAData.to_csv("data/usaData.csv", index = False)
    DeathsSexAge.to_csv("data/demoDeaths.csv", index = False)
    raceNew.to_csv("data/raceDeaths.csv", index = False)
    hospital.to_csv("data/hospitalData.csv", index = False)
    GoogleUsaMobility.to_csv('data/GoogleUsaMobility.csv', index = False)
    GoogleStateMobility.to_csv('data/GoogleStateMobility.csv', index = False)
    GoogleCountyMobility.to_csv('data/GoogleCountyMobility.csv', index = False)
    NYTusa.to_csv('data/NYTusa.csv', index = False)
    NYTstate.to_csv('data/NYTstate.csv', index = False)
    NYTcounty.to_csv('data/NYTcounty.csv', index = False)