# Process raw data

## Load libraries

In [1]:
import functools

import covid_analysis.utils.paths as path
import janitor
import pandas as pd
import pandas_flavor as pf


## Define default input and output directory

In [2]:
input_dir = path.data_raw_dir()
output_dir = path.data_processed_dir()

## Johns Hopkins University - Time series

### Utilities

In [3]:
@pf.register_dataframe_method
def hopkins_tidy_data(df: pd.DataFrame, out_column: str) -> pd.DataFrame:
    return (
        df
        .select_columns(["Country/Region", "*/*/*"])
        .pivot_longer(
            index="Country/Region",
            names_to="date"
        )
        .to_datetime("date")
        .clean_names()
        .rename_column("country_region", "country")
        .groupby(["country", "date"])
        .sum()
        .reset_index()
        .rename_column("value", out_column)
    )

### Read data

In [4]:
hopkins_time_series = {
    file.stem: pd.read_csv(file)
    for file in input_dir.glob("time_series_covid19_[!v]*_global.csv")
}

hopkins_time_series.keys()

dict_keys(['time_series_covid19_confirmed_global', 'time_series_covid19_deaths_global'])

### Process data

In [5]:
hopkins_tidy_time_series = {
    key: df.hopkins_tidy_data(out_column=key)
    for key, df in hopkins_time_series.items()
}

hopkins_tidy_cumulative_df = (
    functools.reduce(
        lambda x, y: pd.merge(x, y, on=["date", "country"]),
        hopkins_tidy_time_series.values()
    )
)

hopkins_tidy_cumulative_df.columns = (
    hopkins_tidy_cumulative_df
    .columns
    .str
    .extract(r"(country|date|confirmed|deaths)")
    .iloc[:, 0]
)

hopkins_tidy_cumulative_df.head(1)

Unnamed: 0,country,date,confirmed,deaths
0,Afghanistan,2020-01-22,0,0


### Save data

In [6]:
(
    hopkins_tidy_cumulative_df
    .to_csv(
        path_or_buf=output_dir.joinpath("hopkins_tidy_cumulative.csv"),
        index=False
    )
)

## Johns Hopkins University - Countries metadata

### Read data

In [7]:
countries_metadata_filename = input_dir.joinpath("UID_ISO_FIPS_LookUp_Table.csv")

countries_metadata_df = pd.read_csv(countries_metadata_filename)
countries_metadata_df.head(1)

Unnamed: 0,UID,iso2,iso3,code3,FIPS,Admin2,Province_State,Country_Region,Lat,Long_,Combined_Key,Population
0,4,AF,AFG,4.0,,,,Afghanistan,33.93911,67.709953,Afghanistan,38928341.0


### Process data

In [8]:
countries_population_df = (
    countries_metadata_df
    .clean_names()
    .rename_column("country_region", "country")
    .select_columns(["country", "population"])
    .groupby("country")
    .sum()
    .reset_index()
)

countries_population_df.head(1)

Unnamed: 0,country,population
0,Afghanistan,38928341.0


### Save data

In [9]:
(
    countries_population_df
    .to_csv(
        path_or_buf=output_dir.joinpath("countries_population.csv"),
        index=False
    )
)

## Johns Hopkins University - Vaccination Time Series

### Read data

In [10]:
vaccinations_tidy_time_series_file = input_dir.joinpath("time_series_covid19_vaccine_global.csv")

vaccinations_tidy_time_series_df = pd.read_csv(vaccinations_tidy_time_series_file)
vaccinations_tidy_time_series_df.head(1)

Unnamed: 0,Country_Region,Date,Doses_admin,People_partially_vaccinated,People_fully_vaccinated,Report_Date_String,UID,Province_State
0,Afghanistan,2021-02-22,0,0.0,0.0,2021-02-22,4.0,


### Process data

In [11]:
vaccination_country_cumulative_df = (
    vaccinations_tidy_time_series_df
    .clean_names()
    .rename_column("country_region", "country")
    .remove_columns(["report_date_string", "uid", "province_state"])
    .groupby(["country", "date"])
    .sum()
    .reset_index()
)

vaccination_country_cumulative_df.head(1)

Unnamed: 0,country,date,doses_admin,people_partially_vaccinated,people_fully_vaccinated
0,Afghanistan,2021-02-22,0,0.0,0.0


### Save data

In [12]:
(
    vaccination_country_cumulative_df
    .to_csv(
        path_or_buf=output_dir.joinpath("vaccination_country_cumulative.csv"),
        index=False
    )
)