# Process raw data

## Load libraries

In [1]:
import functools

import covid_analysis.utils.paths as path
import janitor
import pandas as pd
import pandas_flavor as pf


## Define default input and output directory

In [2]:
input_dir = path.data_raw_dir()
output_dir = path.data_processed_dir()

## Johns Hopkins University time series

### Utilities

In [3]:
@pf.register_dataframe_method
def hopkins_tidy_data(df: pd.DataFrame, out_column: str) -> pd.DataFrame:
    return (
        df
        .select_columns(["Country/Region", "*/*/*"])
        .pivot_longer(
            index="Country/Region",
            names_to="date"
        )
        .to_datetime("date")
        .clean_names()
        .rename_column("country_region", "country")
        .groupby(["country", "date"])
        .sum()
        .reset_index()
        .rename_column("value", out_column)
    )

### Read data

In [4]:
hopkins_time_series = {
    file.stem: pd.read_csv(file)
    for file in input_dir.glob("time_series_covid19_*_global.csv")
}

hopkins_time_series.keys()

dict_keys(['time_series_covid19_confirmed_global', 'time_series_covid19_deaths_global'])

### Process data

In [5]:
hopkins_tidy_time_series = {
    key: df.hopkins_tidy_data(out_column=key)
    for key, df in hopkins_time_series.items()
}

hopkins_tidy_cumulative_df = (
    functools.reduce(
        lambda x, y: pd.merge(x, y, on=["date", "country"]),
        hopkins_tidy_time_series.values()
    )
)

hopkins_tidy_cumulative_df.columns = (
    hopkins_tidy_cumulative_df
    .columns
    .str
    .extract(r"(country|date|confirmed|deaths)")
    .iloc[:, 0]
)

hopkins_tidy_cumulative_df.head(1)

Unnamed: 0,country,date,confirmed,deaths
0,Afghanistan,2020-01-22,0,0


### Save data

In [6]:
(
    hopkins_tidy_cumulative_df
    .to_csv(
        path_or_buf=output_dir.joinpath("hopkins_tidy_cumulative.csv"),
        index=False
    )
)