# Process raw data

## Load libraries

In [1]:
import functools

import covid_analysis.utils.paths as path
import janitor
import pandas as pd
import pandas_flavor as pf
import datatable as dt
from IPython.display import HTML, display


## Define default input and output directory

In [2]:
input_dir = path.data_raw_dir()
output_dir = path.data_processed_dir()

## Johns Hopkins University - Time series

### Utilities

In [3]:
@pf.register_dataframe_method
def hopkins_tidy_data(df: pd.DataFrame, out_column: str) -> pd.DataFrame:
    return (
        df
        .select_columns(["Country/Region", "*/*/*"])
        .pivot_longer(
            index="Country/Region",
            names_to="date"
        )
        .to_datetime("date")
        .clean_names()
        .rename_column("country_region", "country")
        .groupby(["country", "date"])
        .sum()
        .reset_index()
        .rename_column("value", out_column)
    )

### Read data

In [4]:
hopkins_time_series = {
    file.stem: pd.read_csv(file)
    for file in input_dir.glob("time_series_covid19_[!v]*_global.csv")
}

hopkins_time_series.keys()

dict_keys(['time_series_covid19_confirmed_global', 'time_series_covid19_deaths_global'])

### Process data

In [5]:
hopkins_tidy_time_series = {
    key: df.hopkins_tidy_data(out_column=key)
    for key, df in hopkins_time_series.items()
}

hopkins_tidy_cumulative_df = (
    functools.reduce(
        lambda x, y: pd.merge(x, y, on=["date", "country"]),
        hopkins_tidy_time_series.values()
    )
)

hopkins_tidy_cumulative_df.columns = (
    hopkins_tidy_cumulative_df
    .columns
    .str
    .extract(r"(country|date|confirmed|deaths)")
    .iloc[:, 0]
)

hopkins_tidy_cumulative_df.head(1)

Unnamed: 0,country,date,confirmed,deaths
0,Afghanistan,2020-01-22,0,0


### Save data

In [6]:
(
    hopkins_tidy_cumulative_df
    .to_csv(
        path_or_buf=output_dir.joinpath("hopkins_tidy_cumulative.csv"),
        index=False
    )
)

## Johns Hopkins University - Countries metadata

### Read data

In [7]:
countries_metadata_filename = input_dir.joinpath("UID_ISO_FIPS_LookUp_Table.csv")

countries_metadata_df = pd.read_csv(countries_metadata_filename)
countries_metadata_df.head(1)

Unnamed: 0,UID,iso2,iso3,code3,FIPS,Admin2,Province_State,Country_Region,Lat,Long_,Combined_Key,Population
0,4,AF,AFG,4.0,,,,Afghanistan,33.93911,67.709953,Afghanistan,38928341.0


### Process data

In [8]:
countries_population_df = (
    countries_metadata_df
    .clean_names()
    .rename_column("country_region", "country")
    .select_columns(["country", "population"])
    .groupby("country")
    .sum()
    .reset_index()
)

countries_population_df.head(1)

Unnamed: 0,country,population
0,Afghanistan,38928341.0


### Save data

In [9]:
(
    countries_population_df
    .to_csv(
        path_or_buf=output_dir.joinpath("countries_population.csv"),
        index=False
    )
)

## Johns Hopkins University - Vaccination Time Series

### Read data

In [10]:
vaccinations_tidy_time_series_file = input_dir.joinpath("time_series_covid19_vaccine_global.csv")

vaccinations_tidy_time_series_df = pd.read_csv(vaccinations_tidy_time_series_file)
vaccinations_tidy_time_series_df.head(1)

Unnamed: 0,Country_Region,Date,Doses_admin,People_partially_vaccinated,People_fully_vaccinated,Report_Date_String,UID,Province_State
0,Afghanistan,2021-02-22,0,0.0,0.0,2021-02-22,4.0,


### Process data

In [11]:
vaccination_country_cumulative_df = (
    vaccinations_tidy_time_series_df
    .clean_names()
    .rename_column("country_region", "country")
    .remove_columns(["report_date_string", "uid", "province_state"])
    .groupby(["country", "date"])
    .sum()
    .reset_index()
    .filter_on("country != 'World' and country != 'US (Aggregate)'")
)

vaccination_country_cumulative_df.head(1)

Unnamed: 0,country,date,doses_admin,people_partially_vaccinated,people_fully_vaccinated
0,Afghanistan,2021-02-22,0,0.0,0.0


### Save data

In [12]:
(
    vaccination_country_cumulative_df
    .to_csv(
        path_or_buf=output_dir.joinpath("vaccination_country_cumulative.csv"),
        index=False
    )
)

## Government of Mexico data

In [13]:
interim_dir = path.data_interim_dir()

### Unzip files

In [14]:
!unzip -q -o -d {str(interim_dir)} {str(input_dir.joinpath("diccionario_datos_covid19.zip"))} "*Catalogos.xlsx"

In [15]:
!unzip -q -o -d {str(interim_dir)} {str(input_dir.joinpath("datos_abiertos_covid19.zip"))}

### Load data

#### Catalogs

In [16]:
catalogs_file = list(interim_dir.glob("*Catalogos.xlsx"))[-1]

In [17]:
catalogs_dfs = pd.read_excel(
    io=catalogs_file,
    sheet_name=None,
    header=0
)

catalogs_dfs.keys()

dict_keys(['Catálogo ORIGEN', 'Catálogo SECTOR', 'Catálogo SEXO', 'Catálogo TIPO_PACIENTE', 'Catálogo SI_NO', 'Catálogo NACIONALIDAD', 'Catálogo RESULTADO_LAB', 'Catálogo RESULTADO_ANTIGENO', 'Catálogo CLASIFICACION_FINAL', 'Catálogo de ENTIDADES', 'Catálogo MUNICIPIOS'])

In [18]:
catalogs_dfs_2 = dict()
for sheet, df in catalogs_dfs.items():

    sheet_name = sheet.split(" ")[-1].lower()
    inconsistent_columns_position = df.columns.str.contains("Unnamed").any()

    if inconsistent_columns_position:
        if sheet_name == 'clasificacion_final':
            location_names = 1
 
        else:
            location_names = 0

        df = (
            df
            .dropna()
            .rename(columns=df.iloc[location_names])
            .drop(location_names)
            .reset_index(drop=True)
        )
    
    df = df.clean_names()

    catalogs_dfs_2[sheet_name] = df

catalogs_dfs_2.keys()

dict_keys(['origen', 'sector', 'sexo', 'tipo_paciente', 'si_no', 'nacionalidad', 'resultado_lab', 'resultado_antigeno', 'clasificacion_final', 'entidades', 'municipios'])

In [19]:
[display(sheet, df.head(1)) for sheet, df in catalogs_dfs_2.items()];

'origen'

Unnamed: 0,clave,descripcion
0,1,USMER


'sector'

Unnamed: 0,clave,descripcion
0,1,CRUZ ROJA


'sexo'

Unnamed: 0,clave,descripcion
0,1,MUJER


'tipo_paciente'

Unnamed: 0,clave,descripcion
0,1,AMBULATORIO


'si_no'

Unnamed: 0,clave,descripcion
0,1,SI


'nacionalidad'

Unnamed: 0,clave,descripcion
0,1,MEXICANA


'resultado_lab'

Unnamed: 0,clave,descripcion
0,1,POSITIVO A SARS-COV-2


'resultado_antigeno'

Unnamed: 0,clave,descripcion
0,1,POSITIVO A SARS-COV-2


'clasificacion_final'

Unnamed: 0,clave,clasificacion,descripcion
0,1,CASO DE COVID-19 CONFIRMADO POR ASOCIACIÓN CLÍ...,Confirmado por asociación aplica cuando el cas...


'entidades'

Unnamed: 0,clave_entidad,entidad_federativa,abreviatura
0,1,AGUASCALIENTES,AS


'municipios'

Unnamed: 0,clave_municipio,municipio,clave_entidad
0,1,AGUASCALIENTES,1


### Complete covid cases

In [20]:
covid_mex_file = list(interim_dir.glob("*COVID19MEXICO.csv"))[-1]

### Create local database

In [21]:
covid_mex_db = output_dir.joinpath("covid_mex.db")

In [22]:
covid_mex_df = dt.fread(covid_mex_file)
covid_mex_df.head(1)

NameError: name 'dt' is not defined

In [None]:
from sqlalchemy import create_engine, Table, Column, Integer, String, ForeignKey, Text, MetaData
engine = create_engine(f"sqlite:///{covid_mex_db}")

with engine.connect() as conn:

    for sheet, df in catalogs_dfs_2.items():

        (
            df
            .to_sql(
                name=sheet,
                con=conn, 
                if_exists="replace",
                index=False
            )
        )

0    Confirmado por asociación aplica cuando el cas...
1    Confirmado por dictaminación solo aplica para ...
2    Confirmado aplica cuando:\nEl caso tiene muest...
3    Inválido aplica cuando el caso no tienen asoci...
4    No realizado aplica cuando el caso no tienen a...
5    Sospechoso aplica cuando: \nEl caso no tienen ...
6    Negativo aplica cuando el caso:\n1. Se le tomo...
Name: descripcion, dtype: object
