# Verifications Post-analysis

## Lib

In [57]:
import os
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline

## Set config and paths

In [58]:
inputs = {
    "folder": lambda hdi_index: f"PRI_NA_COORTEX_HDI_{hdi_index}_21JAN2021_31AUG2021",
    "only_events": lambda seed: f"EVENTOS_PAREADOS_{seed}.parquet",
    "only_pairs": lambda seed: f"PAREADOS_CPF_{seed}.parquet",
    "pairs_events_int": lambda event, seed: f"PAREADOS_COM_INTERVALOS_{event}_{seed}.parquet",
    "survival_int": lambda event, seed: f"SURVIVAL_CORONAVAC_D1D2_{event}_{seed}.parquet"
}

config = {
    "seed": 1,
    "hdi_index": 2,
    "event": "OBITO",
}

schema_path = os.path.join( "..", "output", "data", "SCHEMA_21JAN2021_31AUG2021.parquet")
base_path = os.path.join( "..", "output", "PAREAMENTO", "CORONAVAC", inputs["folder"](config["hdi_index"]) )
pairs_path = os.path.join( base_path, inputs["only_pairs"](config['seed']) )
events_path = os.path.join( base_path, inputs["only_events"](config['seed']) )
pairs_events_path = os.path.join( base_path, inputs["pairs_events_int"](config['event'], config['seed']) )
survival_path = os.path.join( base_path, "SURVIVAL", inputs['survival_int'](config['event'], config['seed']) )

## Load data

In [59]:
fschema_df = pd.read_parquet(schema_path)

In [60]:
pairs_df = pd.read_parquet(pairs_path)
events_df = pd.read_parquet(events_path)
pairs_events_df = pd.read_parquet(pairs_events_path)
survival_df = pd.read_parquet(survival_path)

In [61]:
events_df = events_df[events_df["TIPO"]!="NAO PAREADO"]

## Count of events

In [62]:
survival_death = pd.read_parquet(os.path.join( base_path, "SURVIVAL", inputs['survival_int']("OBITO", config['seed']) ))
survival_hosp = pd.read_parquet(os.path.join( base_path, "SURVIVAL", inputs['survival_int']("HOSPITAL", config['seed']) ))
survival_icu = pd.read_parquet(os.path.join( base_path, "SURVIVAL", inputs['survival_int']("UTI", config['seed']) ))

data = {
    "OBITO (day 100)": [0,0,0,0], "HOSPITAL (day 100)": [0,0,0,0], "UTI (day 100)": [0,0,0,0],
    "OBITO (day end)": [0,0,0,0], "HOSPITAL (day end)": [0,0,0,0], "UTI (day end)": [0,0,0,0],
}

survival, survival_names = [survival_death, survival_hosp, survival_icu], ["OBITO", "HOSPITAL", "UTI"]
for index, df in enumerate(survival):
    surv_caso_100 = df[(df["TIPO"]=="CASO") & (df[f"t - D1 {survival_names[index]}"]<=40) & (df[f"t - D1 {survival_names[index]}"]>=0)]
    surv_controle_100 = df[(df["TIPO"]=="CONTROLE") & (df[f"t - D1 {survival_names[index]}"]<=40) & (df[f"t - D1 {survival_names[index]}"]>=0)]
    surv_caso = df[(df["TIPO"]=="CASO")]
    surv_controle = df[(df["TIPO"]=="CONTROLE")]
    
    caso_total_d1 = surv_caso[f"t - D1 {survival_names[index]}"].notnull().sum()
    controle_total_d1 = surv_controle[f"t - D1 {survival_names[index]}"].notnull().sum()
    caso_total_d2 = surv_caso[f"t - D2 {survival_names[index]}"].notnull().sum()
    controle_total_d2 = surv_controle[f"t - D2 {survival_names[index]}"].notnull().sum()
    
    caso_event_d1_100 = surv_caso_100[f"E - D1 {survival_names[index]}"].value_counts().loc[True]
    caso_event_d2_100 = surv_caso_100[f"E - D2 {survival_names[index]}"].value_counts().loc[True]
    controle_event_d1_100 = surv_controle_100[f"E - D1 {survival_names[index]}"].value_counts().loc[True]
    controle_event_d2_100 = surv_controle_100[f"E - D2 {survival_names[index]}"].value_counts().loc[True]
    
    caso_event_d1 = surv_caso[f"E - D1 {survival_names[index]}"].value_counts().loc[True]
    caso_event_d2 = surv_caso[f"E - D2 {survival_names[index]}"].value_counts().loc[True]
    controle_event_d1 = surv_controle[f"E - D1 {survival_names[index]}"].value_counts().loc[True]
    controle_event_d2 = surv_controle[f"E - D2 {survival_names[index]}"].value_counts().loc[True]
    
    data[f"{survival_names[index]} (day 100)"][0] = f"{caso_event_d1_100}/{caso_total_d1}"
    data[f"{survival_names[index]} (day 100)"][1] = f"{controle_event_d1_100}/{controle_total_d1}"
    data[f"{survival_names[index]} (day 100)"][2] = f"{caso_event_d2_100}/{caso_total_d2}"
    data[f"{survival_names[index]} (day 100)"][3] = f"{controle_event_d2_100}/{controle_total_d2}"
    
    data[f"{survival_names[index]} (day end)"][0] = f"{caso_event_d1}/{caso_total_d1}"
    data[f"{survival_names[index]} (day end)"][1] = f"{controle_event_d1}/{controle_total_d1}"
    data[f"{survival_names[index]} (day end)"][2] = f"{caso_event_d2}/{caso_total_d2}"
    data[f"{survival_names[index]} (day end)"][3] = f"{controle_event_d2}/{controle_total_d2}"

data = pd.DataFrame(data)
data.index = ["CASO D1", "CONTROLE D1", "CASO D2", "CONTROLE D2"]

In [63]:
data

Unnamed: 0,OBITO (day 100),HOSPITAL (day 100),UTI (day 100),OBITO (day end),HOSPITAL (day end),UTI (day end)
CASO D1,49/144393,274/144393,61/144393,57/144393,277/144393,62/144393
CONTROLE D1,41/144393,275/144393,64/144393,42/144393,277/144393,65/144393
CASO D2,20/62658,277/62380,47/62626,22/62658,286/62380,50/62626
CONTROLE D2,130/62658,479/62380,121/62626,132/62658,500/62380,126/62626


In [58]:
158/70789

0.0022319851954399695

In [70]:
surv = survival_death
surv_caso = surv[surv["TIPO"]=="CASO"]
surv_controle = surv[surv["TIPO"]=="CONTROLE"]

In [55]:
surv_caso["E - D1 HOSPITAL"].value_counts()

False    144795
True        277
Name: E - D1 HOSPITAL, dtype: int64

In [56]:
surv_controle["E - D1 HOSPITAL"].value_counts()

False    144795
True        277
Name: E - D1 HOSPITAL, dtype: int64

In [66]:
from lifelines import KaplanMeierFitter

In [71]:
df_c = surv_caso[pd.notna(surv_caso["t - D1 OBITO"])]
kmf_caso = KaplanMeierFitter(label="caso")
kmf_caso.fit(df_c["t - D1 OBITO"], df_c["E - D1 OBITO"])

<lifelines.KaplanMeierFitter:"caso", fitted with 144393 total observations, 144336 right-censored observations>

In [50]:
df_c = surv_controle[pd.notna(surv_controle["t - D1 OBITO"])]
kmf_controle = KaplanMeierFitter(label="controle")
kmf_controle.fit(df_c["t - D1 OBITO"], df_c["E - D1 OBITO"])

<lifelines.KaplanMeierFitter:"controle", fitted with 144393 total observations, 144351 right-censored observations>

In [46]:
surv_controle[pd.isna(surv_controle["t - D1 OBITO"])]
surv_caso[pd.isna(surv_caso["t - D1 OBITO"])]

Unnamed: 0,CPF,TIPO,t - D1 OBITO,E - D1 OBITO,t - D2 OBITO,E - D2 OBITO
6624,14284405349,CASO,,False,81.0,False
6890,13593340330,CASO,,False,66.0,False
6976,19059337387,CASO,,False,84.0,False
7056,27816206372,CASO,,False,65.0,False
7070,07345879372,CASO,,False,42.0,False
...,...,...,...,...,...,...
289108,30147484391,CASO,,False,106.0,False
289546,13604651387,CASO,,False,,False
289744,72114665372,CASO,,False,60.0,False
289974,15441482320,CASO,,False,167.0,False


In [39]:
events_df[events_df["CPF"]=="30147484391"]

Unnamed: 0,CPF,DATA D1,DATA D2,DATA OBITO COVID,DATA OBITO GERAL,DATA HOSPITALIZACAO,DATA UTI,TIPO,PAR,PAREADO
236429,30147484391,2021-05-18,2021-05-17,NaT,NaT,,,CONTROLE,41932480382,True
289110,30147484391,2021-05-18,2021-05-17,NaT,NaT,,,CASO,18778070368,True


In [49]:
kmf_caso.event_table["observed"].sum()

57

In [51]:
kmf_controle.event_table["observed"].sum()

42

In [72]:
kmf_caso.event_table

Unnamed: 0_level_0,removed,observed,censored,entrance,at_risk
event_at,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0.0,5897,0,5897,144393,144393
1.0,23347,0,23347,0,138496
2.0,6810,0,6810,0,115149
3.0,10769,0,10769,0,108339
4.0,5775,0,5775,0,97570
...,...,...,...,...,...
201.0,2,0,2,0,6
206.0,1,0,1,0,4
209.0,1,0,1,0,3
216.0,1,0,1,0,2


In [75]:
kmf_caso.survival_table_from_events

AttributeError: 'KaplanMeierFitter' object has no attribute 'survival_table_from_events'