# Exploring matching and survival analysis

## Lib and paths

In [13]:
import os
import json
from pprint import pprint
import pandas as pd
import numpy as np
import datetime as dt

fname_schema = "SCHEMA_21JAN2021_31AUG2021.parquet"
pares_1 = "PAREADOS_CPF_2.parquet"
eventos_1 = "EVENTOS_PAREADOS_2.parquet"
#par_evento_1 = "PAREADOS_COM_INTERVALOS_1.parquet"

vaccine = "CORONAVAC"
data_folder = os.path.join("..", "output", "data")
pareado_folder = os.path.join("..", "output", "PAREAMENTO", vaccine)

## Load data

In [14]:
fschema = pd.read_parquet(os.path.join(data_folder, fname_schema))
pareado_df = pd.read_parquet(os.path.join(pareado_folder, pares_1))
eventos_df = pd.read_parquet(os.path.join(pareado_folder, eventos_1))
#intervalo_df = pd.read_parquet(os.path.join(pareado_folder, par_evento_1))

In [9]:
print(fschema.columns)
fschema[:3]

Index(['NOME', 'CPF', 'CNS', 'DATA NASCIMENTO', 'NOME MAE', 'CEP', 'BAIRRO',
       'SEXO', 'SITUACAO VACINEJA', 'cpf LINKAGE VACINADOS',
       'id LINKAGE INTEGRASUS', 'ordem LINKAGE OBITO COVID',
       'cpf LINKAGE CARTORIOS', 'primary key LINKAGE SIVEP',
       'TESTE POSITIVO ANTES COORTE', 'POSITIVOS COLETA DATA',
       'POSITIVOS SOLICITACAO DATA', 'VACINA APLICADA', 'DATA D1', 'DATA D2',
       'DATA D3', 'DATA D4', 'ORDEM(OBITO COVID)', 'NUMERODO(OBITO COVID)',
       'DATA PRI SINTOMAS(OBITO COVID)', 'DATA OBITO',
       'DATA FALECIMENTO(CARTORIOS)', 'NUMERODO(CARTORIOS)',
       'DATA NOTIFICACAO SIVEP', 'DATA INTERNACAO', 'EVOLUCAO',
       'DATA EVOLUCAO', 'COLETA APOS OBITO', 'SOLICITACAO APOS OBITO',
       'OBITO INCONSISTENCIA COVID', 'OBITO INCONSISTENCIA CARTORIOS',
       'STATUS VACINACAO DURANTE COORTE', 'STATUS VACINACAO', 'IDADE',
       'DATA HOSPITALIZACAO', 'OBITO ANTES COORTE',
       'HOSPITALIZACAO ANTES COORTE'],
      dtype='object')


Unnamed: 0,NOME,CPF,CNS,DATA NASCIMENTO,NOME MAE,CEP,BAIRRO,SEXO,SITUACAO VACINEJA,cpf LINKAGE VACINADOS,...,COLETA APOS OBITO,SOLICITACAO APOS OBITO,OBITO INCONSISTENCIA COVID,OBITO INCONSISTENCIA CARTORIOS,STATUS VACINACAO DURANTE COORTE,STATUS VACINACAO,IDADE,DATA HOSPITALIZACAO,OBITO ANTES COORTE,HOSPITALIZACAO ANTES COORTE
0,FRANCISCA ANA MARIA DE MELO LIMA,67698255387,,1974-03-14,TEREZA ARAUJO DE MELO,60450635,NÃO INFOMRADO,F,RECEBEU AO MENOS UMA DOSE,67698255387.0,...,False,False,False,False,(D1)(D2),(D1)(D2)(D3),47,,False,False
1,WASTERLAY BARROSO PEIXOTO,58931961391,,1974-04-02,MARLUCIA BEZERRA BARROSO,60864260,CAJAZEIRAS,M,RECEBEU AO MENOS UMA DOSE,58931961391.0,...,False,False,False,False,(D1)(D2),(D1)(D2)(D3),47,,False,False
2,AFONSO DIAS BARRETO JUNIOR,52783847353,,1974-08-25,ALBERTINA DIAS BARRETO,60822345,CAMBEBA,M,RECEBEU AO MENOS UMA DOSE,,...,False,False,False,False,,,47,,False,False


In [10]:
print(pareado_df.shape)
pareado_df[:3]

(124669, 2)


Unnamed: 0,CPF CASO,CPF CONTROLE
0,13933680344,6840492362
1,16650700304,16214153334
2,10965394387,53786408300


In [11]:
print(eventos_df.shape)
eventos_df = eventos_df.set_index("CPF")
eventos_df[:3]

(258866, 9)


Unnamed: 0_level_0,DATA D1,DATA D2,DATA OBITO COVID,DATA OBITO GERAL,DATA HOSPITALIZACAO,TIPO,PAR,PAREADO
CPF,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
13933680344,2021-01-21,2021-02-14,NaT,NaT,,CASO,6840492362,True
6840492362,2021-04-02,2021-05-16,NaT,NaT,,CONTROLE,13933680344,True
16650700304,2021-01-21,2021-02-14,NaT,NaT,,CASO,16214153334,True


In [12]:
print(intervalo_df.shape)
intervalo_df[:3]

(124668, 16)


Unnamed: 0,CPF,DATA D1,DATA D2,DATA OBITO COVID,DATA OBITO GERAL,DATA HOSPITALIZACAO,TIPO,PAR,CASO D1 INTERVALO,CASO D1 CENSURADO,CASO D2 INTERVALO,CASO D2 CENSURADO,CONTROLE D1 INTERVALO,CONTROLE D1 CENSURADO,CONTROLE D2 INTERVALO,CONTROLE D2 CENSURADO
0,13933680344,2021-01-21,2021-02-14,NaT,NaT,,CASO,6840492362,24,True,47.0,True,24,True,47.0,True
2,16650700304,2021-01-21,2021-02-14,NaT,NaT,,CASO,16214153334,24,True,198.0,True,24,True,198.0,True
4,10965394387,2021-01-21,2021-02-14,NaT,NaT,,CASO,53786408300,24,True,198.0,True,24,True,198.0,True


## Define censoring conditions for pairs

In [94]:
def extract_all(pareados_df, eventos_df):
    '''
    
    '''
    casos = dict()
    controle = dict()

def extract_pair_dict(cpf_caso, cpf_controle, eventos_df):
    '''
    
    '''
    sub_caso = eventos_df.loc[cpf_caso]
    sub_controle = eventos_df.loc[cpf_controle]
    if type(sub_caso)==pd.Series:
        caso_dict = [sub_caso.to_dict()]
    else:
        caso_dict = sub_caso[sub_caso["TIPO"]=="CASO"].to_dict(orient="records")
    if type(sub_controle)==pd.Series:
        controle_dict = [sub_controle.to_dict()]
    else:
        controle_dict = sub_controle[sub_controle["TIPO"]=="CONTROLE"].to_dict(orient='records')
    return caso_dict, controle_dict

def compare_pair_survival(caso_hash, controle_hash, events_col, final_cohort):
    '''
        Description.
        
        Args:
            caso_hash:
                dictionary.
            controle_hash:
                dictionary.
            events_col:
                dictionary.
            final_cohort:
                datetime.date.
        Return:
            res:
                dictionary.
    '''
    #cpf_caso = caso_hash["CPF"]
    #cpf_controle = controle_hash["CPF"]
    # Get events of case
    caso_d1_date = caso_hash[events_col["D1"]]
    caso_d2_date = caso_hash[events_col["D2"]]
    caso_covid_date = caso_hash[events_col["OBITO COVID"]]
    caso_geral_date = caso_hash[events_col["OBITO GERAL"]]
    # Get events of control
    control_d1_date = controle_hash[events_col["D1"]]
    control_d2_date = controle_hash[events_col["D2"]]
    control_covid_date = controle_hash[events_col["OBITO COVID"]]
    control_geral_date = controle_hash[events_col["OBITO GERAL"]]
    
    #f = lambda x: x.date() if not pd.isna(x) else np.nan
    f = lambda x: x if pd.notna(x) else np.nan
    g = lambda x,y: (x-y).days if not pd.isna(x) and not pd.isna(y) else np.nan
            
    # --> D1
    start_date = caso_d1_date
    caso_diff = {
        "D1 to D2": g(f(caso_d2_date),start_date),
        "D1 to D1_CONTROL": g(f(control_d1_date),start_date),
        "D1 to COVID": g(f(caso_covid_date), start_date),
        "D1 to GERAL": g(f(caso_geral_date), start_date),
        "D1 to FIM": g(final_cohort, start_date)
    }
    control_diff = {
        "D1 to D1_CONTROL": g(f(control_d1_date),start_date),
        "D1 to COVID_CONTROL": g(f(control_covid_date),start_date),
        "D1 to GERAL_CONTROL": g(f(control_geral_date), start_date),
        "D1 to D2": g(f(caso_d2_date),start_date), # test, think
        "D1 to FIM": g(final_cohort,start_date)
    }
    
    # --> D2
    start_date = caso_d2_date
    caso_diff_d2 = {
        "D2 to D1_CONTROL": g(f(control_d1_date),start_date),
        "D2 to COVID": g(f(caso_covid_date), start_date),
        "D2 to GERAL": g(f(caso_geral_date), start_date),
        "D2 to FIM": g(final_cohort, start_date)
    }
    control_diff_d2 = {
        "D2 to D1_CONTROL": g(f(control_d1_date),start_date),
        "D2 to COVID_CONTROL": g(f(control_covid_date),start_date),
        "D2 to GERAL_CONTROL": g(f(control_geral_date), start_date),
        "D2 to FIM": g(final_cohort,start_date)
    }
    
    caso_events_d1 = [ (key, caso_diff[key]) for key in caso_diff.keys() ]
    control_events_d1 = [ (key, control_diff[key]) for key in control_diff.keys() ]
    caso_events_d2 = [ (key, caso_diff_d2[key]) for key in caso_diff_d2.keys() ]
    control_events_d2 = [ (key, control_diff_d2[key]) for key in control_diff_d2.keys() ]
    res = {
        #"CPF CASO": cpf_caso,
        #"CPF CONTROLE": cpf_controle,
        "D1": (caso_events_d1, control_events_d1),
        "D2": (caso_events_d2, control_events_d2)
    }
    return res

In [95]:
cpf_casoex, cpf_controlex = pareado_df["CPF CASO"].iat[0], pareado_df["CPF CONTROLE"].iat[0]
print(cpf_casoex, cpf_controlex)

13933680344 06840492362


In [110]:
caso_hash, controle_hash = extract_pair_dict(cpf_casoex, cpf_controlex, eventos_df)
caso_hash = caso_hash[0]
controle_hash = controle_hash[0]
pprint(caso_hash)
pprint(controle_hash)

{'DATA D1': Timestamp('2021-01-21 00:00:00'),
 'DATA D2': Timestamp('2021-02-14 00:00:00'),
 'DATA HOSPITALIZACAO': None,
 'DATA OBITO COVID': NaT,
 'DATA OBITO GERAL': NaT,
 'PAR': '06840492362',
 'PAREADO': True,
 'TIPO': 'CASO'}
{'DATA D1': Timestamp('2021-04-02 00:00:00'),
 'DATA D2': Timestamp('2021-05-16 00:00:00'),
 'DATA HOSPITALIZACAO': None,
 'DATA OBITO COVID': NaT,
 'DATA OBITO GERAL': NaT,
 'PAR': '13933680344',
 'PAREADO': True,
 'TIPO': 'CONTROLE'}


In [98]:
events_col = {
    "D1": "DATA D1",
    "D2": "DATA D2",
    "OBITO COVID": "DATA OBITO COVID",
    "OBITO GERAL": "DATA OBITO GERAL",
}
res = compare_pair_survival(caso_hash, controle_hash, events_col, dt.datetime(2021, 8, 31))
print("D1: ", res["D1"])
print("D2: ", res["D2"])

D1:  ([('D1 to D2', 24), ('D1 to D1_CONTROL', 71), ('D1 to COVID', nan), ('D1 to GERAL', nan), ('D1 to FIM', 222)], [('D1 to D1_CONTROL', 71), ('D1 to COVID_CONTROL', nan), ('D1 to GERAL_CONTROL', nan), ('D1 to D2', 24), ('D1 to FIM', 222)])
D2:  ([('D2 to D1_CONTROL', 47), ('D2 to COVID', nan), ('D2 to GERAL', nan), ('D2 to FIM', 198)], [('D2 to D1_CONTROL', 47), ('D2 to COVID_CONTROL', nan), ('D2 to GERAL_CONTROL', nan), ('D2 to FIM', 198)])


In [113]:
dt.datetime(2021,4,2)-dt.datetime(2021,2,14)

datetime.timedelta(days=47)

In [15]:
init = dt.datetime(2021, 1, 21)
final = dt.datetime(2021, 8, 31)
eventos_df[pd.notna(eventos_df["DATA HOSPITALIZACAO"])]

Unnamed: 0_level_0,DATA D1,DATA D2,DATA OBITO COVID,DATA OBITO GERAL,DATA HOSPITALIZACAO,TIPO,PAR,PAREADO
CPF,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
21459789334,2021-01-22,2021-03-25,NaT,NaT,[2021-02-22T00:00:00.000000],CASO,35693797368,True
06865860349,2021-01-22,2021-03-03,NaT,NaT,[2021-01-31T00:00:00.000000],CASO,61364661390,True
05901243315,2021-04-01,2021-06-06,NaT,NaT,[2021-05-03T00:00:00.000000],CONTROLE,12767360453,True
45522146320,2021-03-31,2021-05-08,NaT,NaT,[2021-02-23T00:00:00.000000],CONTROLE,10496580353,True
14397714304,2021-01-24,2021-02-21,NaT,NaT,[2021-03-04T00:00:00.000000],CASO,12024481353,True
...,...,...,...,...,...,...,...,...
08183627315,NaT,NaT,2021-03-18,NaT,[2021-03-18T00:00:00.000000],NAO PAREADO,,False
71080147349,NaT,NaT,NaT,NaT,[2021-03-31T00:00:00.000000],NAO PAREADO,,False
07322658353,NaT,NaT,NaT,2021-03-19,[2021-03-09T00:00:00.000000],NAO PAREADO,,False
11118547349,NaT,NaT,2021-03-23,NaT,[2021-03-08T00:00:00.000000],NAO PAREADO,,False


In [11]:
def f(x, init, final):
    '''
        D
    '''
    if not np.any(pd.notna(x)):
        return np.nan
    x = np.sort([xx for xx in x if pd.notna(xx)]) 
    condition = (x>=init) & (x<=final)
    if x[condition].shape[0]>0:
        return x[condition][0]
    else:
        return np.nan
    
init = dt.datetime(2021, 1, 21)
final = dt.datetime(2021, 8, 31)
eventos_df["DATA HOSPITALIZACAO 1"] = eventos_df["DATA HOSPITALIZACAO"].apply(lambda x: f(x, init, final))

In [17]:
tst = np.sort([dt.datetime(2021, 3, 1), dt.datetime(2021, 2, 1), dt.datetime(2020,9,1), dt.datetime(2021, 5, 1)])

In [25]:
cond = (tst>=init) & (tst<=final)

In [27]:
tst[cond][0]

datetime.datetime(2021, 2, 1, 0, 0)

In [12]:
eventos_df["DATA HOSPITALIZACAO 1"]

0               NaT
1               NaT
2               NaT
3               NaT
4               NaT
            ...    
258861          NaT
258862   2021-03-09
258863   2021-03-08
258864          NaT
258865   2021-03-21
Name: DATA HOSPITALIZACAO 1, Length: 258866, dtype: datetime64[ns]

In [148]:
eventos_df[:10]

Unnamed: 0_level_0,DATA D1,DATA D2,DATA OBITO COVID,DATA OBITO GERAL,DATA HOSPITALIZACAO,TIPO,PAR,PAREADO
CPF,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
13933680344,2021-01-21,2021-02-14,NaT,NaT,,CASO,6840492362,True
6840492362,2021-04-02,2021-05-16,NaT,NaT,,CONTROLE,13933680344,True
16650700304,2021-01-21,2021-02-14,NaT,NaT,,CASO,16214153334,True
16214153334,2021-11-17,2021-12-10,NaT,NaT,,CONTROLE,16650700304,True
10965394387,2021-01-21,2021-02-14,NaT,NaT,,CASO,53786408300,True
53786408300,NaT,NaT,NaT,NaT,,CONTROLE,10965394387,True
25949160304,2021-01-21,2021-02-16,NaT,NaT,,CASO,11806591391,True
11806591391,2021-03-31,2021-05-05,NaT,NaT,,CONTROLE,25949160304,True
12138070344,2021-01-22,2021-02-19,NaT,NaT,,CASO,6102441320,True
6102441320,2021-11-26,2021-11-26,NaT,NaT,,CONTROLE,12138070344,True


In [158]:
def f(x):
    return {"CPF": x["CPF"], "DATA D1": x["DATA D1"], "DATA D2": x["DATA D2"], "DATA OBITO COVID": x["DATA OBITO COVID"],
           "DATA OBITO GERAL": x["DATA OBITO GERAL"], "DATA HOSPITALIZACAO": x["DATA HOSPITALIZACAO"], "TIPO": x["TIPO"]}

ev = eventos_df.reset_index()
ev = ev[pd.notna(ev["PAR"])]
ev = ev[(ev["PAREADO"]==True)]
ev["KEY_DICT"] = ev["CPF"]+ev["TIPO"]
ev["DICT_INFO"] = ev.apply(f, axis=1)

hashdict = dict(zip(ev["KEY_DICT"], ev["DICT_INFO"]))

In [159]:
ev["KEY_DICT"][:6]

0        13933680344CASO
1    06840492362CONTROLE
2        16650700304CASO
3    16214153334CONTROLE
4        10965394387CASO
5    53786408300CONTROLE
Name: KEY_DICT, dtype: object

In [160]:
ev_caso = ev[ev["TIPO"]=="CASO"]
ev_caso["RESULT"] = ev_caso.apply(lambda x: compare_pair_survival(x["DICT_INFO"], hashdict[x["PAR"]+"CONTROLE"], events_col, dt.datetime(2021,8,31)), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ev_caso["RESULT"] = ev_caso.apply(lambda x: compare_pair_survival(x["DICT_INFO"], hashdict[x["PAR"]+"CONTROLE"], events_col, dt.datetime(2021,8,31)), axis=1)


In [161]:
ev_caso["RESULT"][:4]

0    {'D1': ([('D1 to D2', 24), ('D1 to D1_CONTROL'...
2    {'D1': ([('D1 to D2', 24), ('D1 to D1_CONTROL'...
4    {'D1': ([('D1 to D2', 24), ('D1 to D1_CONTROL'...
6    {'D1': ([('D1 to D2', 26), ('D1 to D1_CONTROL'...
Name: RESULT, dtype: object

In [130]:
HASHDICT["06840492362CONTROLE"]

{'CPF': '06840492362',
 'DATA D1': Timestamp('2021-04-02 00:00:00'),
 'DATA D2': Timestamp('2021-05-16 00:00:00'),
 'DATA OBITO COVID': NaT,
 'DATA OBITO GERAL': NaT,
 'DATA HOSPITALIZACAO': None,
 'TIPO': 'CONTROLE'}

In [143]:
ev_par["RESULT"] = ev_par.apply(lambda x: compare_pair_survival(x["DICT_INFO"], HASHDICT[x["PAR"]+"CONTROLE"], events_col, dt.datetime(2021, 8, 31)), axis=1)

KeyError: '13933680344CONTROLE'

In [137]:
HASHDICT["13933680344CONTROLE"]

KeyError: '13933680344CONTROLE'

In [139]:
len(HASHDICT.keys())

258866

In [145]:
ev[ev["PAR"]=="13933680344"]

Unnamed: 0,CPF,DATA D1,DATA D2,DATA OBITO COVID,DATA OBITO GERAL,DATA HOSPITALIZACAO,TIPO,PAR,PAREADO,DICT_INFO,KEY_DICT
1,6840492362,2021-04-02,2021-05-16,NaT,NaT,,CONTROLE,13933680344,True,"{'CPF': '06840492362', 'DATA D1': 2021-04-02 0...",06840492362CONTROLE


In [146]:
ev[ev["CPF"]=="13933680344"]

Unnamed: 0,CPF,DATA D1,DATA D2,DATA OBITO COVID,DATA OBITO GERAL,DATA HOSPITALIZACAO,TIPO,PAR,PAREADO,DICT_INFO,KEY_DICT
0,13933680344,2021-01-21,2021-02-14,NaT,NaT,,CASO,6840492362,True,"{'CPF': '13933680344', 'DATA D1': 2021-01-21 0...",13933680344CASO
