In [9]:
# Date --> AUG 25, 2022

# **Investigation on irregular vaccine dates**

In [2]:
import sys
sys.path.append("..")

import os
import re
import csv
import numpy as np
import pandas as pd
from tqdm import tqdm

from collections import defaultdict

In [3]:
dataf = os.path.join("..", "..", "..", "data", "PARQUET_TRANSFORMED")

base_path = os.path.join(os.environ["USERPROFILE"], "Documents")
project_folder = os.path.join(base_path, "projects", "vaccine-eff-fortaleza")
pareamento_folder = os.path.join(project_folder, "output", "PAREAMENTO")
output_data_folder = os.path.join(project_folder, "output", "data")

## **Read and format data on applied vaccines**

In [3]:
def transform_string(string):
    lst = string.replace("Insert into EXPORT_TABLE", "").split("values")
    columns = lst[0].split(",")
    columns = [ x.replace(' ','').replace("(", "").replace(")", "") for x in columns ]
    values = lst[1].replace('null', "''")
    values = re.findall(r"'([^']*)'", values[2:-1])
    return columns, values

In [4]:
col_subset = ('"cpf_usuario"', '"vacina"', '"data_aplicacao"', '"municipio"', '"tipo_atendimento"', '"nascimento"', '"dose"')
data_lines, success = [], []
with open(os.path.join(dataf, "vacinados.csv"), "r", encoding="latin") as f:
    csv_reader = csv.reader(f, delimiter=';')
    line_count = 0
    for row in csv_reader:
        try:
            col, val = transform_string(row[0])
            dict_temp = dict(zip(col, val))
            data_lines.append({k: dict_temp[k] for k in col_subset })
            success.append(True)
        except:
            transf_row = row[0]
            data_lines.append(None)
            success.append(False)
        if line_count == -10:
            break
        else:
            line_count += 1
    print(f'Processed {line_count} lines.')

Processed 6455110 lines.


In [6]:
pd.Series(success).value_counts()

True     6454981
False        129
dtype: int64

In [5]:
data_lines[:4]

[None,
 None,
 {'"cpf_usuario"': '64085678272',
  '"vacina"': 'VACINA COVID 19 COVISHIELD',
  '"data_aplicacao"': '2021-04-20',
  '"municipio"': 'MANAUS',
  '"tipo_atendimento"': 'ROTINA',
  '"nascimento"': '1978-07-27',
  '"dose"': '1ª DOSE'},
 {'"cpf_usuario"': '64085678272',
  '"vacina"': 'VACINA COVID 19 COVISHIELD',
  '"data_aplicacao"': '2021-07-08',
  '"municipio"': 'MANAUS',
  '"tipo_atendimento"': 'ROTINA',
  '"nascimento"': '1978-07-27',
  '"dose"': '2ª DOSE'}]

In [7]:
df = pd.DataFrame([ x for x in data_lines if x is not None])

In [10]:
df.to_parquet(os.path.join(dataf, "VACINAS_APLICADAS_AUG_2022.parquet"))

## **Investigation of vaccine receipt dates** 

In [3]:
df = pd.read_parquet(os.path.join(dataf, "VACINAS_APLICADAS_AUG_2022.parquet"))

In [9]:
df['"dose"'].value_counts()

1ª DOSE           2318809
2ª DOSE           2169761
REFORÇO           1317930
2º REFORÇO         402006
1º REFORÇO         194885
DOSE ÚNICA          32270
DOSE ADICIONAL      14400
3º REFORÇO           4541
DOSE                  379
Name: "dose", dtype: int64

In [182]:
def transform_vaccine_data(data):
    '''
    
    '''
    individuals = defaultdict(lambda: ([], [], []))
    for j in tqdm(range(data.shape[0])):
        cpf = data['"cpf_usuario"'].iat[j]
        dose = data['"dose"'].iat[j]
        date = data['"data_aplicacao"'].iat[j]
        vacina = data['"vacina"'].iat[j]
        
        if dose not in ["1ª DOSE", "2ª DOSE", "DOSE ÚNICA"]:
            continue
        
        individuals[cpf][0].append(dose)
        individuals[cpf][1].append(vacina)
        individuals[cpf][2].append(date)
    return individuals

def checkIfDuplicates_2(listOfElems):
    ''' Check if given list contains any duplicates '''    
    setOfElems = set()
    for elem in listOfElems:
        if elem in setOfElems:
            return True
        else:
            setOfElems.add(elem)         
    return False

def return_duplicates_cpf(individuals_dict):
    cpfs_dupl = []
    cpfs_nondupl = []
    for key in individuals_dict.keys():
        if checkIfDuplicates_2(individuals_dict[key][0]):
            cpfs_dupl.append(key)
        else:
            cpfs_nondupl.append(key)
    return cpfs_dupl, cpfs_nondupl

In [11]:
individuals = transform_vaccine_data(df)

100%|█████████████████████████████████████████████████████████████████████| 6454981/6454981 [01:55<00:00, 56125.99it/s]


In [183]:
cpfs_dupl, cpfs_nondupl = return_duplicates_cpf(individuals)

In [184]:
print(len(cpfs_dupl),len(cpfs_nondupl))

33171 2264011


In [185]:
print(cpfs_dupl[15])
print(pd.Series(individuals[cpfs_dupl[15]][0]).value_counts())

11331305349
1ª DOSE    2
2ª DOSE    1
dtype: int64


In [198]:
def classify(tupl):
    clas = None
    count_dose = pd.Series(tupl[0]).value_counts()
    if "1ª DOSE" in count_dose.index and count_dose.loc["1ª DOSE"]>1:
        clas = "D1"
        if "2ª DOSE" in count_dose.index and count_dose.loc["2ª DOSE"]>1:
                clas = "BOTH"
    elif "2ª DOSE" in count_dose.index and count_dose.loc["2ª DOSE"]>1:
        clas = "D2"
        if "1ª DOSE" in count_dose.index and count_dose.loc["1ª DOSE"]>1:
                clas = "BOTH"
    else:
        clas = "EXTRA"
    return clas

# -- Classify each individual duplicated
def classify_error_type(cpf_duplicated, individuals_dict):
    '''
    
    '''
    dupl_type = [ None for x in cpf_duplicated ]
    remove = [ False for x in  cpf_duplicated ]
    for pos, cur_cpf in enumerate(cpf_duplicated):
        info = individuals_dict[cur_cpf]
        # first: identify which type of duplication -> d1? d2? both?
        dupl_type[pos] = classify(info)
        # second: duplication identified -> get all dates and compare
        if dupl_type[pos]=="D1":
            dates_d1 = [ pd.to_datetime(info[2][j]) for j in range(len(info[0])) if info[0][j]=="1ª DOSE" ]
            date_d2 = [ pd.to_datetime(info[2][j]) for j in range(len(info[0])) if info[0][j]=="2ª DOSE" ]
            if len(date_d2)==0: continue
            for d1 in dates_d1:
                if (date_d2[0]-d1).days<=7 or d1>date_d2[0]:
                    remove[pos] = True
                    break
        elif dupl_type[pos]=="D2":
            dates_d2 = [ pd.to_datetime(info[2][j]) for j in range(len(info[0])) if info[0][j]=="2ª DOSE" ]
            date_d1 = [ pd.to_datetime(info[2][j]) for j in range(len(info[0])) if info[0][j]=="1ª DOSE" ]
            if len(date_d1)==0: continue
            for d2 in dates_d2:
                if (d2-date_d1[0]).days<=7 or date_d1[0]>d2:
                    remove[pos] = True
                    break
        elif dupl_type[pos]=="BOTH":
            dates_d2 = [ pd.to_datetime(info[2][j]) for j in range(len(info[0])) if info[0][j]=="2ª DOSE" ]
            dates_d1 = [ pd.to_datetime(info[2][j]) for j in range(len(info[0])) if info[0][j]=="1ª DOSE" ]
            for d2 in dates_d2:
                for d1 in dates_d1:
                    if (d2-d1).days<=7 or d1>d2:
                        remove[pos] = True
                        break
        else:
            pass
        #if len(info[0])<20:
        #    print(info[0], info[2], dupl_type[pos], remove[pos])
    return dupl_type, remove 

def classify_nondupl(cpf_nonduplicated, individuals_dict):
    dtype = [ "D1" for x in cpf_nonduplicated ]
    remove = [ False for x in  cpf_nonduplicated ]
    for pos, cur_cpf in tqdm(enumerate(cpf_nonduplicated)):
        info = individuals_dict[cur_cpf]
        
        dates_d2 = [ pd.to_datetime(info[2][j]) for j in range(len(info[0])) if info[0][j]=="2ª DOSE" ]
        dates_d1 = [ pd.to_datetime(info[2][j]) for j in range(len(info[0])) if info[0][j]=="1ª DOSE" ]
        
        for d2 in dates_d2:
            for d1 in dates_d1:
                if (d2-d1).days<=7 and (d2-d1).days>=0:
                    remove[pos] = True
                    break
                elif d1>d2:
                    remove[pos] = True
                    dtype[pos] = "BOTH"
                    break
    return dtype, remove

In [148]:
#for index, dup in enumerate(dupl):
#    if dup=="BOTH" and cpfs_dupl[index]!="":
#        print(individuals[cpfs_dupl[index]][0], individuals[cpfs_dupl[index]][2], dup, remove[index])

In [104]:
dupl, remove = classify_error_type(cpfs_dupl, individuals)

In [199]:
dtype, remove_non = classify_nondupl(cpfs_nondupl, individuals)

2264011it [06:21, 5932.67it/s]


In [200]:
temp_non = pd.DataFrame({"cpf": cpfs_nondupl, "dtype": dtype, "remove": remove_non})

In [149]:
temp = pd.DataFrame({"cpf": cpfs_dupl, "dupl type": dupl, "remove": remove})

In [276]:
cpf_remove_d1 = temp[(temp["dupl type"]=="D1") & (temp["remove"]==True)]["cpf"].reset_index()
cpf_remove_d2 = temp[(temp["dupl type"]=="D2") & (temp["remove"]==True)]["cpf"].reset_index()
cpf_remove_d1d2 = temp[(temp["dupl type"]=="BOTH") & (temp["remove"]==True)]["cpf"].reset_index()

cpf_remove_d1_1 = temp_non[(temp_non["dtype"]=="D1") & (temp_non["remove"]==True)]["cpf"].reset_index()
cpf_remove_d2_1 = temp_non[(temp_non["dtype"]=="BOTH") & (temp_non["remove"]==True)]["cpf"].reset_index()

# --> 
d1_cpfs = pd.concat([cpf_remove_d1, cpf_remove_d1d2, cpf_remove_d1_1, cpf_remove_d2, cpf_remove_d2_1])
d2_cpfs = pd.concat([cpf_remove_d2])
d1_cpfs.to_parquet(os.path.join(output_data_folder, "CPF_IRREGULAR_DATA_D1.parquet"))
d2_cpfs.to_parquet(os.path.join(output_data_folder, "CPF_IRREGULAR_DATA_D2.parquet"))

In [259]:
d1_cpfs[:5]

Unnamed: 0,index,cpf
0,0,4603769504
1,2,2635471320
2,3,62031035355
3,4,1322566305
4,5,2634749333


In [130]:
x.index("EXTRA")

1049

In [64]:
print(cpfs_dupl[1049])
individuals[cpfs_dupl[1049]]

03226906330


(['DOSE ÚNICA', 'DOSE ÚNICA', '2ª DOSE'],
 ['VACINA COVID 19 RECOMBINANTE',
  'VACINA COVID 19 RECOMBINANTE',
  'VACINA COVID 19 RECOMBINANTE'],
 ['2021-11-24', '2021-11-24', '2022-04-27'])

In [65]:
df[df['"cpf_usuario"']=="03226906330"]

Unnamed: 0,"""cpf_usuario""","""vacina""","""data_aplicacao""","""municipio""","""tipo_atendimento""","""nascimento""","""dose"""
206480,3226906330,VACINA COVID 19 RECOMBINANTE,2021-11-24,FORTALEZA,ROTINA,1990-03-20,DOSE ÚNICA
206481,3226906330,VACINA COVID 19 RECOMBINANTE,2021-11-24,FORTALEZA,CONTINGENCIA,1990-03-20,DOSE ÚNICA
206482,3226906330,VACINA COVID 19 RECOMBINANTE,2022-04-27,FORTALEZA,ROTINA,1990-03-20,2ª DOSE


In [11]:
d = pd.read_parquet(os.path.join(pareamento_folder, "CORONAVAC", "NOVO_HDI_2_21JAN2021_31AUG2021", "EVENTOS_PAREADOS_D1_DAY0_1.parquet"))

In [5]:
d["DIFF"] = d[["DATA D1", "DATA D2"]].apply(lambda x: (x["DATA D2"]-x["DATA D1"]).days, axis=1)

(372781, 10)

In [6]:
d[d["CPF"].isin(d1_cpfs["cpf"])]["DATA OBITO GERAL"].notnull().sum()

NameError: name 'd1_cpfs' is not defined

In [7]:
dd = pd.read_parquet(os.path.join(pareamento_folder, "CORONAVAC", "NOVO_D1D2REG_HDI_2_21JAN2021_31AUG2021", "EVENTOS_PAREADOS_D1_DAY0_1.parquet"))

In [8]:
dd.shape

(340246, 10)

In [9]:
dd["DIFF"] = dd[["DATA D1", "DATA D2"]].apply(lambda x: (x["DATA D2"]-x["DATA D1"]).days, axis=1)

In [10]:
dd[(dd["DIFF"]<=7) & (dd["TIPO"]!="NAO PAREADO")]

Unnamed: 0,CPF,DATA D1,DATA D2,DATA OBITO COVID,DATA OBITO GERAL,DATA HOSPITALIZACAO,DATA UTI,TIPO,PAR,PAREADO,DIFF
1103,35620714349,2021-09-25,2021-09-25,NaT,NaT,,,CONTROLE,00950775886,True,0.0
1931,27808130320,2021-07-13,2021-07-13,NaT,NaT,,,CONTROLE,38887126372,True,0.0
2263,41039645372,2021-07-13,2021-07-13,NaT,NaT,,,CONTROLE,35889870378,True,0.0
3359,77582349334,2021-07-13,2021-07-13,NaT,NaT,,,CONTROLE,22117709320,True,0.0
3607,36272957300,2021-07-13,2021-07-13,NaT,NaT,,,CONTROLE,11763540359,True,0.0
...,...,...,...,...,...,...,...,...,...,...,...
250365,05201521312,2021-09-25,2021-09-25,NaT,NaT,,,CONTROLE,11654503304,True,0.0
250517,16383249304,2021-07-13,2021-07-13,NaT,NaT,,,CONTROLE,38583208387,True,0.0
250640,89956850853,2021-07-12,2021-07-12,NaT,NaT,,,CASO,63696974358,True,0.0
250919,07346510828,2021-09-25,2021-09-25,NaT,NaT,,,CONTROLE,11796391387,True,0.0


In [273]:
individuals["20343930315"]

(['1ª DOSE', '2ª DOSE', '2ª DOSE'],
 ['VACINA ADSORVIDA COVID 19 INATIVADA',
  'VACINA ADSORVIDA COVID 19 INATIVADA',
  'VACINA ADSORVIDA COVID 19 INATIVADA'],
 ['2021-02-09', '2021-02-14', '2021-02-14'])

In [255]:
cpfs_dupl.index("21302995391")

10030

In [256]:
dupl[10030]

'D2'

In [181]:
df_coronavac[df_coronavac["CPF"]=="00003107353"]

Unnamed: 0,CPF,DATA D1,DATA D2,DATA OBITO COVID,DATA OBITO GERAL,DATA HOSPITALIZACAO,DATA UTI,TIPO,PAR,PAREADO,DIFF
65468,3107353,2021-03-22,2021-03-22,NaT,NaT,,,CASO,9103236315,True,0
