# **linkage SISCEL**

In [42]:
from pathlib import Path
from simpledbf import Dbf5
import numpy as np
import pandas as pd
from unidecode import unidecode

**Load not structured SISCEL and create a unified and structured dataset for further processing**

In [40]:
basepath = Path.home().joinpath("Documents", "data")
cargaviralpath = basepath.joinpath("carga_viral_hiv")

folders = list(cargaviralpath.glob("*"))
cargaviral_folders = [ elem for elem in folders if 'viral' in elem.stem ]
cd4_folders = [ elem for elem in folders if 'viral' not in elem.stem ]

def open_folder(current_folder):
    '''
        load all csv files from the parsed folder and concatenate 
        them into a single file.
    '''
    list_of_files = current_folder.glob("*.csv")
    df = []
    for current_file in list_of_files:
        cur_df = pd.read_csv(current_file, delimiter=";", encoding='latin', skiprows=10)
        df.append(cur_df)
    df = pd.concat(df)
    return df

print([ elem.stem for elem in cargaviral_folders ])
print([ elem.stem for elem in cd4_folders ])

carga2020 = open_folder(cargaviral_folders[0])
carga2021 = open_folder(cargaviral_folders[1])
carga2022 = open_folder(cargaviral_folders[2])
carga2023 = open_folder(cargaviral_folders[3])
carga2024 = open_folder(cargaviral_folders[4])

cd4_2020 = open_folder(cd4_folders[0])

# -- save joined dataset (not processed)
carga_viral_df = pd.concat([carga2020, carga2021, carga2022, carga2023, carga2024])
carga_viral_df = carga_viral_df.reset_index(drop=True)
carga_viral_df["TIPO"] = [ "CV" for n in range(carga_viral_df.shape[0]) ]

cd4_2020 = cd4_2020.reset_index(drop=True)
cd4_2020["TIPO"] = [ "CD4" for n in range(cd4_2020.shape[0]) ]

siscel_df = pd.concat([carga_viral_df, cd4_2020], ignore_index=True, axis=0)
siscel_df["Idade Gestacional"] = siscel_df["Idade Gestacional"].astype(str).apply(lambda x: x.replace("nan", ""))
siscel_df.to_parquet(basepath.joinpath("sinan", "AIDS", "siscel_cv_2020_2024_cd4_2020.parquet"))

['ano de 2020 Carga viral', 'ano de 2021 carga viral', 'ano de 2022 carga viral', 'ano de 2023 Carga viral', 'ano de 2024 carga viral']
['CD 4  2020']


  df = pd.concat(df)


**further testing for processing**

In [41]:
siscel_df = pd.read_parquet(basepath.joinpath("sinan", "AIDS", "siscel_cv_2020_2024_cd4_2020.parquet"))
siscel_df.head()

Unnamed: 0,Código,Nome Civil,Nome Social,Mãe,Responsável,Sexo,Escolaridade,Raca/Cor,Data de Nascimento,Endereço,...,TIPO,Dois últimos CD4 > 350 cel.mm³?,Paciente assintomático?,Carga Viral Indetectável?,Contagem CD4,% CD4,Contagem CD8,% CD8,Média CD3,Linfócitos/CD45
0,99775520,Francisca Cláudia Oliveira de Souza,,Maria do Carmo Oliveira de Souza,,Feminino,De 8 a 11 anos,Parda,13/01/1975,AV D 531 CASA B ETAPA JOSE WALTER,...,CV,,,,,,,,,
1,99942466,Aila Pinheiro do Nascimento,,Maria Margarida Pinheiro da Silva,,Feminino,Ignorado,Parda,23/11/1982,"RUA ANTONIO ALVES COSTA, 150",...,CV,,,,,,,,,
2,147214,Alisson David Izidio dos Santos,,Ana Regina Izidio Serafim,,Masculino,De 12 e mais anos,Preta,19/06/2001,"RUA TODOS OS SANTOS, 1854",...,CV,,,,,,,,,
3,99830400,Aurilene Maria Ribeiro Silva,,Maria Joana da Conceicáo,,Feminino,De 8 a 11 anos,Parda,28/08/1976,"AV PARANA, 421",...,CV,,,,,,,,,
4,99078122,Cicero Bento da Silva,,Francisca Lino da Silva,,Masculino,Nenhuma,Parda,01/12/1973,RUA MARIA BEZERRA DE ARAUJO 192,...,CV,,,,,,,,,


In [46]:
siscel_df["Cópias"].value_counts()[:20]
    

Cópias
Não Detectado                  84073
Menor que o limite inferior    27000
20                               411
21                               403
23                               359
22                               358
26                               316
25                               315
24                               310
40                               301
29                               256
27                               254
28                               250
30                               238
31                               237
45                               218
42                               218
41                               210
44                               208
43                               200
Name: count, dtype: int64