# **Generate summary of the project's datasets**

**List of databases**:<br>
    - Vacine Já: cadastrados_vacineja_202109171512.csv.<br>
    - Vacinas aplicadas: VACINADOS2Sep2021.csv <br>
    - GAL: testes_gal_JAN_AGO2021.csv.<br>
    - IntegraSUS: base_dados_integrasus_fortaleza_final.csv.<br>
    - Hospitalization by Covid-19: NONSRAGHOSPITALIZADO2020_2021.csv.<br>
    - Deaths by Covid-19: base_dados_obitos_cevepi.csv.<br>


## **Lib**

In [58]:
import sys
sys.path.append("..")

import os
import json
import warnings
import pandas as pd
import numpy as np
import lib.utils as utils
import lib.db_utils as dutils
warnings.filterwarnings('ignore')

data_paths, data_names = dutils.data_hash()
print(json.dumps(data_names, indent=4, sort_keys=True))

{
    "HOSPITALIZACAO COVID-19": "NONSRAGHOSPITALIZADO2020_2021.csv",
    "OBITOS COVID-19": "base_dados_obitos_cevepi.csv",
    "TESTES COVID-19": "testes_gal_JAN_AGO2021.csv",
    "TESTES COVID-19 INTEGRA": "base_dados_integrasus_fortaleza_final.csv",
    "VACINACAO CADASTRO (VACINE JA)": "cadastrados_vacineja_202109171512.csv",
    "VACINACAO POR PESSOA": "vacinacao_por_pessoa_6D9M_15H3.csv",
    "VACINAS APLICADAS": "VACINADOS2Sep2021.csv"
}


## **Utility function**

In [59]:
def json_summary_data(data_fname, delimiter=",", encoding="utf-8", onecol=None):
    '''

    '''
    resume_dict = dict()
    sample_df = pd.read_csv(data_fname, delimiter=delimiter, encoding=encoding, nrows=100)
    all_columns = list(sample_df.columns)
    
    j = 0
    df_rows = 0
    interval = 30
    null_tables = []
    while True:
        if j+interval>len(all_columns):
            df_rows = pd.read_csv(data_fname, delimiter=delimiter, encoding=encoding, usecols=all_columns[j:j+2]).shape[0]
            null_tables.append(pd.read_csv(data_fname, delimiter=delimiter, encoding=encoding, usecols=all_columns[j:]).isnull().sum())
            break
        else:
            null_tables.append(pd.read_csv(data_fname, delimiter=delimiter, encoding=encoding, usecols=all_columns[j:j+interval]).isnull().sum())
            j += interval
    null_tables = pd.concat(null_tables)

    resume_dict.update({"Number of rows": df_rows, "Number of columns": sample_df.shape[1]})
    for col in all_columns:
        resume_dict.update({
            col: {
                "Number of nulls": null_tables.loc[col],
                "Sample of values": sample_df[col].sample(n=6, random_state=1).tolist()
            }
        })
    return resume_dict

class NpEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.integer):
            return int(obj)
        if isinstance(obj, np.floating):
            return float(obj)
        if isinstance(obj, np.ndarray):
            return obj.tolist()
        return super(NpEncoder, self).default(obj)

## **VACINE JÁ**

In [61]:
vacineja_info = json_summary_data(os.path.join(data_paths["VACINACAO CADASTRO (VACINE JA)"], data_names["VACINACAO CADASTRO (VACINE JA)"]),
                          delimiter=";")

with open("../output/vacineja_info.json", "w") as f:
    json.dump(vacineja_info, f, cls=NpEncoder)

In [64]:
vacineja_info

{'Number of rows': 2272651,
 'Number of columns': 7,
 'nome': {'Number of nulls': 0,
  'Sample of values': ['YASMIN MARTINS ARAGAO',
   'MIKAEL VITOR LOPES SALES',
   'ANA JULIA MENEZES DA SILVA',
   'SIBELLY VERONICA OLIVEIRA BANDEIRA',
   'KLEBER DA SILVA LEANDRO FILHO',
   'VICTOR MANUEL DE CARVALHO MORENO']},
 'cpf': {'Number of nulls': 0,
  'Sample of values': [62975057300,
   10429019378,
   63367232327,
   63128311323,
   62868464335,
   10464920396]},
 'data_nascimento': {'Number of nulls': 0,
  'Sample of values': ['2006-02-16',
   '2008-01-27',
   '2009-04-14',
   '2005-09-18',
   '2005-03-21',
   '2007-03-09']},
 'bairro': {'Number of nulls': 2640,
  'Sample of values': ['OLAVO OLIVEIRA',
   'JARDIM DAS OLIVEIRAS',
   'SERRINHA',
   'PAUPINA',
   'BOM JARDIM',
   'VILA PERY']},
 'cidade': {'Number of nulls': 2538,
  'Sample of values': ['FORTALEZA',
   'FORTALEZA',
   'FORTALEZA',
   'FORTALEZA',
   'FORTALEZA',
   'FORTALEZA']},
 'sexo': {'Number of nulls': 108631,
  'Sampl

## **Vacinas aplicadas**

In [54]:
vacinas_info = json_summary_data(os.path.join(data_paths["VACINAS APLICADAS"], data_names["VACINAS APLICADAS"]),
                          delimiter=";")

with open("../output/vacinas_info.json", "w") as f:
    json.dump(vacinas_info, f, cls=NpEncoder)

In [65]:
vacinas_info

{'Number of rows': 2620105,
 'Number of columns': 56,
 'id': {'Number of nulls': 0,
  'Sample of values': [2429424, 2408713, 2093633, 320633, 1442731, 1756481]},
 'codigo_paciente': {'Number of nulls': 0,
  'Sample of values': ['00026cb6b60fc743434f7e71af394e46',
   '00027890cab70b2ae73c700899b7c175',
   '0001047232e91db26cd2951b3ab0b9c3',
   '0002720f951fc9ea53c27d128eb81ac0',
   '0002b0803c0b521f6bbe12aaf9edb7e1',
   '0000a5d049d9b04be1ba278797cb1552']},
 'codigo_usuario': {'Number of nulls': 0,
  'Sample of values': ['07054a04144c178228914c61fc7e50ce',
   '20703f8c714d823eceeeeec979e0220d',
   'b3defb5d0d3ab7eabe67bd13c486c30e',
   '106cf74440214457d7442599db31719f',
   'a82d6179297b8d092f67b10843ec862e',
   '2af27a52c103b0dbfa7372973040228e']},
 'vacina': {'Number of nulls': 0,
  'Sample of values': ['PFIZER',
   'ASTRAZENECA',
   'PFIZER',
   'PFIZER',
   'ASTRAZENECA',
   'ASTRAZENECA']},
 'frasco': {'Number of nulls': 2448302,
  'Sample of values': [nan, nan, nan, nan, nan, nan]

## **GAL**

In [53]:
gal_info = json_summary_data(os.path.join(data_paths["TESTES COVID-19"], data_names["TESTES COVID-19"]),
                          delimiter=",")

with open("../output/galtestes_info.json", "w") as f:
    json.dump(gal_info, f, cls=NpEncoder)

In [66]:
gal_info

{'Number of rows': 366981,
 'Number of columns': 101,
 'Unnamed: 0': {'Number of nulls': 0,
  'Sample of values': [80, 84, 33, 81, 93, 17]},
 'Index tabela original': {'Number of nulls': 0,
  'Sample of values': [80, 84, 33, 81, 93, 17]},
 'Requisição': {'Number of nulls': 0,
  'Sample of values': [210156000134,
   210160000325,
   210158000437,
   210159000502,
   210190000252,
   210164000764]},
 'Requisição Correlativo (S/N)': {'Number of nulls': 0,
  'Sample of values': ['N', 'N', 'N', 'N', 'N', 'N']},
 'Regional de Cadastro': {'Number of nulls': 0,
  'Sample of values': [1, 1, 1, 1, 1, 1]},
 'Laboratório de Cadastro': {'Number of nulls': 0,
  'Sample of values': ['UNIDADE PRONTO ATENDIMENTO AUTRAN NUNES',
   'UNIDADE PRONTO ATENDIMENTO PRAIA DO FUTURO',
   'UNIDADE PRONTO ATENDIMENTO MESSEJANA',
   'UNIDADE PRONTO ATENDIMENTO JANGURUSSU',
   'UNIDADE DE PRONTO ATENDIMENTO VILA VELHA',
   'UNIDADE PRONTO ATENDIMENTO ITAPERI']},
 'CNES Laboratório de Cadastro': {'Number of nulls': 0

## **IntegraSUS**

In [52]:
integra_info = json_summary_data(os.path.join(data_paths["TESTES COVID-19 INTEGRA"], data_names["TESTES COVID-19 INTEGRA"]),
                          delimiter=",")

with open("../output/integratestes_info.json", "w") as f:
    json.dump(integra_info, f, cls=NpEncoder)

In [67]:
integra_info

{'Number of rows': 182073,
 'Number of columns': 101,
 'Unnamed: 0': {'Number of nulls': 0,
  'Sample of values': [80, 84, 33, 81, 93, 17]},
 'id': {'Number of nulls': 0,
  'Sample of values': [1105605664,
   1105735704,
   1105580241,
   1107327497,
   1105747854,
   1105605012]},
 'codigo_paciente': {'Number of nulls': 0,
  'Sample of values': ['00278c3b78887c00c96eafcfa8c913fc',
   '002b5f0b41ce64a198ebb1c700ecd91d',
   '000fda5df1f5f736180c40c3e8c489dd',
   '00291be0bc4f07f149c222fb52740f3d',
   '002e8f076e2948ed75c4736db75e6b76',
   '0009db13d6540e708cff05af72579249']},
 'codigo_obito': {'Number of nulls': 13484,
  'Sample of values': ['c8e36fdfc30c79778c175ff60e6f4967',
   '0617b27d7b3ceb007a37c25b3941a8cf',
   '0d7b9c87cfeda68ef90f9ee4c8ec17cc',
   'd4ac561d70a59f23cde16cbb46a13f1c',
   '91f4c866c78604431c0dcfedd24c021f',
   '5ed2417445fc1c74fb963c70d6823604']},
 'id_redcap': {'Number of nulls': 182073,
  'Sample of values': [nan, nan, nan, nan, nan, nan]},
 'classificacao_estad

## **Óbitos**

In [51]:
obito_info = json_summary_data(os.path.join(data_paths["OBITOS COVID-19"], data_names["OBITOS COVID-19"]), 
                               delimiter=";", encoding="latin")

with open("../output/obito_info.json", "w") as f:
    json.dump(obito_info, f, cls=NpEncoder)

In [68]:
obito_info

{'Number of rows': 11386,
 'Number of columns': 158,
 'codigo_obito': {'Number of nulls': 0,
  'Sample of values': ['643ab9d3d7a7cabcf7f9c27fdc44f197',
   'e94e3e8ec2c5aa2b8ca7ecd1570a6c96',
   'ff30502c0d15b28da404424304f53b9d',
   '232d89572a7c8587704813586dafc18d',
   'b8023cf8fb7868942ac53a69850888d3',
   '6a5679904cc8fff18e6071b6c7724cda']},
 'codigo_paciente': {'Number of nulls': 0,
  'Sample of values': ['d1fbc1c6e296d5814008a8eb875446d7',
   'c2be58635804df2e60e9ea17611351f5',
   'ebd25902375335de57040be145df7b1d',
   'c8ad24c0dc101c0e7956950da4cc83d0',
   '91b7e2c7797f7aa5df6626ac27979526',
   '6e28f38409073391fd0da6578a841532']},
 'codigo_usuario': {'Number of nulls': 0,
  'Sample of values': ['438c83e94ccf89dd10e5e3c2d2fc1c20',
   '1aaecf3bab520ea51b2cf6c2956ad320',
   '2f53e2c3fb62c338b1313e9ebb2c9a1e',
   'a84227b16610af2746e52202e81695bc',
   '88bb5afb08b10527958fc121159db23d',
   '2a34f7ba4c8e570565b5bd4400cdb25e']},
 'ordem': {'Number of nulls': 0,
  'Sample of values':