In [1]:
import numpy as np
import pandas as pd

import decouple
import sqlalchemy

import shutil
import os

In [2]:
#Using enviroment variables for security
user = decouple.config("db_user_mysql")
host = decouple.config('db_host_mysql')
password = decouple.config('db_password_mysql')
database = decouple.config("db_database_mysql")
connection = sqlalchemy.create_engine("mysql+mysqldb://"+user+":"+password+"@"+host+"/"+database)
# sudo apt-get install libmysqlclient-dev

In [3]:
df = pd.read_sql_table('patient01', con = connection)
df.head(3).T

Unnamed: 0,0,1,2
nomepaciente,Nicolas Freitas Teste,Gustavo Silveira Teste,Otávio Barros Teste
paciente,1,2,3
datacadastramentopaciente,2019-04-13 17:37:19,2021-04-26 01:54:13,2016-12-13 19:05:57
datanascimentopaciente,1975-03-04 15:28:17,1997-12-09 09:42:22,1983-06-17 22:41:41
conveniopaciente,29.0,34.0,29.0
matriculapaciente,75681864492,57851299323,91162597647
sexopaciente,M,M,F
estadocivilpaciente,S,D,V
corpaciente,Amarelo,Amarelo,Amarelo
documentoidentidadepaciente,734051682,423156809,543278104


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 428 entries, 0 to 427
Data columns (total 40 columns):
 #   Column                       Non-Null Count  Dtype         
---  ------                       --------------  -----         
 0   nomepaciente                 428 non-null    object        
 1   paciente                     428 non-null    int64         
 2   datacadastramentopaciente    428 non-null    datetime64[ns]
 3   datanascimentopaciente       428 non-null    datetime64[ns]
 4   conveniopaciente             422 non-null    float64       
 5   matriculapaciente            428 non-null    object        
 6   sexopaciente                 428 non-null    object        
 7   estadocivilpaciente          428 non-null    object        
 8   corpaciente                  428 non-null    object        
 9   documentoidentidadepaciente  428 non-null    object        
 10  cpfpaciente                  428 non-null    object        
 11  graudeinstrucaopaciente      428 non-null    

In [5]:
# Colocando o nome das colunas do arquivo de acordo a documentação da iClinic
df = df.rename(
    columns = {
        "paciente":"patient_id",
        "nomepaciente": "name",
        "nomeoriginal": "civil_name",
        "datanascimentopaciente":"birth_date", #tratar depois
        "sexopaciente": "gender", #tratar depois
        "cpfpaciente": "cpf", 
        "documentoidentidadepaciente": "rg",
        "telefonespaciente": "mobile_phone", #apenas para inicio do tratamento
        # "email":"email", # como é o mesmo não será modificado
        "naturalidadepaciente": "birth_place",
        "ceppaciente": "zip_code", #tratar depois
        "logradouropaciente": "address", #tratar depois
        "complementopaciente":"complement",
        "bairropaciente": "neighborhood",
        "cidadepaciente": "city",
        "ufpaciente": "state",
        "corpaciente": "ethnicity", #tratar depois
        "estadocivilpaciente": "marital_status", #tratar depois
        "profissaopaciente": "occupation",
        "graudeinstrucaopaciente": "education", #tratar depois
        "observacoespaciente": "observation",
        "indicadoporpaciente": "indication", #tratar depois
        "inativo": "active", #tratar depois
        "pai": "patientrelatedness_father_names",
        "mae": "patientrelatedness_mother_names",
        "foto": "picture_filename", #tratar depois
        "conveniopaciente": "healthinsurance_pack", #tratar depois
    }
)

In [6]:
df['social_gender'] = np.nan
df['rg_issuer'] = np.nan
df['home_phone'] = np.nan
df['office_phone'] = np.nan
df['birth_state'] = np.nan 
df['country'] = "BR"
df['religion'] = np.nan
df['responsible'] = np.nan
df['email_secondary'] = np.nan
df['cns'] = np.nan
df['died'] = np.nan
df['death_info'] = np.nan
df['nationality'] = "BR" 
df['indication_observation'] = np.nan 
df['receive_email'] = np.nan
df['tag_names'] = np.nan # Não faço ideia de como tratar isso
df['tag_physician_id'] = np.nan

In [7]:
# Tratamento 'birth_date'
# Não tem valores nulos
df['birth_date'] = pd.to_datetime(df['birth_date'],format='%Y%m%d').dt.date

In [8]:
# Tratamento 'gender'
# Não tem valores nulos
df['gender'] = ['m' if x == 'M' else 'f' if x == 'F' else np.nan for x in df['gender']]
# print(df['gender'].value_counts())

In [9]:
# Tratamento 'mobile_phone'
# Não tem valores nulos
rows_t = df['mobile_phone'].astype('str').str.extractall('(\d+)').unstack().fillna('').sum(axis=1).astype(int).astype('str')
# print(rows_t)
# print(rows_t.str.len().value_counts())
# como só tenho numeros de telefone a partir de 10 digitos, então o tipo de telefone mais adequado é o 'home_phone' por acreditar que é o padrão dos pacientes
df['home_phone'] = '(' + rows_t.str[-10:-8] + ')' + rows_t.str[-8:-4] + '-' + rows_t.str[-4:]
df['mobile_phone'] = np.nan

In [10]:
# Tratamento 'zip_code'
# Não tem valores nulos
rows_with_dashes = df['zip_code'].str.contains('-')
# print(rows_with_dashes.value_counts())
# tem valores contendo '-'
df['zip_code'] = [df['zip_code'][i] if x == True else df['zip_code'][i][:5]+'-'+df['zip_code'][i][5:] if x == False else np.nan for i, x in enumerate(rows_with_dashes)]

In [11]:
# Tratamento 'address'
# Não tem valores nulos
# Separando as informações e distribuindo de acordo ao seu tipo
# print(df['address'])
row_with_adress = df['address'].str.split(',')
# print(row_with_adress.str.len().value_counts())
# existem linhas contrando 1 e dois valores apenas
df['number'] = [np.nan if len(x) == 1 else x[1].strip() for x in row_with_adress]
df['address'] = [x[0].strip() for x in row_with_adress]

In [12]:
# Tratamento 'ethnicity'
# Não tem valores nulos
# print(df['ethnicity'].value_counts())
df['ethnicity'] = df['ethnicity'].replace({'Branco': 'wh', 'Amarelo': 'ye', 'Pardo': 'br','Preto': 'bl'})

In [13]:
# Tratamento 'marital_status'
# Não tem valores nulos
# print(df['marital_status'].value_counts())
df['marital_status'] = df['marital_status'].replace({'S': 'si', 'C': 'ma', 'O': 'st', 'V': 'wi','D': 'se'})
# acima foi considerada a sigla "O" como união estável

In [14]:
# Tratamento 'education'
# Não tem valores nulos
# print(df['education'].value_counts())
df['education'] = df['education'].replace({'Ensino Médio': 's', 'Superior': 'h', 'Pós-graduação': 'p', 'Especialização': 'p'})
# acima foi considerada a sigla "Pós-graduação" e "Especialização" como 'p', já que não tem nada para específicar se a pós gradução é lato sensu ou se não foi informado ter sido mestrado

In [15]:
# Tratamento 'indication'
# Todos os valores são nulos
df['indication']= 'ot'

In [16]:
# Tratamento 'active'
# Não tem valores nulos
# print(df['active'].value_counts())
df['active'] = df['active'].replace({0: 1, 1: 0})
# df['active'] = df['active'].astype(np.dtype(bool)) #não necessário

In [17]:
# Tratamento 'picture_filename'
df['picture_filename'] = df['picture_filename'].replace({'NONE': np.nan})
df['picture_filename'] = df['picture_filename'].fillna(np.nan)
# print(df['picture_filename'])

In [18]:
# abaixo estamos removendo os valores que não representam um endereço na pasta 'extra/'
rows_begining_with_extra = df['picture_filename'].str.contains(r'^extra')
df['picture_filename'] = [df['picture_filename'][i][6:] if x == True else 'None' for i, x in enumerate(rows_begining_with_extra)]

In [19]:
for source in df['picture_filename'].values:

    source_path = "desafio-base2/extra/"+source
    # Destination path
    destination_path = "desafio-base2-output/picture/"+source

    # Create the directory 'pictures/' if it doesn't exist

    try:
        os.path.exists('desafio-base2-output/picture/')
        os.makedirs('desafio-base2-output/picture/')
        print("Directory created.")

    # If there is any permission issue
    except:
        print("Directory already exist.")

    # Copy the content of
    # source to destination

    try:
        shutil.copy(source_path, destination_path)
        
    # If there is any permission issue
    except shutil.SameFileError:
        print("Source and destination represents the same file.")

    # If there is any permission issue
    except PermissionError:
        print("Permission denied.")

    # For other errors
    except:
        print("Error occurred while copying file.")

print("Pictures copied successfully.")
    

Directory already exist.
Error occurred while copying file.
Directory already exist.
Error occurred while copying file.
Directory already exist.
Directory already exist.
Error occurred while copying file.
Directory already exist.
Error occurred while copying file.
Directory already exist.
Error occurred while copying file.
Directory already exist.
Error occurred while copying file.
Directory already exist.
Error occurred while copying file.
Directory already exist.
Error occurred while copying file.
Directory already exist.
Directory already exist.
Error occurred while copying file.
Directory already exist.
Directory already exist.
Directory already exist.
Error occurred while copying file.
Directory already exist.
Error occurred while copying file.
Directory already exist.
Error occurred while copying file.
Directory already exist.
Directory already exist.
Error occurred while copying file.
Directory already exist.
Error occurred while copying file.
Directory already exist.
Error occu

In [20]:
# modificicando o endereço onde estão contidas as fotos:  'picture/'.
df['picture_filename'] = ['picture/'+df['picture_filename'][i] if x == True else 'None' for i, x in enumerate(rows_begining_with_extra)]

In [27]:
# Tratamento 'healthinsurance_pack'
df_planos = pd.read_csv('desafio-base2-output/planos.csv')
df_planos.head(7).T

Unnamed: 0,0,1,2,3,4,5,6
plan,1,2,3,4,5,6,7
code,9118087,518103,1377066,3346654,429694,1561209,2031645
name,Unimed,Bradesco,Amil,NotreDame,Porto Seguro,SulAmérica,Allianz
account,9823,6813,5061,4479,4058,9954,2695
billsenddayinternal,1,20,21,25,1,13,12
returndaysexternal,20,21,25,29,29,21,22


In [26]:
# Tratamento 'healthinsurance_pack'
# print(df['healthinsurance_pack'].value_counts())
# Após uma breve análise foi assumido que
# Foi assumido que o indice 0 seria considerado igual a np.nan
# df['healthinsurance_pack'] = df['healthinsurance_pack'].replace({np.nan: 0})
# df['healthinsurance_pack'] = df['healthinsurance_pack'].astype(int)
# df['healthinsurance_pack'] = [x if x <= df_planos.size else 0 for x in df['healthinsurance_pack'] ]

24.0        19
31.0        18
2.0         18
1.0         18
15.0        18
34.0        15
23.0        15
13.0        15
32.0        14
12.0        14
27.0        13
4.0         13
19.0        13
11.0        12
14.0        12
29.0        11
20.0        11
0.0         11
8.0         11
30.0        10
17.0        10
21.0        10
18.0        10
35.0         9
3.0          9
5.0          9
6.0          9
22.0         9
25.0         9
33.0         9
7.0          9
9.0          8
26.0         8
16.0         7
100000.0     7
28.0         5
10.0         4
Name: healthinsurance_pack, dtype: int64


In [23]:
# Neste caso, como o 'healthinsurance_pack' inicia com 1 e está ordenada é fácil de encontrar seu nome correspondente
index = df['healthinsurance_pack'].values
# print(index)
df['healthinsurance_pack_name'] = [ df_planos['name'][i] if i != 0 else np.nan for i in index ]
df['healthinsurance_pack_code'] = [ df_planos['code'][i].astype(int).astype(str) if i != 0 else np.nan for i in index ]

# print(df['healthinsurance_pack_name'])

In [24]:
df['healthinsurance_pack'] = 'json::[\\n\\t{\\n\\t\\t"name":"'+df['healthinsurance_pack_name']+'",\\n\\t\\t"code":"'+df['healthinsurance_pack_code']+'"\\n\\t}\\n]'

In [25]:
df =  df.loc[:, ["patient_id","name","birth_date","gender","cpf","rg","rg_issuer","mobile_phone","home_phone","office_phone","email","email_secondary","birth_place","birth_state","zip_code","address","number","complement","neighborhood","city","state","country","picture_filename","ethnicity","marital_status","religion","occupation","education","responsible","cns","died","death_info","nationality","indication","indication_observation","active","receive_email","observation","healthinsurance_pack","patientrelatedness_mother_names","patientrelatedness_father_names","tag_names","tag_physician_id"]]

In [26]:
df.to_csv('desafio-base2-output/patient.csv',index=False, encoding='utf-8')

In [27]:
# Informações não utilizadas da tabela 'patient01'
#  2   datacadastramentopaciente    428 non-null    datetime64[ns]     
#  5   matriculapaciente            428 non-null    object
#  24  prontuario                   428 non-null    object        
#  25  retorno                      428 non-null    datetime64[ns]
#  29  leito                        428 non-null    object        
#  30  alteracao                    428 non-null    datetime64[ns]
#  31  timestamp                    428 non-null    datetime64[ns]
#  34  conjuge                      428 non-null    object        
#  35  filhos                       130 non-null    object        
#  36  empresa                      428 non-null    object  
#  38  cloud                        428 non-null    int64         
#  39  update001                    428 non-null    datetime64[ns]
