In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('desafio-base1/pacientes.csv', parse_dates=['DataNasc'], encoding='iso-8859-1',quotechar='"', delimiter='|')
print(df.dtypes)

Código                       int64
Nome                        object
DataNasc            datetime64[ns]
Sexo                        object
Estado                      object
Endereco                    object
Cidade                      object
CEP                         object
Naturalidade                object
Telefone                    object
TipoTelefone                object
Profissao                   object
Pai                         object
Mae                         object
Conjuge                     object
ProfissaoConjuge            object
Cor                         object
EstadoCivil                 object
dtype: object


In [3]:
# Colocando o nome das colunas do arquivo de acordo a documentacção da iClinic
# Tem informação adicional (pode ser colocado em observation) e que falta tratamento por estar em series separadas: Conjuge, ProfissaoConjuge, Telefone, TipoTelefone
df_renamed = df.rename(
    columns = {
        "Código":"patient_id",
        "Nome": "name",
        "DataNasc":"birth_date",
        "Sexo": "gender",
        "Estado": "birth_state",
        "Endereco": "address",
        "Cidade": "city",
        "CEP": "zip_code",
        "Naturalidade": "birth_place",
        "Profissao": "occupation",
        "Pai": "patientrelatedness_father_names",
        "Mae": "patientrelatedness_mother_names",
        "Cor": "ethnicity",
        "EstadoCivil": "marital_status",
    }
)

In [4]:
df_renamed.head(8)

Unnamed: 0,patient_id,name,birth_date,gender,birth_state,address,city,zip_code,birth_place,Telefone,TipoTelefone,occupation,patientrelatedness_father_names,patientrelatedness_mother_names,Conjuge,ProfissaoConjuge,ethnicity,marital_status
0,0,Pedro Henrique Porto,NaT,M,RJ,"Quadra Caroline Dias, 482, Salgado Filho",Ramos,31924-681,Campos do Amparo,(021) 9313-9128,T,Reumatologista,Diego Silveira,Emanuella Lima,Lorenzo Silveira,Juiz ou árbitro de futebol,,
1,1,João Vitor da Paz,1996-01-01,M,AP,"Setor Nunes, Santa Lúcia",Nogueira,28517-970,Araújo,(021) 5202 2566,T,Fotógrafo,André Campos,Isabel Gomes,Pietra Pereira,Astrônomo,B,VI
2,2,Theo Nunes,1959-01-01,,,,,,Lopes,,,Atendente,Eduardo Lima,Brenda Mendes,Marcos Vinicius Aragão,Armador,I,
3,3,Isis da Rosa,1983-01-01,,,,,,,,,Encarregado de manutenção predial,,Rebeca da Rocha,Raquel Castro,Assistente administrativo,A,
4,4,Srta. Beatriz Santos,NaT,F,MA,"Favela de Peixoto, 43, Pantanal",Barros de Moreira,59801-069,Peixoto,,,Psicólogo,,Gabrielly da Costa,,,P,ES
5,5,Sofia Pires,NaT,F,TO,"Vale de Porto, 9, Mangueiras",Jesus,35356581,Oliveira da Mata,+55 51 5247 5165,T,Chapeiro,Pietro Mendes,Gabriela Mendes,Carlos Eduardo das Neves,Revisor,A,
6,6,Francisco da Mota,2011-01-01,,,,,,Barbosa,+55 (011) 6998 0347,T,Agrimensor,,Alexia Rocha,,,P,VI
7,7,Renan Viana,NaT,M,SE,"Núcleo de Gonçalves, 85, Luxemburgo",Gomes,97431633,Santos,61 2746 3446,R,Agente comunitário de saúde,Diogo Moreira,Srta. Alícia Aragão,Alexandre Gomes,Gourmet,,


In [5]:
# Trantando a serie "gender"
# testei uma ideia originalmente, mas não foi correta
# df_renamed['gender'] = df_renamed['gender'].astype(str).str.lower()
# pelo que achei melhor utilizar list comprehension, desta forma se tiver algum outro caracter não correto ele receberá o valor NaN
# print(df_renamed['gender'].isna().sum())
df_renamed['gender'] = ['m' if x == 'M' else 'f' if x == 'F' else np.nan for x in df_renamed['gender']]
# print(df_renamed['gender'].isna().sum())
# Verifiquei que a quantidade de NaN ficou imutável após a mudança

In [6]:
# Trantando a serie "birth_state"
teste = df_renamed['birth_state'].str.contains(r'\b[A-Z]{2}')
print(teste.value_counts())
# Verifiquei se alguma informação não se adequava ao padrão estado (duas letras maiusculas), como só tive o valor True, o resto é NaN, logo está OK

True    632
Name: birth_state, dtype: int64


In [7]:
# Trantando a serie "birth_state"
# Primeiro verifiquei a quantidade de NaN para garantir que ele continuasse o mesmo
# Em seguida verifiquei qual 
# print(df_renamed['zip_code'].isna().sum())
rows_with_dashes = df_renamed['zip_code'].str.contains('-')
df_renamed['zip_code'] = [df_renamed['zip_code'][i] if x == True else df_renamed['zip_code'][i][:5]+'-'+df_renamed['zip_code'][i][5:] if x == False else np.nan for i, x in enumerate(rows_with_dashes)]
# print(df_renamed['zip_code'].isna().sum())

In [8]:
#Fazendo analise dos valores na serie TipoTelefone
print(df_renamed['TipoTelefone'].value_counts())

C    225
T    195
R    195
Name: TipoTelefone, dtype: int64


In [12]:
#Criando as colunas relativas ao tipo de phone e tratando as series 'Telefone' e 'TipoTelefone'
df_renamed['mobile_phone'] = np.nan
df_renamed['home_phone'] = np.nan
df_renamed['office_phone'] = np.nan

In [13]:
#Extraindo apenas os valores numericos dos numeros de telefone
df_renamed['Telefone'] = df_renamed['Telefone'].astype('str').str.extractall('(\d+)').unstack().fillna('').sum(axis=1).astype(int).astype('str')

In [9]:
#Verificando o tipo de telefone T, trabalho
rows_t = df_renamed['TipoTelefone'].str.contains('T', na=False)
df_renamed['office_phone'] = '(' + df_renamed['Telefone'][rows_t].str[-10:-8] + ')' + df_renamed['Telefone'][rows_t].str[-8:-4] + '-' + df_renamed['Telefone'][rows_t].str[-4:]
#print(df_renamed['Telefone'][rows_t],"\n")

In [10]:
#Verificando o tipo de telefone C, celular
rows_c = df_renamed['TipoTelefone'].str.contains('C', na=False)
df_renamed['mobile_phone'] = '(' + df_renamed['Telefone'][rows_c].str[:2] + ')9' + df_renamed['Telefone'][rows_c].str[-8:-4] + '-' + df_renamed['Telefone'][rows_c].str[-4:]
#print(df_renamed['Telefone'][rows_c],"\n")

In [11]:
#Verificando o tipo de telefone R, residencial
rows_r = df_renamed['TipoTelefone'].str.contains('R', na=False)
df_renamed['home_phone'] = '(' + df_renamed['Telefone'][rows_r].str[-10:-8] + ')' + df_renamed['Telefone'][rows_r].str[-8:-4] + '-' + df_renamed['Telefone'][rows_r].str[-4:]
#print(df_renamed['Telefone'][rows_r],"\n")

In [17]:
# Tratar serie "marital_status"
print(df_renamed['marital_status'].value_counts())
#CA = casado = ma
#ES = união estável = st
#VI = Viúvo = wi
#SE = Separado = se

CA    187
ES    170
VI    145
SE    132
Name: marital_status, dtype: int64


In [20]:
rows_with_something = df_renamed['marital_status'].str.contains('CA', na=False)
df_renamed['marital_status'] = ['ma' if x == True else df_renamed['marital_status'][i] for i, x in enumerate(rows_with_something)]
rows_with_something = df_renamed['marital_status'].str.contains('ES', na=False)
df_renamed['marital_status'] = ['st' if x == True else df_renamed['marital_status'][i] for i, x in enumerate(rows_with_something)]
rows_with_something = df_renamed['marital_status'].str.contains('VI', na=False)
df_renamed['marital_status'] = ['wi' if x == True else df_renamed['marital_status'][i] for i, x in enumerate(rows_with_something)]
rows_with_something = df_renamed['marital_status'].str.contains('SE', na=False)
df_renamed['marital_status'] = ['se' if x == True else df_renamed['marital_status'][i] for i, x in enumerate(rows_with_something)]
#print(df_renamed['marital_status'])

0      NaN
1       wi
2      NaN
3      NaN
4       st
      ... 
778     wi
779     wi
780    NaN
781     ma
782     se
Name: marital_status, Length: 783, dtype: object


In [21]:
# Tratar serie "ethnicity"
print(df_renamed['ethnicity'].value_counts())
#B = branca = wh
#A = amarela = ye
#P = parda = br
#N = negra = bl
#I = indegena = br?

B    132
A    131
P    127
N    127
I    114
Name: ethnicity, dtype: int64


In [22]:
rows_with_something = df_renamed['ethnicity'].str.contains('B', na=False)
df_renamed['ethnicity'] = ['wh' if x == True else df_renamed['ethnicity'][i] for i, x in enumerate(rows_with_something)]
rows_with_something = df_renamed['ethnicity'].str.contains('A', na=False)
df_renamed['ethnicity'] = ['ye' if x == True else df_renamed['ethnicity'][i] for i, x in enumerate(rows_with_something)]
rows_with_something = df_renamed['ethnicity'].str.contains('P', na=False)
df_renamed['ethnicity'] = ['br' if x == True else df_renamed['ethnicity'][i] for i, x in enumerate(rows_with_something)]
rows_with_something = df_renamed['ethnicity'].str.contains('N', na=False)
df_renamed['ethnicity'] = ['bl' if x == True else df_renamed['ethnicity'][i] for i, x in enumerate(rows_with_something)]
rows_with_something = df_renamed['ethnicity'].str.contains('I', na=False)
df_renamed['ethnicity'] = ['br' if x == True else df_renamed['ethnicity'][i] for i, x in enumerate(rows_with_something)]

In [5]:
#Tratar "address"
#Criando as colunas relativas ao endereço do paciente, 
df_renamed['number'] = np.nan
df_renamed['complement'] = np.nan
df_renamed['neighborhood'] = np.nan
df_renamed['state'] = np.nan
df_renamed['country'] = "BR"

In [20]:
row_with_adress = df_renamed['address'].str.split(',')
[np.nan if x == np.nan else True for x in row_with_adress]


[True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,

In [12]:
#Criando novas series com valores nulos
df_renamed['status'] = 'cp'
df_renamed['patient_home_phone'] = np.nan
df_renamed['patient_mobile_phone'] = np.nan
df_renamed['description'] = np.nan
df_renamed['all_day'] = np.nan
df_renamed['cancel_reason'] = np.nan  
df_renamed['patient_email'] = np.nan  
df_renamed['event_blocked_scheduling'] = np.nan
df_renamed['quantity'] = np.nan  

In [6]:
df_renamed.head(5)

Unnamed: 0,patient_id,name,birth_date,gender,birth_state,address,city,zip_code,birth_place,Telefone,TipoTelefone,occupation,patientrelatedness_father_names,patientrelatedness_mother_names,Conjuge,ProfissaoConjuge,ethnicity,marital_status
0,0,Pedro Henrique Porto,NaT,m,RJ,"Quadra Caroline Dias, 482, Salgado Filho",Ramos,31924-681,Campos do Amparo,(021) 9313-9128,T,Reumatologista,Diego Silveira,Emanuella Lima,Lorenzo Silveira,Juiz ou árbitro de futebol,,
1,1,João Vitor da Paz,1996-01-01,m,AP,"Setor Nunes, Santa Lúcia",Nogueira,28517-970,Araújo,(021) 5202 2566,T,Fotógrafo,André Campos,Isabel Gomes,Pietra Pereira,Astrônomo,B,VI
2,2,Theo Nunes,1959-01-01,,,,,,Lopes,,,Atendente,Eduardo Lima,Brenda Mendes,Marcos Vinicius Aragão,Armador,I,
3,3,Isis da Rosa,1983-01-01,,,,,,,,,Encarregado de manutenção predial,,Rebeca da Rocha,Raquel Castro,Assistente administrativo,A,
4,4,Srta. Beatriz Santos,NaT,f,MA,"Favela de Peixoto, 43, Pantanal",Barros de Moreira,59801-069,Peixoto,,,Psicólogo,,Gabrielly da Costa,,,P,ES


In [14]:
df_renamed.loc[:, ["patient_id","name","birth_date","gender","cpf","rg","rg_issuer","mobile_phone","home_phone","office_phone","email","email_secondary","birth_place","birth_state","zip_code","address","number","complement","neighborhood","city","state","country","picture_filename","ethnicity","marital_status","religion","occupation","education","responsible","sms","cns","died","death_info","nationality","indication","indication_observation","active","receive_email","observation","healthinsurance_pack","patientrelatedness_mother_names","patientrelatedness_father_names","tag_names","tag_physician_id"]]

Unnamed: 0,patient_id,patient_name,physician_id,date,status,patient_home_phone,patient_mobile_phone,arrival_time,start_time,end_time,description,all_day,cancel_reason,patient_email,event_blocked_scheduling,healthinsurance_name,eventprocedure_pack
0,1,José,91,2020-09-10,cp,,,14:13:35,19:00:49,22:15:06,,,,,,9,"json::[{""name"":""1ª vez"",""value"":107.0}]"
1,2,Teste 1,100,1997-06-01,cp,,,NaT,NaT,NaT,,,,,,13,"json::[{""name"":""Consulta"",""value"":123.0}]"
2,2,Teste 1,564,1999-09-10,cp,,,18:31:24,22:31:27,23:11:29,,,,,,2,"json::[{""name"":""Retorno"",""value"":212.0}]"
3,2,Teste 1,585,1995-06-13,cp,,,21:55:27,22:03:14,23:02:27,,,,,,0,"json::[{""name"":""1ª vez"",""value"":19.0}]"
4,2,Teste 1,588,2015-11-20,cp,,,11:13:07,12:16:58,17:14:26,,,,,,12,"json::[{""name"":null,""value"":4.0}]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1225,2,Teste 1,121,1973-10-31,cp,,,11:08:51,19:09:59,21:19:03,,,,,,0,"json::[{""name"":""1ª vez"",""value"":0.0}]"
1226,1,José,684,1988-12-15,cp,,,12:33:14,23:52:10,23:59:37,,,,,,4,"json::[{""name"":""Retorno"",""value"":128.0}]"
1227,1,José,58,1971-04-09,cp,,,16:51:01,22:33:45,23:43:33,,,,,,0,"json::[{""name"":""Retorno"",""value"":182.0}]"
1228,2,Teste 1,570,1981-07-23,cp,,,20:50:39,21:59:01,23:47:54,,,,,,5,"json::[{""name"":null,""value"":17.0}]"


In [16]:
df_renamed.to_csv('patient.csv',index=False, encoding='utf-8')
