In [2]:
import numpy as np
import pandas as pd

import decouple
import sqlalchemy

In [3]:
#Using enviroment variables for security
user = decouple.config("db_user_mysql")
host = decouple.config('db_host_mysql')
database = decouple.config("db_database_mysql")
connection = sqlalchemy.create_engine("mysql+mysqldb://"+user+"@"+host+"/"+database)

In [4]:
df = pd.read_sql_table('text01', con = connection)
df.head(3).T

Unnamed: 0,0,1,2
counter,1,2,3
paciente,356.0,317.0,188.0
nometexto,Pedido exame,Pedido exame,Atestado
texto,<h2>Requisição de exames</h2>\n<br/>\n<p>&nbsp...,<h2>Requisição de exames</h2>\n<br/>\n<p>&nbsp...,<h2>Atestado por Acidente de Trabalho</h2>\n<b...
datatexto,2016-07-03 09:41:03,2016-07-06 18:53:30,2016-07-09 16:31:48
timestamp,2021-07-01 18:06:19,2021-07-01 18:06:19,2021-07-01 18:06:19
cloud,1,0,0
update001,2021-05-18 12:51:57,2021-02-14 22:10:44,2021-05-24 20:12:22
ch3,1,1,1
sysuser,5,5,6


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 428 entries, 0 to 427
Data columns (total 40 columns):
 #   Column                       Non-Null Count  Dtype         
---  ------                       --------------  -----         
 0   nomepaciente                 428 non-null    object        
 1   paciente                     428 non-null    int64         
 2   datacadastramentopaciente    428 non-null    datetime64[ns]
 3   datanascimentopaciente       428 non-null    datetime64[ns]
 4   conveniopaciente             422 non-null    float64       
 5   matriculapaciente            428 non-null    object        
 6   sexopaciente                 428 non-null    object        
 7   estadocivilpaciente          428 non-null    object        
 8   corpaciente                  428 non-null    object        
 9   documentoidentidadepaciente  428 non-null    object        
 10  cpfpaciente                  428 non-null    object        
 11  graudeinstrucaopaciente      428 non-null    

In [5]:
# Colocando o nome das colunas do arquivo de acordo a documentação da iClinic
df = df.rename(
    columns = {
        "paciente":"patient_id",
        "nomepaciente": "name",
        "nomeoriginal": "civil_name",
        "datanascimentopaciente":"birth_date", #tratar depois
        "sexopaciente": "gender", #tratar depois
        "cpfpaciente": "cpf", 
        "documentoidentidadepaciente": "rg",
        "telefonespaciente": "mobile_phone", #apenas para inicio do tratamento
        # "email":"email", # como é o mesmo não será modificado
        "naturalidadepaciente": "birth_place",
        "ceppaciente": "zip_code", #tratar depois
        "logradouropaciente": "address", #tratar depois
        "complementopaciente":"complement",
        "bairropaciente": "neighborhood",
        "cidadepaciente": "city",
        "ufpaciente": "state",
        "corpaciente": "ethnicity", #tratar depois
        "estadocivilpaciente": "marital_status", #tratar depois
        "profissaopaciente": "occupation",
        "graudeinstrucaopaciente": "education", #tratar depois
        "observacoespaciente": "observation",
        "indicadoporpaciente": "indication", #tratar depois
        "inativo": "active", #tratar depois
        "pai": "patientrelatedness_father_names",
        "mae": "patientrelatedness_mother_names",
        "foto": "picture_filename", #tratar depois
        "conveniopaciente": "healthinsurance_pack", #tratar depois
    }
)

In [6]:
df['social_gender'] = np.nan
df['rg_issuer'] = np.nan
df['home_phone'] = np.nan
df['office_phone'] = np.nan
df['birth_state'] = np.nan 
df['country'] = "BR"
df['religion'] = np.nan
df['responsible'] = np.nan
df['email_secondary'] = np.nan
df['cns'] = np.nan
df['died'] = np.nan
df['death_info'] = np.nan
df['nationality'] = "BR" 
df['indication_observation'] = np.nan 
df['receive_email'] = np.nan
df['tag_names'] = np.nan # Não faço ideia de como tratar isso
df['tag_physician_id'] = np.nan

In [7]:
# Tratamento 'birth_date'
# Não tem valores nulos
df['birth_date'] = pd.to_datetime(df['birth_date'],format='%Y%m%d').dt.date

In [8]:
# Tratamento 'gender'
# Não tem valores nulos
df['gender'] = ['m' if x == 'M' else 'f' if x == 'F' else np.nan for x in df['gender']]
# print(df['gender'].value_counts())

In [9]:
# Tratamento 'mobile_phone'
# Não tem valores nulos
rows_t = df['mobile_phone'].astype('str').str.extractall('(\d+)').unstack().fillna('').sum(axis=1).astype(int).astype('str')
# print(rows_t)
# print(rows_t.str.len().value_counts())
# como só tenho numeros de telefone a partir de 10 digitos, então o tipo de telefone mais adequado é o 'home_phone' por acreditar que é o padrão dos pacientes
df['home_phone'] = '(' + rows_t.str[-10:-8] + ')' + rows_t.str[-8:-4] + '-' + rows_t.str[-4:]
df['mobile_phone'] = np.nan

In [10]:
# Tratamento 'zip_code'
# Não tem valores nulos
rows_with_dashes = df['zip_code'].str.contains('-')
# print(rows_with_dashes.value_counts())
# tem valores contendo '-'
df['zip_code'] = [df['zip_code'][i] if x == True else df['zip_code'][i][:5]+'-'+df['zip_code'][i][5:] if x == False else np.nan for i, x in enumerate(rows_with_dashes)]

In [11]:
# Tratamento 'address'
# Não tem valores nulos
# Separando as informações e distribuindo de acordo ao seu tipo
# print(df['address'])
row_with_adress = df['address'].str.split(',')
# print(row_with_adress.str.len().value_counts())
# existem linhas contrando 1 e dois valores apenas
df['number'] = [np.nan if len(x) == 1 else x[1].strip() for x in row_with_adress]
df['address'] = [x[0].strip() for x in row_with_adress]

In [12]:
# Tratamento 'ethnicity'
# Não tem valores nulos
# print(df['ethnicity'].value_counts())
df['ethnicity'] = df['ethnicity'].replace({'Branco': 'wh', 'Amarelo': 'ye', 'Pardo': 'br','Preto': 'bl'})

In [13]:
# Tratamento 'marital_status'
# Não tem valores nulos
# print(df['marital_status'].value_counts())
df['marital_status'] = df['marital_status'].replace({'S': 'si', 'C': 'ma', 'O': 'st', 'V': 'wi','D': 'se'})
# acima foi considerada a sigla "O" como união estável

In [14]:
# Tratamento 'education'
# Não tem valores nulos
# print(df['education'].value_counts())
df['education'] = df['education'].replace({'Ensino Médio': 's', 'Superior': 'h', 'Pós-graduação': 'p', 'Especialização': 'p'})
# acima foi considerada a sigla "Pós-graduação" e "Especialização" como 'p', já que não tem nada para específicar se a pós gradução é lato sensu ou se não foi informado ter sido mestrado

In [15]:
# Tratamento 'indication'
# Todos os valores são nulos
df['indication']= 'ot'

In [16]:
# Tratamento 'active'
# Não tem valores nulos
# print(df['active'].value_counts())
df['active'] = df['active'].replace({0: 1, 1: 0})
# df['active'] = df['active'].astype(np.dtype(bool))

In [17]:
# Tratamento 'picture_filename'
df['picture_filename'] = df['picture_filename'].replace({'NONE': np.nan})
df['picture_filename'] = df['picture_filename'].fillna(np.nan)
# print(df['picture_filename'])

In [18]:
# abaixo estamos removendo os valores que não representam um endereço na pasta 'extra/'
rows_begining_with_extra = df['picture_filename'].str.contains(r'^extra')
df['picture_filename'] = [df['picture_filename'][i][6:] if x == True else 'None' for i, x in enumerate(rows_begining_with_extra)]

In [19]:
for source in df['picture_filename'].values:

    source_path = "desafio-base2/extra/"+source
    # Destination path
    destination_path = "desafio-base2-output/picture/"+source

    # Create the directory 'pictures/' if it doesn't exist

    try:
        os.path.exists('desafio-base2-output/picture/')
        os.makedirs('desafio-base2-output/picture/')
        print("Directory created.")

    # If there is any permission issue
    except:
        print("Directory already exist.")

    # Copy the content of
    # source to destination

    try:
        shutil.copy(source_path, destination_path)
        
    # If there is any permission issue
    except shutil.SameFileError:
        print("Source and destination represents the same file.")

    # If there is any permission issue
    except PermissionError:
        print("Permission denied.")

    # For other errors
    except:
        print("Error occurred while copying file.")

print("Pictures copied successfully.")
    

Directory already exist.
Error occurred while copying file.
Directory already exist.
Error occurred while copying file.
Directory already exist.
Directory already exist.
Error occurred while copying file.
Directory already exist.
Error occurred while copying file.
Directory already exist.
Error occurred while copying file.
Directory already exist.
Error occurred while copying file.
Directory already exist.
Error occurred while copying file.
Directory already exist.
Error occurred while copying file.
Directory already exist.
Directory already exist.
Error occurred while copying file.
Directory already exist.
Directory already exist.
Directory already exist.
Error occurred while copying file.
Directory already exist.
Error occurred while copying file.
Directory already exist.
Error occurred while copying file.
Directory already exist.
Directory already exist.
Error occurred while copying file.
Directory already exist.
Error occurred while copying file.
Directory already exist.
Error occu

In [20]:
df['picture_filename'] = ['picture/'+df['picture_filename'][i] if x == True else 'None' for i, x in enumerate(rows_begining_with_extra)]
# os arquivos de foto do paciente devem ser armazenados junto com os arquivos CSV, na subpasta “picture”.

In [21]:
# Tratamento 'healthinsurance_pack'
df['healthinsurance_pack'] = df['healthinsurance_pack'].replace({np.nan: 0})
df['healthinsurance_pack'] = df['healthinsurance_pack'].astype(int)
df['healthinsurance_pack'] = [x if x <= 35 else 0 for x in df['healthinsurance_pack'] ]
# print(df['healthinsurance_pack'].values)

In [22]:
df_planos = pd.read_csv('desafio-base2-output/planos.csv',index_col = "plan")
df_planos.head(7).T

plan,1,2,3,4,5,6,7
code,9118087,518103,1377066,3346654,429694,1561209,2031645
name,Unimed,Bradesco,Amil,NotreDame,Porto Seguro,SulAmérica,Allianz
account,9823,6813,5061,4479,4058,9954,2695
billsenddayinternal,1,20,21,25,1,13,12
returndaysexternal,20,21,25,29,29,21,22


In [23]:
# Neste caso, como o 'healthinsurance_pack' inicia com 1 e está ordenada é fácil de encontrar seu nome correspondente
index = df['healthinsurance_pack'].values
# print(index)
df['healthinsurance_pack_name'] = [ df_planos['name'][i] if i != 0 else np.nan for i in index ]
df['healthinsurance_pack_code'] = [ df_planos['code'][i].astype(int).astype(str) if i != 0 else np.nan for i in index ]

# print(df['healthinsurance_pack_name'])

In [24]:
df['healthinsurance_pack'] = 'json::[\\n\\t{\\n\\t\\t"name":"'+df['healthinsurance_pack_name']+'",\\n\\t\\t"code":"'+df['healthinsurance_pack_code']+'"\\n\\t}\\n]'

In [25]:
df = df.loc[:, ["patient_id","patient_name","physician_id","date","status","patient_home_phone","patient_mobile_phone","arrival_time","start_time","end_time","description","all_day","cancel_reason","patient_email","event_blocked_scheduling","healthinsurance_name","eventprocedure_pack"]]

In [26]:
df.to_csv('desafio-base2-output/event_scheduling.csv',index=False, encoding='utf-8')