In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
main_path = '/content/drive/MyDrive/MyMasterThesis2025-26/Data/pre-datasets/'

In [14]:
path_df_4 = main_path + 'df_4_processed.csv'

In [15]:
df_4 = pd.read_csv(path_df_4)

In [16]:
df_4.head(4)

Unnamed: 0,PatientId,AppointmentID,Gender,ScheduledDay,AppointmentDay,Age,Neighbourhood,Scholarship,Hipertension,Diabetes,Alcoholism,Handcap,SMS_received,Showed_up,Date.diff
0,29872500000000.0,5642903,0,2016-04-29,2016-04-29,62,JARDIM DA PENHA,0,1,0,0,0,0,1,0
1,558997800000000.0,5642503,1,2016-04-29,2016-04-29,56,JARDIM DA PENHA,0,0,0,0,0,0,1,0
2,4262962000000.0,5642549,0,2016-04-29,2016-04-29,62,MATA DA PRAIA,0,0,0,0,0,0,1,0
3,867951200000.0,5642828,0,2016-04-29,2016-04-29,8,PONTAL DE CAMBURI,0,0,0,0,0,0,1,0


In [17]:
df_4['PatientId'].duplicated().sum()


np.int64(46717)

In [18]:
df_4['PatientId'] = df_4['PatientId'].astype('int64').astype(str)

In [19]:
df_4.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 106987 entries, 0 to 106986
Data columns (total 15 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   PatientId       106987 non-null  object
 1   AppointmentID   106987 non-null  int64 
 2   Gender          106987 non-null  int64 
 3   ScheduledDay    106987 non-null  object
 4   AppointmentDay  106987 non-null  object
 5   Age             106987 non-null  int64 
 6   Neighbourhood   106987 non-null  object
 7   Scholarship     106987 non-null  int64 
 8   Hipertension    106987 non-null  int64 
 9   Diabetes        106987 non-null  int64 
 10  Alcoholism      106987 non-null  int64 
 11  Handcap         106987 non-null  int64 
 12  SMS_received    106987 non-null  int64 
 13  Showed_up       106987 non-null  int64 
 14  Date.diff       106987 non-null  int64 
dtypes: int64(11), object(4)
memory usage: 12.2+ MB


## Extract patients into a separate table

In [20]:
patients_df = (
    df_4[['PatientId', 'Gender', 'Age', 'Scholarship',
           'Hipertension', 'Diabetes', 'Alcoholism', 'Handcap']]
    .drop_duplicates(subset='PatientId')
)


In [21]:
def gen_patient_id_unique(
    value,
    df_name: str,
    max_length: int
) -> str:
    """
    Generate a unique, prefixed, zero-padded ID for a single value.

    Example:
    DF5_000001
    """
    return f"{df_name.upper()}_{str(value).zfill(max_length)}"

In [22]:
patients_df['Patient_ID'] = patients_df['PatientId'].apply(
    gen_patient_id_unique,
    args=('DF4', 0)
)


In [23]:
patients_df.head(4)

Unnamed: 0,PatientId,Gender,Age,Scholarship,Hipertension,Diabetes,Alcoholism,Handcap,Patient_ID
0,29872499824296,0,62,0,1,0,0,0,DF4_29872499824296
1,558997776694438,1,56,0,0,0,0,0,DF4_558997776694438
2,4262962299951,0,62,0,0,0,0,0,DF4_4262962299951
3,867951213174,0,8,0,0,0,0,0,DF4_867951213174


In [24]:
patients_df['Patient_ID'].duplicated().sum()

np.int64(0)

In [31]:
patients_df.shape


(60270, 9)

## Map PatientId in df_4 ( Appointments )


In [26]:
unique_patients = df_4['PatientId'].unique()

In [27]:
unique_patients.shape

(60270,)

In [28]:
patient_id_map = {
    pid: f"DF4_{pid}"
    for pid in unique_patients
}


In [29]:
df_4['Patient_ID'] = df_4['PatientId'].map(patient_id_map)

In [34]:
# df_4.head(4)

In [32]:
df_4.drop(columns=['PatientId'], inplace=True)

In [33]:
df_4.head(4)

Unnamed: 0,AppointmentID,Gender,ScheduledDay,AppointmentDay,Age,Neighbourhood,Scholarship,Hipertension,Diabetes,Alcoholism,Handcap,SMS_received,Showed_up,Date.diff,Patient_ID
0,5642903,0,2016-04-29,2016-04-29,62,JARDIM DA PENHA,0,1,0,0,0,0,1,0,DF4_29872499824296
1,5642503,1,2016-04-29,2016-04-29,56,JARDIM DA PENHA,0,0,0,0,0,0,1,0,DF4_558997776694438
2,5642549,0,2016-04-29,2016-04-29,62,MATA DA PRAIA,0,0,0,0,0,0,1,0,DF4_4262962299951
3,5642828,0,2016-04-29,2016-04-29,8,PONTAL DE CAMBURI,0,0,0,0,0,0,1,0,DF4_867951213174


## New DataFrame

In [40]:
appointments_df = df_4.copy()

## Delete PatientId ( Old ) from df_patients

In [39]:
patients_df.drop(columns=['PatientId'], inplace=True)

In [41]:
patients_df.head(4)

Unnamed: 0,Gender,Age,Scholarship,Hipertension,Diabetes,Alcoholism,Handcap,Patient_ID
0,0,62,0,1,0,0,0,DF4_29872499824296
1,1,56,0,0,0,0,0,DF4_558997776694438
2,0,62,0,0,0,0,0,DF4_4262962299951
3,0,8,0,0,0,0,0,DF4_867951213174


---

In [43]:
patient_cols = [
    'Patient_ID',
    'Age',
    'Gender',
    'Scholarship',
    'Hipertension',
    'Diabetes',
    'Alcoholism',
    'Handcap'
]
patients_df = patients_df[patient_cols]


In [44]:
patients_df.head(4)

Unnamed: 0,Patient_ID,Age,Gender,Scholarship,Hipertension,Diabetes,Alcoholism,Handcap
0,DF4_29872499824296,62,0,0,1,0,0,0
1,DF4_558997776694438,56,1,0,0,0,0,0
2,DF4_4262962299951,62,0,0,0,0,0,0
3,DF4_867951213174,8,0,0,0,0,0,0


In [45]:
appointment_cols = [
    'AppointmentID',
    'Patient_ID',
    'ScheduledDay',
    'AppointmentDay',
    'Neighbourhood',
    'SMS_received',
    'Showed_up',
    'Date.diff'
]
appointments_df = appointments_df[appointment_cols]


In [46]:
appointments_df.head(4)

Unnamed: 0,AppointmentID,Patient_ID,ScheduledDay,AppointmentDay,Neighbourhood,SMS_received,Showed_up,Date.diff
0,5642903,DF4_29872499824296,2016-04-29,2016-04-29,JARDIM DA PENHA,0,1,0
1,5642503,DF4_558997776694438,2016-04-29,2016-04-29,JARDIM DA PENHA,0,1,0
2,5642549,DF4_4262962299951,2016-04-29,2016-04-29,MATA DA PRAIA,0,1,0
3,5642828,DF4_867951213174,2016-04-29,2016-04-29,PONTAL DE CAMBURI,0,1,0
