In [2]:
import pandas as pd

# Load the dataset

In [3]:
df = pd.read_csv("medical-May-2016.csv")

In [4]:
df.head()

Unnamed: 0,PatientId,AppointmentID,Gender,ScheduledDay,AppointmentDay,Age,Neighbourhood,Scholarship,Hipertension,Diabetes,Alcoholism,Handcap,SMS_received,No-show
0,29872500000000.0,5642903,F,2016-04-29T18:38:08Z,2016-04-29T00:00:00Z,62,JARDIM DA PENHA,0,1,0,0,0,0,No
1,558997800000000.0,5642503,M,2016-04-29T16:08:27Z,2016-04-29T00:00:00Z,56,JARDIM DA PENHA,0,0,0,0,0,0,No
2,4262962000000.0,5642549,F,2016-04-29T16:19:04Z,2016-04-29T00:00:00Z,62,MATA DA PRAIA,0,0,0,0,0,0,No
3,867951200000.0,5642828,F,2016-04-29T17:29:31Z,2016-04-29T00:00:00Z,8,PONTAL DE CAMBURI,0,0,0,0,0,0,No
4,8841186000000.0,5642494,F,2016-04-29T16:07:23Z,2016-04-29T00:00:00Z,56,JARDIM DA PENHA,0,1,1,0,0,0,No


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 110527 entries, 0 to 110526
Data columns (total 14 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   PatientId       110527 non-null  float64
 1   AppointmentID   110527 non-null  int64  
 2   Gender          110527 non-null  object 
 3   ScheduledDay    110527 non-null  object 
 4   AppointmentDay  110527 non-null  object 
 5   Age             110527 non-null  int64  
 6   Neighbourhood   110527 non-null  object 
 7   Scholarship     110527 non-null  int64  
 8   Hipertension    110527 non-null  int64  
 9   Diabetes        110527 non-null  int64  
 10  Alcoholism      110527 non-null  int64  
 11  Handcap         110527 non-null  int64  
 12  SMS_received    110527 non-null  int64  
 13  No-show         110527 non-null  object 
dtypes: float64(1), int64(8), object(5)
memory usage: 11.8+ MB


# Identify missing values

In [6]:
print("\nMissing values per column:")
print(df.isnull().sum())


Missing values per column:
PatientId         0
AppointmentID     0
Gender            0
ScheduledDay      0
AppointmentDay    0
Age               0
Neighbourhood     0
Scholarship       0
Hipertension      0
Diabetes          0
Alcoholism        0
Handcap           0
SMS_received      0
No-show           0
dtype: int64


# Remove duplicate rows

In [7]:
df = df.drop_duplicates()
print(f"\nAfter removing duplicates: {df.shape}")


After removing duplicates: (110527, 14)


# Standardize text values

In [9]:
if 'Gender' in df.columns:
    df['Gender'] = df['Gender'].str.strip().str.lower().replace({
        'm': 'male', 'f': 'female'
    })

if 'Country' in df.columns:
    df['Country'] = df['Country'].str.strip().str.title()


# Convert date formats

In [10]:
for col in df.columns:
    if 'date' in col.lower():
        df[col] = pd.to_datetime(df[col], errors='coerce')
        df[col] = df[col].dt.strftime('%d-%m-%Y')


In [11]:
df.head()

Unnamed: 0,PatientId,AppointmentID,Gender,ScheduledDay,AppointmentDay,Age,Neighbourhood,Scholarship,Hipertension,Diabetes,Alcoholism,Handcap,SMS_received,No-show
0,29872500000000.0,5642903,female,2016-04-29T18:38:08Z,2016-04-29T00:00:00Z,62,JARDIM DA PENHA,0,1,0,0,0,0,No
1,558997800000000.0,5642503,male,2016-04-29T16:08:27Z,2016-04-29T00:00:00Z,56,JARDIM DA PENHA,0,0,0,0,0,0,No
2,4262962000000.0,5642549,female,2016-04-29T16:19:04Z,2016-04-29T00:00:00Z,62,MATA DA PRAIA,0,0,0,0,0,0,No
3,867951200000.0,5642828,female,2016-04-29T17:29:31Z,2016-04-29T00:00:00Z,8,PONTAL DE CAMBURI,0,0,0,0,0,0,No
4,8841186000000.0,5642494,female,2016-04-29T16:07:23Z,2016-04-29T00:00:00Z,56,JARDIM DA PENHA,0,1,1,0,0,0,No


# Rename column headers

In [12]:
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')

# Check and fix data types

In [13]:
if 'age' in df.columns:
    df['age'] = pd.to_numeric(df['age'], errors='coerce').astype('Int64')

In [14]:
for col in df.columns:
    if 'date' in col:
        df[col] = pd.to_datetime(df[col], errors='coerce')

# Final check

In [15]:
print("\nData types after cleaning:")
print(df.dtypes)

print("\nCleaned dataset preview:")
print(df.head())


Data types after cleaning:
patientid         float64
appointmentid       int64
gender             object
scheduledday       object
appointmentday     object
age                 Int64
neighbourhood      object
scholarship         int64
hipertension        int64
diabetes            int64
alcoholism          int64
handcap             int64
sms_received        int64
no-show            object
dtype: object

Cleaned dataset preview:
      patientid  appointmentid  gender          scheduledday  \
0  2.987250e+13        5642903  female  2016-04-29T18:38:08Z   
1  5.589978e+14        5642503    male  2016-04-29T16:08:27Z   
2  4.262962e+12        5642549  female  2016-04-29T16:19:04Z   
3  8.679512e+11        5642828  female  2016-04-29T17:29:31Z   
4  8.841186e+12        5642494  female  2016-04-29T16:07:23Z   

         appointmentday  age      neighbourhood  scholarship  hipertension  \
0  2016-04-29T00:00:00Z   62    JARDIM DA PENHA            0             1   
1  2016-04-29T00:00:00Z   5

# Save cleaned file

In [16]:
df.to_csv("medical_cleaned.csv", index=False)
print("\n Cleaned dataset 'medical_cleaned.csv'")



 Cleaned dataset 'medical_cleaned.csv'
