In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('KaggleV2-May-2016.csv')

In [3]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 110527 entries, 0 to 110526
Data columns (total 14 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   PatientId       110527 non-null  float64
 1   AppointmentID   110527 non-null  int64  
 2   Gender          110527 non-null  object 
 3   ScheduledDay    110527 non-null  object 
 4   AppointmentDay  110527 non-null  object 
 5   Age             110527 non-null  int64  
 6   Neighbourhood   110527 non-null  object 
 7   Scholarship     110527 non-null  int64  
 8   Hipertension    110527 non-null  int64  
 9   Diabetes        110527 non-null  int64  
 10  Alcoholism      110527 non-null  int64  
 11  Handcap         110527 non-null  int64  
 12  SMS_received    110527 non-null  int64  
 13  No-show         110527 non-null  object 
dtypes: float64(1), int64(8), object(5)
memory usage: 11.8+ MB
None


In [4]:
# Check for missing values
print(df.isnull().sum())


PatientId         0
AppointmentID     0
Gender            0
ScheduledDay      0
AppointmentDay    0
Age               0
Neighbourhood     0
Scholarship       0
Hipertension      0
Diabetes          0
Alcoholism        0
Handcap           0
SMS_received      0
No-show           0
dtype: int64


In [5]:
# Check for duplicates
print(f"Duplicates before: {df.duplicated().sum()}")

# Remove duplicates
df = df.drop_duplicates()

# Confirm removal
print(f"Duplicates after: {df.duplicated().sum()}")


Duplicates before: 0
Duplicates after: 0


In [7]:
# Standardize Gender to uppercase
df['Gender'] = df['Gender'].str.upper()

# Strip whitespace and standardize No-show
df['No-show'] = df['No-show'].str.strip().str.capitalize()


In [8]:
# Convert date columns to datetime
df['ScheduledDay'] = pd.to_datetime(df['ScheduledDay']).dt.strftime('%d-%m-%Y')
df['AppointmentDay'] = pd.to_datetime(df['AppointmentDay']).dt.strftime('%d-%m-%Y')


In [9]:
# Clean column names: lowercase, replace spaces with underscores
df.columns = df.columns.str.strip().str.lower().str.replace('-', '_').str.replace(' ', '_')
print(df.columns)


Index(['patientid', 'appointmentid', 'gender', 'scheduledday',
       'appointmentday', 'age', 'neighbourhood', 'scholarship', 'hipertension',
       'diabetes', 'alcoholism', 'handcap', 'sms_received', 'no_show'],
      dtype='object')


In [10]:
# Check data types
print(df.dtypes)

# Convert age to integer if not already
df['age'] = df['age'].astype(int)

# Convert dates back to datetime if needed for analysis
df['scheduledday'] = pd.to_datetime(df['scheduledday'], format='%d-%m-%Y')
df['appointmentday'] = pd.to_datetime(df['appointmentday'], format='%d-%m-%Y')


patientid         float64
appointmentid       int64
gender             object
scheduledday       object
appointmentday     object
age                 int64
neighbourhood      object
scholarship         int64
hipertension        int64
diabetes            int64
alcoholism          int64
handcap             int64
sms_received        int64
no_show            object
dtype: object
