### Predict No Shows for Medical Appointments

#### Data Files
[1] Medical History = medical_history.csv
[2] Demographic Details = demographic_details.csv
[3] Train Dataset =train.csv
[4] Test Dataset = test_share.csv

In [1]:
# import dependencies

import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split


In [2]:
# read data and metadata

datafile_train="C:/Users/parik/OneDrive/Desktop/Python/Edvancer projects/P6_predict_no_shows_for_medical_appointments/Predict-no_shows_for_medical_appointments/data/raw_data/train.csv"

bd_train=pd.read_csv(datafile_train)

medical_hist='C:/Users/parik/OneDrive/Desktop/Python/Edvancer projects/P6_predict_no_shows_for_medical_appointments/Predict-no_shows_for_medical_appointments/data/raw_data/medical_history.csv'
demo_det='C:/Users/parik/OneDrive/Desktop/Python/Edvancer projects/P6_predict_no_shows_for_medical_appointments/Predict-no_shows_for_medical_appointments/data/raw_data/demographic_details.csv'
med=pd.read_csv(medical_hist)
dem=pd.read_csv(demo_det)

In [3]:
# combining metedata

comb=pd.merge(med,dem,how='outer',on='PatientId')
comb.head()

Unnamed: 0,PatientId,Hipertension,Diabetes,Alcoholism,Handcap,Gender,Age,Neighbourhood,Scholarship
0,29872500000000.0,1,0,0,0,F,62,JARDIM DA PENHA,0
1,558997800000000.0,0,0,0,0,M,56,JARDIM DA PENHA,0
2,4262962000000.0,0,0,0,0,F,62,MATA DA PRAIA,0
3,867951200000.0,0,0,0,0,F,8,PONTAL DE CAMBURI,0
4,8841186000000.0,1,1,0,0,F,56,JARDIM DA PENHA,0


In [4]:
# combining training data and metadata

train=pd.merge(bd_train,comb,how='left',on='PatientId')


In [5]:
# Quiz 1
a = train[train.Hipertension == 1]
a.count()

PatientId         19573
AppointmentID     19573
ScheduledDay      19573
AppointmentDay    19573
SMS_received      19573
No-show           19573
Hipertension      19573
Diabetes          19573
Alcoholism        19573
Handcap           19573
Gender            19573
Age               19573
Neighbourhood     19573
Scholarship       19573
dtype: int64

In [6]:
b = train[(train.Hipertension == 1) & (train.Diabetes == 1)]
b.count()

PatientId         5824
AppointmentID     5824
ScheduledDay      5824
AppointmentDay    5824
SMS_received      5824
No-show           5824
Hipertension      5824
Diabetes          5824
Alcoholism        5824
Handcap           5824
Gender            5824
Age               5824
Neighbourhood     5824
Scholarship       5824
dtype: int64

In [7]:
# Quiz 2
males = train[train.Gender == 'M']
females = train[train.Gender == 'F']
males["Age"].describe()
females["Age"].describe()

count    64694.000000
mean        38.793922
std         22.162029
min         -1.000000
25%         21.000000
50%         38.000000
75%         56.000000
max        115.000000
Name: Age, dtype: float64

In [8]:
# Quiz 3 & 4
# modifying dtype to datetime

# all_data.ScheduledDay.dtype
train.ScheduledDay = pd.to_datetime(train.ScheduledDay)
train.AppointmentDay = pd.to_datetime(train.AppointmentDay)


In [9]:
# extracting dayOfWeek and hour column 

train['dayOfWeek'] = train['AppointmentDay'].dt.day_name()
train.head()

Unnamed: 0,PatientId,AppointmentID,ScheduledDay,AppointmentDay,SMS_received,No-show,Hipertension,Diabetes,Alcoholism,Handcap,Gender,Age,Neighbourhood,Scholarship,dayOfWeek
0,29872500000000.0,5642903,2016-04-29 18:38:08+00:00,2016-04-29 00:00:00+00:00,0,No,1,0,0,0,F,62,JARDIM DA PENHA,0,Friday
1,558997800000000.0,5642503,2016-04-29 16:08:27+00:00,2016-04-29 00:00:00+00:00,0,No,0,0,0,0,M,56,JARDIM DA PENHA,0,Friday
2,867951200000.0,5642828,2016-04-29 17:29:31+00:00,2016-04-29 00:00:00+00:00,0,No,0,0,0,0,F,8,PONTAL DE CAMBURI,0,Friday
3,8841186000000.0,5642494,2016-04-29 16:07:23+00:00,2016-04-29 00:00:00+00:00,0,No,1,1,0,0,F,56,JARDIM DA PENHA,0,Friday
4,95985130000000.0,5626772,2016-04-27 08:36:51+00:00,2016-04-29 00:00:00+00:00,0,No,1,0,0,0,F,76,REPÚBLICA,0,Friday


In [10]:
#train.value_counts()
train['dayOfWeek'].value_counts()

Wednesday    23287
Tuesday      23048
Monday       20470
Friday       17066
Thursday     15585
Saturday        34
Name: dayOfWeek, dtype: int64

In [11]:
# Quiz 5

train['diff_bookingDays'] = (train['AppointmentDay']-train['ScheduledDay']).astype('timedelta64[D]')
train['diff_bookingDays'].head()
#train['diff_bookingDays'] = np.where(train['diff_bookingDays']< 0, 0, train['diff_bookingDays'])
train.head()
train['diff_bookingDays'].astype('int')

0        -1
1        -1
2        -1
3        -1
4         1
         ..
99485    34
99486    34
99487    34
99488    40
99489    40
Name: diff_bookingDays, Length: 99490, dtype: int32

In [12]:
train['diff_bookingDays'].describe()

count    99490.000000
mean         9.207508
std         15.260697
min         -7.000000
25%         -1.000000
50%          3.000000
75%         14.000000
max        178.000000
Name: diff_bookingDays, dtype: float64

In [13]:
# Quiz 6
a = train[train["No-show"] == "Yes"]
b = train[(train["No-show"] == "Yes") & (train["SMS_received"] == 0)]
a.count()
b.count()

PatientId           11267
AppointmentID       11267
ScheduledDay        11267
AppointmentDay      11267
SMS_received        11267
No-show             11267
Hipertension        11267
Diabetes            11267
Alcoholism          11267
Handcap             11267
Gender              11267
Age                 11267
Neighbourhood       11267
Scholarship         11267
dayOfWeek           11267
diff_bookingDays    11267
dtype: int64

In [14]:
# Quiz 7
train["Neighbourhood"].value_counts()

JARDIM CAMBURI                 6948
MARIA ORTIZ                    5203
RESISTÊNCIA                    3960
JARDIM DA PENHA                3509
ITARARÉ                        3171
                               ... 
ILHA DO BOI                      31
ILHA DO FRADE                    10
AEROPORTO                         7
ILHAS OCEÂNICAS DE TRINDADE       2
PARQUE INDUSTRIAL                 1
Name: Neighbourhood, Length: 81, dtype: int64