# Medical No Show Data
## Description
This notebook uses raw medical no-show data and creates clean files for analysis

### Import Dependencies

In [65]:
import pandas as pd
import numpy as np
import os
from pprint import pprint
import matplotlib.pyplot as plt
import datetime
import calendar

### Import Raw file. Assess initial data and clean.

In [66]:
# Import data from csv. Create dataframe.
noshow_df = pd.read_csv('../data/rawData/Brasil_medical_noshow_V2-May-2016.csv')

# find raw data count
raw_count = len(noshow_df) #110,527 rows total

# preview the raw data
noshow_df.head()

Unnamed: 0,PatientId,AppointmentID,Gender,ScheduledDay,AppointmentDay,Age,Neighbourhood,Scholarship,Hipertension,Diabetes,Alcoholism,Handcap,SMS_received,No-show
0,29872500000000.0,5642903,F,2016-04-29T18:38:08Z,2016-04-29T00:00:00Z,62,JARDIM DA PENHA,0,1,0,0,0,0,No
1,558997800000000.0,5642503,M,2016-04-29T16:08:27Z,2016-04-29T00:00:00Z,56,JARDIM DA PENHA,0,0,0,0,0,0,No
2,4262962000000.0,5642549,F,2016-04-29T16:19:04Z,2016-04-29T00:00:00Z,62,MATA DA PRAIA,0,0,0,0,0,0,No
3,867951200000.0,5642828,F,2016-04-29T17:29:31Z,2016-04-29T00:00:00Z,8,PONTAL DE CAMBURI,0,0,0,0,0,0,No
4,8841186000000.0,5642494,F,2016-04-29T16:07:23Z,2016-04-29T00:00:00Z,56,JARDIM DA PENHA,0,1,1,0,0,0,No


In [67]:
# Rename columns for clarity
noshow_df.rename(columns = {'PatientId':'PatientID',
                            'Neighbourhood':'Neighborhood',
                            'Scholarship':'Welfare_Assistance',
                            'Hipertension':'Hypertension',
                            'Handcap':'Handicap'}, inplace = True)

noshow_df.head()

Unnamed: 0,PatientID,AppointmentID,Gender,ScheduledDay,AppointmentDay,Age,Neighborhood,Welfare_Assistance,Hypertension,Diabetes,Alcoholism,Handicap,SMS_received,No-show
0,29872500000000.0,5642903,F,2016-04-29T18:38:08Z,2016-04-29T00:00:00Z,62,JARDIM DA PENHA,0,1,0,0,0,0,No
1,558997800000000.0,5642503,M,2016-04-29T16:08:27Z,2016-04-29T00:00:00Z,56,JARDIM DA PENHA,0,0,0,0,0,0,No
2,4262962000000.0,5642549,F,2016-04-29T16:19:04Z,2016-04-29T00:00:00Z,62,MATA DA PRAIA,0,0,0,0,0,0,No
3,867951200000.0,5642828,F,2016-04-29T17:29:31Z,2016-04-29T00:00:00Z,8,PONTAL DE CAMBURI,0,0,0,0,0,0,No
4,8841186000000.0,5642494,F,2016-04-29T16:07:23Z,2016-04-29T00:00:00Z,56,JARDIM DA PENHA,0,1,1,0,0,0,No


In [68]:
# It looks like Appointment Days were booked without any time information. Thus, we only need the dates themselves, we can strip out the timestamp info
# Convert ScheduledDay and AppointmentDay to dates
noshow_df['AppointmentDay'].astype(str)
noshow_df['ScheduledDay'].astype(str)

noshow_df['AppointmentDay'] = noshow_df['AppointmentDay'].str[:10]
noshow_df['ScheduledDay'] = noshow_df['ScheduledDay'].str[:10]

noshow_df.head()

Unnamed: 0,PatientID,AppointmentID,Gender,ScheduledDay,AppointmentDay,Age,Neighborhood,Welfare_Assistance,Hypertension,Diabetes,Alcoholism,Handicap,SMS_received,No-show
0,29872500000000.0,5642903,F,2016-04-29,2016-04-29,62,JARDIM DA PENHA,0,1,0,0,0,0,No
1,558997800000000.0,5642503,M,2016-04-29,2016-04-29,56,JARDIM DA PENHA,0,0,0,0,0,0,No
2,4262962000000.0,5642549,F,2016-04-29,2016-04-29,62,MATA DA PRAIA,0,0,0,0,0,0,No
3,867951200000.0,5642828,F,2016-04-29,2016-04-29,8,PONTAL DE CAMBURI,0,0,0,0,0,0,No
4,8841186000000.0,5642494,F,2016-04-29,2016-04-29,56,JARDIM DA PENHA,0,1,1,0,0,0,No


In [69]:
# Create Appointment Day of Week column
def findWeekday(date): 
    weekday = datetime.datetime.strptime(date, '%Y-%m-%d').weekday() 
    return (calendar.day_name[weekday]) 

noshow_df['AppointmentDayofWeek'] = noshow_df.AppointmentDay.apply(findWeekday)

noshow_df

Unnamed: 0,PatientID,AppointmentID,Gender,ScheduledDay,AppointmentDay,Age,Neighborhood,Welfare_Assistance,Hypertension,Diabetes,Alcoholism,Handicap,SMS_received,No-show,AppointmentDayofWeek
0,2.987250e+13,5642903,F,2016-04-29,2016-04-29,62,JARDIM DA PENHA,0,1,0,0,0,0,No,Friday
1,5.589978e+14,5642503,M,2016-04-29,2016-04-29,56,JARDIM DA PENHA,0,0,0,0,0,0,No,Friday
2,4.262962e+12,5642549,F,2016-04-29,2016-04-29,62,MATA DA PRAIA,0,0,0,0,0,0,No,Friday
3,8.679512e+11,5642828,F,2016-04-29,2016-04-29,8,PONTAL DE CAMBURI,0,0,0,0,0,0,No,Friday
4,8.841186e+12,5642494,F,2016-04-29,2016-04-29,56,JARDIM DA PENHA,0,1,1,0,0,0,No,Friday
5,9.598513e+13,5626772,F,2016-04-27,2016-04-29,76,REPÚBLICA,0,1,0,0,0,0,No,Friday
6,7.336882e+14,5630279,F,2016-04-27,2016-04-29,23,GOIABEIRAS,0,0,0,0,0,0,Yes,Friday
7,3.449833e+12,5630575,F,2016-04-27,2016-04-29,39,GOIABEIRAS,0,0,0,0,0,0,Yes,Friday
8,5.639473e+13,5638447,F,2016-04-29,2016-04-29,21,ANDORINHAS,0,0,0,0,0,0,No,Friday
9,7.812456e+13,5629123,F,2016-04-27,2016-04-29,19,CONQUISTA,0,0,0,0,0,0,No,Friday


In [70]:
#Check out unique values in attributes to get a better understanding of data and check for typos

# Print Unique Values
print("Unique Values in Age => {}".format(noshow_df.Age.unique()))
print("Unique Values in Gender => {}".format(noshow_df.Gender.unique()))
print("Unique Values in Welfare_Assistance => {}".format(noshow_df.Welfare_Assistance.unique()))
print("Unique Values in Hypertension => {}".format(noshow_df.Hypertension.unique()))
print("Unique Values in Diabetes => {}".format(noshow_df.Diabetes.unique()))
print("Unique Values in Alcoholism => {}".format(noshow_df.Alcoholism.unique()))
print("Unique Values in Handicap => {}".format(noshow_df.Handicap.unique()))
print("Unique Values in SMS_received => {}".format(noshow_df.SMS_received.unique()))

Unique Values in Age => [ 62  56   8  76  23  39  21  19  30  29  22  28  54  15  50  40  46   4
  13  65  45  51  32  12  61  38  79  18  63  64  85  59  55  71  49  78
  31  58  27   6   2  11   7   0   3   1  69  68  60  67  36  10  35  20
  26  34  33  16  42   5  47  17  41  44  37  24  66  77  81  70  53  75
  73  52  74  43  89  57  14   9  48  83  72  25  80  87  88  84  82  90
  94  86  91  98  92  96  93  95  97 102 115 100  99  -1]
Unique Values in Gender => ['F' 'M']
Unique Values in Welfare_Assistance => [0 1]
Unique Values in Hypertension => [1 0]
Unique Values in Diabetes => [0 1]
Unique Values in Alcoholism => [0 1]
Unique Values in Handicap => [0 1 2 3 4]
Unique Values in SMS_received => [0 1]


In [71]:
# Check datatypes and verify if there are any nulls

noshow_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 110527 entries, 0 to 110526
Data columns (total 15 columns):
PatientID               110527 non-null float64
AppointmentID           110527 non-null int64
Gender                  110527 non-null object
ScheduledDay            110527 non-null object
AppointmentDay          110527 non-null object
Age                     110527 non-null int64
Neighborhood            110527 non-null object
Welfare_Assistance      110527 non-null int64
Hypertension            110527 non-null int64
Diabetes                110527 non-null int64
Alcoholism              110527 non-null int64
Handicap                110527 non-null int64
SMS_received            110527 non-null int64
No-show                 110527 non-null object
AppointmentDayofWeek    110527 non-null object
dtypes: float64(1), int64(8), object(6)
memory usage: 12.6+ MB


In [72]:
# Clean up datatypes

# Convert PatientID from float to integer
noshow_df['PatientID'] = noshow_df['PatientID'].astype('int64')

# Convert ScheduledDay and AppointmentDay from 'object' type to 'datetime64[ns]'
noshow_df['ScheduledDay'] = pd.to_datetime(noshow_df['ScheduledDay']).dt.date.astype('datetime64[ns]')
noshow_df['AppointmentDay'] = pd.to_datetime(noshow_df['AppointmentDay']).dt.date.astype('datetime64[ns]')

# Convert Welfare_Assistance, Hypertension, Diabetes, Alcoholism, Handicap, SMS_received from integers to objects
noshow_df['Welfare_Assistance'] = noshow_df['Welfare_Assistance'].astype('object')
noshow_df['Hypertension'] = noshow_df['Hypertension'].astype('object')
noshow_df['Diabetes'] = noshow_df['Diabetes'].astype('object')
noshow_df['Alcoholism'] = noshow_df['Alcoholism'].astype('object')
noshow_df['Handicap'] = noshow_df['Handicap'].astype('object')
noshow_df['SMS_received'] = noshow_df['SMS_received'].astype('object')

noshow_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 110527 entries, 0 to 110526
Data columns (total 15 columns):
PatientID               110527 non-null int64
AppointmentID           110527 non-null int64
Gender                  110527 non-null object
ScheduledDay            110527 non-null datetime64[ns]
AppointmentDay          110527 non-null datetime64[ns]
Age                     110527 non-null int64
Neighborhood            110527 non-null object
Welfare_Assistance      110527 non-null object
Hypertension            110527 non-null object
Diabetes                110527 non-null object
Alcoholism              110527 non-null object
Handicap                110527 non-null object
SMS_received            110527 non-null object
No-show                 110527 non-null object
AppointmentDayofWeek    110527 non-null object
dtypes: datetime64[ns](2), int64(3), object(10)
memory usage: 12.6+ MB


In [74]:
#Remove negative info in Age column 
noshow_df = noshow_df.loc[noshow_df['Age']>=0]

print("Unique Values in Age => {}".format(noshow_df.Age.unique()))

Unique Values in Age => [ 62  56   8  76  23  39  21  19  30  29  22  28  54  15  50  40  46   4
  13  65  45  51  32  12  61  38  79  18  63  64  85  59  55  71  49  78
  31  58  27   6   2  11   7   0   3   1  69  68  60  67  36  10  35  20
  26  34  33  16  42   5  47  17  41  44  37  24  66  77  81  70  53  75
  73  52  74  43  89  57  14   9  48  83  72  25  80  87  88  84  82  90
  94  86  91  98  92  96  93  95  97 102 115 100  99]


In [75]:
# Find the number Days Booked in Advance for appointment
noshow_df['AdvanceBookingDays'] = noshow_df['AppointmentDay']-noshow_df['ScheduledDay']

noshow_df.head(10)

Unnamed: 0,PatientID,AppointmentID,Gender,ScheduledDay,AppointmentDay,Age,Neighborhood,Welfare_Assistance,Hypertension,Diabetes,Alcoholism,Handicap,SMS_received,No-show,AppointmentDayofWeek,AdvanceBookingDays
0,29872499824296,5642903,F,2016-04-29,2016-04-29,62,JARDIM DA PENHA,0,1,0,0,0,0,No,Friday,0 days
1,558997776694438,5642503,M,2016-04-29,2016-04-29,56,JARDIM DA PENHA,0,0,0,0,0,0,No,Friday,0 days
2,4262962299951,5642549,F,2016-04-29,2016-04-29,62,MATA DA PRAIA,0,0,0,0,0,0,No,Friday,0 days
3,867951213174,5642828,F,2016-04-29,2016-04-29,8,PONTAL DE CAMBURI,0,0,0,0,0,0,No,Friday,0 days
4,8841186448183,5642494,F,2016-04-29,2016-04-29,56,JARDIM DA PENHA,0,1,1,0,0,0,No,Friday,0 days
5,95985133231274,5626772,F,2016-04-27,2016-04-29,76,REPÚBLICA,0,1,0,0,0,0,No,Friday,2 days
6,733688164476661,5630279,F,2016-04-27,2016-04-29,23,GOIABEIRAS,0,0,0,0,0,0,Yes,Friday,2 days
7,3449833394123,5630575,F,2016-04-27,2016-04-29,39,GOIABEIRAS,0,0,0,0,0,0,Yes,Friday,2 days
8,56394729949972,5638447,F,2016-04-29,2016-04-29,21,ANDORINHAS,0,0,0,0,0,0,No,Friday,0 days
9,78124564369297,5629123,F,2016-04-27,2016-04-29,19,CONQUISTA,0,0,0,0,0,0,No,Friday,2 days


### What about repeat patients? 
Time to dig deeper. Which patients have visited more than once? How frequently do most patients visit? 

In [76]:
# Count number of visits per patient
noshow_df['PatientID'].value_counts()

822145925426128    88
99637671331        84
26886125921145     70
33534783483176     65
258424392677       62
75797461494159     62
871374938638855    62
6264198675331      62
66844879846766     57
872278549442       55
89239687393655     54
8435223536         51
853439686798559    50
14479974122994     46
65433599726992     46
81894521843749     42
9452745294842      42
188232341789524    40
2271579924275      38
9496196639835      38
13364929297498     37
1484143378533      35
88834999836575     34
986162815579582    34
712458866975343    33
416755661551767    30
6128878448536      30
81213966782532     29
8634164126317      24
1198157171975      23
                   ..
98683352133221      1
5394313945329       1
48689197872217      1
9675119787546       1
763619586595        1
867726299814126     1
77425139319169      1
39423418767468      1
961392519656997     1
737858311826761     1
271517596623238     1
8249496395977       1
137479426839        1
6529316371746       1
3212962263