In [53]:
import pandas as pd
import numpy as np
import libs.DataCleaning as dc

In [54]:
df = pd.read_csv('data/show_no_show.csv')

# Cleaning

In [55]:
df.tail()

Unnamed: 0.1,Unnamed: 0,PatientId,AppointmentID,Gender,ScheduledDay,AppointmentDay,Age,Neighbourhood,Scholarship,Hipertension,Diabetes,Alcoholism,Handcap,SMS_received,No-show
110522,110522,2572134000000.0,5651768,F,2016-05-03T09:15:35Z,2016-06-07T00:00:00Z,56,MARIA ORTIZ,0,0,0,,0,1,No
110523,110523,3596266000000.0,5650093,F,2016-05-03T07:27:33Z,2016-06-07T00:00:00Z,51,MARIA ORTIZ,0,0,0,,0,1,No
110524,110524,15576630000000.0,5630692,F,2016-04-27T16:03:52Z,2016-06-07T00:00:00Z,21,MARIA ORTIZ,0,0,0,,0,1,No
110525,110525,92134930000000.0,5630323,F,2016-04-27T15:09:23Z,2016-06-07T00:00:00Z,38,MARIA ORTIZ,0,0,0,,0,1,No
110526,110526,377511500000000.0,5629448,F,2016-04-27T13:30:56Z,2016-06-07T00:00:00Z,54,MARIA ORTIZ,0,0,0,,0,1,No


## String Dates to Date

In [56]:
dc.transform_dates_to_date_dtype(df,['ScheduledDay','AppointmentDay'])

In [57]:
df.dtypes

Unnamed: 0                      int64
PatientId                     float64
AppointmentID                   int64
Gender                         object
ScheduledDay      datetime64[ns, UTC]
AppointmentDay    datetime64[ns, UTC]
Age                             int64
Neighbourhood                  object
Scholarship                     int64
Hipertension                    int64
Diabetes                        int64
Alcoholism                     object
Handcap                         int64
SMS_received                    int64
No-show                        object
dtype: object

## Eliminate Unnecesary Columns

In [58]:
df.drop(columns=["Unnamed: 0","PatientId"], inplace= True)

## Rename Columns

In [59]:
df.rename(columns={"Handcap": "Handicap"},inplace=True)

## NO SHOW TO NUMERIC

In [60]:
df['No-show'] = df['No-show'].map({'No':'0','Yes':'1'})
df['No-show'] = pd.to_numeric(df['No-show'],downcast = 'integer')

In [61]:
df.dtypes

AppointmentID                   int64
Gender                         object
ScheduledDay      datetime64[ns, UTC]
AppointmentDay    datetime64[ns, UTC]
Age                             int64
Neighbourhood                  object
Scholarship                     int64
Hipertension                    int64
Diabetes                        int64
Alcoholism                     object
Handicap                        int64
SMS_received                    int64
No-show                          int8
dtype: object

## Remove null gender values (dropna)

In [62]:
df.dropna(inplace=True)

## Convert Gender Columns to Integer

In [63]:
df = pd.get_dummies(df, columns = ["Gender"],drop_first=True)

In [64]:
df

Unnamed: 0,AppointmentID,ScheduledDay,AppointmentDay,Age,Neighbourhood,Scholarship,Hipertension,Diabetes,Alcoholism,Handicap,SMS_received,No-show,Gender_M
0,5642903,2016-04-29 18:38:08+00:00,2016-04-29 00:00:00+00:00,62,JARDIM DA PENHA,0,1,0,,0,0,0,0
1,5642503,2016-04-29 16:08:27+00:00,2016-04-29 00:00:00+00:00,56,JARDIM DA PENHA,0,0,0,,0,0,0,1
2,5642549,2016-04-29 16:19:04+00:00,2016-04-29 00:00:00+00:00,62,MATA DA PRAIA,0,0,0,,0,0,0,0
3,5642828,2016-04-29 17:29:31+00:00,2016-04-29 00:00:00+00:00,8,PONTAL DE CAMBURI,0,0,0,,0,0,0,0
4,5642494,2016-04-29 16:07:23+00:00,2016-04-29 00:00:00+00:00,56,JARDIM DA PENHA,0,1,1,,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
110522,5651768,2016-05-03 09:15:35+00:00,2016-06-07 00:00:00+00:00,56,MARIA ORTIZ,0,0,0,,0,1,0,0
110523,5650093,2016-05-03 07:27:33+00:00,2016-06-07 00:00:00+00:00,51,MARIA ORTIZ,0,0,0,,0,1,0,0
110524,5630692,2016-04-27 16:03:52+00:00,2016-06-07 00:00:00+00:00,21,MARIA ORTIZ,0,0,0,,0,1,0,0
110525,5630323,2016-04-27 15:09:23+00:00,2016-06-07 00:00:00+00:00,38,MARIA ORTIZ,0,0,0,,0,1,0,0


## Alcoholism to Label

In [40]:
df.Alcoholism.value_counts()

None        85273
Moderate      903
Low           897
High          878
Name: Alcoholism, dtype: int64

In [41]:
from sklearn.preprocessing import OrdinalEncoder

In [65]:
encoder = OrdinalEncoder(categories=[["None", "Low", "Moderate","High"]],dtype=np.int8)

In [66]:
df['Alcoholism'] = encoder.fit_transform(df['Alcoholism'].values.reshape(-1, 1))

In [71]:
df

Unnamed: 0,AppointmentID,ScheduledDay,AppointmentDay,Age,Neighbourhood,Scholarship,Hipertension,Diabetes,Alcoholism,Handicap,SMS_received,No-show,Gender_M
0,5642903,2016-04-29 18:38:08+00:00,2016-04-29 00:00:00+00:00,62,JARDIM DA PENHA,0,1,0,0,0,0,0,0
1,5642503,2016-04-29 16:08:27+00:00,2016-04-29 00:00:00+00:00,56,JARDIM DA PENHA,0,0,0,0,0,0,0,1
2,5642549,2016-04-29 16:19:04+00:00,2016-04-29 00:00:00+00:00,62,MATA DA PRAIA,0,0,0,0,0,0,0,0
3,5642828,2016-04-29 17:29:31+00:00,2016-04-29 00:00:00+00:00,8,PONTAL DE CAMBURI,0,0,0,0,0,0,0,0
4,5642494,2016-04-29 16:07:23+00:00,2016-04-29 00:00:00+00:00,56,JARDIM DA PENHA,0,1,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
110522,5651768,2016-05-03 09:15:35+00:00,2016-06-07 00:00:00+00:00,56,MARIA ORTIZ,0,0,0,0,0,1,0,0
110523,5650093,2016-05-03 07:27:33+00:00,2016-06-07 00:00:00+00:00,51,MARIA ORTIZ,0,0,0,0,0,1,0,0
110524,5630692,2016-04-27 16:03:52+00:00,2016-06-07 00:00:00+00:00,21,MARIA ORTIZ,0,0,0,0,0,1,0,0
110525,5630323,2016-04-27 15:09:23+00:00,2016-06-07 00:00:00+00:00,38,MARIA ORTIZ,0,0,0,0,0,1,0,0
