In [1]:
import pandas as pd
import numpy as np
import libs.DataCleaning as dc

In [19]:
df = pd.read_csv('data/show_no_show.csv')

# Cleaning

In [20]:
df.head()

Unnamed: 0.1,Unnamed: 0,PatientId,AppointmentID,Gender,ScheduledDay,AppointmentDay,Age,Neighbourhood,Scholarship,Hipertension,Diabetes,Alcoholism,Handcap,SMS_received,No-show
0,0,29872500000000.0,5642903,F,2016-04-29T18:38:08Z,2016-04-29T00:00:00Z,62,JARDIM DA PENHA,0,1,0,,0,0,No
1,1,558997800000000.0,5642503,M,2016-04-29T16:08:27Z,2016-04-29T00:00:00Z,56,JARDIM DA PENHA,0,0,0,,0,0,No
2,2,4262962000000.0,5642549,F,2016-04-29T16:19:04Z,2016-04-29T00:00:00Z,62,MATA DA PRAIA,0,0,0,,0,0,No
3,3,867951200000.0,5642828,F,2016-04-29T17:29:31Z,2016-04-29T00:00:00Z,8,PONTAL DE CAMBURI,0,0,0,,0,0,No
4,4,8841186000000.0,5642494,F,2016-04-29T16:07:23Z,2016-04-29T00:00:00Z,56,JARDIM DA PENHA,0,1,1,,0,0,No


## String Dates to Date

In [21]:
dc.transform_dates_to_date_dtype(df,['ScheduledDay','AppointmentDay'])

In [22]:
df.dtypes

Unnamed: 0                      int64
PatientId                     float64
AppointmentID                   int64
Gender                         object
ScheduledDay      datetime64[ns, UTC]
AppointmentDay    datetime64[ns, UTC]
Age                             int64
Neighbourhood                  object
Scholarship                     int64
Hipertension                    int64
Diabetes                        int64
Alcoholism                     object
Handcap                         int64
SMS_received                    int64
No-show                        object
dtype: object

## Eliminate Unnecesary Columns

In [23]:
df.drop(columns=["Unnamed: 0","PatientId","Neighbourhood"], inplace= True)

## Set Index 

In [35]:
df.set_index('AppointmentID', inplace=True)

## Rename Columns

In [24]:
df.rename(columns={"Handcap": "Handicap"},inplace=True)

## NO SHOW TO NUMERIC

In [25]:
df['No-show'] = df['No-show'].map({'No':'0','Yes':'1'})
df['No-show'] = pd.to_numeric(df['No-show'],downcast = 'integer')

In [26]:
df.dtypes

AppointmentID                   int64
Gender                         object
ScheduledDay      datetime64[ns, UTC]
AppointmentDay    datetime64[ns, UTC]
Age                             int64
Scholarship                     int64
Hipertension                    int64
Diabetes                        int64
Alcoholism                     object
Handicap                        int64
SMS_received                    int64
No-show                          int8
dtype: object

## Remove null gender values (dropna)

In [46]:
df.dropna(inplace=True)

## Drop Age values below 0 and over 99

In [47]:
df["Age"]=df.Age[(df.Age >= 0) & (df.Age < 100)]
df.Age= df.Age.astype(int)

## Convert Gender Columns to Integer

In [28]:
df = pd.get_dummies(df, columns = ["Gender"],drop_first=True)

In [48]:
df.head()

Unnamed: 0_level_0,ScheduledDay,AppointmentDay,Age,Scholarship,Hipertension,Diabetes,Alcoholism,Handicap,SMS_received,No-show,Gender_M
AppointmentID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
5642903,2016-04-29 18:38:08+00:00,2016-04-29 00:00:00+00:00,62,0,1,0,0,0,0,0,0
5642503,2016-04-29 16:08:27+00:00,2016-04-29 00:00:00+00:00,56,0,0,0,0,0,0,0,1
5642549,2016-04-29 16:19:04+00:00,2016-04-29 00:00:00+00:00,62,0,0,0,0,0,0,0,0
5642828,2016-04-29 17:29:31+00:00,2016-04-29 00:00:00+00:00,8,0,0,0,0,0,0,0,0
5642494,2016-04-29 16:07:23+00:00,2016-04-29 00:00:00+00:00,56,0,1,1,0,0,0,0,0


## Alcoholism to Label

In [39]:
df.Alcoholism.value_counts()

0    106829
1      1125
2      1116
3      1112
Name: Alcoholism, dtype: int64

In [40]:
from sklearn.preprocessing import OrdinalEncoder

In [41]:
encoder = OrdinalEncoder(categories=[["None", "Low", "Moderate","High"]],dtype=np.int8)

In [42]:
df['Alcoholism'] = encoder.fit_transform(df['Alcoholism'].values.reshape(-1, 1))

ValueError: invalid literal for int() with base 10: 'None'

In [43]:
df.head()

Unnamed: 0_level_0,ScheduledDay,AppointmentDay,Age,Scholarship,Hipertension,Diabetes,Alcoholism,Handicap,SMS_received,No-show,Gender_M
AppointmentID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
5642903,2016-04-29 18:38:08+00:00,2016-04-29 00:00:00+00:00,62.0,0,1,0,0,0,0,0,0
5642503,2016-04-29 16:08:27+00:00,2016-04-29 00:00:00+00:00,56.0,0,0,0,0,0,0,0,1
5642549,2016-04-29 16:19:04+00:00,2016-04-29 00:00:00+00:00,62.0,0,0,0,0,0,0,0,0
5642828,2016-04-29 17:29:31+00:00,2016-04-29 00:00:00+00:00,8.0,0,0,0,0,0,0,0,0
5642494,2016-04-29 16:07:23+00:00,2016-04-29 00:00:00+00:00,56.0,0,1,1,0,0,0,0,0
