# Linear SVM

https://www.kaggle.com/joniarroba/noshowappointments   

In [36]:
import matplotlib.pyplot as plt
%matplotlib inline

import os, sys
import itertools
import numpy as np
import pandas as pd

In [37]:
datasource = "datasets/appointment_noshow.csv"

In [38]:
print(os.path.exists(datasource))

True


In [39]:
dataset = pd.read_csv(datasource)
del dataset["Unnamed: 0"]

In [40]:
dataset.head()

Unnamed: 0,PatientId,AppointmentID,Gender,ScheduledDay,AppointmentDay,Age,Neighbourhood,Scholarship,Hipertension,Diabetes,Alcoholism,Handcap,SMS_received,No-show
0,914544900000000.0,5548703,F,2016-04-06T08:52:26Z,2016-05-02T00:00:00Z,26,SANTA TEREZA,0,0,0,0,0,1,No
1,95368950000000.0,5702942,M,2016-05-16T13:17:59Z,2016-05-31T00:00:00Z,4,TABUAZEIRO,0,0,0,0,0,0,No
2,4887647000000.0,5705960,F,2016-05-17T07:40:33Z,2016-05-18T00:00:00Z,1,NOVA PALESTINA,0,0,0,0,0,0,Yes
3,6541438000000.0,5681110,M,2016-05-10T13:35:24Z,2016-05-10T00:00:00Z,0,SÃO JOSÉ,0,0,0,0,0,0,No
4,6296736000000.0,5621693,F,2016-04-26T10:40:18Z,2016-04-29T00:00:00Z,20,MARUÍPE,0,0,0,0,0,1,No


In [41]:
dataset.describe()

Unnamed: 0,PatientId,AppointmentID,Age,Scholarship,Hipertension,Diabetes,Alcoholism,Handcap,SMS_received
count,110527.0,110527.0,110527.0,110527.0,110527.0,110527.0,110527.0,110527.0,110527.0
mean,147496300000000.0,5675305.0,37.088874,0.098266,0.197246,0.071865,0.0304,0.022248,0.321026
std,256094900000000.0,71295.75,23.110205,0.297675,0.397921,0.258265,0.171686,0.161543,0.466873
min,39217.84,5030230.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,4172614000000.0,5640286.0,18.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,31731840000000.0,5680573.0,37.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,94391720000000.0,5725524.0,55.0,0.0,0.0,0.0,0.0,0.0,1.0
max,999981600000000.0,5790484.0,115.0,1.0,1.0,1.0,1.0,4.0,1.0


## Preprocessing

We don't need PatientId and AppointmentId so let's remove it

In [42]:
deleteCols = ["PatientId", "AppointmentID"]

dataset.drop(deleteCols, axis = 1, inplace = True)

dataset.head()

Unnamed: 0,Gender,ScheduledDay,AppointmentDay,Age,Neighbourhood,Scholarship,Hipertension,Diabetes,Alcoholism,Handcap,SMS_received,No-show
0,F,2016-04-06T08:52:26Z,2016-05-02T00:00:00Z,26,SANTA TEREZA,0,0,0,0,0,1,No
1,M,2016-05-16T13:17:59Z,2016-05-31T00:00:00Z,4,TABUAZEIRO,0,0,0,0,0,0,No
2,F,2016-05-17T07:40:33Z,2016-05-18T00:00:00Z,1,NOVA PALESTINA,0,0,0,0,0,0,Yes
3,M,2016-05-10T13:35:24Z,2016-05-10T00:00:00Z,0,SÃO JOSÉ,0,0,0,0,0,0,No
4,F,2016-04-26T10:40:18Z,2016-04-29T00:00:00Z,20,MARUÍPE,0,0,0,0,0,1,No


Now, we need to convert Gender and No-show to binaries

In [43]:
dataset["Gender"] = dataset["Gender"].apply(["M", "F"].index)
dataset["No-show"] = dataset["No-show"].apply(["No", "Yes"].index)
#dataset["No-show"] = list(map(["No", "Yes"].index, dataset["No-show"]))

dataset.head()

Unnamed: 0,Gender,ScheduledDay,AppointmentDay,Age,Neighbourhood,Scholarship,Hipertension,Diabetes,Alcoholism,Handcap,SMS_received,No-show
0,1,2016-04-06T08:52:26Z,2016-05-02T00:00:00Z,26,SANTA TEREZA,0,0,0,0,0,1,0
1,0,2016-05-16T13:17:59Z,2016-05-31T00:00:00Z,4,TABUAZEIRO,0,0,0,0,0,0,0
2,1,2016-05-17T07:40:33Z,2016-05-18T00:00:00Z,1,NOVA PALESTINA,0,0,0,0,0,0,1
3,0,2016-05-10T13:35:24Z,2016-05-10T00:00:00Z,0,SÃO JOSÉ,0,0,0,0,0,0,0
4,1,2016-04-26T10:40:18Z,2016-04-29T00:00:00Z,20,MARUÍPE,0,0,0,0,0,1,0


In [44]:
dataset["ScheduledDay"].dtype

dtype('O')

In [45]:
dataset["AppointmentDay"].dtype

dtype('O')

Both are in object = string format. We'd have to convert them to datetime

In [46]:
dataset["ScheduledDay"] = dataset["ScheduledDay"].apply(np.datetime64)
dataset["AppointmentDay"] = dataset["AppointmentDay"].apply(np.datetime64)

dataset.head()

Unnamed: 0,Gender,ScheduledDay,AppointmentDay,Age,Neighbourhood,Scholarship,Hipertension,Diabetes,Alcoholism,Handcap,SMS_received,No-show
0,1,2016-04-06 08:52:26,2016-05-02,26,SANTA TEREZA,0,0,0,0,0,1,0
1,0,2016-05-16 13:17:59,2016-05-31,4,TABUAZEIRO,0,0,0,0,0,0,0
2,1,2016-05-17 07:40:33,2016-05-18,1,NOVA PALESTINA,0,0,0,0,0,0,1
3,0,2016-05-10 13:35:24,2016-05-10,0,SÃO JOSÉ,0,0,0,0,0,0,0
4,1,2016-04-26 10:40:18,2016-04-29,20,MARUÍPE,0,0,0,0,0,1,0


In [47]:
print("ScheduledDay", dataset["ScheduledDay"].dtype)
print("AppointmentDay", dataset["AppointmentDay"].dtype)

ScheduledDay datetime64[ns]
AppointmentDay datetime64[ns]


We'll add another column for the time difference between scheduledDay and AppointmentDay

In [48]:
dataset["AwaitingTime"] = (dataset["AppointmentDay"] - dataset["ScheduledDay"]).apply(lambda dt: dt.days)

In [49]:
dataset.head()

Unnamed: 0,Gender,ScheduledDay,AppointmentDay,Age,Neighbourhood,Scholarship,Hipertension,Diabetes,Alcoholism,Handcap,SMS_received,No-show,AwaitingTime
0,1,2016-04-06 08:52:26,2016-05-02,26,SANTA TEREZA,0,0,0,0,0,1,0,25
1,0,2016-05-16 13:17:59,2016-05-31,4,TABUAZEIRO,0,0,0,0,0,0,0,14
2,1,2016-05-17 07:40:33,2016-05-18,1,NOVA PALESTINA,0,0,0,0,0,0,1,0
3,0,2016-05-10 13:35:24,2016-05-10,0,SÃO JOSÉ,0,0,0,0,0,0,0,-1
4,1,2016-04-26 10:40:18,2016-04-29,20,MARUÍPE,0,0,0,0,0,1,0,2


### Explore the values to see if there are any "bad" values

In [50]:
for column_name in set(dataset.columns) - {"ScheduledDay", "AppointmentDay"}:
    print(column_name, "\n ================================ \n", sorted(np.unique(dataset[column_name])))

Scholarship 
 [0, 1]
Alcoholism 
 [0, 1]
Neighbourhood 
 ['AEROPORTO', 'ANDORINHAS', 'ANTÔNIO HONÓRIO', 'ARIOVALDO FAVALESSA', 'BARRO VERMELHO', 'BELA VISTA', 'BENTO FERREIRA', 'BOA VISTA', 'BONFIM', 'CARATOÍRA', 'CENTRO', 'COMDUSA', 'CONQUISTA', 'CONSOLAÇÃO', 'CRUZAMENTO', 'DA PENHA', 'DE LOURDES', 'DO CABRAL', 'DO MOSCOSO', 'DO QUADRO', 'ENSEADA DO SUÁ', 'ESTRELINHA', 'FONTE GRANDE', 'FORTE SÃO JOÃO', 'FRADINHOS', 'GOIABEIRAS', 'GRANDE VITÓRIA', 'GURIGICA', 'HORTO', 'ILHA DAS CAIEIRAS', 'ILHA DE SANTA MARIA', 'ILHA DO BOI', 'ILHA DO FRADE', 'ILHA DO PRÍNCIPE', 'ILHAS OCEÂNICAS DE TRINDADE', 'INHANGUETÁ', 'ITARARÉ', 'JABOUR', 'JARDIM CAMBURI', 'JARDIM DA PENHA', 'JESUS DE NAZARETH', 'JOANA D´ARC', 'JUCUTUQUARA', 'MARIA ORTIZ', 'MARUÍPE', 'MATA DA PRAIA', 'MONTE BELO', 'MORADA DE CAMBURI', 'MÁRIO CYPRESTE', 'NAZARETH', 'NOVA PALESTINA', 'PARQUE INDUSTRIAL', 'PARQUE MOSCOSO', 'PIEDADE', 'PONTAL DE CAMBURI', 'PRAIA DO CANTO', 'PRAIA DO SUÁ', 'REDENÇÃO', 'REPÚBLICA', 'RESISTÊNCIA', 'ROMÃO

## Remove outliers

In [51]:
from sklearn.svm import OneClassSVM
from sklearn.neighbors import LocalOutlierFactor
from sklearn.covariance import EllipticEnvelope
from sklearn.ensemble import IsolationForest

Age cannot be less than 1 so let's remove those rows first

In [52]:
dataset = dataset[dataset["Age"] >= 1]
dataset = dataset.reset_index(drop = True)

print("Number of records", len(dataset))
print("Age", np.unique(dataset["Age"]))

Number of records 106987
Age [  1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18
  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36
  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53  54
  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71  72
  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89  90
  91  92  93  94  95  96  97  98  99 100 102 115]


Removing outliers for Awaiting Time with EllipticEnvelope

In [53]:
dataset["AwaitingTime"].head(10)

0    25
1    14
2     0
3     2
4    30
5    12
6    -1
7    20
8    -1
9     5
Name: AwaitingTime, dtype: int64

In [54]:
awaiting_time = np.array(dataset["AwaitingTime"]).reshape((-1, 1))
awaiting_time[:10]

array([[25],
       [14],
       [ 0],
       [ 2],
       [30],
       [12],
       [-1],
       [20],
       [-1],
       [ 5]], dtype=int64)

In [55]:
awaiting_time.shape

(106987, 1)

In [56]:
envelope = EllipticEnvelope(contamination = 0.003)
envelope.fit(awaiting_time)

EllipticEnvelope(assume_centered=False, contamination=0.003,
         random_state=None, store_precision=True, support_fraction=None)

In [57]:
outliers = envelope.predict(awaiting_time) == -1

In [58]:
dataset.drop(np.flatnonzero(outliers), inplace = True)
dataset.reset_index(drop = True, inplace = True)

In [60]:
outliers[0:10]

array([False, False, False, False, False, False, False, False, False, False], dtype=bool)

In [61]:
print({"inliers": np.sum(~ outliers), "outliers": np.sum(outliers)})

{'inliers': 106670, 'outliers': 317}


In [62]:
print("Number of records", len(dataset))

Number of records 106670


In [66]:
print("AwaitingTime: \n\n", np.unique(dataset["AwaitingTime"]))

AwaitingTime: 

 [-7 -2 -1  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21
 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46
 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71
 72 73 74 75 76 77 78 79 80 81 82 83 84 85]


## Encoding

Dealing with datetimes suck so let's create the day and month from the AppointmentDay column. We'll remove the ScheduledDay column because it can be derived from the 2 columns we'll create

In [67]:
dataset["AppointmentDate_day"] = dataset["AppointmentDay"].apply(lambda d: d.day)
dataset["AppointmentDate_month"] = dataset["AppointmentDay"].apply(lambda d: d.month)

In [68]:
colRemove = ["AppointmentDay", "ScheduledDay"]
dataset.drop(colRemove, inplace = True, axis = 1)

In [69]:
dataset.head()

Unnamed: 0,Gender,Age,Neighbourhood,Scholarship,Hipertension,Diabetes,Alcoholism,Handcap,SMS_received,No-show,AwaitingTime,AppointmentDate_day,AppointmentDate_month
0,1,26,SANTA TEREZA,0,0,0,0,0,1,0,25,2,5
1,0,4,TABUAZEIRO,0,0,0,0,0,0,0,14,31,5
2,1,1,NOVA PALESTINA,0,0,0,0,0,0,1,0,18,5
3,1,20,MARUÍPE,0,0,0,0,0,1,0,2,29,4
4,1,6,CARATOÍRA,0,0,0,0,0,0,1,30,16,5
