# Assignment 8
Using any dataset that has a categorical feature that needs to be predicted, use several algorithms, preprocessing techniques, feature extraction techniques to fit the data to the model and show the accuracy, confusion matrix, and the classification report. G
https://www.kaggle.com/ntnu-testimon/paysim1
https://www.kaggle.com/joniarroba/noshowappointments
https://archive.ics.uci.edu/ml/datasets.html?format=&task=cla&att=&area=&numAtt=&numIns=&type=&sort=nameUp&view=table
https://archive.ics.uci.edu/ml/datasets/default+of+credit+card+clients
https://archive.ics.uci.edu/ml/datasets/Adult

In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import svm, datasets, preprocessing, metrics
from sklearn.preprocessing import scale, LabelEncoder, OneHotEncoder
import pandas as pd
from pandas import Series, DataFrame
from pylab import rcParams
rcParams['figure.figsize'] = 20, 10

In [2]:
from sklearn.svm import LinearSVC, SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

import seaborn as sns
from sklearn.linear_model import LogisticRegression as Model



In [3]:
from sklearn.metrics import (accuracy_score, 
                             classification_report, 
                             confusion_matrix, auc, roc_curve
                            )

from sklearn import cross_validation



## Dataset: Medical Appointment No Show

#### *This dataset, made available on kaggle.com, looks at 15 variables across 300,000 medical appointments.  The goal is to predict if a patient is a no-show to their appointment.  I chose this dataset because it had 15 attributes (varying in data types) and thought it would be an interesting problem...and there would be some munging that would need to be done (e.g., parsing out time of appointment, converting strings to numerical values). I have in the past completely missed appointments because I completely forgot I had one scheduled.*  

In [4]:
#read in dataset

appt = pd.read_csv('/Users/jMac/Documents/ML_Portfolio_Morgia/data/No-show-issue-Comma-300k.csv')
appt.dropna(inplace=True) #discards all na values

In [5]:
list(appt.columns)

['Age',
 'Gender',
 'AppointmentRegistration',
 'ApointmentData',
 'DayOfTheWeek',
 'Status',
 'Diabetes',
 'Alcoolism',
 'HiperTension',
 'Handcap',
 'Smokes',
 'Scholarship',
 'Tuberculosis',
 'Sms_Reminder',
 'AwaitingTime']

In [6]:
appt.dtypes

Age                         int64
Gender                     object
AppointmentRegistration    object
ApointmentData             object
DayOfTheWeek               object
Status                     object
Diabetes                    int64
Alcoolism                   int64
HiperTension                int64
Handcap                     int64
Smokes                      int64
Scholarship                 int64
Tuberculosis                int64
Sms_Reminder                int64
AwaitingTime                int64
dtype: object

In [7]:
#noticed a few spelling errors in the file and perhaps better naming conventions for the column.  
#This converts the current column names to more useful ones.

appt.rename(columns = {'AppointmentRegistration':'Scheduled', 
                      'ApointmentData':'Appointment',
                      'Alcoolism':'Alcoholism',
                      'HiperTension':'HyperTension',
                      'Handcap':'Handicap', 
                      'Scholarship':'Aid Recipient',
                      'AwaitingTime':'DaysBetween'}, inplace=True)


In [8]:
list(appt.columns)

['Age',
 'Gender',
 'Scheduled',
 'Appointment',
 'DayOfTheWeek',
 'Status',
 'Diabetes',
 'Alcoholism',
 'HyperTension',
 'Handicap',
 'Smokes',
 'Aid Recipient',
 'Tuberculosis',
 'Sms_Reminder',
 'DaysBetween']

#### convert strings to numeric. Will do this for gender, dayoftheweek, and status

In [9]:
#figure out what values exist in the feature: gender
appt['Gender'].unique()

array(['M', 'F'], dtype=object)

In [10]:
#use a loop to convert M, F to 0, 1 (note: tried using LabelEncoder, it didn't seem to do the function I wanted it to do, which was number then apply as a new column)

def num_Gender(Gender):
    if Gender == "M":
        return 0
    elif Gender == "F":
        return 1

#add it to the dataframe
appt.Gender.apply(num_Gender).head
appt['NumGender'] = appt.Gender.apply(num_Gender)



In [None]:
#figure out what values exist for dayoftheweek
appt['DayOfTheWeek'].unique()

array(['Wednesday', 'Tuesday', 'Thursday', 'Friday', 'Monday', 'Saturday',
       'Sunday'], dtype=object)

In [None]:
#converted day of the week to numeric using a loop (to control the assignment of numbers)


def num_Weekday(DayOfTheWeek):
    if DayOfTheWeek == "Sunday":
        return 1
    elif DayOfTheWeek == "Monday":
        return 2
    elif DayOfTheWeek == "Tuesday":
        return 3
    elif DayOfTheWeek == "Wednesday":
        return 4
    elif DayOfTheWeek == "Thursday":
        return 5
    elif DayOfTheWeek == "Friday":
        return 6
    elif DayOfTheWeek == "Saturday":
        return 7
    
#add it to the dataframe
appt.DayOfTheWeek.apply(num_Weekday).head
appt['Weekday'] = appt.DayOfTheWeek.apply(num_Weekday)


In [None]:
#figure out what values exist for status (should be only 2)

appt['Status'].unique()

In [None]:
#convert to a numeric value

def num_Status(Status):
    if Status == "No-Show":
        return 0
    elif Status == "Show-Up":
        return 1
    
#add it to the dataframe
appt.Status.apply(num_Status).head
appt['NumStatus'] = appt.Status.apply(num_Status)



#### Now relook at the new(er) dataset

In [None]:
appt.head()

#### Convert and parse out date, time values in Scheduled and Appointment columns.

In [None]:
#split the Scheduled column (i.e., turn 2014-12-16T14:46:25Z to 2014-12-16 14 46 25 using x.split)

def parsed_Scheduled(Scheduled):
    for letter in Scheduled:
        if letter == "T":
            Scheduled = Scheduled.replace(letter," ")
        elif letter == "Z":
            Scheduled = Scheduled.replace(letter,"")
    return Scheduled

def parsed_Appointment(Appointment):
    for letter in Appointment:
        if letter == "T":
            Appointment = Appointment.replace(letter," ")
        elif letter == "Z":
            Appointment = Appointment.replace(letter,"")
    return Appointment

In [None]:
#add it to the dataframe
appt.Scheduled.apply(parsed_Scheduled).head
appt['ParsedScheduled'] = appt.Scheduled.apply(parsed_Scheduled)
appt.Appointment.apply(parsed_Appointment).head
appt['ParsedAppointment'] = appt.Appointment.apply(parsed_Appointment)



In [None]:

#split off information in appointment
appt[['AppointmentDate', 'AppointmentTime']] = pd.DataFrame([ x.split(' ') for x in appt['ParsedAppointment'].tolist() ])

appt[['AppointmentYear', 'AppointmentMonth', 'AppointmentDay']] = pd.DataFrame([ x.split('-') for x in appt['AppointmentDate'].tolist() ])

In [None]:
#now split the Scheduled Date and Time
appt[['ScheduledDate', 'ScheduledTime']] = pd.DataFrame([ x.split(' ') for x in appt['ParsedScheduled'].tolist() ])


In [None]:
#now split ScheduledDate and ScheduledTime, to allow for more refined investigation (hypothesis: perhaps the hour it was scheduled affects people remembering the appointment)

appt[['ScheduledYear', 'ScheduledMonth', 'ScheduledDay']] = pd.DataFrame([ x.split('-') for x in appt['ScheduledDate'].tolist() ])

appt[['ScheduledHour', 'ScheduledMinute', 'ScheduledSecond']] = pd.DataFrame([ x.split(':') for x in appt['ScheduledTime'].tolist() ])

appt.head()

**Note:** Apparently there is no useful time stamp on the Appointment time (likely because the default was 00:00:00Z)

In [None]:
#now we should convert the Days Between appointment Scheduled and Appointment Date to absolute values (instead of negative integers)

abs_DaysBetween = appt.DaysBetween.abs()

#add it to the dataframe
appt['Abs_DaysBetween'] = abs_DaysBetween
appt.head()

In [None]:
appt.dtypes

In [None]:
#odd that the numerical features I created turned out into object, let's fix that...
appt["AppointmentYear"] = pd.to_numeric(appt["AppointmentYear"], errors="coerce").fillna(0).astype("int64")
appt["AppointmentMonth"] = pd.to_numeric(appt["AppointmentMonth"], errors="coerce").fillna(0).astype("int64")
appt["AppointmentDay"] = pd.to_numeric(appt["AppointmentDay"], errors="coerce").fillna(0).astype("int64")
appt["ScheduledHour"] = pd.to_numeric(appt["ScheduledHour"], errors="coerce").fillna(0).astype("int64")
appt["ScheduledMinute"] = pd.to_numeric(appt["ScheduledMinute"], errors="coerce").fillna(0).astype("int64")
appt["ScheduledSecond"] = pd.to_numeric(appt["ScheduledSecond"], errors="coerce").fillna(0).astype("int64")
appt["ScheduledYear"] = pd.to_numeric(appt["ScheduledYear"], errors="coerce").fillna(0).astype("int64")
appt["ScheduledMonth"] = pd.to_numeric(appt["ScheduledMonth"], errors="coerce").fillna(0).astype("int64")
appt["ScheduledDay"] = pd.to_numeric(appt["ScheduledDay"], errors="coerce").fillna(0).astype("int64")

#Not all features need to be changed from objects to numeric because I don't plan on using them for the analysis.

In [None]:
#see which columns are numeric

num_columns = [col for col, dtype in zip(appt.columns, appt.dtypes) if dtype != 'object']
num_columns


In [None]:
len(num_columns)

In [None]:
appt.head()

#### Horray! Now we have parsed out data, formatted, and ready to analyze...with 23 features to play with.

## Exploring the data (getting a better feel for how it's distributed, etc)

##### (1) How many times have we observed a missed appointment in the dataset?

In [None]:
appt.Status.value_counts()

**Observation:** According to the Status feature, patients miss the appointment 43% of the time!  That's shockingly high (and I guess creates quite a bit of revenue for a clinic).

##### (2) Which day is the most frequent for appointments?

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
sns.set(font_scale=1.5)

sns.countplot(x='DayOfTheWeek', data=appt)

**Observation:** It appears that all of the weekdays are quite popular with Wednesday being the most and Friday as the least.  Wednesday and Tuesday appear to be very close. There are probably only a few places open on Saturday, which is why we see some values (I don't think I'll really use that portion of the data because it seems to be an outlier).

##### (3) What time is the most popular for scheduling?  (by hour and month)

In [None]:
sns.countplot(x='ScheduledHour', data=appt)

In [None]:
sns.countplot(x='ScheduledMonth', data=appt)

##### How about for appointments? (Month only because hour is nonexistent)


In [None]:
sns.countplot(x='AppointmentMonth', data=appt)

**Observation:** It appears that the most popular time for scheduling appointments is early in the morning or in the midafternoon.  The most popular month for scheduling these appointments (though there's not a ton of deviation) is May and July-October.  The appointment dates roughly follow the scheduling dates.

In [None]:
sns.countplot(x="DayOfTheWeek", hue='Status', data=appt)

In [None]:
sns.countplot(x="ScheduledMonth", hue='Status', data=appt)

In [None]:
sns.countplot(x="ScheduledYear", hue='Status', data=appt)

In [None]:
sns.countplot(x="AppointmentMonth", hue='Status', data=appt)

In [None]:
sns.countplot(x="AppointmentYear", hue='Status', data=appt)

**Observation:** Even when parsing out by Status, it shows a relatively consistent change in status throughout the year. Does this mean that each month a little less than half of the scheduled patients don't show up to their appointments? (also need to discard 2013 ScheduledYear information, because there's too little data for that time)

##### (4) Let's take a look at the data by looking at how the the status is affected by the days between the scheduling and the appointment.  Are there points in time where this is more likely to occur?

In [None]:
max(appt.Abs_DaysBetween)


In [None]:
min(appt.Abs_DaysBetween)

In [None]:
fig, ax = plt.subplots()
#remember that 0 is no-show and 1 is a show-up
appt[appt.NumStatus==0].plot(kind='scatter', x='AppointmentMonth', y='Abs_DaysBetween', color='red', alpha=0.1, ax=ax)
appt[appt.NumStatus==1].plot(kind='scatter', x='AppointmentMonth', y='Abs_DaysBetween', color='yellow', alpha=0.1, ax=ax)

In [None]:
fig, ax = plt.subplots()
#remember that 0 is no-show and 1 is a show-up
appt[appt.NumStatus==0].plot(kind='scatter', x='Sms_Reminder', y='Abs_DaysBetween', color='red', alpha=0.1, ax=ax)
appt[appt.NumStatus==1].plot(kind='scatter', x='Sms_Reminder', y='Abs_DaysBetween', color='yellow', alpha=0.1, ax=ax)

In [None]:
appt[appt.NumStatus==0].Abs_DaysBetween.hist(bins=np.arange(0,50,1), alpha=.4) #dark green, no-show
appt[appt.NumStatus==1].Abs_DaysBetween.hist(bins=np.arange(0,50,1), alpha=.4) #light green, show-up


**Observation:** So it appears that the longer the time passes between the scheduling of an appointment and the appointment date, the more likely folks are to miss their appointment. The second plot clearly shows that the text message reminders are more effective and probably encourage people to schedule these visits more frequently? The third plot shows the ratio between those who show up and those who do not. Porportionally there seems to be a greater probability of folks not showing up to their appointment the further apart the scheduled date is from the appointment date.


## Data Analysis:  Building a Model to Predict No-Shows

### incorporate accuracy score, classification, and confusion matrix

Given how the data is distributed when looking at it through various temporal features (scheduled hour, time between schedule and appointment, month of appointment), it appears that a SVM and a Linear Regression approach should be tried. 

### SVM Model

In [None]:
#creates a subplot...colored background

def plot_svm(i, clf, title, X, y, col1, col2):
    
    h = .2  # step size in the mesh
    # create a mesh to plot in
    x_min, x_max = X[col1].min() - 1, X[col1].max() + 1
    y_min, y_max = X[col2].min() - 1, X[col2].max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
    grid_stack = np.stack([xx.flatten(), yy.flatten()]).T

    x1 = X[col1]
    x2 = X[col2]
    # Plot the decision boundary. For that, we will assign a color to each
    # point in the mesh [x_min, x_max]x[y_min, y_max].
    plt.subplot(2, 2, i + 1)
    plt.subplots_adjust(wspace=0.4, hspace=0.4)

    Z = clf.predict(scale(grid_stack)).reshape(xx.shape)
    # Put the result into a color plot
    plt.contourf(xx, yy, Z, cmap=plt.cm.coolwarm, alpha=0.8)
    x_s = preprocessing.scale(X)

    # Plot also the training points
    plt.scatter(x1, x2, c=y, cmap=plt.cm.coolwarm)
    
    plt.xlabel(col1)
    plt.ylabel(col2)
    plt.title(title)

In [None]:
#lets build our inital model looking at the relationship among a patients' Show status, Days Between schedule and appointment, and if a person recieved SMS reminders.
col1, col2 ='Abs_DaysBetween', 'Sms_Reminder'

X = appt[[col1, col2]]                    
y = appt['NumStatus']
# create a model to predict if a patient is a no-show.
# we create an instance of SVM and fit out data. We do not scale our data since we want to plot the support vectors

svc = svm.SVC(kernel='linear', C=1.0).fit(scale(X), y)
# model = fitting

plot_svm(0, svc,'SVC with linear kernel' , X, y, col1, col2)


plt.show()