# The Logistic Regression Model

Below you will find code that demonstrates how to run and interpret a logistic regression model. As before, please refer to the slides to get a full understanding of the motivations and derivations behind logistic regression and importantly its relation with the linear model.

In [None]:
from pandas import DataFrame, Series
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
from sklearn.cross_validation import train_test_split

%matplotlib inline

In [None]:
#Read in Titanic Data
titanic = pd.read_csv("../../datasets/titanic/train.csv")

## Dealing with Categorical Data (One-Hot-Encoding)

Categorical data, or data that have strings that denote something other than a numeric quantity, are extremely common in datasets. The catch is that, at least in Python, the vast majority of models do not know how to deal with categorical data - they prefer numeric data types only. At least in linear and logistic regression this makes intuitive sense because it doesn't make sense to invert a matrix of strings. What we do instead is do something called "One-Hot-Encoding".

In [None]:
titanic_only = pd.get_dummies(titanic,columns=['Sex','Pclass','Embarked'],drop_first=True)
titanic_only.head()

If you notice closely, there are now more than one column that represents a categorical variable! Sex is split into a male only column (1 if the corresponding Sex element was male) and a female only column, which is NOT shown because we chose to drop it from drop_first. Drop_first drops a single column from the new columns we've generated because this again has to do with multicollinearity. If I know that someone is male, then I know for sure someone is not female. As a result, just holding the male column is enough information for our model to handle, and we won't need to worry about multicollinearity issues!

This process of converting a categorical column into multiple columns containing 0's and 1's is called one-hot-encoding and this technique is by far the most common way of feeding in categorical data into a model. Another way of describing this process is getting "dummy variables" (hence pd.get_dummies) which just refer to the variables with 1's and 0's. 

## Validation Method

In [None]:
#Drop columns we don't care about (yet) or have missing values (Models don't like missing values)
titanic_only.drop(['PassengerId','Name','Ticket','Age','Cabin'],axis=1,inplace=True)

In [None]:
#Train Test Splitting
local_train, local_test = train_test_split(titanic_only,test_size=0.2,random_state=123)

In [None]:
local_train.shape

In [None]:
local_test.shape

In [None]:
local_train_y = local_train["Survived"]
local_train_x = local_train.drop(["Survived"],axis=1)
local_test_y = local_test["Survived"]
local_test_x = local_test.drop("Survived",axis=1)

In [None]:
#The Model
clf = sm.Logit(local_train_y,local_train_x)
result = clf.fit()
preds = result.predict(local_test_x)

In [None]:
#Accuracy of Logistic Model
np.mean((preds > 0.5) == local_test_y)

In [None]:
result.summary()

## Now let's put some of the Data Cleaning and Feature Engineering from before to work!

In [None]:
#Read in Titanic Data
titanic = pd.read_csv("../../datasets/titanic/train.csv")

In [None]:
titanic_engineered = titanic.copy()

In [None]:
#Imputing Age
titanic_engineered['title'] = 'other'
titanic_engineered.loc[['Master.' in n for n in titanic_engineered['Name']],'title'] = 'Master'
titanic_engineered.loc[['Miss.' in n for n in titanic_engineered['Name']],'title'] = 'Miss'
titanic_engineered.loc[['Mr.' in n for n in titanic_engineered['Name']],'title'] = 'Mr'
titanic_engineered.loc[['Mrs.' in n for n in titanic_engineered['Name']],'title'] = 'Mrs'

#Transform performs operation per group and returns values to their original index
titanic_engineered['age_filled'] = titanic_engineered[['title','Age']].groupby('title').transform(lambda x: x.fillna(x.mean())) 

titanic_engineered.drop(['Age'],axis=1,inplace=True)

In [None]:
#Cabin Side Feature
titanic_engineered['cabin_side'] = 'Unknown'
titanic_engineered.loc[titanic_engineered['Cabin'].str[-1].isin(["1", "3", "5", "7", "9"]),'cabin_side'] = 'starboard'
titanic_engineered.loc[titanic_engineered['Cabin'].str[-1].isin(["2", "4", "6", "8", "0"]),'cabin_side'] = 'port'

In [None]:
#Deck Feature (including some cleaning)
titanic_engineered['deck'] = 'Unknown'
titanic_engineered.loc[titanic_engineered['Cabin'].notnull(),'deck'] = titanic_engineered['Cabin'].str[0]
titanic_engineered.loc[titanic_engineered['deck'] == 'T','deck'] = "Unknown"

In [None]:
#Deck Feature (including some cleaning)
titanic_engineered['deck'] = 'Unknown'
titanic_engineered.loc[titanic_engineered['Cabin'].notnull(),'deck'] = titanic_engineered['Cabin'].str[0]
titanic_engineered.loc[titanic_engineered['deck'] == 'T','deck'] = "Unknown"

pattern = "[A-Z]\s[A-Z]" #Any capital letter between A-Z followed by a whitespace followed by any letter between A-Z
mask = titanic_engineered['Cabin'].str.contains(pattern,na=False)
titanic_engineered.loc[mask,'deck'] = titanic_engineered.loc[mask,'Cabin'].str[2]

In [None]:
#Number cabins per person
titanic_engineered['num_in_group'] = titanic_engineered['Cabin'].str.split().apply(lambda x: len(x) if type(x)!=float else 1)

In [None]:
#Removing columns we don't want (that don't make sense to include anymore)
#Notice we are NOT dropping the Age column anymore because we've filled in the missing values!
titanic_engineered.drop(['PassengerId','Name','Ticket','Cabin','title'],axis=1,inplace=True)

In [None]:
#Getting Dummy Variables
titanic_engineered = pd.get_dummies(titanic_engineered,columns=['Sex','Pclass','Embarked','cabin_side','deck'],drop_first=True)

In [None]:
titanic_engineered.head()

In [None]:
#Train Test Splitting
local_train, local_test = train_test_split(titanic_engineered,test_size=0.2,random_state=123)

local_train_y = local_train["Survived"]
local_train_x = local_train.drop(["Survived"],axis=1)
local_test_y = local_test["Survived"]
local_test_x = local_test.drop("Survived",axis=1)

In [None]:
#The Model
clf = sm.Logit(local_train_y,local_train_x)
result = clf.fit()
preds = result.predict(local_test_x)

In [None]:
#Accuracy of Logistic Model
np.mean((preds > 0.5) == local_test_y)

In [None]:
result.summary()

## K-Fold Cross Validation (Basic Data Set)

In [None]:
from sklearn.cross_validation import KFold

In [None]:
#Splits data into our train and test indices for each fold
kf = KFold(titanic_only.shape[0], n_folds=10)

In [None]:
#Saves our accuracy scores for each fold
outcomes = []

#Keeps track of which fold we are currently in
fold = 0

In [None]:
for train_index, test_index in kf:
    fold += 1
    local_train_xy, local_test_xy = titanic_only.iloc[train_index], titanic_only.iloc[test_index]
    local_train_y = local_train_xy['Survived']
    local_train_x = local_train_xy.drop(['Survived'],axis=1)
    local_test_y = local_test_xy['Survived']
    local_test_x = local_test_xy.drop(['Survived'],axis=1)

    clf = sm.Logit(local_train_y,local_train_x)
    result = clf.fit()
    preds = result.predict(local_test_x)
    accuracy = np.mean((preds > 0.5) == local_test_y)

    outcomes.append(accuracy)
    print("Fold {0} accuracy: {1}".format(fold, accuracy)) 

In [None]:
#Final Cross Validated (average) score
mean_outcome = np.mean(outcomes)
mean_outcome

## K-Fold Cross Validation (Feature Engineered Data Set)

In [None]:
#Saves our accuracy scores for each fold
outcomes = []

#Keeps track of which fold we are currently in
fold = 0

In [None]:
for train_index, test_index in kf:
    fold += 1
    local_train_xy, local_test_xy = titanic_engineered.iloc[train_index], titanic_engineered.iloc[test_index]
    local_train_y = local_train_xy['Survived']
    local_train_x = local_train_xy.drop(['Survived'],axis=1)
    local_test_y = local_test_xy['Survived']
    local_test_x = local_test_xy.drop(['Survived'],axis=1)

    clf = sm.Logit(local_train_y,local_train_x)
    result = clf.fit()
    preds = result.predict(local_test_x)
    accuracy = np.mean((preds > 0.5) == local_test_y)

    outcomes.append(accuracy)
    print("Fold {0} accuracy: {1}".format(fold, accuracy)) 

    

In [None]:
mean_outcome = np.mean(outcomes)
mean_outcome