# Classification example

In [2]:
import matplotlib.pyplot as plt
import pandas as pd
import pandas_profiling

from sklearn import (
    ensemble,
    model_selection,    
    preprocessing,
    tree,
)

from sklearn.metrics import (
    auc,
    confusion_matrix,
    roc_auc_score,
    roc_curve,
)

from sklearn.model_selection import (
    train_test_split,
    StratifiedKFold,
)

from sklearn.experimental import (
    enable_iterative_imputer,
)

from sklearn import impute

from yellowbrick.classifier import (
    ConfusionMatrix,
    ROCAUC,
)

from yellowbrick.model_selection import (
    LearningCurve,
)



## Business Understanding: Problem Statement
Based on Titanic passenger characteristics predict if they survive or not (binary classification).

## Data Understanding: Dataset

Independent variables:
- pclass - Passenger class (1 = 1st, 2 = 2nd, 3 = 3rd)
- name - Name
- sex - Sex
- age - Age
- sibsp - Number of siblings/spouses aboard
- parch - Number of parents/children aboard
- ticket - Ticket number
- fare - Passenger fare
- cabin - Cabin
- embarked - Point of embarkation (C = Cherbourg, Q = Queenstown, S = Southampton)
- boat - Lifeboat
- body - Body identification number
- home.dest - Home/destination

Dependent variable:
- survival - Survival (0 = No, 1 = Yes)

In [3]:
url = (
    "http://biostat.mc.vanderbilt.edu/"
    "wiki/pub/Main/DataSets/titanic3.xls"
)

In [4]:
url = "data/titanic3.xls"
df = pd.read_excel(url)
orig_df = df

In [5]:
df.head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"


## Data Preparation
- Check if data is numerical (only some models allow categorical data) and encode it if needed
    - Check data types (convert data types to proper ones if needed)
- Check if there are missing values
- Try normalizing data (mean of 0, std. deviation of 1)
- Check if there are leaking data (containing information about the target/outcome)

In [6]:
df.dtypes

pclass         int64
survived       int64
name          object
sex           object
age          float64
sibsp          int64
parch          int64
ticket        object
fare         float64
cabin         object
embarked      object
boat          object
body         float64
home.dest     object
dtype: object

In [7]:
df.shape

(1309, 14)

### Data Profiling

In [8]:
pandas_profiling.ProfileReport(df)

HBox(children=(FloatProgress(value=0.0, description='Summarize dataset', max=28.0, style=ProgressStyle(descrip…




HBox(children=(FloatProgress(value=0.0, description='Generate report structure', max=1.0, style=ProgressStyle(…




HBox(children=(FloatProgress(value=0.0, description='Render HTML', max=1.0, style=ProgressStyle(description_wi…






In [9]:
df.describe().iloc[:, :2]

Unnamed: 0,pclass,survived
count,1309.0,1309.0
mean,2.294882,0.381971
std,0.837836,0.486055
min,1.0,0.0
25%,2.0,0.0
50%,3.0,0.0
75%,3.0,1.0
max,3.0,1.0


In [10]:
df.isnull().sum()

pclass          0
survived        0
name            0
sex             0
age           263
sibsp           0
parch           0
ticket          0
fare            1
cabin        1014
embarked        2
boat          823
body         1188
home.dest     564
dtype: int64

Columns boat and body contain leaked data. Boat informs if someone survived and body (identifier) if someone died. Columns cabin, boat and body should be removed.

In [11]:
# Show rows with missing data (in any column)
mask = df.isnull().any(axis=1)
mask.head()

0    True
1    True
2    True
3    True
4    True
dtype: bool

In [12]:
df[mask].body.head()

0      NaN
1      NaN
2      NaN
3    135.0
4      NaN
Name: body, dtype: float64

In [13]:
# Show how many observations we have per column
df[mask].count()

pclass       1309
survived     1309
name         1309
sex          1309
age          1046
sibsp        1309
parch        1309
ticket       1309
fare         1308
cabin         295
embarked     1307
boat          486
body          121
home.dest     745
dtype: int64

In [14]:
# Show percent of missing values per column
df.isnull().mean()

pclass       0.000000
survived     0.000000
name         0.000000
sex          0.000000
age          0.200917
sibsp        0.000000
parch        0.000000
ticket       0.000000
fare         0.000764
cabin        0.774637
embarked     0.001528
boat         0.628724
body         0.907563
home.dest    0.430863
dtype: float64

In [15]:
# Count unique values for categorical columns
df.sex.value_counts(dropna=False)

male      843
female    466
Name: sex, dtype: int64

In [16]:
df.embarked.value_counts(dropna=False)

S      914
C      270
Q      123
NaN      2
Name: embarked, dtype: int64

In [17]:
# Get the counts of missing features for each sample
df.isnull().sum(axis=1).loc[:10]

0     1
1     1
2     2
3     1
4     2
5     1
6     1
7     2
8     1
9     2
10    1
dtype: int64

In [16]:
# Show percent of missing values per row
df.isnull().mean(axis=1).head()

0    0.071429
1    0.071429
2    0.142857
3    0.071429
4    0.142857
dtype: float64

### Define features for modeling
- We can drop columns where variance or std. deviation is equal 0 (no signal).
- We can drop text columns where each observation has different value (if this is no NLP problem).
- We can drop columns that leak information.

In [19]:
columns = ['name', 'ticket', 'home.dest', 'boat', 'body', 'cabin']
df = df.drop(columns=columns)
df.head()

Unnamed: 0,pclass,survived,sex,age,sibsp,parch,fare,embarked
0,1,1,female,29.0,0,0,211.3375,S
1,1,1,male,0.9167,1,2,151.55,S
2,1,0,female,2.0,1,2,151.55,S
3,1,0,male,30.0,1,2,151.55,S
4,1,0,female,25.0,1,2,151.55,S


In [20]:
# Create dummy columns from string columns
df = pd.get_dummies(df)
df.head()

Unnamed: 0,pclass,survived,age,sibsp,parch,fare,sex_female,sex_male,embarked_C,embarked_Q,embarked_S
0,1,1,29.0,0,0,211.3375,1,0,0,0,1
1,1,1,0.9167,1,2,151.55,0,1,0,0,1
2,1,0,2.0,1,2,151.55,1,0,0,0,1
3,1,0,30.0,1,2,151.55,0,1,0,0,1
4,1,0,25.0,1,2,151.55,1,0,0,0,1


At this point the sex_male and sex_female columns are perfectly inverse correlated. Typically we remove any columns with perfect or very high positive or negative correlation. Multicollinearity can impact interpretation of feature importance and coefficients in some models.

In [21]:
df = df.drop(columns="sex_male")
# or if we use
#df = pd.get_dummies(df, drop_first=True) 
# we already have this done by get_dummies()

In [22]:
y = df.survived
X = df.drop(columns='survived')

### Train-test Split

In [51]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

In [52]:
X_train.head()

Unnamed: 0,pclass,age,sibsp,parch,fare,sex_female,embarked_C,embarked_Q,embarked_S
306,1,54.0,0,1,77.2875,0,0,0,1
927,3,,1,0,14.4542,0,1,0,0
642,3,13.0,4,2,31.3875,0,0,0,1
1294,3,28.5,0,0,16.1,0,0,0,1
1015,3,55.5,0,0,8.05,0,0,0,1


In [53]:
y_train.head()

306     0
927     0
642     0
1294    0
1015    0
Name: survived, dtype: int64

### Data Imputation

The age column has missing values. We need to impute age from the numeric values. We only want to impute on the training set and then use that imputer to fill in the date for the test set. Otherwise we are leaking data (cheating by giving future information to the model). Now that we have test and train data, we can impute missing values on the training set, and use the trained imputers to fill in the test dataset.

In [54]:
num_cols = [
    "pclass",
    "age",
    "sibsp",
    "parch",
    "fare",
    "sex_female",
]

In [55]:
imputer = impute.IterativeImputer()
imputed = imputer.fit_transform(X_train[num_cols])

In [56]:
imputed[1] # new row 1 values in X_train

array([ 3.        , 24.27089623,  1.        ,  0.        , 14.4542    ,
        0.        ])

In [57]:
X_train.loc[:, num_cols] = imputed

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value[:, i].tolist())


In [58]:
imputed = imputer.transform(X_test[num_cols])
X_test.loc[:, num_cols] = imputed

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value[:, i].tolist())


### Impute with Median
Alternatively but take it only from X_train

In [31]:
#meds = X_train.median()
#X_train = X_train.fillna(meds)
#X_test = X_test.fillna(meds)

### Normalize Data
Normalizing or preprocessing the data will help many models perform better after this is done. Particularly those that depend on a distance metric to determine similarity. (Note that tree models, which treat each feature on its own, don’t have this requirement.) Standardizing is translating the data so that it has a mean value of zero and a standard deviation of one. This way models don’t treat variables with larger scales as more important than smaller scaled variables.

In [59]:
# Normally don’t standardize dummy columns
cols = "pclass,age,sibsp,fare".split(",")

In [60]:
sca = preprocessing.StandardScaler()
X_train = sca.fit_transform(X_train[cols])
X_train = pd.DataFrame(X_train, columns=cols)

In [61]:
X_train.head()

Unnamed: 0,pclass,age,sibsp,fare
0,-1.54372,1.86361,-0.487688,0.906108
1,0.834829,-0.378656,0.525287,-0.366368
2,0.834829,-1.228744,3.564212,-0.023441
3,0.834829,-0.059683,-0.487688,-0.333038
4,0.834829,1.976745,-0.487688,-0.496063


In [62]:
X_test = sca.transform(X_test[cols])
X_test = pd.DataFrame(X_test, columns=cols)

### Refactor
Gather everything needed into reusable functions

In [63]:
def tweak_titanic(df):
    df = df.drop(
        columns=[
            "name",
            "ticket",
            "home.dest",
            "boat",
            "body",
            "cabin",
        ]
    ).pipe(pd.get_dummies, drop_first=True)
    return df

In [64]:
def get_train_test_X_y(df, y_col, size=0.3, std_cols=None):
    y = df[y_col]
    X = df.drop(columns=y_col)
    X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=size, random_state=42)
    cols = X.columns
    num_cols = [
        "pclass",
        "age",
        "sibsp",
        "parch",
        "fare",
    ]
    fi = impute.IterativeImputer()
    X_train.loc[:, num_cols] = fi.fit_transform(X_train[num_cols])
    X_test.loc[:, num_cols] = fi.transform(X_test[num_cols])

    if std_cols:
        std = preprocessing.StandardScaler()
        X_train.loc[:, std_cols] = std.fit_transform(X_train[std_cols])
        X_test.loc[:, std_cols] = std.transform(X_test[std_cols])
    
    return X_train, X_test, y_train, y_test

In [65]:
ti_df = tweak_titanic(orig_df)

std_cols = "pclass,age,sibsp,fare".split(",")
X_train, X_test, y_train, y_test = get_train_test_X_y(ti_df, "survived", std_cols=std_cols)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value[:, i].tolist())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value[:, i].tolist())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value[:, i].tolist())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_in

In [66]:
ti_df.head()

Unnamed: 0,pclass,survived,age,sibsp,parch,fare,sex_male,embarked_Q,embarked_S
0,1,1,29.0,0,0,211.3375,0,0,1
1,1,1,0.9167,1,2,151.55,1,0,1
2,1,0,2.0,1,2,151.55,0,0,1
3,1,0,30.0,1,2,151.55,1,0,1
4,1,0,25.0,1,2,151.55,0,0,1


## Modeling

### Baseline Model
Creating a baseline model that does something really simple can give us something to compare our model to. Note that using the default .score result gives us the accuracy which can be misleading. 

In [67]:
from sklearn.dummy import DummyClassifier
bm = DummyClassifier()
bm.fit(X_train, y_train)
bm.score(X_test, y_test)  # accuracy



0.5292620865139949

In [68]:
from sklearn import metrics
metrics.precision_score(y_test, bm.predict(X_test))

0.3945578231292517

### Find Best Algorithm
No algorithm performs well on all data. However, for some finite set of data, there may be an algorithm that does well on that set.

Here we use a few different families and compare the AUC (Area Under the ROC Curve) score and standard deviation using k-fold cross-validation. AUC ranges in value from 0 to 1. A model whose predictions are 100% wrong has an AUC of 0.0; one whose predictions are 100% correct has an AUC of 1.0. An algorithm that has a slightly smaller average score but tighter standard deviation might be a better choice.

Because we are using k-fold cross-validation, we will feed the model all of X and y.

In [69]:
X = pd.concat([X_train, X_test])
y = pd.concat([y_train, y_test])

In [70]:
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
import xgboost

In [75]:
for model in [
    DummyClassifier,
    LogisticRegression,
    DecisionTreeClassifier,
    KNeighborsClassifier,
    GaussianNB,
    SVC,
    RandomForestClassifier,
    xgboost.XGBClassifier,
]:
    
    cls = model()
    kfold = model_selection.KFold(n_splits=10, random_state=4)
    s = model_selection.cross_val_score(cls, X, y, scoring="roc_auc", cv=kfold)

    print(f"{model.__name__:22}  AUC: "
          f"{s.mean():.3f} STD: {s.std():.2f}")



DummyClassifier         AUC: 0.502 STD: 0.05
LogisticRegression      AUC: 0.843 STD: 0.03
DecisionTreeClassifier  AUC: 0.761 STD: 0.03
KNeighborsClassifier    AUC: 0.830 STD: 0.05
GaussianNB              AUC: 0.817 STD: 0.04
SVC                     AUC: 0.837 STD: 0.05




RandomForestClassifier  AUC: 0.848 STD: 0.03




XGBClassifier           AUC: 0.859 STD: 0.03


### Stacking
If you want maximum performance at the cost of interpretability, stacking is an option. A stacking classifier takes other models and uses their output to predict a target or label. We will use the previous models’ outputs and combine them to see if a stacking classifier can do better:

## Evaluation

## Deployment