# Classification and data mangling examples using the [Titanic dataset](https://www.kaggle.com/c/titanic) in Kaggle

In [1]:
import pandas as pd, sklearn, numpy as np
import seaborn as sns
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
from matplotlib import rcParams
rcParams["font.size"] = "16"

In [2]:
train = pd.read_csv('./kaggle_titanic_dataset/train.csv')
test = pd.read_csv('./kaggle_titanic_dataset/test.csv')
n_train, m_train = train.shape

data = train.copy()
# Assume that PassengerId and Name do not matter
X_train = train.drop(['PassengerId', 'Survived', 'Name'], axis=1)
y_train = train['Survived'].values

In [3]:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


In [5]:
data.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [6]:
print('---Embarked---')
print(X_train["Embarked"].value_counts())
print('---Tickets---')
print(X_train["Ticket"].value_counts()[0:10])
print('--Survived--')
print(data["Survived"].value_counts())

---Embarked---
S    644
C    168
Q     77
Name: Embarked, dtype: int64
---Tickets---
CA. 2343        7
1601            7
347082          7
3101295         6
CA 2144         6
347088          6
382652          5
S.O.C. 14879    5
LINE            4
19950           4
Name: Ticket, dtype: int64
--Survived--
0    549
1    342
Name: Survived, dtype: int64


In [7]:
# Pick numerical attributes
num_attribs = list(X_train.select_dtypes(include=['number']))
cat_attribs = list(X_train.select_dtypes(include=['object']))

print('Numerical attributes:', num_attribs)
print('Categorical attributes:', cat_attribs)

Numerical attributes: ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']
Categorical attributes: ['Sex', 'Ticket', 'Cabin', 'Embarked']


### Build pipeline for processing numerical attributes

In [8]:
from sklearn.base import BaseEstimator, TransformerMixin
class DataFrameAttributesSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names=None):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        columns = list(X) if self.attribute_names is None else self.attribute_names
        return X[columns]
    
class DataFrameToValuesTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        self.attribute_names = list(X)
        return X.values


In [10]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Imputer, StandardScaler

num_pipeline = Pipeline([
    ('selector', DataFrameAttributesSelector(attribute_names=num_attribs)),
    ('to_numpy', DataFrameToValuesTransformer()),
    ('imputer', Imputer(strategy='median')),
    ('scaler', StandardScaler())
])


### Pipeline for processing categorical attributes

In [11]:
class CategoricalToIntegerFactorizer(BaseEstimator, TransformerMixin):
    def __init__(self, max_categories):
        self.max_categories = max_categories
        self.categories = []
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        cols = list(X)
        Y = pd.DataFrame()
        for col in cols:
            unique_values = len(X[col].unique())
            # print('Column %s has %d unique values' % (col, unique_values))
            if (unique_values > self.max_categories):
                continue
            factorized, categories = X[col].factorize(na_sentinel=unique_values)
            self.categories.extend(['%s_%s' % (col, cat) for cat in categories])
            if (sum(factorized == unique_values) > 0.5):
                self.categories.append('%s_nan' % col)
            Y[col] = factorized
        return Y 

In [13]:
from sklearn.preprocessing import OneHotEncoder
cat_pipeline = Pipeline([
    ('selector', DataFrameAttributesSelector(attribute_names=cat_attribs)),
    ('cat_to_int_encoder', CategoricalToIntegerFactorizer(max_categories=5)),
    ('one_hot_encoder', OneHotEncoder())
])

# cat_pipeline.fit_transform(X_train).toarray()

In [14]:
from sklearn.pipeline import FeatureUnion
full_pipeline = FeatureUnion(transformer_list=[
    ("num_pipeline", num_pipeline),
    ("cat_pipeline", cat_pipeline)
])

X_train_prepared = full_pipeline.fit_transform(X_train)
print('Size of prepared X:', X_train_prepared.shape)

assert X_train_prepared.shape[0] == len(y_train)

Size of prepared X: (891, 11)


In [15]:
X_train_prepared.toarray()

array([[ 0.82737724, -0.56573646,  0.43279337, ...,  0.        ,
         0.        ,  0.        ],
       [-1.56610693,  0.66386103,  0.43279337, ...,  1.        ,
         0.        ,  0.        ],
       [ 0.82737724, -0.25833709, -0.4745452 , ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [ 0.82737724, -0.1046374 ,  0.43279337, ...,  0.        ,
         0.        ,  0.        ],
       [-1.56610693, -0.25833709, -0.4745452 , ...,  1.        ,
         0.        ,  0.        ],
       [ 0.82737724,  0.20276197, -0.4745452 , ...,  0.        ,
         1.        ,  0.        ]])

### Try RandomForestClassifier and GridSearchCV with the prepared data

In [17]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, make_scorer

param_grid = [
    { 'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8] },
    { 'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]}
]

forest_clf = RandomForestClassifier()
grid_search = GridSearchCV(forest_clf, param_grid, cv=5, return_train_score=True, refit='accuracy',
                           scoring={ 'accuracy': make_scorer(accuracy_score),
                                     'precision': make_scorer(precision_score)
                                   })

cv = grid_search.fit(X_train_prepared, y_train)
cv_results = pd.DataFrame(grid_search.cv_results_)
print('Best test score accuracy is:', grid_search.best_score_)

Best test score accuracy is: 0.8215488215488216


In [18]:
cols = list(grid_search.cv_results_.keys())
cols_of_interest = [key for key in cols if key.startswith('param_') 
                    or key.startswith('mean_train') 
                    or key.startswith('mean_test_')
                    or key.startswith('rank')]
cv_results[cols_of_interest]

Unnamed: 0,mean_test_precision,param_n_estimators,param_bootstrap,rank_test_precision,mean_train_precision,mean_test_accuracy,param_max_features,rank_test_accuracy,mean_train_accuracy
0,0.697172,3,,18,0.927445,0.767677,2,18,0.939959
1,0.754492,10,,7,0.983134,0.800224,2,7,0.970263
2,0.756799,30,,6,0.978463,0.802469,2,6,0.977838
3,0.721922,3,,16,0.932077,0.780022,4,15,0.942203
4,0.77055,10,,4,0.969377,0.803591,4,5,0.966893
5,0.752669,30,,8,0.979345,0.805836,4,4,0.979802
6,0.742226,3,,12,0.933344,0.795735,6,12,0.946412
7,0.776676,10,,2,0.972299,0.81257,6,2,0.968858
8,0.786288,30,,1,0.980102,0.821549,6,1,0.980642
9,0.746101,3,,11,0.937626,0.796857,8,11,0.9408


In [20]:
num_attribs_prepared = num_pipeline.named_steps["to_numpy"].attribute_names
cat_attribs_prepared = cat_pipeline.named_steps["cat_to_int_encoder"].categories
attributes = num_attribs_prepared + cat_attribs_prepared

feature_importances = grid_search.best_estimator_.feature_importances_

sorted(zip(feature_importances, num_attribs_prepared + cat_attribs_prepared), reverse=True)

[(0.2516636690246802, 'Age'),
 (0.2383150145681014, 'Fare'),
 (0.17363694365309573, 'Sex_female'),
 (0.12475361452866167, 'Sex_male'),
 (0.09381114742435191, 'Pclass'),
 (0.04928684519872581, 'SibSp'),
 (0.03330916573702174, 'Parch'),
 (0.014693967204695773, 'Embarked_S'),
 (0.013662809015841654, 'Embarked_C'),
 (0.006866823644824141, 'Embarked_Q'),
 (0.0, 'Embarked_nan')]