# Titanic: Machine Learning from Disaster
https://www.kaggle.com/c/titanic

Firstly, make imports, read data, pick out target column and find missings in data.

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cluster import KMeans

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

y_train = np.array(train['Survived'])
del train['Survived']

X_all = pd.concat([train, test])
print(X_all.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1309 entries, 0 to 417
Data columns (total 11 columns):
PassengerId    1309 non-null int64
Pclass         1309 non-null int64
Name           1309 non-null object
Sex            1309 non-null object
Age            1046 non-null float64
SibSp          1309 non-null int64
Parch          1309 non-null int64
Ticket         1309 non-null object
Fare           1308 non-null float64
Cabin          295 non-null object
Embarked       1307 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 122.7+ KB
None


Almost all machine learning algorithms work with numerical vectors. Convert data to appropriate format, filling in blanks and encode categorial features. Missing may be useful information, save it in additional column.

In [2]:
X_all['age_null'] = pd.isnull(X_all['Age'])
X_all['cabin_null'] = pd.isnull(X_all['Cabin'])

X_all['isMale'] = X_all['Sex'] == 'male'

X_all.fillna(value = {'Age': X_all['Age'].mean(),
                      'Fare' : X_all['Fare'].mean(),
                      'Embarked' : X_all['Embarked'].mode()[0]},
             inplace = True)

X_all['Embarked_S'] = X_all['Embarked'] == 'S'
X_all['Embarked_C'] = X_all['Embarked'] == 'C'
X_all['Embarked_Q'] = X_all['Embarked'] == 'Q'

del X_all['Cabin'], X_all['PassengerId'], X_all['Name'], X_all['Ticket'], X_all['Embarked'], X_all['Sex']

X_train = np.array(X_all.iloc[train.index], dtype=np.dtype('float64'))
X_test = np.array(X_all.iloc[X_train.shape[0]:], dtype=np.dtype('float64'))

Scale all features with StandardScaler thus mean will be zero and standard deviation will be one.

In [3]:
scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

Try linear model - logistic regression. Pick up regulazation parameter.

In [4]:
kfold = KFold(n_splits=10, shuffle=True, random_state=44)

for C in [2.0 ** i for i in range(-5, 10)]:
    clf = LogisticRegression(C = C)
    scores = cross_val_score(clf, X_train, y_train, cv=kfold)
    print("C = %0.3f \nAccuracy: %0.3f (+/- %0.3f)\n" % (C, scores.mean(), scores.std() * 2))


C = 0.031 
Accuracy: 0.788 (+/- 0.103)

C = 0.062 
Accuracy: 0.792 (+/- 0.092)

C = 0.125 
Accuracy: 0.795 (+/- 0.090)

C = 0.250 
Accuracy: 0.796 (+/- 0.087)

C = 0.500 
Accuracy: 0.798 (+/- 0.093)

C = 1.000 
Accuracy: 0.798 (+/- 0.093)

C = 2.000 
Accuracy: 0.798 (+/- 0.093)

C = 4.000 
Accuracy: 0.798 (+/- 0.093)

C = 8.000 
Accuracy: 0.799 (+/- 0.091)

C = 16.000 
Accuracy: 0.799 (+/- 0.091)

C = 32.000 
Accuracy: 0.799 (+/- 0.091)

C = 64.000 
Accuracy: 0.799 (+/- 0.091)

C = 128.000 
Accuracy: 0.799 (+/- 0.091)

C = 256.000 
Accuracy: 0.799 (+/- 0.091)

C = 512.000 
Accuracy: 0.799 (+/- 0.091)



So, our baseline is ** 0.799 (+/- 0.091) **

Now try SVM with RBF.

In [5]:
kfold = KFold(n_splits=10, shuffle=True, random_state=44)

for C in [2.0 ** i for i in range(-5, 10)]:
    clf = SVC(C = C, kernel='rbf')
    scores = cross_val_score(clf, X_train, y_train, cv=kfold)
    print("C = %0.3f \nAccuracy: %0.3f (+/- %0.3f)\n" % (C, scores.mean(), scores.std() * 2))


C = 0.031 
Accuracy: 0.739 (+/- 0.073)

C = 0.062 
Accuracy: 0.788 (+/- 0.092)

C = 0.125 
Accuracy: 0.805 (+/- 0.100)

C = 0.250 
Accuracy: 0.810 (+/- 0.092)

C = 0.500 
Accuracy: 0.825 (+/- 0.077)

C = 1.000 
Accuracy: 0.820 (+/- 0.063)

C = 2.000 
Accuracy: 0.826 (+/- 0.091)

C = 4.000 
Accuracy: 0.834 (+/- 0.075)

C = 8.000 
Accuracy: 0.824 (+/- 0.081)

C = 16.000 
Accuracy: 0.814 (+/- 0.075)

C = 32.000 
Accuracy: 0.803 (+/- 0.071)

C = 64.000 
Accuracy: 0.786 (+/- 0.054)

C = 128.000 
Accuracy: 0.781 (+/- 0.067)

C = 256.000 
Accuracy: 0.779 (+/- 0.081)

C = 512.000 
Accuracy: 0.783 (+/- 0.074)



Results improve to ** 0.834 (+/- 0.075) **

Maybe it means that data have a cluster structure. 
Try K-nearest neighbor.

In [6]:
kfold = KFold(n_splits=10, shuffle=True, random_state=44)

for n_neighbors in range(1, 26):
    clf = KNeighborsClassifier(n_neighbors=n_neighbors)
    scores = cross_val_score(clf, X_train, y_train, cv=kfold)
    print("n_neighbors = %d \nAccuracy: %0.3f (+/- %0.3f)\n" % (n_neighbors, scores.mean(), scores.std() * 2))


n_neighbors = 1 
Accuracy: 0.746 (+/- 0.102)

n_neighbors = 2 
Accuracy: 0.781 (+/- 0.096)

n_neighbors = 3 
Accuracy: 0.787 (+/- 0.080)

n_neighbors = 4 
Accuracy: 0.783 (+/- 0.083)

n_neighbors = 5 
Accuracy: 0.794 (+/- 0.069)

n_neighbors = 6 
Accuracy: 0.800 (+/- 0.071)

n_neighbors = 7 
Accuracy: 0.813 (+/- 0.072)

n_neighbors = 8 
Accuracy: 0.805 (+/- 0.085)

n_neighbors = 9 
Accuracy: 0.811 (+/- 0.069)

n_neighbors = 10 
Accuracy: 0.809 (+/- 0.088)

n_neighbors = 11 
Accuracy: 0.807 (+/- 0.071)

n_neighbors = 12 
Accuracy: 0.814 (+/- 0.074)

n_neighbors = 13 
Accuracy: 0.806 (+/- 0.078)

n_neighbors = 14 
Accuracy: 0.808 (+/- 0.084)

n_neighbors = 15 
Accuracy: 0.815 (+/- 0.079)

n_neighbors = 16 
Accuracy: 0.806 (+/- 0.084)

n_neighbors = 17 
Accuracy: 0.807 (+/- 0.089)

n_neighbors = 18 
Accuracy: 0.803 (+/- 0.083)

n_neighbors = 19 
Accuracy: 0.810 (+/- 0.063)

n_neighbors = 20 
Accuracy: 0.814 (+/- 0.070)

n_neighbors = 21 
Accuracy: 0.801 (+/- 0.078)

n_neighbors = 22 
Accu

Results are better than logistic regression: ** 0.815 (+/- 0.079) **.
Then try add information about cluster structure to features.

In [7]:
for n_clusters in range(2,101):
    kmeans = KMeans(n_clusters=n_clusters, random_state=44)
    
    X_train_cluster_dist = kmeans.fit_transform(X_train)
    X_test_cluster_dist = kmeans.transform(X_test)
    
    scaler_cluster_dist = StandardScaler()
    
    X_train_cluster_dist = scaler_cluster_dist.fit_transform(X_train_cluster_dist)
    X_test_cluster_dist = scaler_cluster_dist.transform(X_test_cluster_dist)
    
    X_train2 = np.hstack([X_train, X_train_cluster_dist])
    X_test2 = np.hstack([X_test, X_test_cluster_dist])
    
    kfold = KFold(n_splits=10, shuffle=True, random_state=44)
    
    best_mean = 0
    best_std = 0
    best_C = 0
    for C in [2.0 ** i for i in range(-5, 10)]:
        clf = LogisticRegression(C = C)
        scores = cross_val_score(clf, X_train2, y_train, cv=kfold)
        if (best_mean < scores.mean()):
            best_mean = scores.mean()
            best_std = scores.std()
            best_C = C
    print("n_clusters = %d \nC = %0.3f \nAccuracy: %0.3f (+/- %0.3f)\n" % (n_clusters, best_C, best_mean, best_std * 2))


n_clusters = 2 
C = 0.250 
Accuracy: 0.795 (+/- 0.103)

n_clusters = 3 
C = 0.250 
Accuracy: 0.796 (+/- 0.105)

n_clusters = 4 
C = 64.000 
Accuracy: 0.803 (+/- 0.102)

n_clusters = 5 
C = 64.000 
Accuracy: 0.807 (+/- 0.103)

n_clusters = 6 
C = 8.000 
Accuracy: 0.804 (+/- 0.109)

n_clusters = 7 
C = 32.000 
Accuracy: 0.806 (+/- 0.099)

n_clusters = 8 
C = 128.000 
Accuracy: 0.807 (+/- 0.105)

n_clusters = 9 
C = 2.000 
Accuracy: 0.805 (+/- 0.103)

n_clusters = 10 
C = 4.000 
Accuracy: 0.806 (+/- 0.098)

n_clusters = 11 
C = 8.000 
Accuracy: 0.813 (+/- 0.104)

n_clusters = 12 
C = 32.000 
Accuracy: 0.807 (+/- 0.110)

n_clusters = 13 
C = 16.000 
Accuracy: 0.804 (+/- 0.105)

n_clusters = 14 
C = 16.000 
Accuracy: 0.820 (+/- 0.086)

n_clusters = 15 
C = 16.000 
Accuracy: 0.815 (+/- 0.096)

n_clusters = 16 
C = 128.000 
Accuracy: 0.824 (+/- 0.093)

n_clusters = 17 
C = 8.000 
Accuracy: 0.822 (+/- 0.101)

n_clusters = 18 
C = 16.000 
Accuracy: 0.818 (+/- 0.092)

n_clusters = 19 
C = 8.000 

Results are better than baseline, but worse than SVM with RBF: ** 0.832 (+/- 0.061) **.

Now test our model on different splitting to make sure, that we didn't overfit.

In [8]:
kmeans = KMeans(n_clusters=32, random_state=44)

X_train_cluster_dist = kmeans.fit_transform(X_train)
X_test_cluster_dist = kmeans.transform(X_test)

scaler_cluster_dist = StandardScaler()

X_train_cluster_dist = scaler_cluster_dist.fit_transform(X_train_cluster_dist)
X_test_cluster_dist = scaler_cluster_dist.transform(X_test_cluster_dist)

X_train2 = np.hstack([X_train, X_train_cluster_dist])
X_test2 = np.hstack([X_test, X_test_cluster_dist])

kfold = KFold(n_splits=10, shuffle=True, random_state=42) # random_state = 42, before used 44

clf = LogisticRegression(C = 8.0)

scores = cross_val_score(clf, X_train2, y_train, cv=kfold)

print("Accuracy: %0.3f (+/- %0.3f)\n" % (scores.mean(), scores.std() * 2))

Accuracy: 0.820 (+/- 0.098)



This result is within the limits of error.

Make a submission to Kaggle.

In [9]:
clf.fit(X_train2, y_train)
y_pred = clf.predict(X_test2)
ans = pd.DataFrame({'PassengerId': test['PassengerId'], 'Survived':y_pred})
ans.to_csv("ans.csv", index=False)

The result is 0.80383. It is within the limits of error too.