# 1. Goal

The goal of this work to predict if passenger of Titanic will survive or not.

# 2. Data and hypothesis

Let's have a look on the data:

In [3]:
# imports
import os
import pandas as pd
import matplotlib as plt
import sklearn
import xgboost
from tensorflow import keras
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn import svm, tree, linear_model, neighbors, naive_bayes, ensemble, discriminant_analysis, gaussian_process
# from sklearn import feature_selection
# from sklearn import model_selection
# from sklearn import metrics

from sklearn.model_selection import cross_val_score


In [4]:
data_train = pd.read_csv('./input/train.csv')
data_test = pd.read_csv('./input/test.csv')
data_all = [data_train, data_test]

data_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


Data summary's:

In [5]:
data_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [6]:
data_train.describe(include='all')

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
count,891.0,891.0,891.0,891,891,714.0,891.0,891.0,891,891.0,204,889
unique,,,,891,2,,,,681,,147,3
top,,,,"Moss, Mr. Albert Johan",male,,,,CA. 2343,,B96 B98,S
freq,,,,1,577,,,,7,,4,644
mean,446.0,0.383838,2.308642,,,29.699118,0.523008,0.381594,,32.204208,,
std,257.353842,0.486592,0.836071,,,14.526497,1.102743,0.806057,,49.693429,,
min,1.0,0.0,1.0,,,0.42,0.0,0.0,,0.0,,
25%,223.5,0.0,2.0,,,20.125,0.0,0.0,,7.9104,,
50%,446.0,0.0,3.0,,,28.0,0.0,0.0,,14.4542,,
75%,668.5,1.0,3.0,,,38.0,1.0,0.0,,31.0,,


In [7]:
for data in data_all:
    # 'Sex' to bool    
    data['Sex'] = [int(x == 'male') for x in data["Sex"]]
    

In [8]:
for data in data_all:

# Working with NaN's
    data['Age'].fillna(data['Age'].mean(), inplace=True); # Mean value
    data['Embarked'].fillna('S', inplace=True); # Most frequent value

In [9]:
data_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    int64  
 5   Age          891 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     891 non-null    object 
dtypes: float64(2), int64(6), object(4)
memory usage: 83.7+ KB


In [10]:
X_cols = ['Pclass', 'Sex', 'Age']
X_cols_num = ['Age', 'Sex', 'SibSp', 'Parch']
X_cols_cat = ['Pclass', 'Embarked']
Y_cols = 'Survived'

In [15]:
def Training(X, y):
    models = [
        #linear_model.LogisticRegression(),
        linear_model.LogisticRegressionCV(),
        linear_model.SGDClassifier(),
        linear_model.RidgeClassifier(),
        
        svm.SVC(),
        
        xgboost.XGBClassifier(),
        xgboost.XGBRFClassifier(),
        
        tree.DecisionTreeClassifier(),
        tree.ExtraTreeClassifier(),
        
        ensemble.AdaBoostClassifier(),
        ensemble.BaggingClassifier(),
        ensemble.ExtraTreesClassifier(),
        ensemble.GradientBoostingClassifier(),
        ensemble.RandomForestClassifier()
        ]

    scores = []
    scores_cv = []

    for model in models:
        scores_cv.append(cross_val_score(model, X, y, cv=5).mean())
        model.fit(X, y)
        scores.append(model.score(X, y))
        print('{:<30}{:>8.3f}{:>8.3f}'.format(model.__class__.__name__, scores[-1], scores_cv[-1]))
    return models[scores_cv.index(max(scores_cv))]

In [11]:
# Normalization and working with categorical data

oh_enc = OneHotEncoder(handle_unknown='ignore')
or_enc = OrdinalEncoder()

data_oh_enc_all = []
data_or_enc_all = []
for data in data_all:
    data_oh_enc_all.append(oh_enc.fit_transform(data[X_cols_cat]))
    data_or_enc_all.append(or_enc.fit_transform(data[X_cols_cat]))

In [17]:
X_oh_all = []
for data, data_enc in zip(data_all, data_oh_enc_all):
    X_oh_all.append(data[X_cols_num].join(pd.DataFrame(data_enc.toarray())))

X_or_all = []
for data, data_enc in zip(data_all, data_or_enc_all):
    X_or_all.append(data[X_cols_num].join(pd.DataFrame(data_enc)))    

X_all = [X_oh_all, X_or_all]
    
X = X_oh_all[0]
y = data_train[Y_cols]

In [None]:
model = Training(X, y)
model.__class__.__name__

In [29]:
for X in X_all:
    print(20*'-')
    model = Training(X[0], y)
    model.__class__.__name__

--------------------


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

LogisticRegressionCV             0.805   0.789
SGDClassifier                    0.810   0.740
RidgeClassifier                  0.799   0.793
SVC                              0.639   0.640
XGBClassifier                    0.919   0.818
XGBRFClassifier                  0.859   0.809
DecisionTreeClassifier           0.937   0.794
ExtraTreeClassifier              0.937   0.780
AdaBoostClassifier               0.826   0.816
BaggingClassifier                0.928   0.806
ExtraTreesClassifier             0.937   0.788
GradientBoostingClassifier       0.871   0.824
RandomForestClassifier           0.937   0.803
--------------------
LogisticRegressionCV             0.804   0.789
SGDClassifier                    0.783   0.694
RidgeClassifier                  0.797   0.791
SVC                              0.640   0.641
XGBClassifier                    0.917   0.820
XGBRFClassifier                  0.855   0.818
DecisionTreeClassifier           0.937   0.792
ExtraTreeClassifier              0.937 

In [62]:
for data in data_all:
    # 'Sex' to bool    
    data['SibSp'] = [int(x > 0) for x in data['SibSp']]
    data['Parch'] = [int(x > 0) for x in data['Parch']]
    data['Age'] = data['Age']/data['Age'].mean()
    
    
X_all = []
for data, data_enc in zip(data_all, data_enc_all):
    X_all.append(data[X_cols_num].join(pd.DataFrame(data_enc.toarray())))

X = X_all[0]

In [63]:
model = Training(X, y)
model.__class__.__name__

LogisticRegressionCV             0.797   0.795
SGDClassifier                    0.800   0.768
RidgeClassifier                  0.791   0.787
SVC                              0.824   0.813
XGBClassifier                    0.917   0.807
XGBRFClassifier                  0.853   0.803
DecisionTreeClassifier           0.930   0.789
ExtraTreeClassifier              0.930   0.779
AdaBoostClassifier               0.823   0.806
BaggingClassifier                0.918   0.808
ExtraTreesClassifier             0.930   0.791
GradientBoostingClassifier       0.865   0.808
RandomForestClassifier           0.930   0.801


'SVC'

In [64]:
X_cols = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Embarked']
X_cols_num = ['Age', 'Sex', 'SibSp', 'Parch', 'Pclass']
X_cols_cat = ['Embarked']
Y_cols = 'Survived'

In [65]:
# Normalization and working with categorical data

enc = OneHotEncoder(handle_unknown='ignore')

data_enc_all = []
for data in data_all:
    data_enc_all.append(enc.fit_transform(data[X_cols_cat]))

In [66]:
X_all = []
for data, data_enc in zip(data_all, data_enc_all):
    X_all.append(data[X_cols_num].join(pd.DataFrame(data_enc.toarray())))

X = X_all[0]

In [67]:
model = Training(X, y)
model.__class__.__name__

LogisticRegressionCV             0.797   0.795
SGDClassifier                    0.744   0.772
RidgeClassifier                  0.791   0.787
SVC                              0.824   0.813
XGBClassifier                    0.917   0.807
XGBRFClassifier                  0.853   0.803
DecisionTreeClassifier           0.930   0.792
ExtraTreeClassifier              0.930   0.782
AdaBoostClassifier               0.823   0.806
BaggingClassifier                0.920   0.791
ExtraTreesClassifier             0.930   0.799
GradientBoostingClassifier       0.865   0.808
RandomForestClassifier           0.930   0.803


'SVC'

In [19]:
y_test = model.predict(X_test)

ans = pd.DataFrame()
ans["Survived"] = y_test
ans["PassengerId"] = data_test["PassengerId"]
ans.set_index("PassengerId", inplace=True)
ans.to_csv('./output/submission.csv')