In [1]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [2]:
train_data = pd.read_csv("titanic_train.csv")
test_data = pd.read_csv("titanic_test.csv")

In [3]:
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
test_data.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [5]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [6]:
train_data.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [7]:
train_data.Survived.value_counts()

0    549
1    342
Name: Survived, dtype: int64

In [8]:
train_data.Pclass.value_counts()

3    491
1    216
2    184
Name: Pclass, dtype: int64

In [9]:
train_data.Sex.value_counts()

male      577
female    314
Name: Sex, dtype: int64

In [10]:
train_data.Embarked.value_counts()

S    644
C    168
Q     77
Name: Embarked, dtype: int64

In [11]:
train_data.Parch.value_counts()

0    678
1    118
2     80
5      5
3      5
4      4
6      1
Name: Parch, dtype: int64

In [12]:
numeric_features = ['Age', 'Fare', 'SibSp', 'Parch']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_features = ['Pclass', 'Sex', 'Embarked']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers = [
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])


In [13]:
X_train = preprocessor.fit_transform(train_data)

In [14]:
X_train

array([[-0.56573646, -0.50244517,  0.43279337, ...,  0.        ,
         1.        ,  0.        ],
       [ 0.66386103,  0.78684529,  0.43279337, ...,  0.        ,
         0.        ,  0.        ],
       [-0.25833709, -0.48885426, -0.4745452 , ...,  0.        ,
         1.        ,  0.        ],
       ...,
       [-0.1046374 , -0.17626324,  0.43279337, ...,  0.        ,
         1.        ,  0.        ],
       [-0.25833709, -0.04438104, -0.4745452 , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.20276197, -0.49237783, -0.4745452 , ...,  1.        ,
         0.        ,  0.        ]])

In [15]:
y_train = train_data['Survived']

In [18]:
from sklearn.model_selection import RandomizedSearchCV

In [22]:
from sklearn.svm import SVC

param_grid = {
    'C': [.001, .1, 1, 10, 100],
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid']
}

SVC = SVC()

SVC_clf = RandomizedSearchCV(SVC, param_grid, scoring='accuracy', n_jobs=-1, verbose=10, random_state=42)
SVC_search = SVC_clf.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    2.1s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    2.4s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    2.5s
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:    2.5s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1929s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    2.6s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0893s.) Setting batch_size=4.
[Parallel(n_jobs=-1)]: Done  43 out of  50 | elapsed:    2.7s remaining:    0.4s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    3.3s finished


In [23]:
print("Best score: ", SVC_search.best_score_)
print("Best params: ", SVC_search.best_params_)

Best score:  0.7957880861213986
Best params:  {'kernel': 'poly', 'C': 100}


In [24]:
from sklearn.ensemble import RandomForestClassifier

param_grid = {
    'n_estimators': [1, 10, 50, 100, 250, 500],
    'criterion': ['gini', 'entropy'],
    'max_depth': [10, 50, 100, None],
    'max_features': ['auto', 'sqrt', 'log2']
}

rf = RandomForestClassifier()
rf_clf = RandomizedSearchCV(rf, param_grid, scoring='accuracy', n_jobs=-1, random_state=42, verbose=10)
rf_search = rf_clf.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    0.9s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    1.7s
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:    2.1s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1798s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    2.1s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    4.8s finished


In [25]:
print("Best score: ", rf_search.best_score_)
print("Best params: ", rf_search.best_params_)

Best score:  0.8283284162952732
Best params:  {'n_estimators': 250, 'max_features': 'sqrt', 'max_depth': 10, 'criterion': 'entropy'}


In [27]:
from sklearn.linear_model import LogisticRegression

param_grid = {
    'C': [.0001, .01, .1, 1, 10, 100],
    'fit_intercept': [True, False],
    'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
}

log_reg = LogisticRegression()
log_reg_clf = RandomizedSearchCV(log_reg, param_grid, scoring='accuracy', n_jobs=-1, random_state=42, verbose=5)
log_reg_search = log_reg_clf.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    2.0s
[Parallel(n_jobs=-1)]: Done  43 out of  50 | elapsed:    2.2s remaining:    0.4s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    2.2s finished


In [28]:
print("Best score: ", log_reg_search.best_score_)
print("Best params: ", log_reg_search.best_params_)

Best score:  0.7923796371853619
Best params:  {'solver': 'newton-cg', 'fit_intercept': False, 'C': 10}
