In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/titanic/train.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/gender_submission.csv


In [2]:
#sklearn imports
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, VotingClassifier, StackingClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import PCA

In [3]:
#Function declaration
def analyze(df):
    print(df.info(), df.describe())
    
def feature_impotances(X, y, model=RandomForestClassifier(random_state=42)):
    model.fit(X, y)
    imp_dict = {}
    for col, score in zip(X.columns, model.feature_importances_):
        imp_dict[col] = score.round(4)
    print(imp_dict)
    print("Unimportant features: ", "\n")
    print([col for col in imp_dict if imp_dict[col] == 0])
    
def score_model(X_train, y_train, X_valid, y_valid, model):
    model.fit(X_train, y_train)
    y_predt = model.predict(X_train)
    y_predv = model.predict(X_valid)
    print("train accuracy:", accuracy_score(y_train, y_predt), precision_score(y_train, y_predt), recall_score(y_train, y_predt))
    print("valid accuracy:", accuracy_score(y_valid, y_predv), precision_score(y_valid, y_predv), recall_score(y_valid, y_predv))
    print("")
    
def tune_hyperparameter(X_train, y_train, X_valid, y_valid, gridsearch):
    score_model(X_train, y_train, X_valid, y_valid, gridsearch)
    print(gridsearch.best_params_, "\n")

In [4]:
#Prepare datasets
X = pd.read_csv('/kaggle/input/titanic/train.csv')
X.drop(['PassengerId', 'Name', 'Ticket'], axis=1, inplace=True)
y = X.pop('Survived')
X_test = pd.read_csv('/kaggle/input/titanic/test.csv')
X_test.drop(['PassengerId', 'Name', 'Ticket'], axis=1, inplace=True)

num_cols = X.select_dtypes(['number']).columns
cat_cols = X.select_dtypes(['object']).columns

to_drop = X[X['Embarked'].isnull()].index
X.drop(to_drop, inplace=True)
y.drop(to_drop, inplace=True)

print(num_cols, "\n", cat_cols)
print(y.value_counts())
print(X.shape, y.shape)
analyze(X)

Index(['Pclass', 'Age', 'SibSp', 'Parch', 'Fare'], dtype='object') 
 Index(['Sex', 'Cabin', 'Embarked'], dtype='object')
Survived
0    549
1    340
Name: count, dtype: int64
(889, 8) (889,)
<class 'pandas.core.frame.DataFrame'>
Index: 889 entries, 0 to 890
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    889 non-null    int64  
 1   Sex       889 non-null    object 
 2   Age       712 non-null    float64
 3   SibSp     889 non-null    int64  
 4   Parch     889 non-null    int64  
 5   Fare      889 non-null    float64
 6   Cabin     202 non-null    object 
 7   Embarked  889 non-null    object 
dtypes: float64(2), int64(3), object(3)
memory usage: 62.5+ KB
None            Pclass         Age       SibSp       Parch        Fare
count  889.000000  712.000000  889.000000  889.000000  889.000000
mean     2.311586   29.642093    0.524184    0.382452   32.096681
std      0.834700   14.492933    1.103705    0.806761  

In [5]:
#Feature engineering
def feature_engineer(df):
    df = df.copy()
    #df['Ticket'] = df['Ticket'].apply(lambda x: x[:3] if len(x) > 7 else 'normal')
    df['Cabin'] = df['Cabin'].apply(lambda x: 1 if type(x) == str else 0)
    return df
X_eng = feature_engineer(X)

#plt.hist(X_eng.Age)
print(X_eng.Cabin.unique())

num_cols = X_eng.select_dtypes(['number']).columns
cat_cols = X_eng.select_dtypes(['object']).columns
print(num_cols, "\n", cat_cols)
analyze(X_eng)

[0 1]
Index(['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Cabin'], dtype='object') 
 Index(['Sex', 'Embarked'], dtype='object')
<class 'pandas.core.frame.DataFrame'>
Index: 889 entries, 0 to 890
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    889 non-null    int64  
 1   Sex       889 non-null    object 
 2   Age       712 non-null    float64
 3   SibSp     889 non-null    int64  
 4   Parch     889 non-null    int64  
 5   Fare      889 non-null    float64
 6   Cabin     889 non-null    int64  
 7   Embarked  889 non-null    object 
dtypes: float64(2), int64(4), object(2)
memory usage: 62.5+ KB
None            Pclass         Age       SibSp       Parch        Fare       Cabin
count  889.000000  712.000000  889.000000  889.000000  889.000000  889.000000
mean     2.311586   29.642093    0.524184    0.382452   32.096681    0.227222
std      0.834700   14.492933    1.103705    0.806761   49.697504    0.419273
min 

In [6]:
#Data preprocessing
num_pipeline = make_pipeline(
    SimpleImputer(strategy='median'), 
    #PCA(0.95, random_state=42), 
    StandardScaler()
)
cat_pipeline = make_pipeline(
    OneHotEncoder(handle_unknown='ignore', sparse=False)
)
col_transformer = make_column_transformer(
    (num_pipeline, num_cols), 
    (cat_pipeline, cat_cols)
)

col_transformer.fit(X_eng)
X_trans = pd.DataFrame(col_transformer.transform(X_eng), columns=col_transformer.get_feature_names_out(), index=X_eng.index)
print(X_trans.shape, y.shape)
analyze(X_trans)

(889, 11) (889,)
<class 'pandas.core.frame.DataFrame'>
Index: 889 entries, 0 to 890
Data columns (total 11 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   pipeline-1__Pclass      889 non-null    float64
 1   pipeline-1__Age         889 non-null    float64
 2   pipeline-1__SibSp       889 non-null    float64
 3   pipeline-1__Parch       889 non-null    float64
 4   pipeline-1__Fare        889 non-null    float64
 5   pipeline-1__Cabin       889 non-null    float64
 6   pipeline-2__Sex_female  889 non-null    float64
 7   pipeline-2__Sex_male    889 non-null    float64
 8   pipeline-2__Embarked_C  889 non-null    float64
 9   pipeline-2__Embarked_Q  889 non-null    float64
 10  pipeline-2__Embarked_S  889 non-null    float64
dtypes: float64(11)
memory usage: 83.3 KB
None        pipeline-1__Pclass  pipeline-1__Age  pipeline-1__SibSp  \
count        8.890000e+02     8.890000e+02       8.890000e+02   
mean        -2.43



In [7]:
#feature_impotances(X_trans, y)

In [8]:
#Train & Test set preparation
X_train, X_valid, y_train, y_valid = train_test_split(X_trans, y, random_state=42)

print(y_train.value_counts(), y_valid.value_counts())
print(X_train.shape, y_train.shape)

Survived
0    408
1    258
Name: count, dtype: int64 Survived
0    141
1     82
Name: count, dtype: int64
(666, 11) (666,)


In [9]:
#Model training & selection
lr = LogisticRegression(random_state=42, max_iter=1000, tol=1e-3)
svc = SVC(random_state=42)
# rfc = RandomForestClassifier(random_state=42)
etc = ExtraTreesClassifier(random_state=42)
abc = AdaBoostClassifier(random_state=42)
# knc = KNeighborsClassifier()

score_model(X_train, y_train, X_valid, y_valid, lr)
score_model(X_train, y_train, X_valid, y_valid, svc)
# score_model(X_train, y_train, X_valid, y_valid, rfc)
score_model(X_train, y_train, X_valid, y_valid, etc)
score_model(X_train, y_train, X_valid, y_valid, abc)
# score_model(X_train, y_train, X_valid, y_valid, knc)

train accuracy: 0.7972972972972973 0.7530864197530864 0.7093023255813954
valid accuracy: 0.7982062780269058 0.7126436781609196 0.7560975609756098

train accuracy: 0.8408408408408409 0.8333333333333334 0.7364341085271318
valid accuracy: 0.8251121076233184 0.7654320987654321 0.7560975609756098

train accuracy: 0.9864864864864865 0.9960159362549801 0.9689922480620154
valid accuracy: 0.7982062780269058 0.7228915662650602 0.7317073170731707

train accuracy: 0.8453453453453453 0.8112449799196787 0.7829457364341085
valid accuracy: 0.8116591928251121 0.7272727272727273 0.7804878048780488



In [10]:
#Hyperparameter tuning
param_grid1 = {'C': [0.1, 1.0, 10]}
param_grid2 = {'C': [0.1, 1.0, 10, 100], 'degree': [1, 2, 3], 'kernel': ["rbf", "poly"]}
# param_grid3 = {'max_depth': [5, 10, 50, 100], 'min_samples_split': [4, 6, 8, 10], 'n_estimators': [50, 100, 500]}
param_grid3 = {'n_estimators': range(100, 1001), 'min_samples_split': range(2, 10), 'max_depth': range(1, 15)}
param_grid4 = {'learning_rate': [0.1, 1.0, 10], 'n_estimators': [10, 50, 100, 500]}
# param_grid5 = {'n_neighbors': range(2, 11)}

gs_lr = GridSearchCV(lr, param_grid1, cv=3)
gs_svc = GridSearchCV(svc, param_grid2, cv=3)
# gs_etc = GridSearchCV(etc, param_grid3, cv=5)
rs_etc = RandomizedSearchCV(etc, param_grid3, n_iter=15, cv=5, random_state=42)
gs_abc = GridSearchCV(abc, param_grid4, cv=3)

tune_hyperparameter(X_train, y_train, X_valid, y_valid, gs_lr)
tune_hyperparameter(X_train, y_train, X_valid, y_valid, gs_svc)
tune_hyperparameter(X_train, y_train, X_valid, y_valid, rs_etc)
tune_hyperparameter(X_train, y_train, X_valid, y_valid, gs_abc)

train accuracy: 0.7972972972972973 0.7530864197530864 0.7093023255813954
valid accuracy: 0.7982062780269058 0.7126436781609196 0.7560975609756098

{'C': 1.0} 

train accuracy: 0.8408408408408409 0.8333333333333334 0.7364341085271318
valid accuracy: 0.8251121076233184 0.7654320987654321 0.7560975609756098

{'C': 1.0, 'degree': 1, 'kernel': 'rbf'} 

train accuracy: 0.9504504504504504 0.987012987012987 0.8837209302325582
valid accuracy: 0.8116591928251121 0.7702702702702703 0.6951219512195121

{'n_estimators': 201, 'min_samples_split': 3, 'max_depth': 13} 

train accuracy: 0.8093093093093093 0.788546255506608 0.6937984496124031
valid accuracy: 0.8161434977578476 0.7530864197530864 0.7439024390243902

{'learning_rate': 10, 'n_estimators': 100} 



In [11]:
#Ensemble training & selection
lr.set_params(**gs_lr.best_params_)
svc.set_params(**gs_svc.best_params_)
svc.probability = True
svc.fit(X_train, y_train)
etc.set_params(**rs_etc.best_params_)
etc.min_samples_leaf = 5
# score_model(X_train, y_train, X_valid, y_valid, etc)
abc.set_params(**gs_abc.best_params_)

vc = VotingClassifier([
#     ('lr', lr), 
    ('svc', svc), 
#     ('etc', etc),
    ('abc', abc), 
#     ('knc', knc), 
], voting='soft')
sc = StackingClassifier([
#     ('lr', lr), 
    ('svc', svc), 
#     ('etc', etc),
    ('abc', abc), 
#     ('knc', knc), 
], cv=5)
score_model(X_train, y_train, X_valid, y_valid, vc)
score_model(X_train, y_train, X_valid, y_valid, sc)

train accuracy: 0.8393393393393394 0.8385650224215246 0.7248062015503876
valid accuracy: 0.8251121076233184 0.7654320987654321 0.7560975609756098

train accuracy: 0.8378378378378378 0.8348214285714286 0.7248062015503876
valid accuracy: 0.8251121076233184 0.7654320987654321 0.7560975609756098



In [12]:
#Final prediction
vc.fit(X_trans, y)
sc.fit(X_trans, y)
svc.fit(X_trans, y)
print(vc.score(X_valid, y_valid), sc.score(X_valid, y_valid), svc.score(X_valid, y_valid))

X_test_eng = feature_engineer(X_test)
# print(num_cols, "\n", cat_cols)
# analyze(X_test_eng)

X_test_trans = pd.DataFrame(col_transformer.transform(X_test_eng), columns=col_transformer.get_feature_names_out(), index=X_test_eng.index)
# print(X_test_trans.shape)
# analyze(X_test_trans)

predictions = pd.DataFrame({"PassengerId": X_test_trans.index+892, "Survived": svc.predict(X_test_trans)})
predictions.to_csv("submission.csv", index=False)
pd.read_csv('/kaggle/working/submission.csv')

0.820627802690583 0.8295964125560538 0.8295964125560538


Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,0
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0
