In [1]:
# Import Library Yang diguanakan

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler

In [2]:
# Import data dan drop kolom
df = pd.read_csv("titanic.csv", index_col="PassengerId")
df.drop(columns=['Name', 'Ticket', 'Age', 'Cabin'], inplace=True)
df.head()

Unnamed: 0_level_0,Survived,Pclass,Sex,SibSp,Parch,Fare,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,0,3,male,1,0,7.25,S
2,1,1,female,1,0,71.2833,C
3,1,3,female,0,0,7.925,S
4,1,1,female,1,0,53.1,S
5,0,3,male,0,0,8.05,S


In [3]:
# Splitting Dataset
X = df.drop(columns=['Survived'])
y = df.Survived

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((712, 6), (179, 6), (712,), (179,))

In [8]:
# Preprocesor
numerical_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", MinMaxScaler())])

categorical_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder())])



In [9]:
X_train.head()

Unnamed: 0_level_0,Pclass,Sex,SibSp,Parch,Fare,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
693,3,male,0,0,56.4958,S
482,2,male,0,0,0.0,S
528,1,male,0,0,221.7792,S
856,3,female,0,1,9.35,S
802,2,female,1,1,26.25,S


In [10]:
preprocessor = ColumnTransformer([
    ("numeric", numerical_pipeline, ['SibSp', 'Parch', 'Fare']), 
    ("categorical", categorical_pipeline, ['Pclass', 'Sex', 'Embarked'])])

In [11]:
# Pipeline
pipeline = Pipeline([
    ("prep", preprocessor),
    ("algo", KNeighborsClassifier())])


In [12]:
pipeline.fit(X_train, y_train)

Pipeline(steps=[('prep',
                 ColumnTransformer(transformers=[('numeric',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer()),
                                                                  ('scaler',
                                                                   MinMaxScaler())]),
                                                  ['SibSp', 'Parch', 'Fare']),
                                                 ('categorical',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('onehot',
                                                                   OneHotEncoder())]),
                                                  ['Pclass', 'Sex',
                         

In [14]:
pipeline.score(X_test, y_test)

0.776536312849162

In [15]:
pipeline.get_params

<bound method Pipeline.get_params of Pipeline(steps=[('prep',
                 ColumnTransformer(transformers=[('numeric',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer()),
                                                                  ('scaler',
                                                                   MinMaxScaler())]),
                                                  ['SibSp', 'Parch', 'Fare']),
                                                 ('categorical',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('onehot',
                                                                   OneHotEncoder())]),
                                                  ['Pcla

In [16]:
pipeline.get_params()

{'memory': None,
 'steps': [('prep', ColumnTransformer(transformers=[('numeric',
                                    Pipeline(steps=[('imputer', SimpleImputer()),
                                                    ('scaler', MinMaxScaler())]),
                                    ['SibSp', 'Parch', 'Fare']),
                                   ('categorical',
                                    Pipeline(steps=[('imputer',
                                                     SimpleImputer(strategy='most_frequent')),
                                                    ('onehot', OneHotEncoder())]),
                                    ['Pclass', 'Sex', 'Embarked'])])),
  ('algo', KNeighborsClassifier())],
 'verbose': False,
 'prep': ColumnTransformer(transformers=[('numeric',
                                  Pipeline(steps=[('imputer', SimpleImputer()),
                                                  ('scaler', MinMaxScaler())]),
                                  ['SibSp', 'Parch', 'Far

In [17]:
# Parameter Tuning
parameter = {
    'algo__n_neighbors': np.arange(1,51,2),
    'algo__weights': ['uniform', 'distance'],
    'algo__p': [1,2]
}

model = GridSearchCV(pipeline, param_grid=parameter, cv=3, n_jobs=-1, verbose=1)
model.fit(X_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('prep',
                                        ColumnTransformer(transformers=[('numeric',
                                                                         Pipeline(steps=[('imputer',
                                                                                          SimpleImputer()),
                                                                                         ('scaler',
                                                                                          MinMaxScaler())]),
                                                                         ['SibSp',
                                                                          'Parch',
                                                                          'Fare']),
                                                                        ('categorical',
                                                                         Pipeline(steps=[('im

In [24]:
pd.DataFrame(model.cv_results_).sort_values('rank_test_score', ascending=True).head(5)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_algo__n_neighbors,param_algo__p,param_algo__weights,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
40,0.026002,0.009201,0.040003,0.016268,21,1,uniform,"{'algo__n_neighbors': 21, 'algo__p': 1, 'algo_...",0.815126,0.818565,0.810127,0.814606,0.003465,1
36,0.025002,0.006482,0.030336,0.00309,19,1,uniform,"{'algo__n_neighbors': 19, 'algo__p': 1, 'algo_...",0.815126,0.818565,0.805907,0.8132,0.005344,2
24,0.023668,0.002625,0.029669,0.002625,13,1,uniform,"{'algo__n_neighbors': 13, 'algo__p': 1, 'algo_...",0.819328,0.810127,0.810127,0.813194,0.004337,3
28,0.030003,0.011342,0.038335,0.016051,15,1,uniform,"{'algo__n_neighbors': 15, 'algo__p': 1, 'algo_...",0.819328,0.810127,0.810127,0.813194,0.004337,3
38,0.023335,0.003399,0.026668,0.000471,19,2,uniform,"{'algo__n_neighbors': 19, 'algo__p': 2, 'algo_...",0.815126,0.814346,0.805907,0.811793,0.004174,5


In [22]:
model.best_params_

{'algo__n_neighbors': 21, 'algo__p': 1, 'algo__weights': 'uniform'}

In [23]:
model.score(X_train, y_train), model.score(X_test, y_test)

(0.8174157303370787, 0.7821229050279329)

### Model Prediction


In [26]:
data = [
    [1, 'female', 1, 1, 80, 'S' ],
    [3, 'male', 0, 0, 5, 'S']
]

X_pred = pd.DataFrame(data, index=["Rose", "Jack"], columns=X.columns)
X_pred

Unnamed: 0,Pclass,Sex,SibSp,Parch,Fare,Embarked
Rose,1,female,1,1,80,S
Jack,3,male,0,0,5,S


In [28]:
model.predict(X_pred)

array([1, 0], dtype=int64)

In [29]:
X_pred["Survived"] = model.predict(X_pred)
X_pred

Unnamed: 0,Pclass,Sex,SibSp,Parch,Fare,Embarked,Survived
Rose,1,female,1,1,80,S,1
Jack,3,male,0,0,5,S,0
