# Summary

In [1]:
# Import Library Yang diguanakan

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler


# Import data dan drop kolom
df = pd.read_csv("titanic.csv", index_col="PassengerId")
df.drop(columns=['Name', 'Ticket', 'Age', 'Cabin'], inplace=True)
df.head()


# Splitting Dataset
X = df.drop(columns=['Survived'])
y = df.Survived

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape


# Preprocesor
numerical_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", MinMaxScaler())])

categorical_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder())])

preprocessor = ColumnTransformer([
    ("numeric", numerical_pipeline, ['SibSp', 'Parch', 'Fare']), 
    ("categorical", categorical_pipeline, ['Pclass', 'Sex', 'Embarked'])])


# Pipeline
pipeline = Pipeline([
    ("prep", preprocessor),
    ("algo", KNeighborsClassifier())])


# Parameter Tuning
parameter = {
    'algo__n_neighbors': np.arange(1,51,2),
    'algo__weights': ['uniform', 'distance'],
    'algo__p': [1,2]
}

model = GridSearchCV(pipeline, param_grid=parameter, cv=3, n_jobs=-1, verbose=1)
model.fit(X_train, y_train)


# Evaluation
print(model.best_params_)
print(model.score(X_train, y_train), model.best_score_, model.score(X_test, y_test))

Fitting 3 folds for each of 100 candidates, totalling 300 fits
{'algo__n_neighbors': 21, 'algo__p': 1, 'algo__weights': 'uniform'}
0.8174157303370787 0.8146060111808436 0.7821229050279329


In [2]:
from jcopml.pipeline import num_pipe, cat_pipe

In [3]:
preprocessor = ColumnTransformer([
    ('numeric', num_pipe(scaling='minmax'), ['SibSp', 'Parch', 'Fare']),
    ('categoric', cat_pipe(encoder='onehot'), ['Pclass', 'Sex', 'Embarked']),
])

In [5]:
# Pipeline
pipeline = Pipeline([
    ("prep", preprocessor),
    ("algo", KNeighborsClassifier())])

from jcopml.tuning import grid_search_params as gsp

model = GridSearchCV(pipeline, param_grid=gsp.knn_params, cv=3, n_jobs=-1, verbose=2)
model.fit(X_train, y_train)

# Evaluation
print(model.best_params_)
print(model.score(X_train, y_train), model.best_score_, model.score(X_test, y_test))

Fitting 3 folds for each of 90 candidates, totalling 270 fits
{'algo__n_neighbors': 21, 'algo__p': 1, 'algo__weights': 'uniform'}
0.8174157303370787 0.8146060111808436 0.7821229050279329
