In [1]:
#import library
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

#import data
df = pd.read_csv('/home/yulius/Latihan-Data-Science/Data/titanic.csv',index_col='PassengerId')
df.drop(columns=["Name","Ticket","Age","Cabin"], inplace=True)

In [3]:


#data splitting
X = df.drop(columns="Survived") 
y = df['Survived']

X_train , X_test, y_train, y_test = train_test_split(X,y, test_size=0.2,stratify=y, random_state=42)
X_train.shape , X_test.shape, y_train.shape, y_test.shape

#preprocessor
numerical_pipeline = Pipeline([
    ("Imputer",SimpleImputer(strategy="mean")),
    ("Scaler", MinMaxScaler())
])

categorcal_pipeline = Pipeline([
    ("Imputer",SimpleImputer(strategy="most_frequent")),
    ("onehotencoding", OneHotEncoder())
])

preprocessor = ColumnTransformer([
    ("numeric", numerical_pipeline,["SibSp","Parch","Fare"]),
    ("Categorical", categorcal_pipeline,["Sex","Pclass","Embarked"])
])

#pipeline
pipeline = Pipeline([
    ("Prep",preprocessor),
    ("algo",KNeighborsClassifier())
])

#parameter tuning
parameter = {
    "algo__n_neighbors": range(1,51,2),
    "algo__weights":["uniform","distance"],
    "algo__p":[1,2]
}

model = GridSearchCV(pipeline, parameter, cv=3,n_jobs=-1, verbose=1)
model.fit(X_train,y_train)

#evaluation
print(model.best_params_)
print(model.score(X_train,y_train),model.score(X_test,y_test))

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done 128 tasks      | elapsed:    2.4s


{'algo__n_neighbors': 21, 'algo__p': 1, 'algo__weights': 'uniform'}
0.8174157303370787 0.7821229050279329


[Parallel(n_jobs=-1)]: Done 293 out of 300 | elapsed:    5.1s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:    5.1s finished
