# K-NN

refs: 
    
- PCA: https://towardsdatascience.com/pca-using-python-scikit-learn-e653f8989e60

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import KFold, GridSearchCV, cross_validate
from spaceship_titanic import feature_enginnering as fe

In [2]:
df_train = pd.read_csv("data/train.csv", index_col='PassengerId')
df_test = pd.read_csv("data/test.csv", index_col='PassengerId')
df_train.head(2)

Unnamed: 0_level_0,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True


# Data Treatment

In [3]:
def treat_dataset(df:pd.DataFrame) -> pd.DataFrame:
    df = df.drop(['Name'],axis=1)
    df = fe.fillna_homeplanet_and_destination(df)
    df = fe.fill_with_0_people_with_no_other_wastes(df)
    df = fe.fill_0_wastes_people_cryosleep(df)
    df = fe.fill_with_non_0_median(df)
    df = fe.fill_cryosleep(df)
    df = fe.feature_inputer(df, to_mode=[], to_median=['Age'])
    df = fe.cabin_inputer(df)
    df = fe.vip_knn_input(df)
    df['TotalService'] = df[['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']].sum(axis=1)
    df = fe.outliers_to_log(df, ['TotalService','RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck'])
    df = fe.dtype_memory_reducer(df)
    return df
def feature_enginnering(df: pd.DataFrame) -> pd.DataFrame:
    df['Deck'] = df['Deck'].map(dict(zip(['A', 'B', 'C', 'D', 'E', 'F', 'G'],[1,2,3,4,5,6,7])))
    df = pd.get_dummies(df,columns=['HomePlanet','Destination','Side'],drop_first=True)
    return df

def train_test_treatments(df, full_df):
    df['GroupSize'] = fe.calculate_groupsize(df.index, full_df.index)
    df['GroupLastNameSize'] = fe.calculate_group_lastname_size(df.index, full_df)
    df.drop(['Side_U','VIP'],axis=1, inplace=True)
    return df

In [4]:
df = treat_dataset(df_train.copy())
df = feature_enginnering(df)
df = train_test_treatments(df, pd.concat([df_train,df_test]))

In [20]:
X, y = df.drop('Transported',axis=1), df['Transported']
X_train, X_val, y_train, y_val = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42
)

In [32]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import SelectKBest, chi2

In [33]:
pipeline = Pipeline([
    ('scaler', MinMaxScaler()),
    ('skb', SelectKBest(chi2, k = 10)),
    ('model', KNeighborsClassifier())
])

## Tuning

In [37]:
params = {
    'skb__k':[5, 7, 9, 11, 13, 15],
    'model__n_neighbors' : [5, 7, 9, 13, 17, 25],
    'model__weights' : ['uniform'],
    'model__metric' : ['minkowski','euclidean','manhattan']
}

In [38]:
grid = GridSearchCV(
    pipeline,
    param_grid=params,
    scoring='accuracy',
    cv=KFold(5, shuffle=True, random_state=42),
    verbose=3,
    return_train_score=True
)
grid.fit(X_train, y_train)

Fitting 5 folds for each of 108 candidates, totalling 540 fits
[CV 1/5] END model__metric=minkowski, model__n_neighbors=5, model__weights=uniform, skb__k=5;, score=(train=0.812, test=0.794) total time=   0.0s
[CV 2/5] END model__metric=minkowski, model__n_neighbors=5, model__weights=uniform, skb__k=5;, score=(train=0.814, test=0.771) total time=   0.0s
[CV 3/5] END model__metric=minkowski, model__n_neighbors=5, model__weights=uniform, skb__k=5;, score=(train=0.819, test=0.768) total time=   0.0s
[CV 4/5] END model__metric=minkowski, model__n_neighbors=5, model__weights=uniform, skb__k=5;, score=(train=0.800, test=0.775) total time=   0.0s
[CV 5/5] END model__metric=minkowski, model__n_neighbors=5, model__weights=uniform, skb__k=5;, score=(train=0.816, test=0.772) total time=   0.0s
[CV 1/5] END model__metric=minkowski, model__n_neighbors=5, model__weights=uniform, skb__k=7;, score=(train=0.806, test=0.768) total time=   0.0s
[CV 2/5] END model__metric=minkowski, model__n_neighbors=5, m

[CV 2/5] END model__metric=minkowski, model__n_neighbors=7, model__weights=uniform, skb__k=15;, score=(train=0.827, test=0.768) total time=   0.0s
[CV 3/5] END model__metric=minkowski, model__n_neighbors=7, model__weights=uniform, skb__k=15;, score=(train=0.828, test=0.768) total time=   0.0s
[CV 4/5] END model__metric=minkowski, model__n_neighbors=7, model__weights=uniform, skb__k=15;, score=(train=0.823, test=0.790) total time=   0.0s
[CV 5/5] END model__metric=minkowski, model__n_neighbors=7, model__weights=uniform, skb__k=15;, score=(train=0.830, test=0.771) total time=   0.0s
[CV 1/5] END model__metric=minkowski, model__n_neighbors=9, model__weights=uniform, skb__k=5;, score=(train=0.798, test=0.785) total time=   0.0s
[CV 2/5] END model__metric=minkowski, model__n_neighbors=9, model__weights=uniform, skb__k=5;, score=(train=0.807, test=0.783) total time=   0.0s
[CV 3/5] END model__metric=minkowski, model__n_neighbors=9, model__weights=uniform, skb__k=5;, score=(train=0.803, test=

[CV 3/5] END model__metric=minkowski, model__n_neighbors=13, model__weights=uniform, skb__k=13;, score=(train=0.805, test=0.789) total time=   0.0s
[CV 4/5] END model__metric=minkowski, model__n_neighbors=13, model__weights=uniform, skb__k=13;, score=(train=0.799, test=0.786) total time=   0.0s
[CV 5/5] END model__metric=minkowski, model__n_neighbors=13, model__weights=uniform, skb__k=13;, score=(train=0.802, test=0.772) total time=   0.0s
[CV 1/5] END model__metric=minkowski, model__n_neighbors=13, model__weights=uniform, skb__k=15;, score=(train=0.815, test=0.758) total time=   0.0s
[CV 2/5] END model__metric=minkowski, model__n_neighbors=13, model__weights=uniform, skb__k=15;, score=(train=0.814, test=0.786) total time=   0.1s
[CV 3/5] END model__metric=minkowski, model__n_neighbors=13, model__weights=uniform, skb__k=15;, score=(train=0.808, test=0.777) total time=   0.0s
[CV 4/5] END model__metric=minkowski, model__n_neighbors=13, model__weights=uniform, skb__k=15;, score=(train=0.

[CV 4/5] END model__metric=minkowski, model__n_neighbors=25, model__weights=uniform, skb__k=11;, score=(train=0.792, test=0.798) total time=   0.0s
[CV 5/5] END model__metric=minkowski, model__n_neighbors=25, model__weights=uniform, skb__k=11;, score=(train=0.794, test=0.768) total time=   0.0s
[CV 1/5] END model__metric=minkowski, model__n_neighbors=25, model__weights=uniform, skb__k=13;, score=(train=0.793, test=0.774) total time=   0.0s
[CV 2/5] END model__metric=minkowski, model__n_neighbors=25, model__weights=uniform, skb__k=13;, score=(train=0.789, test=0.789) total time=   0.0s
[CV 3/5] END model__metric=minkowski, model__n_neighbors=25, model__weights=uniform, skb__k=13;, score=(train=0.793, test=0.774) total time=   0.0s
[CV 4/5] END model__metric=minkowski, model__n_neighbors=25, model__weights=uniform, skb__k=13;, score=(train=0.789, test=0.797) total time=   0.1s
[CV 5/5] END model__metric=minkowski, model__n_neighbors=25, model__weights=uniform, skb__k=13;, score=(train=0.

[CV 5/5] END model__metric=euclidean, model__n_neighbors=7, model__weights=uniform, skb__k=9;, score=(train=0.800, test=0.778) total time=   0.0s
[CV 1/5] END model__metric=euclidean, model__n_neighbors=7, model__weights=uniform, skb__k=11;, score=(train=0.816, test=0.765) total time=   0.0s
[CV 2/5] END model__metric=euclidean, model__n_neighbors=7, model__weights=uniform, skb__k=11;, score=(train=0.807, test=0.787) total time=   0.0s
[CV 3/5] END model__metric=euclidean, model__n_neighbors=7, model__weights=uniform, skb__k=11;, score=(train=0.809, test=0.776) total time=   0.0s
[CV 4/5] END model__metric=euclidean, model__n_neighbors=7, model__weights=uniform, skb__k=11;, score=(train=0.800, test=0.774) total time=   0.0s
[CV 5/5] END model__metric=euclidean, model__n_neighbors=7, model__weights=uniform, skb__k=11;, score=(train=0.773, test=0.737) total time=   0.0s
[CV 1/5] END model__metric=euclidean, model__n_neighbors=7, model__weights=uniform, skb__k=13;, score=(train=0.818, tes

[CV 1/5] END model__metric=euclidean, model__n_neighbors=13, model__weights=uniform, skb__k=9;, score=(train=0.802, test=0.783) total time=   0.0s
[CV 2/5] END model__metric=euclidean, model__n_neighbors=13, model__weights=uniform, skb__k=9;, score=(train=0.797, test=0.784) total time=   0.0s
[CV 3/5] END model__metric=euclidean, model__n_neighbors=13, model__weights=uniform, skb__k=9;, score=(train=0.804, test=0.776) total time=   0.0s
[CV 4/5] END model__metric=euclidean, model__n_neighbors=13, model__weights=uniform, skb__k=9;, score=(train=0.796, test=0.805) total time=   0.0s
[CV 5/5] END model__metric=euclidean, model__n_neighbors=13, model__weights=uniform, skb__k=9;, score=(train=0.802, test=0.787) total time=   0.0s
[CV 1/5] END model__metric=euclidean, model__n_neighbors=13, model__weights=uniform, skb__k=11;, score=(train=0.805, test=0.774) total time=   0.0s
[CV 2/5] END model__metric=euclidean, model__n_neighbors=13, model__weights=uniform, skb__k=11;, score=(train=0.802, 

[CV 2/5] END model__metric=euclidean, model__n_neighbors=25, model__weights=uniform, skb__k=7;, score=(train=0.791, test=0.790) total time=   0.0s
[CV 3/5] END model__metric=euclidean, model__n_neighbors=25, model__weights=uniform, skb__k=7;, score=(train=0.793, test=0.769) total time=   0.0s
[CV 4/5] END model__metric=euclidean, model__n_neighbors=25, model__weights=uniform, skb__k=7;, score=(train=0.789, test=0.804) total time=   0.0s
[CV 5/5] END model__metric=euclidean, model__n_neighbors=25, model__weights=uniform, skb__k=7;, score=(train=0.793, test=0.779) total time=   0.0s
[CV 1/5] END model__metric=euclidean, model__n_neighbors=25, model__weights=uniform, skb__k=9;, score=(train=0.796, test=0.786) total time=   0.0s
[CV 2/5] END model__metric=euclidean, model__n_neighbors=25, model__weights=uniform, skb__k=9;, score=(train=0.795, test=0.786) total time=   0.0s
[CV 3/5] END model__metric=euclidean, model__n_neighbors=25, model__weights=uniform, skb__k=9;, score=(train=0.798, te

[CV 3/5] END model__metric=manhattan, model__n_neighbors=7, model__weights=uniform, skb__k=5;, score=(train=0.811, test=0.774) total time=   0.0s
[CV 4/5] END model__metric=manhattan, model__n_neighbors=7, model__weights=uniform, skb__k=5;, score=(train=0.795, test=0.779) total time=   0.0s
[CV 5/5] END model__metric=manhattan, model__n_neighbors=7, model__weights=uniform, skb__k=5;, score=(train=0.807, test=0.780) total time=   0.0s
[CV 1/5] END model__metric=manhattan, model__n_neighbors=7, model__weights=uniform, skb__k=7;, score=(train=0.803, test=0.779) total time=   0.0s
[CV 2/5] END model__metric=manhattan, model__n_neighbors=7, model__weights=uniform, skb__k=7;, score=(train=0.798, test=0.762) total time=   0.0s
[CV 3/5] END model__metric=manhattan, model__n_neighbors=7, model__weights=uniform, skb__k=7;, score=(train=0.807, test=0.766) total time=   0.0s
[CV 4/5] END model__metric=manhattan, model__n_neighbors=7, model__weights=uniform, skb__k=7;, score=(train=0.801, test=0.79

[CV 4/5] END model__metric=manhattan, model__n_neighbors=9, model__weights=uniform, skb__k=15;, score=(train=0.818, test=0.800) total time=   0.0s
[CV 5/5] END model__metric=manhattan, model__n_neighbors=9, model__weights=uniform, skb__k=15;, score=(train=0.829, test=0.768) total time=   0.1s
[CV 1/5] END model__metric=manhattan, model__n_neighbors=13, model__weights=uniform, skb__k=5;, score=(train=0.796, test=0.789) total time=   0.0s
[CV 2/5] END model__metric=manhattan, model__n_neighbors=13, model__weights=uniform, skb__k=5;, score=(train=0.799, test=0.785) total time=   0.0s
[CV 3/5] END model__metric=manhattan, model__n_neighbors=13, model__weights=uniform, skb__k=5;, score=(train=0.800, test=0.784) total time=   0.0s
[CV 4/5] END model__metric=manhattan, model__n_neighbors=13, model__weights=uniform, skb__k=5;, score=(train=0.788, test=0.781) total time=   0.0s
[CV 5/5] END model__metric=manhattan, model__n_neighbors=13, model__weights=uniform, skb__k=5;, score=(train=0.800, te

[CV 5/5] END model__metric=manhattan, model__n_neighbors=17, model__weights=uniform, skb__k=13;, score=(train=0.797, test=0.769) total time=   0.0s
[CV 1/5] END model__metric=manhattan, model__n_neighbors=17, model__weights=uniform, skb__k=15;, score=(train=0.807, test=0.758) total time=   0.0s
[CV 2/5] END model__metric=manhattan, model__n_neighbors=17, model__weights=uniform, skb__k=15;, score=(train=0.810, test=0.787) total time=   0.0s
[CV 3/5] END model__metric=manhattan, model__n_neighbors=17, model__weights=uniform, skb__k=15;, score=(train=0.807, test=0.788) total time=   0.0s
[CV 4/5] END model__metric=manhattan, model__n_neighbors=17, model__weights=uniform, skb__k=15;, score=(train=0.805, test=0.800) total time=   0.0s
[CV 5/5] END model__metric=manhattan, model__n_neighbors=17, model__weights=uniform, skb__k=15;, score=(train=0.811, test=0.773) total time=   0.0s
[CV 1/5] END model__metric=manhattan, model__n_neighbors=25, model__weights=uniform, skb__k=5;, score=(train=0.7

In [39]:
pd.DataFrame(grid.cv_results_)[['mean_train_score','std_train_score','mean_test_score','std_test_score']].loc[grid.best_index_]

mean_train_score    0.796736
std_train_score     0.002306
mean_test_score     0.791343
std_test_score      0.008594
Name: 104, dtype: float64

In [40]:
grid.best_params_

{'model__metric': 'manhattan',
 'model__n_neighbors': 25,
 'model__weights': 'uniform',
 'skb__k': 9}

In [42]:
model = grid.best_estimator_
print(classification_report(y_val, model.predict(X_val)))

              precision    recall  f1-score   support

       False       0.73      0.79      0.76       861
        True       0.77      0.72      0.75       878

    accuracy                           0.75      1739
   macro avg       0.75      0.75      0.75      1739
weighted avg       0.75      0.75      0.75      1739



## Cross Validation

# Submission

In [44]:
X_test = treat_dataset(df_test.copy())
X_test = feature_enginnering(X_test)
X_test = train_test_treatments(X_test, pd.concat([df_train,df_test]))
X_test.head(2)

Unnamed: 0_level_0,CryoSleep,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Deck,Num,TotalService,HomePlanet_Europa,HomePlanet_Mars,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,Side_S,GroupSize,GroupLastNameSize
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
0013_01,True,27.0,0.0,0.0,0.0,0.0,0.0,7,3,0.0,0,0,0,1,1,1,1
0018_01,False,19.0,0.0,2.197225,0.0,7.945555,0.0,6,4,7.948738,0,0,0,1,1,1,1


In [49]:
y_test = model.predict(X_test)

In [50]:
submission = pd.Series(y_test, index=[X_test.index]).to_frame('Transported')
submission.to_csv('data/submission_knn_2022-11-05.csv')

# To Ensemble model

In [50]:
df = pd.concat([df_train,df_test]).drop(['Transported'],axis=1)
df = treat_dataset(df)
df = feature_enginnering(df)
df = train_test_treatments(df, pd.concat([df_train,df_test]))
ensemble = model.predict_proba(df)
ensemble = pd.Series(ensemble[:,1], index=[df.index]).to_frame('KNN')
ensemble.to_csv('data/ensemble/knn.csv')