In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer,KNNImputer
from sklearn.preprocessing import StandardScaler
import itertools

In [2]:
df_train = pd.read_csv("data/train.csv", index_col='PassengerId')
df_test = pd.read_csv("data/test.csv", index_col='PassengerId')
df_train.head(2)

Unnamed: 0_level_0,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True


# Data Treatment

In [3]:
def feature_inputer(
    df:pd.DataFrame,
    to_mode = ['HomePlanet', 'CryoSleep', 'Destination', 'VIP'],
    to_median = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
) -> pd.DataFrame:
    for col in to_mode:
        mode_inputer = SimpleImputer(strategy='most_frequent')
        df[col] = mode_inputer.fit_transform(df[[col]])

    
    for col in to_median:
        inputer = SimpleImputer(strategy='median')
        df[col] = inputer.fit_transform(df[[col]])
    return df

def dtype_memory_reducer(df: pd.DataFrame) -> pd.DataFrame:
    df['CryoSleep'] = df['CryoSleep'].astype(bool)
    df['VIP'] = df['VIP'].astype(bool)

    df['HomePlanet'] = df['HomePlanet'].astype('category')
    df['Destination'] = df['Destination'].astype('category')
    return df

def outliers_to_log(df:pd.DataFrame) -> pd.DataFrame:
    to_log = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
    for col in to_log:
        df[col] = np.log(df[col] + 1)
    return df

def cabin_inputer(df:pd.DataFrame) -> pd.DataFrame:
    cabin_features = df['Cabin'].str.split("/",expand=True)[[0,2]].rename(columns={0:'Deck',2:'side'})
    df = pd.concat([df,cabin_features],axis=1)
    df['Deck'] = df['Deck'].fillna('G').replace("T","G")
    df = df.drop(['side','Cabin'],axis=1)
    return df

def vip_knn_input(df: pd.DataFrame) -> pd.DataFrame:
    inputer = KNNImputer(n_neighbors=5)
    df['VIP'] = inputer.fit_transform(df[['VIP','RoomService']])[:,0]
    return df

def treat_dataset(df:pd.DataFrame) -> pd.DataFrame:
    df = df.drop(['Name'],axis=1)
    df = cabin_inputer(df)
    df = feature_inputer(df)
    df = vip_knn_input(df)
    df['0_bills'] = df[['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']].sum(axis=1) == 0
    df = outliers_to_log(df)
    df = dtype_memory_reducer(df)
    return df

def apply_interactions(df:pd.DataFrame):
    for x in itertools.combinations(df.columns, 2):
        df[f'{x[0]}_{x[1]}'] = df[x[0]]*df[x[1]]
    df = df.drop(df.columns[df.nunique() == 1].tolist(),axis=1)
    return df

def feature_enginnering(df: pd.DataFrame) -> pd.DataFrame:
    df = pd.get_dummies(df,columns=['HomePlanet','Destination','Deck'],drop_first=True)
    return df

In [4]:
df_train = treat_dataset(df_train.copy())
df_train = feature_enginnering(df_train)

In [5]:
X_train,y_train = df_train.drop('Transported',axis=1), df_train['Transported']
scaler = StandardScaler()
X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)

In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

## Cross Validation

In [7]:
from sklearn.model_selection import cross_val_score, KFold

def model_evaluation(model,x_train,y_train):
    model.fit(x_train, y_train)
    scores = cross_val_score(model, x_train, y_train, cv=KFold(5, shuffle=True, random_state=42), scoring = "accuracy")
    return {
        'model':model,
        'score_avg':np.mean(scores),
        'score_std':np.std(scores)
    }

In [8]:
models = [
    LogisticRegression(max_iter=1000),
    #RandomForestClassifier(**{'bootstrap': True,'max_depth': 10,'max_features': 'sqrt','min_samples_leaf': 5,'n_estimators': 200}),
    KNeighborsClassifier(**{'metric': 'manhattan', 'n_neighbors': 23, 'weights': 'uniform'}),
]

In [9]:
df_eva = [model_evaluation(model,X_train,y_train) for model in models]
pd.DataFrame(df_eva)

Unnamed: 0,model,score_avg,score_std
0,LogisticRegression(max_iter=1000),0.771312,0.008186
1,"KNeighborsClassifier(metric='manhattan', n_nei...",0.784771,0.007396


# Remodelling with Bagging

In [10]:
X_train_bagging = {
'Logistic Regression':df_eva[0]['model'].predict_proba(X_train)[:,1],
#'Random Forest':df_eva[1]['model'].predict_proba(X_train)[:,1],
'KNN':df_eva[1]['model'].predict_proba(X_train)[:,1],
}
X_train_bagging = pd.DataFrame(X_train_bagging, X_train.index)
X_train_bagging = apply_interactions(X_train_bagging)
X_train_bagging.head(2)

Unnamed: 0,Logistic Regression,KNN,Logistic Regression_KNN
0,0.8655,0.652174,0.564456
1,0.072053,0.304348,0.021929


In [11]:
bagging_model = LogisticRegression()
eva_bagging = model_evaluation(bagging_model,X_train_bagging,y_train)
eva_bagging

{'model': LogisticRegression(),
 'score_avg': 0.8005297808152643,
 'score_std': 0.004927627796781884}

In [16]:
bagging_model.coef_

array([[-0.19194945,  5.14277828,  1.37052044]])

# Submission

In [12]:
X_test = treat_dataset(df_test)
X_test = feature_enginnering(X_test)
X_test = pd.DataFrame(scaler.fit_transform(X_test), index=X_test.index, columns=X_test.columns)
X_test.head(2)

Unnamed: 0_level_0,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,0_bills,HomePlanet_Europa,HomePlanet_Mars,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,Deck_B,Deck_C,Deck_D,Deck_E,Deck_F,Deck_G
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
0013_01,1.330443,-0.114147,-0.132689,-0.64483,-0.646513,-0.628742,-0.661615,-0.624214,1.17083,-0.553131,-0.525314,-0.315862,0.634992,-0.30408,-0.300857,-0.244898,-0.341629,-0.714311,1.490179
0018_01,-0.75163,-0.684313,-0.132689,-0.64483,0.140685,-0.628742,2.226792,-0.624214,-0.854095,-0.553131,-0.525314,-0.315862,0.634992,-0.30408,-0.300857,-0.244898,-0.341629,1.399951,-0.67106


In [13]:
X_test_bagging = {
'Logistic Regression':df_eva[0]['model'].predict_proba(X_test)[:,1],
#'Random Forest':df_eva[1]['model'].predict_proba(X_test)[:,1],
'KNN':df_eva[1]['model'].predict_proba(X_test)[:,1],
}
X_test_bagging = pd.DataFrame(X_test_bagging, X_test.index)
X_test_bagging = apply_interactions(X_test_bagging)
X_test_bagging.head(2)

Unnamed: 0_level_0,Logistic Regression,KNN,Logistic Regression_KNN
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0013_01,0.701838,0.565217,0.396691
0018_01,0.184649,0.130435,0.024085


In [14]:
y_test = bagging_model.predict(X_test_bagging)

In [15]:
submission = pd.Series(y_test, index=[X_test.index]).to_frame('Transported')
submission.to_csv('data/submission_bagging_2022-11-05.csv')