In [45]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer,KNNImputer

In [66]:
df_train = pd.read_csv("data/train.csv", index_col='PassengerId')
df_test = pd.read_csv("data/test.csv", index_col='PassengerId')
df_train.head(2)

Unnamed: 0_level_0,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True


# Data Treatment

In [47]:
def feature_inputer(df:pd.DataFrame) -> pd.DataFrame:
    to_mode = ['HomePlanet', 'CryoSleep', 'Destination', 'VIP']

    for col in to_mode:
        mode_inputer = SimpleImputer(strategy='most_frequent')
        df[col] = mode_inputer.fit_transform(df[[col]])

    to_median = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
    for col in to_median:
        inputer = SimpleImputer(strategy='median')
        df[col] = inputer.fit_transform(df[[col]])
    return df

def dtype_memory_reducer(df: pd.DataFrame) -> pd.DataFrame:
    df['CryoSleep'] = df['CryoSleep'].astype(bool)
    df['VIP'] = df['VIP'].astype(bool)

    df['HomePlanet'] = df['HomePlanet'].astype('category')
    df['Destination'] = df['Destination'].astype('category')
    return df

def outliers_to_log(df:pd.DataFrame) -> pd.DataFrame:
    to_log = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
    for col in to_log:
        df[col] = np.log(df[col] + 1)
    return df

def treat_dataset(df:pd.DataFrame) -> pd.DataFrame:
    df = df.drop(['Cabin', 'Name'],axis=1)
    df = feature_inputer(df)
    df = dtype_memory_reducer(df)
    df = outliers_to_log(df)
    return df

In [67]:
df_train = treat_dataset(df_train)

In [49]:
def feature_enginnering(df: pd.DataFrame) -> pd.DataFrame:
    df = pd.get_dummies(df,columns=['HomePlanet','Destination'],drop_first=True)
    return df

In [50]:
df_train = feature_enginnering(df_train)

In [57]:
X_train,y_train = df_train.drop('Transported',axis=1), df_train['Transported']

In [52]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

## Cross Validation

In [59]:
from sklearn.model_selection import cross_val_score

def model_evaluation(model,x_train,y_train):
    model.fit(x_train, y_train)
    scores = cross_val_score(model, x_train, y_train, cv=5, scoring = "accuracy")
    return {
        'model':model,
        'score_avg':np.mean(scores),
        'score_std':np.std(scores)
    }

In [60]:
log_reg = LogisticRegression(max_iter=1000)
dec_tree = DecisionTreeClassifier(min_samples_leaf=30)

models = [log_reg,dec_tree]

In [61]:
df_eva = [model_evaluation(model,X_train,y_train) for model in models]
pd.DataFrame(df_eva)

Unnamed: 0,model,score_avg,score_std
0,LogisticRegression(max_iter=1000),0.771197,0.00658
1,DecisionTreeClassifier(min_samples_leaf=30),0.783622,0.011759


In [68]:
X_test = treat_dataset(df_test)
X_test = feature_enginnering(X_test)
X_test.head(2)

Unnamed: 0_level_0,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,HomePlanet_Europa,HomePlanet_Mars,Destination_PSO J318.5-22,Destination_TRAPPIST-1e
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0013_01,True,27.0,False,0.0,0.0,0.0,0.0,0.0,0,0,0,1
0018_01,False,19.0,False,0.0,2.302585,0.0,7.94591,0.0,0,0,0,1


# Submission

In [71]:
model = df_eva[0]['model']
y_test = model.predict(X_test)

In [78]:
submission = pd.Series(y_test, index=[X_test.index,]).to_frame('Transported')
submission.to_csv('data/submission_logreg_2022-10-29.csv')