### Train a basic logistic regression model to predict who will be transported to an alternate dimension in the Space Ship Titanic

In [None]:
# Imports
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.linear_model import LogisticRegression

In [6]:
# Load data
train_df = pd.read_csv('../input/spaceship-titanic/train.csv')
test_df = pd.read_csv('../input/spaceship-titanic/test.csv')
train_df.fillna(inplace=True, value=0)
test_df.fillna(inplace=True, value=0)

p_ids = test_df.PassengerId

In [7]:
# Preprocess columns. Split cabin into deck, roomnum, and side.
def preprocess(df):
    df[["deck", "roomNum", "side"]] = df["Cabin"].str.split("/",expand=True)
    df["isCryoSleep"] = df["CryoSleep"].astype(int)
    df["isVIP"] = df["VIP"].astype(int)
    df["totalSpend"] = df[["RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck"]].sum(axis=1)
    return df

train_df = preprocess(train_df)
test_df = preprocess(test_df)

In [8]:
# Drop unused columns. Experiments showed decreased accuracy when keeping these.
drop_cols = ["Cabin", "Name", "Age", "roomNum", "CryoSleep", "VIP", "PassengerId", "Destination"]
train_df.drop(drop_cols, axis=1, inplace=True)
test_df.drop(drop_cols, axis=1, inplace=True)

In [9]:
# One-hot encode categorical variables
cat_vars = ["deck", "side", "HomePlanet"]

def one_hot_encode(colname, df):
    one_hot = pd.get_dummies(df[f"{colname}"], prefix=f"{colname}")
    df.drop(colname, axis=1, inplace=True)
    df = df.join(one_hot)
    return df

for var in cat_vars:
    train_df = one_hot_encode(var, train_df)
    test_df = one_hot_encode(var, test_df)

train_df

Unnamed: 0,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,isCryoSleep,isVIP,totalSpend,deck_A,...,deck_E,deck_F,deck_G,deck_T,side_P,side_S,HomePlanet_0,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars
0,0.0,0.0,0.0,0.0,0.0,False,0,0,0.0,0,...,0,0,0,0,1,0,0,0,1,0
1,109.0,9.0,25.0,549.0,44.0,True,0,0,736.0,0,...,0,1,0,0,0,1,0,1,0,0
2,43.0,3576.0,0.0,6715.0,49.0,False,0,1,10383.0,1,...,0,0,0,0,0,1,0,0,1,0
3,0.0,1283.0,371.0,3329.0,193.0,False,0,0,5176.0,1,...,0,0,0,0,0,1,0,0,1,0
4,303.0,70.0,151.0,565.0,2.0,True,0,0,1091.0,0,...,0,1,0,0,0,1,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,0.0,6819.0,0.0,1643.0,74.0,False,0,1,8536.0,1,...,0,0,0,0,1,0,0,0,1,0
8689,0.0,0.0,0.0,0.0,0.0,False,1,0,0.0,0,...,0,0,1,0,0,1,0,1,0,0
8690,0.0,0.0,1872.0,1.0,0.0,True,0,0,1873.0,0,...,0,0,1,0,0,1,0,1,0,0
8691,0.0,1049.0,0.0,353.0,3235.0,False,0,0,4637.0,0,...,1,0,0,0,0,1,0,0,1,0


In [10]:
# Create train/test split

X = train_df[[i for i in train_df.columns if "Transported" not in i]]
y = train_df["Transported"]

train_data, test_data, train_labels, test_labels  = train_test_split(X, y)

Norms supported by each logistic regression solver

    ‘lbfgs’ - [‘l2’, None]

    ‘liblinear’ - [‘l1’, ‘l2’]

    ‘newton-cg’ - [‘l2’, None]

    ‘newton-cholesky’ - [‘l2’, None]

    ‘sag’ - [‘l2’, None]

    ‘saga’ - [‘elasticnet’, ‘l1’, ‘l2’, None]


In [11]:
# Train basic logistic regression model

params = {
    "random_state": 42,
    "penalty": "l2",
    "solver": "lbfgs",
    "max_iter": 100,
    "l1_ratio": None
}

log_reg = LogisticRegression(**params).fit(train_data, train_labels)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [12]:
# Get accuracy metrics

preds = log_reg.predict(test_data)
tn, fp, fn, tp = confusion_matrix(test_labels, preds).ravel()
acc = accuracy_score(test_labels, preds)
scores = {
    "acc": acc,
    "tp": tp,
    "tn": tn,
    "fp": fp,
    "fn": fn
}
print(scores)

{'acc': 0.7851885924563018, 'tp': 863, 'tn': 844, 'fp': 209, 'fn': 258}


In [13]:
# Create submission file
answer_preds = log_reg.predict(test_df)
d = {"PassengerId": p_ids.values, "Transported": answer_preds}
answer_df = pd.DataFrame(d)

answer_df.to_csv("submission.csv", index=False)