In [7]:
# import libararies
import pandas as pd
import joblib
import matplotlib.pyplot as plt
from pathlib import Path
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
import random

In [8]:
# import data
work_dir = Path.cwd()
train_df = pd.read_csv(work_dir/"kaggle-data"/"train.csv")
print(train_df)
test_df = pd.read_csv(work_dir/"kaggle-data"/"test.csv")
print(test_df)

pd.set_option("display.max_columns", 200)

     PassengerId HomePlanet CryoSleep     Cabin    Destination   Age    VIP  \
0        0001_01     Europa     False     B/0/P    TRAPPIST-1e  39.0  False   
1        0002_01      Earth     False     F/0/S    TRAPPIST-1e  24.0  False   
2        0003_01     Europa     False     A/0/S    TRAPPIST-1e  58.0   True   
3        0003_02     Europa     False     A/0/S    TRAPPIST-1e  33.0  False   
4        0004_01      Earth     False     F/1/S    TRAPPIST-1e  16.0  False   
...          ...        ...       ...       ...            ...   ...    ...   
8688     9276_01     Europa     False    A/98/P    55 Cancri e  41.0   True   
8689     9278_01      Earth      True  G/1499/S  PSO J318.5-22  18.0  False   
8690     9279_01      Earth     False  G/1500/S    TRAPPIST-1e  26.0  False   
8691     9280_01     Europa     False   E/608/S    55 Cancri e  32.0  False   
8692     9280_02     Europa     False   E/608/S    TRAPPIST-1e  44.0  False   

      RoomService  FoodCourt  ShoppingMall     Spa 

In [9]:
def map_cabin_num_to_bins(num):
    try:
        num = int(num)
    except ValueError:
        return pd.NA
    if num <= 300:
        return 1
    elif num <= 600:
        return 2
    elif num <= 900:
        return 3
    elif num <= 1200:
        return 4
    elif num <= 1500:
        return 5
    else:
        return 6

def process_data(df, file_name, scaler_name, save_scaler=False):
    # splitting passenger id and cabin information into multiple columns, removing not needed columns
    df["PassengerGroup"] = df["PassengerId"].str.split("_").str[0]
    df["GroupSize"] = df.groupby("PassengerGroup")["PassengerGroup"].transform("count")

    df["CabinDeck"] = df["Cabin"].str.split("/").str[0]
    df["CabinNum"] = df["Cabin"].str.split("/").str[1]
    df["CabinSide"] = df["Cabin"].str.split("/").str[2]

    df.drop(["PassengerId", "Cabin", "Name", "PassengerGroup"], axis=1, inplace=True)

    # gathering CabinNum into bins
    df["CabinNumBin"] = df["CabinNum"].map(map_cabin_num_to_bins)
    df.drop("CabinNum", axis=1, inplace=True)

    # one hot encoding "HomePlanet", "Destination", "CabinDeck", "CabinSide" and "CabinNumBin"
    columns_to_encode = ["HomePlanet", "Destination", "CabinDeck", "CabinSide", "CabinNumBin"]
    encoder = OneHotEncoder(sparse_output=False)
    encoded_array = encoder.fit_transform(df[columns_to_encode])
    encoded_df = pd.DataFrame(encoded_array, columns=encoder.get_feature_names_out(columns_to_encode))
    df.drop(columns_to_encode, axis=1, inplace=True)
    df.reset_index(drop=True, inplace=True)
    df = pd.concat([df, encoded_df], axis=1)

    # changing columns CryoSleep and VIP to bool type
    train_df[["CryoSleep", "VIP"]] = train_df[["CryoSleep", "VIP"]].astype(bool)

    # scaling columns containing numerical values
    columns_to_scale = ["Age", "RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck", "GroupSize"]
    if save_scaler:
        scaler = MinMaxScaler()
        df[columns_to_scale] = scaler.fit_transform(df[columns_to_scale])
        joblib.dump(scaler, scaler_name + ".joblib")
    else:
        scaler = joblib.load(scaler_name + ".joblib")
        df[columns_to_scale] = scaler.fit_transform(df[columns_to_scale])

    df.to_csv(file_name + ".csv", index=False)

In [10]:
# first dataset - all rows containing NA values removed
# train data
dataset_01_train = train_df.dropna().copy()
process_data(df=dataset_01_train, file_name="dataset_01_train", scaler_name="scaler_01", save_scaler=True)
# test data
# dataset_01_test = test_df.dropna().copy() # rows shouldnt be deleted from the test dataset
dataset_01_test = test_df
# missing categorical data with be imputed in a way that conserves proportions
dataset_01_test["CryoSleep"] = dataset_01_test["CryoSleep"].apply(lambda x: random.choice(dataset_01_test["CryoSleep"].dropna().tolist()) if pd.isna(x) else x)
dataset_01_test["HomePlanet"] = dataset_01_test["HomePlanet"].apply(lambda x: random.choice(dataset_01_test["HomePlanet"].dropna().tolist()) if pd.isna(x) else x)
dataset_01_test["Cabin"] = dataset_01_test["Cabin"].apply(lambda x: random.choice(dataset_01_test["Cabin"].dropna().tolist()) if pd.isna(x) else x)
dataset_01_test["Destination"] = dataset_01_test["Destination"].apply(lambda x: random.choice(dataset_01_test["Destination"].dropna().tolist()) if pd.isna(x) else x)
dataset_01_test["VIP"] = dataset_01_test["VIP"].apply(lambda x: random.choice(dataset_01_test["VIP"].dropna().tolist()) if pd.isna(x) else x)

# missing values from the numerical columns imputed with the median
dataset_01_test["Age"] = dataset_01_test["Age"].fillna(dataset_01["Age"].median())
dataset_01_test["RoomService"] = dataset_01_test["RoomService"].fillna(dataset_01_test["RoomService"].median())
dataset_01_test["FoodCourt"] = dataset_01_test["FoodCourt"].fillna(dataset_01_test["FoodCourt"].median())
dataset_01_test["ShoppingMall"] = dataset_01_test["ShoppingMall"].fillna(dataset_01_test["ShoppingMall"].median())
dataset_01_test["Spa"] = dataset_01_test["Spa"].fillna(dataset_01_test["Spa"].median())
dataset_01_test["VRDeck"] = dataset_01_test["VRDeck"].fillna(dataset_01_test["VRDeck"].median())
process_data(df=dataset_01_test, file_name="dataset_01_test", scaler_name="scaler_01")

TypeError: Encoders require their input argument must be uniformly strings or numbers. Got ['NAType', 'int']

In [None]:
# second dataset - removing NA values from the CryoSleep column, imputing the rest
#train data
dataset_02_train = train_df.dropna(subset="CryoSleep").copy()
# missing categorical data with be imputed in a way that conserves proportions
dataset_02_train["HomePlanet"] = dataset_02_train["HomePlanet"].apply(lambda x: random.choice(dataset_02_train["HomePlanet"].dropna().tolist()) if pd.isna(x) else x)
dataset_02_train["Cabin"] = dataset_02_train["Cabin"].apply(lambda x: random.choice(dataset_02_train["Cabin"].dropna().tolist()) if pd.isna(x) else x)
dataset_02_train["Destination"] = dataset_02_train["Destination"].apply(lambda x: random.choice(dataset_02_train["Destination"].dropna().tolist()) if pd.isna(x) else x)
dataset_02_train["VIP"] = dataset_02_train["VIP"].apply(lambda x: random.choice(dataset_02_train["VIP"].dropna().tolist()) if pd.isna(x) else x)

# missing values from the numerical columns imputed with the median
dataset_02_train["Age"] = dataset_02_train["Age"].fillna(dataset_02["Age"].median())
dataset_02_train["RoomService"] = dataset_02_train["RoomService"].fillna(dataset_02_train["RoomService"].median())
dataset_02_train["FoodCourt"] = dataset_02_train["FoodCourt"].fillna(dataset_02_train["FoodCourt"].median())
dataset_02_train["ShoppingMall"] = dataset_02_train["ShoppingMall"].fillna(dataset_02_train["ShoppingMall"].median())
dataset_02_train["Spa"] = dataset_02_train["Spa"].fillna(dataset_02_train["Spa"].median())
dataset_02_train["VRDeck"] = dataset_02_train["VRDeck"].fillna(dataset_02_train["VRDeck"].median())
process_data(df=dataset_02_train, file_name="dataset_02_train", save_scaler=True, scaler_name="scaler_02")

#test data
dataset_02_test = test_df
# missing categorical data with be imputed in a way that conserves proportions
dataset_02_test["CryoSleep"] = dataset_02_test["CryoSleep"].apply(lambda x: random.choice(dataset_02_test["CryoSleep"].dropna().tolist()) if pd.isna(x) else x)
dataset_02_test["HomePlanet"] = dataset_02_test["HomePlanet"].apply(lambda x: random.choice(dataset_02_test["HomePlanet"].dropna().tolist()) if pd.isna(x) else x)
dataset_02_test["Cabin"] = dataset_02_test["Cabin"].apply(lambda x: random.choice(dataset_02_test["Cabin"].dropna().tolist()) if pd.isna(x) else x)
dataset_02_test["Destination"] = dataset_02_test["Destination"].apply(lambda x: random.choice(dataset_02_test["Destination"].dropna().tolist()) if pd.isna(x) else x)
dataset_02_test["VIP"] = dataset_02_test["VIP"].apply(lambda x: random.choice(dataset_02_test["VIP"].dropna().tolist()) if pd.isna(x) else x)

# missing values from the numerical columns imputed with the median
dataset_02_test["Age"] = dataset_02_test["Age"].fillna(dataset_02["Age"].median())
dataset_02_test["RoomService"] = dataset_02_test["RoomService"].fillna(dataset_02_test["RoomService"].median())
dataset_02_test["FoodCourt"] = dataset_02_test["FoodCourt"].fillna(dataset_02_test["FoodCourt"].median())
dataset_02_test["ShoppingMall"] = dataset_02_test["ShoppingMall"].fillna(dataset_02_test["ShoppingMall"].median())
dataset_02_test["Spa"] = dataset_02_test["Spa"].fillna(dataset_02_test["Spa"].median())
dataset_02_test["VRDeck"] = dataset_02_test["VRDeck"].fillna(dataset_02_test["VRDeck"].median())
process_data(df=dataset_02_test, file_name="dataset_02_test", scaler_name="scaler_02")