# 0. Installs and imports

In [75]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from scipy.stats import *
import matplotlib.pyplot as plt

# 1. Variables

In [76]:
raw_data_path = "../data/01_raw"
save_data_path = "../data/02_preprocessed"

# 2. Load Data

In [77]:
df_train = pd.read_csv(f"{raw_data_path}/train.csv")

In [78]:
df_test = pd.read_csv(f"{raw_data_path}/test.csv")

In [79]:
n_rows = len(df_train.index)
n_cols = len(df_train.columns)
print(f"Dataset has {n_rows} rows and {n_cols} columns.")

Dataset has 891 rows and 12 columns.


In [80]:
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


# 3. Preprocessing

## Duplicates

In [81]:
# Find unique values per column
assert df_train["PassengerId"].nunique() == n_rows
df_train.nunique()

PassengerId    891
Survived         2
Pclass           3
Name           891
Sex              2
Age             88
SibSp            7
Parch            7
Ticket         681
Fare           248
Cabin          147
Embarked         3
dtype: int64

In [82]:
df_train = df_train.set_index("PassengerId")
df_test = df_test.set_index("PassengerId")

## Drop Irrelevant Features

In [83]:
df_dropped = df_train.copy()
df_dropped = df_dropped.drop(
    ["Cabin","Name","Ticket"],
    axis = 1
)

In [84]:
df_dropped_test = df_test.copy()
df_dropped_test = df_dropped_test.drop(
    ["Cabin","Name","Ticket"],
    axis = 1
)

## Null values

In [85]:
df_train_imputted = df_dropped.copy()
df_test_imputted = df_dropped_test.copy()

In [86]:
# Find null values per column
df_train_imputted.isna().sum()

Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Embarked      2
dtype: int64

In [87]:
age = df_train_imputted["Age"].to_numpy()

In [88]:
jarque_bera(age,nan_policy = "omit")

SignificanceResult(statistic=np.float64(18.787551273344455), pvalue=np.float64(8.32405760033658e-05))

In [89]:
imp_median = SimpleImputer(missing_values=np.nan, strategy="median") #Median sicne data normality test rejects null hypothesis of normality, and median is robust to outliers
imp_median_fit = imp_median.fit(age.reshape(-1,1))
df_train_imputted["Age"] = imp_median_fit.transform(age.reshape(-1,1))

In [90]:
age_test = df_test_imputted["Age"].to_numpy()
df_test_imputted["Age"] = imp_median_fit.transform(age_test.reshape(-1,1))

In [91]:
imp_mode = SimpleImputer(missing_values=np.nan, strategy="most_frequent")
imp_mode_fit = imp_mode.fit(df_train_imputted[["Embarked"]])
df_train_imputted["Embarked"] = imp_mode_fit.transform(df_train_imputted[["Embarked"]]).squeeze()
df_test_imputted["Embarked"] = imp_mode_fit.transform(df_test_imputted[["Embarked"]]).squeeze()

In [92]:
df_test_imputted["Fare"] = np.where(
    df_test_imputted["Fare"].isna(),
    df_test_imputted["Fare"].mode(),
    df_test_imputted["Fare"]
)

In [93]:
df_test_imputted.isna().sum()

Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
dtype: int64

## Feature Encoding

In [94]:
df_train_encoded = df_train_imputted.copy()
df_test_encoded = df_test_imputted.copy()

In [95]:
oh_encoder = OneHotEncoder()
oh_encoder_fit = oh_encoder.fit(df_train_encoded[["Sex","Embarked"]])

In [96]:
df_train_encoded.loc[:,["female","male","C","Q","S"]] = oh_encoder_fit.transform(df_train_encoded[["Sex","Embarked"]]).toarray()
df_test_encoded.loc[:,["female","male","C","Q","S"]] = oh_encoder_fit.transform(df_test_encoded[["Sex","Embarked"]]).toarray()

In [97]:
df_train_encoded = df_train_encoded.drop(["Sex","Embarked"],axis=1)

In [98]:
df_test_encoded = df_test_encoded.drop(["Sex","Embarked"],axis=1)

In [99]:
df_train_encoded.dtypes

Survived      int64
Pclass        int64
Age         float64
SibSp         int64
Parch         int64
Fare        float64
female      float64
male        float64
C           float64
Q           float64
S           float64
dtype: object

In [100]:
df_test_encoded.dtypes

Pclass      int64
Age       float64
SibSp       int64
Parch       int64
Fare      float64
female    float64
male      float64
C         float64
Q         float64
S         float64
dtype: object

## Type conversion

In [101]:
df_train_casted = df_train_encoded.copy()
df_train_casted["Survived"] = df_train_casted["Survived"].astype("category")
df_train_casted["Pclass"] = df_train_casted["Pclass"].astype("category")
df_train_casted["Age"] = df_train_casted["Age"].astype("float")
df_train_casted["SibSp"] = df_train_casted["SibSp"].astype("int")
df_train_casted["Parch"] = df_train_casted["Parch"].astype("int")
df_train_casted["Fare"] = df_train_casted["Fare"].astype("float")
df_train_casted["female"] = df_train_casted["female"].astype("category")
df_train_casted["male"] = df_train_casted["male"].astype("category")
df_train_casted["C"] = df_train_casted["C"].astype("category")
df_train_casted["Q"] = df_train_casted["Q"].astype("category")
df_train_casted["S"] = df_train_casted["S"].astype("category")

In [103]:
df_test_casted = df_test_encoded.copy()
df_test_casted["Pclass"] = df_test_casted["Pclass"].astype("category")
df_test_casted["Age"] = df_test_casted["Age"].astype("float")
df_test_casted["SibSp"] = df_test_casted["SibSp"].astype("int")
df_test_casted["Parch"] = df_test_casted["Parch"].astype("int")
df_test_casted["Fare"] = df_test_casted["Fare"].astype("float")
df_test_casted["female"] = df_test_casted["female"].astype("category")
df_test_casted["male"] = df_test_casted["male"].astype("category")
df_test_casted["C"] = df_test_casted["C"].astype("category")
df_test_casted["Q"] = df_test_casted["Q"].astype("category")
df_test_casted["S"] = df_test_casted["S"].astype("category")

# 4. Save

In [44]:
df_train_encoded.to_parquet(
    f"{save_data_path}/train_preprocessed.parquet"
)

In [104]:
df_test_encoded.to_parquet(
    f"{save_data_path}/test_preprocessed.parquet"
)

In [106]:
len(df_train.index)

891

#