In [15]:
import pandas as pd

from helper_functions import prepare_data, replace_strings

from pprint import pprint
from IPython.display import Image

# Data Preparation

In [16]:
# load data
df_train = pd.read_csv("../../data/train.csv", index_col="PassengerId")
df_test = pd.read_csv("../../data/test.csv", index_col="PassengerId")
test_labels = pd.read_csv("../../data/test_labels.csv", index_col="PassengerId", squeeze=True)

# prepare data
df_train = prepare_data(df_train)
df_test = prepare_data(df_test, train_set=False)

# handle missing values in training data
embarked_mode = df_train.Embarked.mode()[0]
df_train["Embarked"].fillna(embarked_mode, inplace=True)

df_train.head()

Unnamed: 0_level_0,Sex,Pclass,Age_Group,Embarked,SibSp,ParCh,Survived
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,male,3,Adult,S,1,0,0
2,female,1,Adult,C,1,0,1
3,female,3,Adult,S,0,0,1
4,female,1,Adult,S,1,0,1
5,male,3,Adult,S,0,0,0


## 1. Step of the Algorithm

In [None]:
def create_age_groups(age):
    if age <= 12:
        return "Child"
    if 12 < age <= 19:
        return "Teenager"
    if 19 < age:
        return "Adult"
    else:
        return "Unknown"
    

In [None]:

def prepare_data(df, train_set=True):
    
    # create new feature
    df["Age_Group"] = df.Age.apply(create_age_groups)
    
    # drop features that we are not going to use
    df.drop(["Name", "Age", "Ticket", "Fare", "Cabin"], axis=1, inplace=True)
    
    # rename column "Parch" to "ParCh"
    df.rename({"Parch": "ParCh"}, axis=1, inplace=True)
    
    # rearange order of columns
    if train_set:
        df = df[["Sex", "Pclass", "Age_Group", "Embarked", "SibSp", "ParCh", "Survived"]]
    else:
        df = df[["Sex", "Pclass", "Age_Group", "Embarked", "SibSp", "ParCh"]]
    
    return df


In [None]:

def replace_strings(df):
    
    df.Age_Group.replace({"Adult": 0, "Unknown": 1, "Teenager": 2, "Child": 3}, inplace=True)
    df.Embarked.replace({"S": 0, "C": 1, "Q": 2}, inplace=True)
    df.Sex.replace({"male": 0, "female": 1}, inplace=True)

    return df

In [17]:
example_table = {
    
    "Sex": {"female": [0.15, 0.68],
            "male": [0.85, 0.32]},
    
    "Pclass": {1: [0.15, 0.40],
               2: [0.18, 0.25],
               3: [0.68, 0.35]},
    
    "class_names": [0, 1],
    "class_counts": [549, 342]
}

In [18]:
def create_table(df, label_column):
    table = {}

    # determine values for the label
    value_counts = df[label_column].value_counts().sort_index()
    table["class_names"] = value_counts.index.to_numpy()
    table["class_counts"] = value_counts.values

    # determine probabilities for the features
    for feature in df.drop(label_column, axis=1).columns:
        table[feature] = {}

        # determine counts
        counts = df.groupby(label_column)[feature].value_counts()
        df_counts = counts.unstack(label_column)

        # add one count to avoid "problem of rare values"
        if df_counts.isna().any(axis=None):
            df_counts.fillna(value=0, inplace=True)
            df_counts += 1

        # calculate probabilities
        df_probabilities = df_counts / df_counts.sum()
        for value in df_probabilities.index:
            probabilities = df_probabilities.loc[value].to_numpy()
            table[feature][value] = probabilities
            
    return table

In [19]:
lookup_table = create_table(df_train, label_column="Survived")
pprint(lookup_table)

{'Age_Group': {'Adult': array([0.61748634, 0.61695906]),
               'Child': array([0.05282332, 0.11695906]),
               'Teenager': array([0.10200364, 0.11403509]),
               'Unknown': array([0.2276867 , 0.15204678])},
 'Embarked': {'C': array([0.13661202, 0.27192982]),
              'Q': array([0.0856102, 0.0877193]),
              'S': array([0.77777778, 0.64035088])},
 'ParCh': {0: array([0.80215827, 0.67048711]),
           1: array([0.0971223 , 0.18911175]),
           2: array([0.07374101, 0.11747851]),
           3: array([0.00539568, 0.01146132]),
           4: array([0.00899281, 0.00286533]),
           5: array([0.00899281, 0.00573066]),
           6: array([0.00359712, 0.00286533])},
 'Pclass': {1: array([0.14571949, 0.39766082]),
            2: array([0.17668488, 0.25438596]),
            3: array([0.67759563, 0.34795322])},
 'Sex': {'female': array([0.14754098, 0.68128655]),
         'male': array([0.85245902, 0.31871345])},
 'SibSp': {0: array([0.7176259 , 

## 2. Step of the Algorithm

In [20]:
def predict_example(row, lookup_table):
    
    class_estimates = lookup_table["class_counts"]
    for feature in row.index:

        try:
            value = row[feature]
            probabilities = lookup_table[feature][value]
            class_estimates = class_estimates * probabilities

        # skip in case "value" only occurs in test set but not in train set
        # (i.e. "value" is not in "lookup_table")
        except KeyError:
            continue

    index_max_class = class_estimates.argmax()
    prediction = lookup_table["class_names"][index_max_class]
    
    return prediction

In [21]:
predictions = df_test.apply(predict_example, axis=1, args=(lookup_table,))
predictions.head()

PassengerId
892    0
893    1
894    0
895    0
896    1
dtype: int64

# Check Accuracy

In [22]:
predictions_correct = predictions == test_labels
accuracy = predictions_correct.mean()
print(f"Accuracy: {accuracy:.3f}")

Accuracy: 0.766


# Comparison to Sklearn

In [23]:
from sklearn.naive_bayes import GaussianNB, MultinomialNB, ComplementNB, BernoulliNB

In [24]:
# data preparation
df_train = replace_strings(df_train)
X_train = df_train.drop("Survived", axis=1)
y_train = df_train.Survived

X_test = replace_strings(df_test)
y_test = test_labels

In [25]:
# use different sklearn Naive Bayes models
clfs = [GaussianNB(), MultinomialNB(), ComplementNB(), BernoulliNB()]
clfs_names = ["GaussianNB", "MultinomialNB", "ComplementNB", "BernoulliNB"]

print("NB Model\tAccuracy")
print("--------\t--------")
for clf, clf_name in zip(clfs, clfs_names):
    clf.fit(X_train, y_train)
    acc = clf.score(X_test, y_test)
    
    print(f"{clf_name}\t{acc:.3f}")

NB Model	Accuracy
--------	--------
GaussianNB	0.763
MultinomialNB	0.768
ComplementNB	0.761
BernoulliNB	0.766
