In [1]:
# %pip install numpy
# %pip install pandas
# %pip install networkx
# %pip install matplotlib

In [2]:
import numpy
import pandas
import math

In [3]:
train_df = pandas.read_csv('train.csv')
test_df = pandas.read_csv('test.csv')

In [4]:
train_df.head(10)

Unnamed: 0,pclass,age,gender,survived
0,1st,adult,male,yes
1,1st,adult,male,yes
2,1st,adult,male,yes
3,1st,adult,male,yes
4,1st,adult,male,yes
5,1st,adult,male,yes
6,1st,adult,male,yes
7,1st,adult,male,yes
8,1st,adult,male,yes
9,1st,adult,male,yes


In [5]:
test_df.head(10)

Unnamed: 0,pclass,age,gender,survived
0,1st,adult,male,yes
1,1st,adult,male,yes
2,1st,adult,male,yes
3,1st,adult,male,yes
4,1st,adult,male,yes
5,1st,adult,male,yes
6,1st,adult,male,yes
7,1st,adult,male,yes
8,1st,adult,male,yes
9,1st,adult,male,yes


In [6]:
train_df.describe()

Unnamed: 0,pclass,age,gender,survived
count,2150,2150,2150,2150
unique,4,2,2,2
top,crew,adult,male,no
freq,885,2046,1710,1485


In [7]:
for col in train_df.columns:
    print(train_df[col].value_counts())

pclass
crew    885
3rd     699
1st     307
2nd     259
Name: count, dtype: int64
age
adult    2046
child     104
Name: count, dtype: int64
gender
male      1710
female     440
Name: count, dtype: int64
survived
no     1485
yes     665
Name: count, dtype: int64


# Model: Naive Bayes Classifier

In [8]:
def calculate_prior_probabilities (target_values: numpy.ndarray):
    unique_targets, target_counts = numpy.unique(target_values, return_counts=True)

    total_targets = len(target_values)
    target_probs = {target : (count + 1) / (total_targets + 2)  for target, count in zip(unique_targets, target_counts)}

    return target_probs

def calculate_feature_probabilities (train_data: pandas.core.frame.DataFrame, target_column: str):
    alpha = 1

    feature_probabilities = {}
    for target_class in train_df[target_column].unique():
        # print (target_class)
        mask = train_df[target_column] == target_class
        masked_df = train_df[mask].drop(target_column, axis=1)

        for column in masked_df:
            unique_values = masked_df[column].unique()
            value_counts = masked_df[column].value_counts()
            # print (unique_values, value_counts)
            # print (type(unique_values), type(value_counts))

            for value in unique_values:
                feature_probabilities[(value, target_class)] = (
                    (value_counts[value] + alpha) / (len(masked_df) + alpha*len(unique_values))
                )

    return feature_probabilities

In [9]:
feature_probs = calculate_feature_probabilities (train_data=train_df, target_column='survived')
print (feature_probs)

{('1st', 'yes'): 0.27802690582959644, ('2nd', 'yes'): 0.14648729446935724, ('3rd', 'yes'): 0.2571001494768311, ('crew', 'yes'): 0.3183856502242152, ('adult', 'yes'): 0.9205397301349325, ('child', 'yes'): 0.07946026986506746, ('male', 'yes'): 0.527736131934033, ('female', 'yes'): 0.47226386806596704, ('1st', 'no'): 0.08260577568838147, ('2nd', 'no'): 0.10946944257891202, ('3rd', 'no'): 0.3552719946272666, ('crew', 'no'): 0.4526527871054399, ('adult', 'no'): 0.9643577673167452, ('child', 'no'): 0.03564223268325487, ('male', 'no'): 0.9145931405514459, ('female', 'no'): 0.08540685944855414}


In [10]:
target_probs = calculate_prior_probabilities (target_values=train_df.survived)
print (target_probs)

{'no': 0.6905204460966543, 'yes': 0.30947955390334575}


In [11]:
with open('./probabilities.txt', 'w') as file:
    for key, value in target_probs.items():
        file.write(f"{key} : {value}\n")

In [12]:
with open('./probabilities.txt', 'a') as file:
    for key, value in feature_probs.items():
        file.write(f"{key[0]}, {key[1]} : {value}\n")

In [13]:
import json

json_file_path = './probabilities.json'

with open (json_file_path, 'w') as json_file:
    string_key_dict = {str(key): value for key, value in feature_probs.items()}
    json.dump(string_key_dict, json_file, indent=4)

In [14]:
def predict_target_value(test_features: numpy.ndarray, target_values: numpy.ndarray):
    max_prob = float('-inf')
    target_class = None
    for target_val in target_values:
        p_feature_class = 1
        for feat in test_features:
            p_feature_class *= feature_probs.get((feat, target_val), 0)

        p_target = target_probs.get(target_val, 0)

        prob = (p_target * p_feature_class)

        if (prob > max_prob):
            max_prob = prob
            target_class = target_val

    return target_class

In [15]:
predictions = [predict_target_value(row.values[:-1], numpy.unique(test_df.survived.values)) for _, row in test_df.iterrows()]

print (predictions)
print (numpy.unique(numpy.array(predictions), return_counts=True))

['no', 'no', 'no', 'no', 'no', 'no', 'no', 'no', 'no', 'no', 'no', 'no', 'no', 'no', 'no', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'no', 'no', 'no', 'no', 'no', 'no', 'no', 'no', 'no', 'no', 'no', 'no', 'no', 'no', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'no', 'no', 'no', 'no', 'no', 'no', 'no']
(array(['no', 'yes'], dtype='<U3'), array([36, 30]))


In [16]:
def calculate_confusion_matrix(predictions, actual):
    # print (type(predictions), predictions.shape)
    # print (type(actual), actual.shape)

    true_positives = sum([pred=='yes' and label=='yes' for pred, label in zip(predictions, actual)])
    false_positives = sum([pred=='yes' and label=='no' for pred, label in zip(predictions, actual)])

    true_negatives = sum([pred=='no' and label=='no' for pred, label in zip(predictions, actual)])
    false_negatives = sum([pred=='no' and label=='yes' for pred, label in zip(predictions, actual)])

    confusion_matrix = {
        'TP' : true_positives, 
        'FP' : false_positives, 
        'TN' : true_negatives, 
        'FN' : false_negatives, 
    }

    return confusion_matrix

In [17]:
test_df_labels = test_df['survived'].values.reshape(1, -1)[0, :]

In [18]:
confusion_matrix = calculate_confusion_matrix(predictions=predictions, actual=test_df_labels)

In [19]:
print (confusion_matrix)

{'TP': 30, 'FP': 0, 'TN': 5, 'FN': 31}


In [20]:
def calculate_evaluation_metrics (confusion_matrix):
    TP = confusion_matrix['TP']
    FP = confusion_matrix['FP']
    TN = confusion_matrix['TN']
    FN = confusion_matrix['FN']

    precision = TP / (TP + FP)
    recall = TP / (TP + FN)
    accuracy = (TP + TN) / (TP + FP + TN + FN)

    return precision, recall, accuracy

In [21]:
precision, recall, accuracy = calculate_evaluation_metrics(confusion_matrix)

In [22]:
print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'Accuracy: {accuracy:.2f}')

Precision: 1.00
Recall: 0.49
Accuracy: 0.53


In [23]:
# Actual:           yes yes yes yes yes yes yes yes yes yes yes yes yes yes yes yes yes yes yes yes yes yes yes yes yes yes yes yes yes yes yes yes yes yes yes yes yes yes yes yes yes yes no  no  no  no  no  yes yes yes yes yes yes yes yes yes yes yes yes yes yes yes yes yes yes yes 
# Predictions (DT): no  no  no  no  no  no  no  no  no  no  no  no  no  no  no  yes yes yes yes yes yes yes yes yes yes yes yes yes yes yes yes yes yes no  no  no  no  no  no  no  no  no  no  no  no  no  no  yes yes yes yes yes yes yes yes yes yes yes yes no  no  no  no  no  no  no  
# Predictions (NB): no  no  no  no  no  no  no  no  no  no  no  no  no  no  no  yes yes yes yes yes yes yes yes yes yes yes yes yes yes yes yes yes yes no  no  no  no  no  no  no  no  no  no  no  no  no  no  yes yes yes yes yes yes yes yes yes yes yes yes no  no  no  no  no  no  no  