# Titanic Dataset Analysis

This analysis involves following process
1. Data Loading and Preprocessing
2. Naive Bayes Algorithm
3. K-Nearest Neighbors Algorithm
4. Evaluation and Accuracy Calculation

In [74]:
# importing libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [75]:
# preparing the data by cleaning it and converting it into catagorical variables
def preprocess_data(df):
    # Drop unnecessary columns
    df = df.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)
    
    # Convert categorical variables to numerical
    df['Sex'] = df['Sex'].map({'female': 0, 'male': 1})
    df['Embarked'] = df['Embarked'].map({'C': 0, 'Q': 1, 'S': 2})
    
    # Fill missing values with median or mode
    df['Age'].fillna(df['Age'].median(), inplace=True)
    df['Fare'].fillna(df['Fare'].median(), inplace=True)
    df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)
    
    return df


In [76]:
def split_data(df, target_col=None):
    if target_col is not None:
        x = df.drop(target_col, axis=1).values
        y = df[target_col].values
        return x, y
    else:
        return df.values

In [77]:
def normalize_data(x_train, x_test):
    x_train_norm = (x_train - np.mean(x_train, axis=0)) / np.std(x_train, axis=0)
    x_test_norm = (x_test - np.mean(x_train, axis=0)) / np.std(x_train, axis=0)
    return x_train_norm, x_test_norm

In [78]:
def prior_prob(y_train):
    p_survived = np.mean(y_train)
    p_not_survived = 1 - p_survived
    return p_survived, p_not_survived

In [79]:
def calculate_likelihoods(x_train, y_train):
    likelihoods_survived = []
    likelihoods_not_survived = []

    for feature in range(x_train.shape[1]):
        survived_values = x_train[y_train == 1, feature]
        not_survived_values = x_train[y_train == 0, feature]
        likelihoods_survived.append({'mean': np.mean(survived_values), 'std': np.std(survived_values)})
        likelihoods_not_survived.append({'mean': np.mean(not_survived_values), 'std': np.std(not_survived_values)})

    return likelihoods_survived, likelihoods_not_survived

In [80]:
# Naive bayes Algorithm
def predict_naive_bayes(x_test, p_survived, p_not_survived, likelihoods_survived, likelihoods_not_survived):
    predictions = []

    for test_sample in x_test:
        p_survived_sample = p_survived
        p_not_survived_sample = p_not_survived

        for feature in range(len(test_sample)):
            p_survived_feature = calculate_probability(test_sample[feature], likelihoods_survived[feature])
            p_not_survived_feature = calculate_probability(test_sample[feature], likelihoods_not_survived[feature])
            
            p_survived_sample *= p_survived_feature
            p_not_survived_sample *= p_not_survived_feature

        if p_survived_sample >= p_not_survived_sample:
            predictions.append(1)
        else:
            predictions.append(0)

    return np.array(predictions)

In [81]:
def calculate_probability(x, likelihood):
    exponent = np.exp(-((x - likelihood['mean']) ** 2) / (2 * likelihood['std'] ** 2))
    probability = (1 / (np.sqrt(2 * np.pi) * likelihood['std'])) * exponent
    return probability


In [82]:
# K-Nearest Neighbors Algorithm
def knn_algo(x_train, y_train, x_test, k):
    predictions = []

    for test_sample in x_test:
        distances = np.sqrt(np.sum((x_train - test_sample) ** 2, axis=1))
        nearest_indices = np.argsort(distances)[:k]
        nearest_labels = y_train[nearest_indices]
        unique_labels, counts = np.unique(nearest_labels, return_counts=True)
        predicted_label = unique_labels[np.argmax(counts)]
        predictions.append(predicted_label)

    return np.array(predictions)

In [83]:
# calculating accuracy 
def calculate_accuracy(y_pred, y_true):
    accuracy = np.sum(y_pred == y_true) / len(y_true)
    return accuracy

In [84]:
# Load the Titanic dataset
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
gender_sub_df = pd.read_csv('gender_submission.csv')

In [85]:
# Preprocess the data
train_df = preprocess_data(train_df)
test_df = preprocess_data(test_df)

In [86]:
# Split the data into features and target variables
x_train, y_train = split_data(train_df, target_col='Survived')
x_test = split_data(test_df)

In [87]:
# Normalize the feature values
x_train_norm, x_test_norm = normalize_data(x_train, x_test)

# Calculate prior probabilities
p_survived, p_not_survived = prior_prob(y_train)


In [88]:
# Calculate likelihoods
likelihoods_survived, likelihoods_not_survived = calculate_likelihoods(x_train_norm, y_train)

In [89]:
# Apply the Naive Bayes algorithm
nb_predictions = predict_naive_bayes(x_test_norm, p_survived, p_not_survived, likelihoods_survived, likelihoods_not_survived)

# Apply the K-Nearest Neighbors algorithm
knn_predictions = knn_algo(x_train_norm, y_train, x_test_norm, k=3)

In [90]:
# Calculate and print accuracy
nb_accuracy = calculate_accuracy(nb_predictions, gender_sub_df['Survived'])
knn_accuracy = calculate_accuracy(knn_predictions, gender_sub_df['Survived'])


In [91]:

print(f'Naive Bayes Accuracy: {nb_accuracy:.2f}')
print(f'K-Nearest Neighbors Accuracy: {knn_accuracy:.2f}')

Naive Bayes Accuracy: 0.90
K-Nearest Neighbors Accuracy: 0.78
