In [2]:
import pandas as pd
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score

In [3]:
# 1 - load dataset

df = pd.read_csv('titanic.csv')
df.head()


Unnamed: 0,survived,pclass,gender,age,sibsp,parch,fare,embarked,deck
0,0,3,male,22.0,1,0,7.25,S,
1,1,1,female,38.0,1,0,71.2833,C,C
2,1,3,female,26.0,0,0,7.925,S,
3,1,1,female,35.0,1,0,53.1,S,C
4,0,3,male,35.0,0,0,8.05,S,


In [4]:
# ----Encoding-----
# Encoder gender using LabelEncoder

le = LabelEncoder()
df['gender'] = le.fit_transform(df['gender'])  # male = 1 or 0 depending on alphabetic order



In [5]:
# ----- Age Cleaning + Scaling  -------
df['age'] = df['age'].fillna(df['age'].median())

scaler = StandardScaler()
df['age_scaled'] = scaler.fit_transform(df[['age']])



In [6]:
# ---- FEATURES ------
x = df[['pclass','gender','age_scaled']]
y = df['survived']

In [7]:
# ---- Train-Test Split ----

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42, stratify=y)

In [8]:
# Train Naive Bayes

model = GaussianNB()
model.fit(x_train,y_train)

In [9]:
# Predict

y_pred = model.predict(x_test)


In [10]:
# Evalution ------------
print(accuracy_score(y_test,y_pred))
print('classification_report')
print(classification_report(y_test,y_pred))
print("Confusion MAtrix")
print(confusion_matrix(y_test,y_pred))

0.776536312849162
classification_report
              precision    recall  f1-score   support

           0       0.80      0.85      0.82       110
           1       0.73      0.67      0.70        69

    accuracy                           0.78       179
   macro avg       0.77      0.76      0.76       179
weighted avg       0.77      0.78      0.77       179

Confusion MAtrix
[[93 17]
 [23 46]]


In [17]:
# ----- print prediction bv actual--------------

result = pd.DataFrame({
    "Actual ": y_test.values,
    "Predicted ": y_pred
    })

print("Predicted vs Actual (Test Data): ")
print(result.to_string(index=False))

Predicted vs Actual (Test Data): 
 Actual   Predicted 
       0           0
       0           0
       1           0
       0           0
       1           1
       1           1
       1           1
       0           0
       0           0
       0           0
       0           0
       0           0
       1           1
       0           0
       0           0
       0           0
       0           1
       0           0
       0           0
       1           1
       0           0
       1           1
       0           0
       1           1
       0           0
       1           1
       0           0
       1           0
       1           0
       0           0
       0           0
       1           1
       1           0
       0           0
       1           0
       1           0
       1           0
       0           1
       0           0
       1           1
       0           1
       0           1
       0           0
       1           1
       1           1


In [11]:
import numpy as np

# Pick one test sample
x = x_test.iloc[0].values     # row vector
classes = model.classes_

means = model.theta_          # (n_classes, n_features)
variances = model.var_        # (n_classes, n_features)
priors = model.class_prior_   # (n_classes,)

contributions = {}

for idx, cls in enumerate(classes):
    mu = means[idx]
    var = variances[idx]

    # Gaussian log-likelihood for each feature
    loglik = -0.5 * np.log(2 * np.pi * var) - ((x - mu) ** 2) / (2 * var)

    total_score = np.log(priors[idx]) + loglik.sum()

    contributions[cls] = {
        "feature_contrib": loglik,
        "total_score": total_score
    }

for cls, info in contributions.items():
    print(f"\nClass: {cls}")
    print("Feature contributions:")
    for f, val in zip(x_train.columns, info["feature_contrib"]):
        print(f"  {f}: {val:.4f}")
    print("Total class score:", info["total_score"])



Class: 0
Feature contributions:
  pclass: -0.7987
  gender: 0.0293
  age_scaled: -1.0015
Total class score: -2.254431859826826

Class: 1
Feature contributions:
  pclass: -1.5218
  gender: -1.2549
  age_scaled: -1.0223
Total class score: -4.757608510115297
