# Importing Modules

In [1]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.metrics import precision_score, recall_score, accuracy_score

# Loading and Preprocessing Data

Preprocessing of the data includes the following operations:
1. Encoding of string data: (**One-Hot Encoding**)
2. Column renaming of Training Data
3. Adding Missing Features to Test Data
4. Column renaming of Test Data

In [2]:
# Load the training and testing data (you should replace 'train.csv' and 'test.csv' with your data files)
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

In [3]:
train_data = pd.get_dummies(train_data, 
                            columns=['pclass', 'age', 'gender', 'survived'], 
                            drop_first=True)
test_data = pd.get_dummies(test_data, 
                            columns=['pclass', 'age', 'gender', 'survived'],
                            drop_first=True)

In [4]:
train_data.head(10)

Unnamed: 0,pclass_2nd,pclass_3rd,pclass_crew,age_child,gender_male,survived_yes
0,False,False,False,False,True,True
1,False,False,False,False,True,True
2,False,False,False,False,True,True
3,False,False,False,False,True,True
4,False,False,False,False,True,True
5,False,False,False,False,True,True
6,False,False,False,False,True,True
7,False,False,False,False,True,True
8,False,False,False,False,True,True
9,False,False,False,False,True,True


In [5]:
train_data.columns = ['pclass_is2nd', 'pclass_is3rd', 'pclass_iscrew', 
                     'age_ischild', 'gender_ismale', 'survived']

train_data

Unnamed: 0,pclass_is2nd,pclass_is3rd,pclass_iscrew,age_ischild,gender_ismale,survived
0,False,False,False,False,True,True
1,False,False,False,False,True,True
2,False,False,False,False,True,True
3,False,False,False,False,True,True
4,False,False,False,False,True,True
...,...,...,...,...,...,...
2145,False,False,True,False,False,True
2146,False,False,True,False,False,True
2147,False,False,True,False,False,False
2148,False,False,True,False,False,False


In [6]:
test_data.columns = ['pclass_is2nd', 'pclass_is3rd', 
                     'age_ischild', 'gender_ismale', 'survived']

test_data

Unnamed: 0,pclass_is2nd,pclass_is3rd,age_ischild,gender_ismale,survived
0,False,False,False,True,True
1,False,False,False,True,True
2,False,False,False,True,True
3,False,False,False,True,True
4,False,False,False,True,True
...,...,...,...,...,...
61,False,True,False,True,True
62,False,True,False,True,True
63,False,True,False,True,True
64,False,True,False,True,True


In [7]:
missing_features = set(train_data.columns) - set(test_data.columns)

for feat in missing_features:
    test_data[feat] = False

In [8]:
test_data = test_data[['pclass_is2nd', 'pclass_is3rd', 'pclass_iscrew', 
                     'age_ischild', 'gender_ismale', 'survived']]

test_data

Unnamed: 0,pclass_is2nd,pclass_is3rd,pclass_iscrew,age_ischild,gender_ismale,survived
0,False,False,False,False,True,True
1,False,False,False,False,True,True
2,False,False,False,False,True,True
3,False,False,False,False,True,True
4,False,False,False,False,True,True
...,...,...,...,...,...,...
61,False,True,False,False,True,True
62,False,True,False,False,True,True
63,False,True,False,False,True,True
64,False,True,False,False,True,True


# Separating features and targets

In [9]:
X_train = train_data.drop(columns=['survived'], axis=1)
y_train = train_data['survived']

X_test = test_data.drop(columns=['survived'], axis=1)
y_test = test_data['survived']

# Training and Evaluating Models

### Decision Tree Classifier

In [10]:
# Create a DecisionTreeClassifier with Information Gain (Entropy) as the splitting criterion
dct_clf = DecisionTreeClassifier(criterion="entropy")

# Fit the model to the training data
dct_clf.fit(X_train, y_train)

# Predicted labels from your model
dct_y_pred = dct_clf.predict(X_test)

dct_precision = precision_score(y_test, dct_y_pred)
dct_recall = recall_score(y_test, dct_y_pred)
dct_accuracy = accuracy_score(y_test, dct_y_pred)

print("Decision Tree Classifier Evaluation Metrics: ")
print("Precision: {:.5f}".format(dct_precision))
print("Recall: {:.5f}".format(dct_recall))
print("Accuracy: {:.5f}".format(dct_accuracy))

Decision Tree Classifier Evaluation Metrics: 
Precision: 1.00000
Recall: 0.49180
Accuracy: 0.53030


### Multinomial Naive Bayes Classifier

In [11]:
# Create a Multinomial Naive Bayes classifier
mnb_clf = MultinomialNB()

# Fit the model to the training data
mnb_clf.fit(X_train, y_train)

# Make predictions on the testing data
mnb_y_pred = mnb_clf.predict(X_test)

# Evaluate the model's performance
mnb_precision = precision_score(y_test, mnb_y_pred)
mnb_recall = recall_score(y_test, mnb_y_pred)
mnb_accuracy = accuracy_score(y_test, mnb_y_pred)

print("Multinomial Naive Bayes Classifier Evaluation Metrics: ")
print("Precision: {:.5f}".format(mnb_precision))
print("Recall: {:.5f}".format(mnb_recall))
print("Accuracy: {:.5f}".format(mnb_accuracy))

Multinomial Naive Bayes Classifier Evaluation Metrics: 
Precision: 1.00000
Recall: 0.08197
Accuracy: 0.15152


In [12]:
# Create a Bernoulli Naive Bayes classifier
bnb_clf = BernoulliNB()

# Fit the model to the training data
bnb_clf.fit(X_train, y_train)

# Make predictions on the testing data
bnb_y_pred = bnb_clf.predict(X_test)

# Evaluate the model's performance
bnb_precision = precision_score(y_test, bnb_y_pred)
bnb_recall = recall_score(y_test, bnb_y_pred)
bnb_accuracy = accuracy_score(y_test, bnb_y_pred)

print("Bernoulli Naive Bayes Classifier Evaluation Metrics:")
print("Precision: {:.5f}".format(bnb_precision))
print("Recall: {:.5f}".format(bnb_recall))
print("Accuracy: {:.5f}".format(bnb_accuracy))

Bernoulli Naive Bayes Classifier Evaluation Metrics:
Precision: 1.00000
Recall: 0.49180
Accuracy: 0.53030
