### Ahmad Ryan - 22i-0781 - CS-B - Lab13

In [140]:
# ==================================================================#
# Titanic Dataset Decision Tree Lab Task (Simplified Node Version)  #
# ==================================================================#

import pandas as pd
from sklearn.model_selection import train_test_split

# --- Load and Preprocess Data ---
# Load titanic.csv and perform preprocessing (handle missing values, encoding, etc.)
data = pd.read_csv('titanic.csv')

# Simple preprocessing: drop rows with missing 'Age' or 'Embarked' or 'Fare' (for simplicity)
data = data.dropna(subset=['Age', 'Embarked', 'Fare'])

# Encode 'Gender' and 'Embarked' as numbers
data['Gender'] = data['Gender'].map({'male': 0, 'female': 1})
data['Embarked'] = data['Embarked'].map({'S': 0, 'C': 1, 'Q': 2})

# We'll use a subset of columns for simplicity
features = ['Pclass', 'Gender', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
target = 'Survived'

# Split the data
train_df, test_df = train_test_split(data, test_size=0.2)

train_data = train_df[features + [target]].values.tolist()
test_data = test_df[features + [target]].values.tolist()


In [141]:
# This section is already implemented for you
# --- Node and Question Classes ---
class Node:
    def __init__(self, is_leaf=False, prediction=None, question=None, true_branch=None, false_branch=None):
        self.is_leaf = is_leaf
        self.prediction = prediction
        self.question = question
        self.true_branch = true_branch
        self.false_branch = false_branch

class Question:
    def __init__(self, column, value):
        self.column = column
        self.value = value

    def match(self, example):
        val = example[self.column]
        if is_numeric(val):
            return val >= self.value
        else:
            return val == self.value


# --- Helper Functions ---

In [142]:

def unique_vals(rows, col):
    return set([row[col] for row in rows])



In [None]:
def class_counts(rows):
    counts = {}
    for row in rows:
        label = row[-1]
        if label not in counts:
            counts[label] = 0
        counts[label] += 1
    return counts


In [144]:
def is_numeric(value):
    return isinstance(value, int) or isinstance(value, float)

In [145]:
def partition(rows, question):
    true_rows, false_rows = [], []
    for row in rows:
        if question.match(row):
            true_rows.append(row)
        else:
            false_rows.append(row)
    return true_rows, false_rows

In [146]:
def entropy(rows):
    from math import log2
    counts = class_counts(rows)
    impurity = 0.0
    total = len(rows)
    for lbl in counts:
        prob_of_lbl = counts[lbl] / total
        impurity -= prob_of_lbl * log2(prob_of_lbl)
    return impurity

In [147]:
def info_gain(left, right, current_uncertainty):
    p = float(len(left)) / (len(left) + len(right))
    return current_uncertainty - p * entropy(left) - (1 - p) * entropy(right)

In [148]:
def find_best_split(rows):
    best_gain = 0
    best_question = None
    current_uncertainty = entropy(rows)
    n_features = len(rows[0]) - 1  # number of columns minus label

    for col in range(n_features):
        values = unique_vals(rows, col)

        for val in values:
            question = Question(col, val)
            true_rows, false_rows = partition(rows, question)

            if len(true_rows) == 0 or len(false_rows) == 0:
                continue

            gain = info_gain(true_rows, false_rows, current_uncertainty)

            if gain >= best_gain:
                best_gain, best_question = gain, question

    return best_gain, best_question

In [149]:
def build_tree(rows):
#TODO: Build the tree by recursively splitting the data
    gain, question = find_best_split(rows)

    if gain == 0:
        return Node(is_leaf=True, prediction=class_counts(rows))

    true_rows, false_rows = partition(rows, question)

    true_branch = build_tree(true_rows)
    false_branch = build_tree(false_rows)

    return Node(is_leaf=False, question=question, true_branch=true_branch, false_branch=false_branch)


In [150]:
def print_tree(node, spacing=""):
    if node.is_leaf:
        print(spacing + "Predict", node.prediction)
        return

    print(spacing + f"Is column[{node.question.column}] == {node.question.value}?")

    print(spacing + '--> True:')
    print_tree(node.true_branch, spacing + "  ")

    print(spacing + '--> False:')
    print_tree(node.false_branch, spacing + "  ")


In [151]:
def classify(row, node):
    if node.is_leaf:
        return node.prediction

    if node.question.match(row):
        return classify(row, node.true_branch)
    else:
        return classify(row, node.false_branch)

In [152]:
def print_leaf(counts):
    total = sum(counts.values()) * 1.0
    probs = {}
    for lbl in counts:
        probs[lbl] = f"{int(counts[lbl] / total * 100)}%"
    return probs

In [153]:
# --- Build and Print Tree ---

my_tree = build_tree(train_data)
print_tree(my_tree)

# --- Predictions on Test Data ---

y_true = []
y_pred = []

for row in test_data:
    prediction = classify(row, my_tree)
    predicted_label = max(prediction, key=prediction.get)
    y_pred.append(predicted_label)
    y_true.append(row[-1])

Is column[1] == 1.0?
--> True:
  Is column[0] == 3.0?
  --> True:
    Is column[5] == 21.075?
    --> True:
      Is column[5] == 31.3875?
      --> True:
        Is column[5] == 34.375?
        --> True:
          Predict {0.0: 5}
        --> False:
          Predict {1.0: 2}
      --> False:
        Predict {0.0: 12}
    --> False:
      Is column[2] == 9.0?
      --> True:
        Is column[3] == 1.0?
        --> True:
          Is column[5] == 20.25?
          --> True:
            Predict {1.0: 2}
          --> False:
            Is column[2] == 25.0?
            --> True:
              Predict {0.0: 7}
            --> False:
              Is column[5] == 9.825?
              --> True:
                Is column[2] == 24.0?
                --> True:
                  Predict {1.0: 1}
                --> False:
                  Is column[6] == 1.0?
                  --> True:
                    Is column[5] == 15.2458?
                    --> True:
                      Predict {0

## **Evaluate the model**

Find the following performance matrices results:
- Accuracy (overall correct rate)

- Precision (true positives ÷ predicted positives)

- Recall (true positives ÷ actual positives)

- Confusion Matrix

- Classfication Report

In [154]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, classification_report

# --- Calculate Performance Metrics ---

# 1. Accuracy
acc = accuracy_score(y_true, y_pred)
print(f"\nAccuracy: {acc:.4f}")

# 2. Precision
precision = precision_score(y_true, y_pred, average='binary')
print(f"Precision: {precision:.4f}")

# 3. Recall
recall = recall_score(y_true, y_pred, average='binary')
print(f"Recall: {recall:.4f}")

# 4. Confusion Matrix
cm = confusion_matrix(y_true, y_pred)
print("\nConfusion Matrix:")
print(cm)

# 5. Classification Report
print("\nClassification Report:")
print(classification_report(y_true, y_pred))



Accuracy: 0.7902
Precision: 0.6923
Recall: 0.7200

Confusion Matrix:
[[77 16]
 [14 36]]

Classification Report:
              precision    recall  f1-score   support

         0.0       0.85      0.83      0.84        93
         1.0       0.69      0.72      0.71        50

    accuracy                           0.79       143
   macro avg       0.77      0.77      0.77       143
weighted avg       0.79      0.79      0.79       143

