In [6]:
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report


In [14]:
def load_and_preprocess(path):
    df = pd.read_csv(path)

    # Keep relevant columns
    df = df[['Pclass', 'Gender', 'Age', 'Fare', 'Embarked', 'Survived']]
    # TODO: Drop rows with missing values
    df.dropna(inplace=True)
    # TODO: Convert Age to AgeGroup (Child if < 16, else Adult)
    df['AgeGroup']=df['Age'].apply(lambda age:'Child' if age<16 else 'Adult' )
    df.drop('Age',axis=1,inplace=True)

    # TODO: Encode categorical features ('Gender', 'Embarked', 'AgeGroup')
    df['Gender']=df['Gender'].map({'male':0, 'female':1})
    df['Embarked']=df['Embarked'].map({'S':0, 'C':1, 'Q':2})
    df['AgeGroup']=df['AgeGroup'].map({'Child':0, 'Adult':1})



    # TODO: Prepare X and y

    X=df[['Pclass', 'Gender', 'Fare', 'Embarked', 'AgeGroup']]
    y=df['Survived']

    # Return train-test split
    return train_test_split(X.values, y.values, test_size=0.2, random_state=42)



In [18]:
def entropy(y):
    # TODO: Implement entropy calculation
    counts=Counter(y)
    total=len(y)
    e=0
    for label in counts:
        p=counts[label]/total
        e-=p*np.log2(p)

    return e




In [7]:
def information_gain(X_column, y, threshold):
    # TODO: Calculate information gain for a split
    parent_e=entropy(y)
    left_indices=X_column<=threshold
    right_indices=X_column>threshold


    if sum(left_indices)==0 or sum(right_indices)==0:
      return 0

    n=len(y)
    left_e=entropy(y[left_indices])
    right_e=entropy(y[right_indices])
    child_entropy = (sum(left_indices) / n) * left_e + (sum(right_indices) / n) * right_e
    ig=parent_e-child_entropy
    return ig








In [9]:
def best_split(X, y):
    # TODO: Loop through features and thresholds to find best split
    best_gain=-1
    best_feature=None
    best_threshold=None

    for feature_index in range(X.shape[1]):
      values=np.unique(X[:, feature_index])
      for threshold in values:
        gain=information_gain(X[:, feature_index], y, threshold)
        if gain>best_gain:
          best_gain=gain
          best_feature=feature_index
          best_threshold=threshold

    return best_feature, best_threshold





In [8]:
class Node:
    def __init__(self, feature=None, threshold=None, left=None, right=None, value=None):
        self.feature = feature
        self.threshold = threshold
        self.left = left
        self.right = right
        self.value = value

In [16]:
def build_tree(X, y, depth=0, max_depth=5):
    # TODO: Recursive tree building using ID3
    if len(set(y))==1 or depth>=max_depth:
      leaf_value=Counter(y).most_common(1)[0][0]
      return Node(value=leaf_value)

    feature, threshold=best_split(X,y)

    if feature is None:
      leaf_value=Counter(y).most_common(1)[0][0]
      return Node(value=leaf_value)

    left_indices=X[:,feature]<=threshold
    right_indices=X[:,feature]>threshold

    left = build_tree(X[left_indices], y[left_indices], depth + 1, max_depth)
    right = build_tree(X[right_indices], y[right_indices], depth + 1, max_depth)

    return Node(feature=feature, threshold=threshold, left=left, right=right)






In [12]:
def predict_one(x, node):
    # TODO: Predict for single sample by traversing the tree
    if node.value is not None:
      return node.value

    if x[node.feature]<=node.threshold:
      return predict_one(x,node.left)
    else:
      return predict_one(x, node.right)



In [11]:
def predict(X, tree):
    # TODO: Predict for all samples
    return [predict_one(x, tree) for x in X]




In [19]:
if __name__ == "__main__":
    X_train, X_test, y_train, y_test = load_and_preprocess("/titanic.csv")

    tree = build_tree(X_train, y_train, max_depth=5)
    y_pred = predict(X_test, tree)

    print("Accuracy:", accuracy_score(y_test, y_pred))
    print(classification_report(y_test, y_pred))

Accuracy: 0.7902097902097902
              precision    recall  f1-score   support

           0       0.78      0.88      0.82        80
           1       0.81      0.68      0.74        63

    accuracy                           0.79       143
   macro avg       0.79      0.78      0.78       143
weighted avg       0.79      0.79      0.79       143

