In [6]:
import numpy as np
import seaborn as sns
import pandas as pd
import random

In [7]:
from sklearn.model_selection import train_test_split

df=pd.read_csv('Titanic.csv')
df.drop(['PassengerId','Name','Ticket','Cabin',"Embarked"],axis=1,inplace=True)
df["Sex"]=df['Sex'].map({"male":0, "female":1}).astype(int)
df.head()

df['Age'].fillna(df['Age'].mean(),inplace=True)
def Z_norm(col):
    return (col-col.mean())/col.std()

df['Age']=Z_norm(df['Age'])
df['Fare']=Z_norm(df['Fare'])


x = df.iloc[:, 1:5]
y = df.iloc[:, :1]

df.head()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(df['Age'].mean(),inplace=True)


Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare
0,0,3,0,-0.592148,1,0,-0.502163
1,1,1,1,0.63843,1,0,0.786404
2,1,3,1,-0.284503,0,0,-0.48858
3,1,1,1,0.407697,1,0,0.420494
4,0,3,0,0.407697,0,0,-0.486064


In [8]:
def entropy(y):
    # Convert y to 1D numpy array if it's not already
    y = np.array(y).flatten()
    # Use np.bincount for integer labels
    proportions = np.bincount(y) / len(y)
    return -np.sum([p * np.log2(p) for p in proportions if p > 0])

def information_gain(X_column, y, threshold):
    left_mask = X_column <= threshold
    right_mask = X_column > threshold

    if len(y[left_mask]) == 0 or len(y[right_mask]) == 0:
        return 0

    parent_entropy = entropy(y)
    n = len(y)
    n_left, n_right = len(y[left_mask]), len(y[right_mask])

    e_left = entropy(y[left_mask])
    e_right = entropy(y[right_mask])

    child_entropy = (n_left / n) * e_left + (n_right / n) * e_right
    return parent_entropy - child_entropy

# Now you can proceed with the rest of the code as before

def best_split(X, y):
    best_gain = -1
    best_feature, best_threshold = None, None

    for feature_idx in range(X.shape[1]):
        thresholds = np.unique(X.iloc[:, feature_idx].values)
        for threshold in thresholds:
            gain = information_gain(X.iloc[:, feature_idx].values, y, threshold)
            if gain > best_gain:
                best_gain = gain
                best_feature = feature_idx
                best_threshold = threshold

    return best_feature, best_threshold

def build_tree(X, y, depth=0, max_depth=None):
    # Ensure y is a 1D numpy array
    y = np.array(y).flatten()

    # Base case: if all values are the same or max depth reached
    if len(np.unique(y)) == 1 or (max_depth is not None and depth >= max_depth):
        return np.bincount(y).argmax()

    feature, threshold = best_split(X, y)
    if feature is None:
        return Counter(y).most_common(1)[0][0]

    left_mask = X.iloc[:, feature].values <= threshold
    right_mask = X.iloc[:, feature].values > threshold

    left_subtree = build_tree(X[left_mask], y[left_mask], depth + 1, max_depth)
    right_subtree = build_tree(X[right_mask], y[right_mask], depth + 1, max_depth)

    return {"feature": feature, "threshold": threshold, "left": left_subtree, "right": right_subtree}

def predict_sample(x, tree):
    if not isinstance(tree, dict):
        return tree
    feature, threshold = tree["feature"], tree["threshold"]
    feature_value = x[feature]
    if feature_value <= threshold:
        return predict_sample(x, tree["left"])
    else:
        return predict_sample(x, tree["right"])
    
def predict(x,tree):
    return np.array([predict_sample(inp,tree) for inp in x.values])

In [9]:

from sklearn.model_selection import train_test_split
from collections import Counter

# Function for bootstrap sampling
def bootstrap_sample(X, y):
    n_samples = X.shape[0]
    indices = np.random.choice(n_samples, size=n_samples, replace=True)
    X_sample = X.iloc[indices]
    y_sample = y.iloc[indices]
    return X_sample, y_sample

def train_decision_tree(X_train, y_train, max_depth=None):
    return build_tree(X_train, y_train, max_depth=max_depth)

def predict_with_bagging(X, trees):
    predictions = np.array([predict(X, tree) for tree in trees])
    return [np.bincount(pred).argmax() for pred in predictions.T]

def bagging(X_train, y_train, n_trees=10, max_depth=None):
    trees = []
    for _ in range(n_trees):
        X_sample, y_sample = bootstrap_sample(X_train, y_train)
        tree = train_decision_tree(X_sample, y_sample,max_depth)
        trees.append(tree)
    return trees



In [10]:
from sklearn.model_selection import train_test_split
for i in range(100):
    x = df.iloc[:, 1:5]
    y = df.iloc[:, :1]
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

    colls=len(x.columns)-1
    n_trees = colls**0.5
    n_trees = int(n_trees)
    max_depth = 5
    trees = bagging(x_train, y_train, n_trees=n_trees, max_depth=max_depth)
    
    y_pred = predict_with_bagging(x_test, trees)
    
    accuracy = np.mean(y_pred == y_test.values.flatten())
    print(f"Accuracy with Bagging: {accuracy*100:.4f}")

Accuracy with Bagging: 82.1229
Accuracy with Bagging: 82.1229
Accuracy with Bagging: 81.5642
Accuracy with Bagging: 81.0056
Accuracy with Bagging: 81.0056
Accuracy with Bagging: 82.1229
Accuracy with Bagging: 81.0056
Accuracy with Bagging: 81.0056
Accuracy with Bagging: 81.0056
Accuracy with Bagging: 80.4469
Accuracy with Bagging: 81.5642
Accuracy with Bagging: 82.6816
Accuracy with Bagging: 82.1229
Accuracy with Bagging: 81.0056
Accuracy with Bagging: 81.5642
Accuracy with Bagging: 81.0056
Accuracy with Bagging: 82.1229
Accuracy with Bagging: 81.5642
Accuracy with Bagging: 80.4469
Accuracy with Bagging: 81.5642
Accuracy with Bagging: 83.2402
Accuracy with Bagging: 80.4469
Accuracy with Bagging: 81.0056
Accuracy with Bagging: 80.4469
Accuracy with Bagging: 82.1229
Accuracy with Bagging: 82.1229
Accuracy with Bagging: 81.0056
Accuracy with Bagging: 81.0056
Accuracy with Bagging: 79.3296
Accuracy with Bagging: 81.0056
Accuracy with Bagging: 81.5642
Accuracy with Bagging: 81.5642
Accuracy