# Random Forest on 1st Dataset

In [80]:
import pandas as pd
import numpy as np
import itertools
import random
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from matplotlib import pyplot as plt

In [81]:
dataset = pd.read_csv('/content/advertising.csv')

In [82]:
def gini_index(y):
    m = len(y)
    if m <= 1:
        return 0

    counts = y.value_counts()
    gini = 1 - sum((count / m) ** 2 for count in counts)
    return gini

def entropy(y):
    a = y.value_counts() / y.shape[0]
    return np.sum(-a * np.log2(a + 1e-9))

In [83]:
def gini_information_gain(y, mask):
    total_size = len(y)
    left_size = sum(mask)
    right_size = total_size - left_size
    if left_size == 0 or right_size == 0:
        return 0
    left_gini = gini_index(y[mask])
    right_gini = gini_index(y[~mask])
    gain = gini_index(y) - (left_size / total_size * left_gini + right_size / total_size * right_gini)
    return gain

In [84]:
def entropy_information_gain(y, mask):
    total_size = len(y)
    left_size = sum(mask)
    right_size = total_size - left_size
    if left_size == 0 or right_size == 0:
        return 0
    left_entropy = entropy(y[mask])
    right_entropy = entropy(y[~mask])
    gain = entropy(y) - (left_size / total_size * left_entropy + right_size / total_size * right_entropy)
    return gain

In [85]:
def best_split(dataset, y, func=entropy):
    best_gain = -float('inf')
    best_split_value = None
    best_split_variable = None
    best_mask = None
    for column in dataset.columns:
        values = dataset[column].unique()
        for value in values:
            if dataset[column].dtype != 'O':  # If it's numeric
                mask = dataset[column] < value
            else:
                mask = dataset[column].isin([value])
            if func == entropy:
                gain = entropy_information_gain(y, mask)
            else:
                gain = gini_information_gain(y, mask)
            if gain > best_gain:
                best_gain = gain
                best_split_value = value
                best_split_variable = column
                best_mask = mask
    return best_split_variable, best_split_value, best_mask, best_gain

In [86]:
def train_tree(dataset, y, max_depth=None, min_samples_split=2, min_information_gain=1e-5, depth=0, func=entropy):
    if len(y.unique()) == 1:  # Base case: Only one class
        return y.iloc[0]
    if depth == max_depth or len(dataset) < min_samples_split:
        return y.mode()[0]  # Majority class
    best_variable, best_value, best_mask, best_gain = best_split(dataset, y, func)
    if best_gain < min_information_gain:
        return y.mode()[0]
    left_dataset = dataset[best_mask]
    right_dataset = dataset[~best_mask]
    left_y = y[best_mask]
    right_y = y[~best_mask]
    left_tree = train_tree(left_dataset, left_y, max_depth, min_samples_split, min_information_gain, depth+1, func)
    right_tree = train_tree(right_dataset, right_y, max_depth, min_samples_split, min_information_gain, depth+1, func)
    tree = {f'{best_variable} <= {best_value}': [left_tree, right_tree]}
    return tree

In [87]:
def classify(observation, tree):
    if isinstance(tree, dict):
        question = list(tree.keys())[0]
        feature, value = question.split(' <= ')
        feature_value = observation[feature]
        if isinstance(tree[question][0], dict):
            if feature_value <= float(value):
                return classify(observation, tree[question][0])
            else:
                return classify(observation, tree[question][1])
        else:
            return tree[question][0]
    else:
        return tree

In [88]:
def create_bootstrapped_subsets(dataset, n_subsets=4):
    subsets = []
    for _ in range(n_subsets):
        subset = dataset.sample(frac=1, replace=True)
        subsets.append(subset)
    return subsets

In [89]:
def train_multiple_trees(subsets, target_column, max_depth=5):
    trees = []
    for subset in subsets:
        X = subset.drop(columns=[target_column])
        y = subset[target_column]
        tree = train_tree(X, y, max_depth=max_depth, func=gini_index)
        trees.append(tree)
    return trees

In [90]:
def evaluate_trees(trees, X, y):
    accuracies = []
    for tree in trees:
        predictions = X.apply(lambda observation: classify(observation, tree), axis=1)
        accuracy = accuracy_score(y, predictions)
        accuracies.append(accuracy)
    return accuracies

In [91]:
def select_best_trees(trees, X, y, n_best=2):
    accuracies = evaluate_trees(trees, X, y)
    best_tree_indices = np.argsort(accuracies)[-n_best:]
    best_trees = [trees[i] for i in best_tree_indices]
    return best_trees

In [92]:
def random_forest_predict(observation, best_trees):
    tree_predictions = [classify(observation, tree) for tree in best_trees]
    return max(set(tree_predictions), key=tree_predictions.count)

In [93]:
subsets = create_bootstrapped_subsets(dataset, n_subsets=4)
target_column = 'Clicked on Ad'
X = dataset.drop(columns=[target_column])
y = dataset[target_column]

trees = train_multiple_trees(subsets, target_column)
best_trees = select_best_trees(trees, X, y, n_best=2)

In [94]:
predictions = X.apply(lambda observation: random_forest_predict(observation, best_trees), axis=1)
random_forest_accuracy = accuracy_score(y, predictions)
print(f"Random Forest Accuracy: {random_forest_accuracy}")

Random Forest Accuracy: 0.494


In [95]:
def print_tree(tree, indent=""):
    if isinstance(tree, dict):
        question = list(tree.keys())[0]
        print(f"{indent}Decision: {question}")
        print(f"{indent}-> Left:")
        print_tree(tree[question][0], indent + "  ")
        print(f"{indent}-> Right:")
        print_tree(tree[question][1], indent + "  ")
    else:
        print(f"{indent}Predict: {tree}")

for i, tree in enumerate(best_trees):
    print(f"Tree {i+1}:\n")
    print_tree(tree)
    print("\n" + "="*50 + "\n")

Tree 1:

Decision: Daily Internet Usage <= 175.14
-> Left:
  Decision: Daily Time Spent on Site <= 71.4
  -> Left:
    Decision: Area Income <= 76984.21
    -> Left:
      Predict: 1
    -> Right:
      Predict: 0
  -> Right:
    Decision: Daily Internet Usage <= 161.42
    -> Left:
      Decision: Ad Topic Line <= Progressive asynchronous adapter
      -> Left:
        Predict: 0
      -> Right:
        Decision: Ad Topic Line <= Progressive clear-thinking open architecture
        -> Left:
          Predict: 0
        -> Right:
          Predict: 1
    -> Right:
      Decision: Area Income <= 50950.24
      -> Left:
        Decision: Male <= 1
        -> Left:
          Predict: 1
        -> Right:
          Predict: 0
      -> Right:
        Predict: 0
-> Right:
  Decision: Daily Time Spent on Site <= 58.18
  -> Left:
    Decision: Age <= 28
    -> Left:
      Decision: Area Income <= 38817.4
      -> Left:
        Predict: 1
      -> Right:
        Decision: Daily Time Spent on Sit

# Random Forest on 2nd Dataset

In [96]:
dataset = pd.read_csv('/content/dataset_traffic_accident.csv')

In [97]:
dataset = dataset.dropna()

In [98]:
categorical_columns = dataset.select_dtypes(include=['object']).columns
for column in categorical_columns:
    dataset[column] = dataset[column].astype('category').cat.codes

In [99]:
target_column = 'Accident'
X = dataset.drop(columns=[target_column])
y = dataset[target_column]

In [100]:
max_depth_values = [5, 10, 15]
min_samples_split_values = [2, 5, 10]
min_samples_leaf_values = [1, 2, 4]

best_trees = []

for max_depth in max_depth_values:
    for min_samples_split in min_samples_split_values:
        for min_samples_leaf in min_samples_leaf_values:
            print(f"Training trees with max_depth={max_depth}, min_samples_split={min_samples_split}, min_samples_leaf={min_samples_leaf}")

            subsets = create_bootstrapped_subsets(dataset, n_subsets=4)

            trees = train_multiple_trees(subsets, target_column, max_depth=max_depth)

            accuracies = evaluate_trees(trees, X, y)
            best_trees.append((max_depth, min_samples_split, min_samples_leaf, accuracies))

            print(f"Accuracy for this combination: {accuracies}")

Training trees with max_depth=5, min_samples_split=2, min_samples_leaf=1
Accuracy for this combination: [0.7061728395061728, 0.5580246913580247, 0.7086419753086419, 0.4419753086419753]
Training trees with max_depth=5, min_samples_split=2, min_samples_leaf=2
Accuracy for this combination: [0.7061728395061728, 0.4617283950617284, 0.7012345679012346, 0.6987654320987654]
Training trees with max_depth=5, min_samples_split=2, min_samples_leaf=4
Accuracy for this combination: [0.671604938271605, 0.4271604938271605, 0.6938271604938272, 0.4074074074074074]
Training trees with max_depth=5, min_samples_split=5, min_samples_leaf=1
Accuracy for this combination: [0.7086419753086419, 0.5728395061728395, 0.7061728395061728, 0.7086419753086419]
Training trees with max_depth=5, min_samples_split=5, min_samples_leaf=2
Accuracy for this combination: [0.7037037037037037, 0.6024691358024692, 0.6641975308641975, 0.5506172839506173]
Training trees with max_depth=5, min_samples_split=5, min_samples_leaf=4
Acc

In [101]:
for (max_depth, min_samples_split, min_samples_leaf, accuracies) in best_trees:
    print(f"Results for max_depth={max_depth}, min_samples_split={min_samples_split}, min_samples_leaf={min_samples_leaf}")
    print(f"Accuracy: {accuracies}")

Results for max_depth=5, min_samples_split=2, min_samples_leaf=1
Accuracy: [0.7061728395061728, 0.5580246913580247, 0.7086419753086419, 0.4419753086419753]
Results for max_depth=5, min_samples_split=2, min_samples_leaf=2
Accuracy: [0.7061728395061728, 0.4617283950617284, 0.7012345679012346, 0.6987654320987654]
Results for max_depth=5, min_samples_split=2, min_samples_leaf=4
Accuracy: [0.671604938271605, 0.4271604938271605, 0.6938271604938272, 0.4074074074074074]
Results for max_depth=5, min_samples_split=5, min_samples_leaf=1
Accuracy: [0.7086419753086419, 0.5728395061728395, 0.7061728395061728, 0.7086419753086419]
Results for max_depth=5, min_samples_split=5, min_samples_leaf=2
Accuracy: [0.7037037037037037, 0.6024691358024692, 0.6641975308641975, 0.5506172839506173]
Results for max_depth=5, min_samples_split=5, min_samples_leaf=4
Accuracy: [0.6049382716049383, 0.7061728395061728, 0.6444444444444445, 0.5728395061728395]
Results for max_depth=5, min_samples_split=10, min_samples_leaf=1

In [102]:
def print_tree(tree, indent=""):
    if isinstance(tree, dict):
        question = list(tree.keys())[0]
        print(f"{indent}Decision: {question}")
        print(f"{indent}-> Left:")
        print_tree(tree[question][0], indent + "  ")
        print(f"{indent}-> Right:")
        print_tree(tree[question][1], indent + "  ")
    else:
        print(f"{indent}Predict: {tree}")

for i, tree in enumerate(best_trees):
    print(f"Tree {i+1}:\n")
    print_tree(tree)
    print("\n" + "="*50 + "\n")

Tree 1:

Predict: (5, 2, 1, [0.7061728395061728, 0.5580246913580247, 0.7086419753086419, 0.4419753086419753])


Tree 2:

Predict: (5, 2, 2, [0.7061728395061728, 0.4617283950617284, 0.7012345679012346, 0.6987654320987654])


Tree 3:

Predict: (5, 2, 4, [0.671604938271605, 0.4271604938271605, 0.6938271604938272, 0.4074074074074074])


Tree 4:

Predict: (5, 5, 1, [0.7086419753086419, 0.5728395061728395, 0.7061728395061728, 0.7086419753086419])


Tree 5:

Predict: (5, 5, 2, [0.7037037037037037, 0.6024691358024692, 0.6641975308641975, 0.5506172839506173])


Tree 6:

Predict: (5, 5, 4, [0.6049382716049383, 0.7061728395061728, 0.6444444444444445, 0.5728395061728395])


Tree 7:

Predict: (5, 10, 1, [0.5679012345679012, 0.6814814814814815, 0.7135802469135802, 0.6938271604938272])


Tree 8:

Predict: (5, 10, 2, [0.7061728395061728, 0.5876543209876544, 0.6419753086419753, 0.4024691358024691])


Tree 9:

Predict: (5, 10, 4, [0.6123456790123457, 0.6172839506172839, 0.7061728395061728, 0.54814814814