<a href="https://colab.research.google.com/github/hitanshu5/ML-LAB/blob/main/MLEXP5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Implementing Random Forest from scratch

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import metrics
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score
import seaborn as sns
import numpy as np
from scipy.stats import norm
import math
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import make_classification

Loading the dataset

In [None]:
df1=pd.read_csv('/content/Breast_cancer_data (1).csv')
df1.head()

Unnamed: 0,mean_radius,mean_texture,mean_perimeter,mean_area,mean_smoothness,diagnosis
0,17.99,10.38,122.8,1001.0,0.1184,0
1,20.57,17.77,132.9,1326.0,0.08474,0
2,19.69,21.25,130.0,1203.0,0.1096,0
3,11.42,20.38,77.58,386.1,0.1425,0
4,20.29,14.34,135.1,1297.0,0.1003,0


Splitting of data

In [None]:
X = df1.iloc[:, :-1]
y=df1['diagnosis']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Scaling the data for better fiitng of the model

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

Implementing Decision tree

In [None]:
import numpy as np

class DecisionTree:
    def __init__(self, max_depth=None):
        self.max_depth = max_depth
        self.tree = {}

    def fit(self, X, y):
        self.tree = self._grow_tree(X, y, depth=0)

    def predict(self, X):
        return np.array([self._predict_tree(x, self.tree) for x in X])

    def _grow_tree(self, X, y, depth):
        num_samples, num_features = X.shape
        num_labels = len(np.unique(y))

        if num_labels == 1:
            return {'class': y[0]}
        if depth == self.max_depth:
            return {'class': np.bincount(y).argmax()}

        best_feature = None
        best_threshold = None
        best_gini = float('inf')
        for feature in range(num_features):
            thresholds = np.unique(X[:, feature])
            for threshold in thresholds:
                left_indices = X[:, feature] < threshold
                right_indices = X[:, feature] >= threshold
                gini = self._gini_impurity(y[left_indices], y[right_indices])
                if gini < best_gini:
                    best_feature = feature
                    best_threshold = threshold
                    best_gini = gini

        left_indices = X[:, best_feature] < best_threshold
        right_indices = X[:, best_feature] >= best_threshold
        left_subtree = self._grow_tree(X[left_indices], y[left_indices], depth + 1)
        right_subtree = self._grow_tree(X[right_indices], y[right_indices], depth + 1)

        return {'feature': best_feature, 'threshold': best_threshold,
                'left': left_subtree, 'right': right_subtree}

    def _predict_tree(self, x, tree):
        if 'class' in tree:
            return tree['class']
        feature = tree['feature']
        threshold = tree['threshold']
        if x[feature] < threshold:
            return self._predict_tree(x, tree['left'])
        else:
            return self._predict_tree(x, tree['right'])

    def _gini_impurity(self, left_labels, right_labels):
        p_left = len(left_labels) / (len(left_labels) + len(right_labels))
        p_right = len(right_labels) / (len(left_labels) + len(right_labels))
        gini_left = 1.0 - sum((np.bincount(left_labels) / len(left_labels)) ** 2)
        gini_right = 1.0 - sum((np.bincount(right_labels) / len(right_labels)) ** 2)
        gini = p_left * gini_left + p_right * gini_right
        return gini

Implementing Random Forest

In [None]:
class RandomForest:
    def __init__(self, n_estimators=100, max_depth=None, max_features=None):
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.max_features = max_features
        self.trees = []

    def fit(self, X, y):
        for _ in range(self.n_estimators):

            bootstrap_indices = np.random.choice(X.shape[0], size=X.shape[0], replace=True)
            bootstrap_X = X[bootstrap_indices]
            bootstrap_y = y[bootstrap_indices]


            tree = DecisionTree(max_depth=self.max_depth)
            tree.fit(bootstrap_X, bootstrap_y)


            self.trees.append(tree)

    def predict(self, X):
        predictions = np.array([tree.predict(X) for tree in self.trees])
        return np.argmax(np.bincount(predictions), axis=0)


Data preprocessing

In [None]:
df1.isnull().sum()

mean_radius        0
mean_texture       0
mean_perimeter     0
mean_area          0
mean_smoothness    0
diagnosis          0
dtype: int64

In [None]:
for idx, column_name in enumerate(X.columns):
    print(f"Column '{column_name}' has index {idx}.")

Column 'mean_radius' has index 0.
Column 'mean_texture' has index 1.
Column 'mean_perimeter' has index 2.
Column 'mean_area' has index 3.
Column 'mean_smoothness' has index 4.


In [None]:
from sklearn.ensemble import RandomForestClassifier

random_forest = RandomForestClassifier(n_estimators=100, class_weight='balanced')

random_forest.fit(X_train_scaled, y_train)

Checking accuracy

In [None]:
accuracy = random_forest.score(X_test_scaled, y_test)
print("Accuracy:", accuracy)

Accuracy: 0.956140350877193


Building the random forest using standard Libraries

Loading The Iris dataset

In [None]:
df=pd.read_csv('/content/Iris.csv')
df.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


In [None]:
X = df.drop(columns=['Species'])
y = df['Species']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
random_forest = RandomForestClassifier(n_estimators=100, random_state=42)

# Fit the model to the training data
random_forest.fit(X_train, y_train)

In [None]:
y_pred = random_forest.predict(X_test)

In [None]:
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 1.0


Comparing the Accuracy with and without feature selection for all the three dataset

1) For Breast Cancer

In [None]:
X = df1.iloc[:, :-1]
y=df1['diagnosis']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Initialize Random Forest classifier without feature selection

In [None]:
rf_full = RandomForestClassifier(n_estimators=100, random_state=42)
rf_full.fit(X_train, y_train)
y_pred_full = rf_full.predict(X_test)
accuracy_full = accuracy_score(y_test, y_pred_full)

Initialize Random Forest classifier with feature selection

In [None]:
from sklearn.feature_selection import SelectFromModel

sfm = SelectFromModel(rf_full, threshold=0.05)
X_train_sfm = sfm.fit_transform(X_train, y_train)
X_test_sfm = sfm.transform(X_test)

Comparing

In [None]:
rf_sfm = RandomForestClassifier(n_estimators=100, random_state=42)
rf_sfm.fit(X_train_sfm, y_train)
y_pred_sfm = rf_sfm.predict(X_test_sfm)
accuracy_sfm = accuracy_score(y_test, y_pred_sfm)

print("Breast Cancer Dataset:")
print("Accuracy without feature selection:", accuracy_full)
print("Accuracy with feature selection:", accuracy_sfm)

Breast Cancer Dataset:
Accuracy without feature selection: 0.9473684210526315
Accuracy with feature selection: 0.9473684210526315


2) For Iris Dataset

In [None]:
X = df.drop(columns=['Species'])
y = df['Species']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Initialize Random Forest classifier without feature selection

In [None]:
rf_full = RandomForestClassifier(n_estimators=100, random_state=42)
rf_full.fit(X_train, y_train)
y_pred_full = rf_full.predict(X_test)
accuracy_full = accuracy_score(y_test, y_pred_full)

In [None]:
sfm = SelectFromModel(rf_full, threshold=0.05)
X_train_sfm = sfm.fit_transform(X_train, y_train)
X_test_sfm = sfm.transform(X_test)

In [None]:
rf_sfm = RandomForestClassifier(n_estimators=100, random_state=42)
rf_sfm.fit(X_train_sfm, y_train)
y_pred_sfm = rf_sfm.predict(X_test_sfm)
accuracy_sfm = accuracy_score(y_test, y_pred_sfm)

print("Iris Dataset:")
print("Accuracy without feature selection:", accuracy_full)
print("Accuracy with feature selection:", accuracy_sfm)

Iris Dataset:
Accuracy without feature selection: 1.0
Accuracy with feature selection: 1.0


Conclusion: Thus we have Implemented Ramndom forest from scratch as well as with some libraries as well