In [54]:
import numpy
import numpy as np
import pandas as pd

In [55]:
data = pd.read_csv("Bike_Rentals.csv")
data.head(5)

Unnamed: 0,instant,dteday,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,01-01-2011,1,0,1,0,6,0,2,0.344167,0.363625,0.805833,0.160446,331,654,985
1,2,02-01-2011,1,0,1,0,0,0,2,0.363478,0.353739,0.696087,0.248539,131,670,801
2,3,03-01-2011,1,0,1,0,1,1,1,0.196364,0.189405,0.437273,0.248309,120,1229,1349
3,4,04-01-2011,1,0,1,0,2,1,1,0.2,0.212122,0.590435,0.160296,108,1454,1562
4,5,05-01-2011,1,0,1,0,3,1,1,0.226957,0.22927,0.436957,0.1869,82,1518,1600


In [56]:
from itertools import product
import numpy as np
class DecisionTreeRegressor():
   # called every time an object is created from a class
    def __init__(self, min_samples_split=2, max_depth=2):
        ''' constructor '''
       
        # initialize the root of the tree 
        self.root = None
        
        # stopping conditions
        self.min_samples_split = min_samples_split #specifies the minimum number of samples required to split an internal node
        self.max_depth = max_depth  #determines the maximum depth of the decision tree that will be constructed
    def get_best_split(self, dataset, num_samples, num_features):
        ''' function to find the best split '''
        
        # dictionary to store the best split
        best_split = {}
        max_var_red = -float("inf")
        # loop over all the features in the dataset
        for feature_index in range(num_features):
            feature_values = dataset[:, feature_index]
            #it will assighn the unique values in the dataset
            possible_thresholds = np.unique(feature_values)
            # loop over all the unique feature values present in the data
            for threshold in possible_thresholds:
                # get current split
                dataset_left, dataset_right = self.split(dataset, feature_index, threshold)
                # check if childs are not null 
                if len(dataset_left)>0 and len(dataset_right)>0:
                    y, left_y, right_y = dataset[:, -1], dataset_left[:, -1], dataset_right[:, -1]
                    # compute varience reduction for target variable
                    curr_var_red = self.variance_reduction(y, left_y, right_y)
                    # update the best split if needed
                    # if curr_var_red is greater than max_var_red then it will update the best split as this value
                    if curr_var_red>max_var_red:
                        best_split["feature_index"] = feature_index
                        best_split["threshold"] = threshold
                        best_split["dataset_left"] = dataset_left
                        best_split["dataset_right"] = dataset_right
                        best_split["var_red"] = curr_var_red
                        max_var_red = curr_var_red
                        
        # return best split
        return best_split
    
    def split(self, dataset, feature_index, threshold):
        ''' function to split the data '''
        # if feature value or index is less than or equal to the threshold then the value is assighned to left 
        dataset_left = np.array([row for row in dataset if row[feature_index]<=threshold])
        # if feature value or index is greater than  to the threshold then the value is assighned to right 
        dataset_right = np.array([row for row in dataset if row[feature_index]>threshold])
        # after splitiing it will return the result
        return dataset_left, dataset_right
    
    def variance_reduction(self, parent, l_child, r_child):
        ''' function to compute variance reduction gfeature'''
        
        weight_l = len(l_child) / len(parent)# it will calculate the varience of the left chaild
        weight_r = len(r_child) / len(parent)# it will calculate the varience of the right chld
        #after calculating varience of left and right child then using this we are going to calculate the varience reduction using this formula 
        #taking sum of the right and left child and subtracting with the varience of parent 
        reduction = np.var(parent) - (weight_l * np.var(l_child) + weight_r * np.var(r_child))
        #return the varience reduction
        return reduction
    
    def calculate_leaf_value(self, Y):
        ''' function to compute leaf node '''
        #it is used to find value of leaf node
        val = np.mean(Y)
        return val  

    def build_tree(self, dataset, curr_depth=0):
        ''' recursive function to build the tree '''
        #it will separate the data into independent and dependet 
        X, Y = dataset[:,:-1], dataset[:,-1]
        num_samples, num_features = np.shape(X)
        #this dictionory will store best split value
        best_split = {}
        # split until stopping conditions are met
        if num_samples>=self.min_samples_split and curr_depth<=self.max_depth:
            # find the best split
            best_split = self.get_best_split(dataset, num_samples, num_features)
            # check if information gain is positive
            if best_split["var_red"]>0:
              #if the best split varience reduction is greater than 0 it will build left and right subtree and increases the size of the depth +1
                # recur left
                left_subtree = self.build_tree(best_split["dataset_left"], curr_depth+1)
                # recur right
                right_subtree = self.build_tree(best_split["dataset_right"], curr_depth+1)
                # return decision node
                return Node(best_split["feature_index"], best_split["threshold"], 
                            left_subtree, right_subtree, best_split["var_red"])
        
        # compute leaf node
        leaf_value = self.calculate_leaf_value(Y)
        # return leaf node
        return Node(value=leaf_value)
    
    
    def fit(self, X, Y):
        ''' function to train the tree '''
        
        dataset = np.concatenate((X, Y), axis=1)
        self.root = self.build_tree(dataset)
        
    def make_prediction(self, x, tree):
        ''' function to predict new dataset '''
        
        if tree.value!=None: return tree.value
        feature_val = x[tree.feature_index]
        #the feature value is less than or equal to the threshol value of the tree it will make prediction on left tree else make prediction on right tree
        if feature_val<=tree.threshold:
            return self.make_prediction(x, tree.left)
        else:
            return self.make_prediction(x, tree.right)
    
    def predict(self, X):
        ''' function to predict a single data point '''
         # Traverse the decision tree to make a prediction for a single instance
        preditions = [self.make_prediction(x, self.root) for x in X]
        return preditions

    def r2_score(y_true, y_pred):
      ss_res = np.sum((y_true - y_pred) ** 2)
      ss_tot = np.sum((y_true - np.mean(y_true)) ** 2)
      r2 = 1 - ss_res / ss_tot
      return r2
       
    def mean_squared_error(self,y_true, y_pred):
   
      # Check if the lengths of both arrays are equal
      if len(y_true) != len(y_pred):
          raise ValueError("Length of y_true and y_pred should be the same.")
      
      # Calculate the squared differences between the true and predicted values
      squared_differences = [(y_true[i] - y_pred[i])**2 for i in range(len(y_true))]
      
      # Calculate the mean of the squared differences
      mse = sum(squared_differences) / len(squared_differences)
      
      return mse

  


    

In [57]:
import numpy as np

from decision_tree import DecisionTreeRegressor

class RandomForestRegressor:
    def __init__(self, n_estimators=100, max_depth=None, min_samples_split=2):
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
      
       
        self.trees = []

    def fit(self, X, y):
     
          for i in range(self.n_estimators):
            
                indices = np.random.choice(X.shape[0], size=X.shape[0], replace=True)
                X_subset = X[indices]
                y_subset = y[indices]
                tree = DecisionTreeRegressor(max_depth=self.max_depth, 
                                              min_samples_split=self.min_samples_split
                                             
                                              )
                tree.fit(X_subset, y_subset)
                self.trees.append(tree)
    def predict(self, X):
        y_preds = np.zeros((X.shape[0], len(self.trees)))
        for i, tree in enumerate(self.trees):
            y_preds[:, i] = tree.predict(X)
        return np.mean(y_preds, axis=1)            
    def mean_squared_error(self,y_true, y_pred):
   
      # Check if the lengths of both arrays are equal
      if len(y_true) != len(y_pred):
          raise ValueError("Length of y_true and y_pred should be the same.")
      
      # Calculate the squared differences between the true and predicted values
      squared_differences = [(y_true[i] - y_pred[i])**2 for i in range(len(y_true))]
      
      # Calculate the mean of the squared differences
      mse = sum(squared_differences) / len(squared_differences)
      
      return mse
    
    
  


In [58]:
X = data.iloc[:, :-1].values
Y = data.iloc[:, -1].values.reshape(-1,1)
#from sklearn.model_selection import train_test_split
#X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=.2, random_state=41)
#print(Y)

In [59]:
import random

def train_test_split(X, y, test_size=0.2, random_state=None):
    if random_state:
        random.seed(random_state)
    
    n = len(X)
    test_data = set(random.sample(range(n), int(n * test_size)))
    train_data = set(range(n)) - test_data
    
    X_train = [X[i] for i in train_data]
    X_train=np.array(X_train)
    X_test = [X[i] for i in test_data]
    X_test=np.array(X_test)
    y_train = [y[i] for i in train_data]
    y_train=np.array(y_train)
    y_test = [y[i] for i in test_data]
    y_test=np.array(y_test)
    return X_train, X_test, y_train, y_test
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=.2, random_state=41)

In [60]:
import numpy as np
regressor = RandomForestRegressor(n_estimators=10, max_depth=4, min_samples_split=2)
regressor.fit(X_train,Y_train)


In [61]:
y_pred = regressor.predict(X_test)

In [62]:
y_pred

array([1384.38664851, 7544.52033217, 1240.31357309, 1781.5552793 ,
       7415.04216533, 1064.70943888,  506.8109127 , 1240.92958208,
       6847.81481291, 1740.73501787, 1435.59661354, 1712.37073215,
       5530.53562476, 6251.44794131, 1712.37073215, 6766.95247525,
       7054.00460995, 6661.29914191, 5610.87038137, 1942.66056548,
       7458.8561655 , 1426.74444904, 2234.45894673, 1938.24502751,
       2018.08442778, 6995.29964408, 2108.0575333 , 6860.35331558,
       6860.35331558, 2124.51871385, 1970.62647457, 2400.85369364,
       6860.35331558, 2262.63356212, 7552.58227879, 2988.8980326 ,
       2066.64896187, 7461.39525134, 7461.39525134, 4024.21186011,
       7734.77632507, 1863.98495358, 7992.15474126, 4458.96307102,
       7544.83127655, 4361.97803881, 7221.7626491 , 4256.01448863,
       4629.41001582, 4382.83967851, 4145.38682327, 4575.86966824,
       4458.96307102, 4751.47465617, 4218.35326335, 7427.58066801,
       5317.79257292, 4488.8032287 , 5539.7753357 , 4412.75883

In [63]:
np.sqrt(regressor.mean_squared_error(Y_test,y_pred))

array([214.92449689])

In [64]:
# Define the random forest regression function
def random_forest_reg(X_train, y_train, X_test, n_estimators, max_depth,min_samples_split):
  # Fit the random forest regression model
  tree_reg = RandomForestRegressor(n_estimators=n_estimators,max_depth=max_depth,min_samples_split=min_samples_split)
  tree_reg.fit(X_train, Y_train)
  
  # Make predictions on the test data
  y_pred = tree_reg.predict(X_test)
  
  # Calculate the mean squared error
  mse = mean_squared_error(Y_test, y_pred)
  
  return mse

In [68]:
from sklearn.metrics import mean_squared_error

# Define the hyperparameter tuning function using randomized cross-validation
def random_search_cv(X_train, Y_train, X_test, param_dist, num_iter):
      # Define the best parameters and score
      best_params = {}
      best_score = np.inf

      # Perform the specified number of iterations
      for i in range(num_iter):
          # Select a random set of parameters
          params = {k: v[np.random.randint(len(v))] for k, v in param_dist.items()}

          # Evaluate the model with the selected parameters
          score = random_forest_reg(X_train, Y_train, X_test, **params)

          # Check if the model is the best so far
          if score < best_score:
              best_score = score
              best_params = params

      return best_params

# Define the hyperparameters and their possible values
param_dist = {"n_estimators": [10,20,40],
              "max_depth": [1, 2, 3, 4, 5,10,20],
              "min_samples_split": [2,4,6,8,10]
              }


In [None]:
# Perform the hyperparameter tuning with randomized cross-validation
best_params = random_search_cv(X_train, Y_train, X_test, param_dist, 5)

# Print the best parameters
print("Best parameters:", best_params)

# Fit the random forest regression model with the best parameters
tree_reg = RandomForestRegressor(**best_params)
tree_reg.fit(X_train, Y_train)

# Make predictions on the test data
y_pred = tree_reg.predict(X_test)

# Calculate the mean squared error
mse = mean_squared_error(Y_test, y_pred)

print("Mean squared error:", mse)