In [1]:
import numpy as np
import regex as re
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.datasets import fetch_openml
np.random.seed(4)
X, y = fetch_openml("titanic", version=1, as_frame=True, return_X_y=True)
X.drop(['boat', 'body', 'home.dest'], axis=1, inplace=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2)

  warn(


In [2]:
#decision tree classifier with some default hyperparameters
class DecisionTreeClassifier:
    def __init__(self, max_depth=None, criterion='gini', min_samples_split=2, min_samples_leaf=1):
        self.max_depth = max_depth
        self.criterion = criterion
        self.min_samples_split = min_samples_split
        self.min_samples_leaf = min_samples_leaf

    #calculates the Gini impurity of a target labels
    
    def _gini_index (self, lab):
        _, cou_S = np.unique(lab, return_counts=True)
        impu = 1.0
        for count in cou_S:
            final = count / len(lab)
            impu -= final ** 2
        return impu
    
    #calculates the misclassification rate impurity measure class labels.
    
    def _misclassification_rate(self, ans):
        _, counts = np.unique(ans, return_counts=True)
        n0_of_samples = len(ans)
        if n0_of_samples == 0:
            return 0
        maximum = np.max(counts)
        RATE_of_MS = 1 - (maximum / n0_of_samples)
        return RATE_of_MS
    
    #calculates the entropy of a set of samples with respect to their labels

  
    def _entropy(self, y):
      _, counts = np.unique(y, return_counts=True)
      probs = counts / len(y)
      entropy = 0
      for p in probs:
          entropy -= p * np.log2(p) if p > 0 else 0
      return entropy

        
    def _grow_tree(self, X, y, depth=0):
        n_samples, updated_Fea = X.shape
        updated_labels = len(np.unique(y))
        
        #checks if a leaf node has been reached by checking if the maximum depth has been reached, if there is only one label, or if the number of samples is less than the minimum number of samples to split
        if (self.max_depth != None and depth >= self.max_depth) or updated_labels == 1 or n_samples < self.min_samples_split:
            return {'predicted_class': np.argmax(np.bincount(y))}
        
        impurity_func = {'gini': self._gini_index, 'misclassification_rate': self._misclassification_rate, 'entropy': self._entropy}.get(self.criterion.lower())

            
        #splitting the dataset at a given node based on the best feature and threshold value.
        fe__ind = np.arange(updated_Fea)
        if updated_Fea > 1:
            np.random.permutation(fe__ind)
            fe__ind = fe__ind[:np.random.randint(1, updated_Fea)]
        
        #shuffles the indices of the features and selects a random subset  
        impurity = np.inf
        for feature in fe__ind:
            thresholds = np.unique(X[:, feature])
            for t in thresholds:
                y_left = y[X[:, feature] < t]
                y_right = y[X[:, feature] >= t]
                if len(y_left) < self.min_samples_leaf or len(y_right) < self.min_samples_leaf: #calculates the unique thresholds and iterates over them
                    continue
                impu_Curr = (len(y_left)/n_samples) * impurity_func(y_left) + (len(y_right)/n_samples) * impurity_func(y_right)
                impurity, ideal_Fea, ideal_threshold, leftside_ind, righttside_ind = (impu_Curr, feature, t, X[:, feature] < t, X[:, feature] >= t) if impu_Curr < impurity else (impurity, ideal_Fea, ideal_threshold, leftside_ind, righttside_ind)
       
        if impurity == np.inf: return {'predicted_class': np.bincount(y).argmax()}
        
        left = self._grow_tree(X[leftside_ind], y[leftside_ind], depth+1)
        right = self._grow_tree(X[righttside_ind], y[righttside_ind], depth+1)
        return {'feature': ideal_Fea, 'threshold': ideal_threshold, 'left': left, 'right': right}

    def fit(self, X, y): # sets the X and y attributes of the class to the input training data
        self.X = X
        self.y = y
        self.n_classes = len(np.nonzero(np.bincount(y))[0])
        self.tree = self._grow_tree(X, y, depth=0)
        
    def predict(self, X):
        return np.array([self._traverse_tree(x, self.tree) for x in X])
    

    # traverse the decision tree and make a prediction for a single sample x
    def _traverse_tree(self, x, node):
        if 'predicted_class' in node: return node['predicted_class']
        return self._traverse_tree(x, node['left']) if x[node['feature']] < node['threshold'] else self._traverse_tree(x, node['right'])

    

In [3]:
#`y_train` and `y_test` arrays from float to integer using `.astype(int)` method. 
y_train = y_train.astype(int)
y_test= y_test.astype(int)
mergeDATA = [X_train, X_test]

In [4]:
#fulldata after conerting
mergeDATA

[      pclass                                       name     sex   age  sibsp  \
 39       1.0                         Brandeis, Mr. Emil    male  48.0    0.0   
 119      1.0              Frauenthal, Dr. Henry William    male  50.0    2.0   
 284      1.0  Stone, Mrs. George Nelson (Martha Evelyn)  female  62.0    0.0   
 1298     3.0                  Wittevrongel, Mr. Camille    male  36.0    0.0   
 241      1.0                      Rood, Mr. Hugh Roscoe    male   NaN    0.0   
 ...      ...                                        ...     ...   ...    ...   
 613      3.0                Albimona, Mr. Nassef Cassem    male  26.0    0.0   
 755      3.0                         Davies, Mr. Joseph    male  17.0    2.0   
 1199     3.0                Shorney, Mr. Charles Joseph    male   NaN    0.0   
 955      3.0                         Lefebre, Miss. Ida  female   NaN    3.0   
 53       1.0                     Carrau, Mr. Jose Pedro    male  17.0    0.0   
 
       parch     ticket   

In [5]:
#creating a new column called newcolumn_cabin in both X_train and X_test dataframes. The new column will contain 1 if the ‘cabin’ column contains a value other than NaN, otherwise it will contain 0.
X_train['newColumn_cabin'] = X_train["cabin"].apply(lambda x: 0 if type(x) == float else 1)
#lambda function checks if the value in the ‘cabin’ column is of type float or not. If it is, then it returns 1, otherwise it returns 0
X_test['newColumn_cabin'] = X_test["cabin"].apply(lambda x: 0 if type(x) == float else 1)

In [6]:
mergeDATA

[      pclass                                       name     sex   age  sibsp  \
 39       1.0                         Brandeis, Mr. Emil    male  48.0    0.0   
 119      1.0              Frauenthal, Dr. Henry William    male  50.0    2.0   
 284      1.0  Stone, Mrs. George Nelson (Martha Evelyn)  female  62.0    0.0   
 1298     3.0                  Wittevrongel, Mr. Camille    male  36.0    0.0   
 241      1.0                      Rood, Mr. Hugh Roscoe    male   NaN    0.0   
 ...      ...                                        ...     ...   ...    ...   
 613      3.0                Albimona, Mr. Nassef Cassem    male  26.0    0.0   
 755      3.0                         Davies, Mr. Joseph    male  17.0    2.0   
 1199     3.0                Shorney, Mr. Charles Joseph    male   NaN    0.0   
 955      3.0                         Lefebre, Miss. Ida  female   NaN    3.0   
 53       1.0                     Carrau, Mr. Jose Pedro    male  17.0    0.0   
 
       parch     ticket   

In [7]:
#creating a new column called all_in_one_family.The new column will contain the sum of ‘sibsp’ and ‘parch’ columns plus 1 
for dataset in mergeDATA:
    dataset['all_in_one_family'] = dataset['sibsp'] + dataset['parch'] + 1

mergeDATA

[      pclass                                       name     sex   age  sibsp  \
 39       1.0                         Brandeis, Mr. Emil    male  48.0    0.0   
 119      1.0              Frauenthal, Dr. Henry William    male  50.0    2.0   
 284      1.0  Stone, Mrs. George Nelson (Martha Evelyn)  female  62.0    0.0   
 1298     3.0                  Wittevrongel, Mr. Camille    male  36.0    0.0   
 241      1.0                      Rood, Mr. Hugh Roscoe    male   NaN    0.0   
 ...      ...                                        ...     ...   ...    ...   
 613      3.0                Albimona, Mr. Nassef Cassem    male  26.0    0.0   
 755      3.0                         Davies, Mr. Joseph    male  17.0    2.0   
 1199     3.0                Shorney, Mr. Charles Joseph    male   NaN    0.0   
 955      3.0                         Lefebre, Miss. Ida  female   NaN    3.0   
 53       1.0                     Carrau, Mr. Jose Pedro    male  17.0    0.0   
 
       parch     ticket   

In [8]:
#filling missing values in the ‘fare’ column with the median value of ‘fare’ column.

for dataset in mergeDATA:
    dataset['fare'] = dataset['fare'].fillna(X_train['fare'].median())


In [9]:
mergeDATA

[      pclass                                       name     sex   age  sibsp  \
 39       1.0                         Brandeis, Mr. Emil    male  48.0    0.0   
 119      1.0              Frauenthal, Dr. Henry William    male  50.0    2.0   
 284      1.0  Stone, Mrs. George Nelson (Martha Evelyn)  female  62.0    0.0   
 1298     3.0                  Wittevrongel, Mr. Camille    male  36.0    0.0   
 241      1.0                      Rood, Mr. Hugh Roscoe    male   NaN    0.0   
 ...      ...                                        ...     ...   ...    ...   
 613      3.0                Albimona, Mr. Nassef Cassem    male  26.0    0.0   
 755      3.0                         Davies, Mr. Joseph    male  17.0    2.0   
 1199     3.0                Shorney, Mr. Charles Joseph    male   NaN    0.0   
 955      3.0                         Lefebre, Miss. Ida  female   NaN    3.0   
 53       1.0                     Carrau, Mr. Jose Pedro    male  17.0    0.0   
 
       parch     ticket   

In [10]:
#filling missing values in the ‘embarked’ column with ‘S’

for dataset in mergeDATA:
    dataset['embarked'] = dataset['embarked'].fillna('S')

In [11]:
mergeDATA

[      pclass                                       name     sex   age  sibsp  \
 39       1.0                         Brandeis, Mr. Emil    male  48.0    0.0   
 119      1.0              Frauenthal, Dr. Henry William    male  50.0    2.0   
 284      1.0  Stone, Mrs. George Nelson (Martha Evelyn)  female  62.0    0.0   
 1298     3.0                  Wittevrongel, Mr. Camille    male  36.0    0.0   
 241      1.0                      Rood, Mr. Hugh Roscoe    male   NaN    0.0   
 ...      ...                                        ...     ...   ...    ...   
 613      3.0                Albimona, Mr. Nassef Cassem    male  26.0    0.0   
 755      3.0                         Davies, Mr. Joseph    male  17.0    2.0   
 1199     3.0                Shorney, Mr. Charles Joseph    male   NaN    0.0   
 955      3.0                         Lefebre, Miss. Ida  female   NaN    3.0   
 53       1.0                     Carrau, Mr. Jose Pedro    male  17.0    0.0   
 
       parch     ticket   

In [12]:
#filling missing values in the AGE column.
for dataset in mergeDATA:
    #taking average of age column
    average_AGE = dataset['age'].mean()
    #taking std of age column
    standardD_AGE = dataset['age'].std()
    Countnull_AGE = dataset['age'].isnull().sum()
    #generates a list of random integers using NumPy's random.randint() function. 
    NULLlist_AGE = np.random.randint(average_AGE - standardD_AGE, average_AGE + standardD_AGE, size=Countnull_AGE)
    # Next line has been improved to avoid warning
    #loc[] function : select rows with null values in the age column and the astype() function : convert the age column to integers.
    dataset.loc[np.isnan(dataset['age']), 'age'] = NULLlist_AGE
    dataset['age'] = dataset['age'].astype(int)

In [13]:
mergeDATA

[      pclass                                       name     sex  age  sibsp  \
 39       1.0                         Brandeis, Mr. Emil    male   48    0.0   
 119      1.0              Frauenthal, Dr. Henry William    male   50    2.0   
 284      1.0  Stone, Mrs. George Nelson (Martha Evelyn)  female   62    0.0   
 1298     3.0                  Wittevrongel, Mr. Camille    male   36    0.0   
 241      1.0                      Rood, Mr. Hugh Roscoe    male   17    0.0   
 ...      ...                                        ...     ...  ...    ...   
 613      3.0                Albimona, Mr. Nassef Cassem    male   26    0.0   
 755      3.0                         Davies, Mr. Joseph    male   17    2.0   
 1199     3.0                Shorney, Mr. Charles Joseph    male   28    0.0   
 955      3.0                         Lefebre, Miss. Ida  female   33    3.0   
 53       1.0                     Carrau, Mr. Jose Pedro    male   17    0.0   
 
       parch     ticket      fare cabi

In [14]:
# defines  function called ‘get_title’ that extracts titles from passenger names.
#‘re.search’ method is used to search for a pattern in a string .

def get_title(name):
    title_search = re.search(' ([A-Za-z]+)\.', name)
    # If the title exists, extract and return it.
    if title_search:
        return title_search.group(1)
    return ""

In [15]:
mergeDATA

[      pclass                                       name     sex  age  sibsp  \
 39       1.0                         Brandeis, Mr. Emil    male   48    0.0   
 119      1.0              Frauenthal, Dr. Henry William    male   50    2.0   
 284      1.0  Stone, Mrs. George Nelson (Martha Evelyn)  female   62    0.0   
 1298     3.0                  Wittevrongel, Mr. Camille    male   36    0.0   
 241      1.0                      Rood, Mr. Hugh Roscoe    male   17    0.0   
 ...      ...                                        ...     ...  ...    ...   
 613      3.0                Albimona, Mr. Nassef Cassem    male   26    0.0   
 755      3.0                         Davies, Mr. Joseph    male   17    2.0   
 1199     3.0                Shorney, Mr. Charles Joseph    male   28    0.0   
 955      3.0                         Lefebre, Miss. Ida  female   33    3.0   
 53       1.0                     Carrau, Mr. Jose Pedro    male   17    0.0   
 
       parch     ticket      fare cabi

In [16]:
#creating a new column called 'Title'
for dataset in mergeDATA:
    dataset['Title'] = dataset['name'].apply(get_title)

In [17]:
mergeDATA

[      pclass                                       name     sex  age  sibsp  \
 39       1.0                         Brandeis, Mr. Emil    male   48    0.0   
 119      1.0              Frauenthal, Dr. Henry William    male   50    2.0   
 284      1.0  Stone, Mrs. George Nelson (Martha Evelyn)  female   62    0.0   
 1298     3.0                  Wittevrongel, Mr. Camille    male   36    0.0   
 241      1.0                      Rood, Mr. Hugh Roscoe    male   17    0.0   
 ...      ...                                        ...     ...  ...    ...   
 613      3.0                Albimona, Mr. Nassef Cassem    male   26    0.0   
 755      3.0                         Davies, Mr. Joseph    male   17    2.0   
 1199     3.0                Shorney, Mr. Charles Joseph    male   28    0.0   
 955      3.0                         Lefebre, Miss. Ida  female   33    3.0   
 53       1.0                     Carrau, Mr. Jose Pedro    male   17    0.0   
 
       parch     ticket      fare cabi

In [18]:
#replaces certain titles with ‘Rare’ and other titles with ‘Miss’ or ‘Mrs’ in the ‘Title’ column of each dataset in mergeDATA
for dataset in mergeDATA:
    dataset['Title'] = dataset['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')

    dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Ms', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')

In [19]:
mergeDATA

[      pclass                                       name     sex  age  sibsp  \
 39       1.0                         Brandeis, Mr. Emil    male   48    0.0   
 119      1.0              Frauenthal, Dr. Henry William    male   50    2.0   
 284      1.0  Stone, Mrs. George Nelson (Martha Evelyn)  female   62    0.0   
 1298     3.0                  Wittevrongel, Mr. Camille    male   36    0.0   
 241      1.0                      Rood, Mr. Hugh Roscoe    male   17    0.0   
 ...      ...                                        ...     ...  ...    ...   
 613      3.0                Albimona, Mr. Nassef Cassem    male   26    0.0   
 755      3.0                         Davies, Mr. Joseph    male   17    2.0   
 1199     3.0                Shorney, Mr. Charles Joseph    male   28    0.0   
 955      3.0                         Lefebre, Miss. Ida  female   33    3.0   
 53       1.0                     Carrau, Mr. Jose Pedro    male   17    0.0   
 
       parch     ticket      fare cabi

In [20]:
for dataset in mergeDATA:
    
    #maps the values in the sex column to integers, with female being mapped to 0 and male being mapped to 1. 
    #The column is then converted to integers using the astype() function.
    
    dataset['sex'] = dataset['sex'].map( {'female': 0, 'male': 1} ).astype(int)
    
    # Mapping titles
    #"Mr" being mapped to 1, "Master" being mapped to 2, "Mrs" being mapped to 3, "Miss" being mapped to 4, and "Rare" being mapped to 5. If the Title column has any missing values, they are filled with 0
    title_mapping = {"Mr": 1, "Master": 2, "Mrs": 3, "Miss": 4, "Rare": 5}
    dataset['Title'] = dataset['Title'].map(title_mapping)
    dataset['Title'] = dataset['Title'].fillna(0)

    # Mapping Embarked
    #embarked column to integers, with "S" being mapped to 0, "C" being mapped to 1, and "Q" being mapped to 2. The column is then converted to integers using the astype() function.

    dataset['embarked'] = dataset['embarked'].map( {'S': 0, 'C': 1, 'Q': 2} ).astype(int)
    
    # Mapping Fare
    #fare column is less than or equal to 7.91, it is mapped to 0. If the value is greater than 7.91 and less than or equal to 14.454, it is mapped to 1. If the value is greater than 14.454 and less than or equal to 31, it is mapped to 2. If the value is greater than 31, it is mapped to 3
    dataset.loc[ dataset['fare'] <= 7.91, 'fare'] = 0
    dataset.loc[(dataset['fare'] > 7.91) & (dataset['fare'] <= 14.454), 'fare'] = 1
    dataset.loc[(dataset['fare'] > 14.454) & (dataset['fare'] <= 31), 'fare']   = 2
    dataset.loc[ dataset['fare'] > 31, 'fare'] 							        = 3
    dataset['fare'] = dataset['fare'].astype(int)
    
    # Mapping Age
    # age column is less than or equal to 16, it is mapped to 0. If the value is greater than 16 and less than or equal to 32, it is mapped to 1. If the value is greater than 32 and less than or equal to 48, it is mapped to 2. If the value is greater than 48 and less than or equal to 64, it is mapped to 3. If the value is greater than 64,it is left unchanged.


    dataset.loc[ dataset['age'] <= 16, 'age'] 					       = 0
    dataset.loc[(dataset['age'] > 16) & (dataset['age'] <= 32), 'age'] = 1
    dataset.loc[(dataset['age'] > 32) & (dataset['age'] <= 48), 'age'] = 2
    dataset.loc[(dataset['age'] > 48) & (dataset['age'] <= 64), 'age'] = 3
    dataset.loc[ dataset['age'] > 64, 'age'] ;

In [21]:
mergeDATA

[      pclass                                       name  sex  age  sibsp  \
 39       1.0                         Brandeis, Mr. Emil    1    2    0.0   
 119      1.0              Frauenthal, Dr. Henry William    1    3    2.0   
 284      1.0  Stone, Mrs. George Nelson (Martha Evelyn)    0    3    0.0   
 1298     3.0                  Wittevrongel, Mr. Camille    1    2    0.0   
 241      1.0                      Rood, Mr. Hugh Roscoe    1    1    0.0   
 ...      ...                                        ...  ...  ...    ...   
 613      3.0                Albimona, Mr. Nassef Cassem    1    1    0.0   
 755      3.0                         Davies, Mr. Joseph    1    1    2.0   
 1199     3.0                Shorney, Mr. Charles Joseph    1    1    0.0   
 955      3.0                         Lefebre, Miss. Ida    0    2    3.0   
 53       1.0                     Carrau, Mr. Jose Pedro    1    1    0.0   
 
       parch     ticket  fare cabin  embarked  newColumn_cabin  \
 39     

In [22]:
#droping the ‘name’, ‘ticket’, ‘cabin’, and ‘sibsp’ columns from the ‘X_train’ and ‘X_test’ datasets.
Trash_ele = ['name', 'ticket', 'cabin', 'sibsp']
X_train = X_train.drop(Trash_ele, axis = 1)
X_test  = X_test.drop(Trash_ele, axis = 1)

In [23]:
mergeDATA

[      pclass                                       name  sex  age  sibsp  \
 39       1.0                         Brandeis, Mr. Emil    1    2    0.0   
 119      1.0              Frauenthal, Dr. Henry William    1    3    2.0   
 284      1.0  Stone, Mrs. George Nelson (Martha Evelyn)    0    3    0.0   
 1298     3.0                  Wittevrongel, Mr. Camille    1    2    0.0   
 241      1.0                      Rood, Mr. Hugh Roscoe    1    1    0.0   
 ...      ...                                        ...  ...  ...    ...   
 613      3.0                Albimona, Mr. Nassef Cassem    1    1    0.0   
 755      3.0                         Davies, Mr. Joseph    1    1    2.0   
 1199     3.0                Shorney, Mr. Charles Joseph    1    1    0.0   
 955      3.0                         Lefebre, Miss. Ida    0    2    3.0   
 53       1.0                     Carrau, Mr. Jose Pedro    1    1    0.0   
 
       parch     ticket  fare cabin  embarked  newColumn_cabin  \
 39     

In [24]:
mergeDATA

[      pclass                                       name  sex  age  sibsp  \
 39       1.0                         Brandeis, Mr. Emil    1    2    0.0   
 119      1.0              Frauenthal, Dr. Henry William    1    3    2.0   
 284      1.0  Stone, Mrs. George Nelson (Martha Evelyn)    0    3    0.0   
 1298     3.0                  Wittevrongel, Mr. Camille    1    2    0.0   
 241      1.0                      Rood, Mr. Hugh Roscoe    1    1    0.0   
 ...      ...                                        ...  ...  ...    ...   
 613      3.0                Albimona, Mr. Nassef Cassem    1    1    0.0   
 755      3.0                         Davies, Mr. Joseph    1    1    2.0   
 1199     3.0                Shorney, Mr. Charles Joseph    1    1    0.0   
 955      3.0                         Lefebre, Miss. Ida    0    2    3.0   
 53       1.0                     Carrau, Mr. Jose Pedro    1    1    0.0   
 
       parch     ticket  fare cabin  embarked  newColumn_cabin  \
 39     

In [25]:
#trained the model on the training data X_train and y_train using the fit() method.
classifier = DecisionTreeClassifier(max_depth=3, criterion = 'gini', min_samples_split=10, min_samples_leaf= 2)
classifier.fit(X_train.values, y_train.values)

In [26]:
y_pred = classifier.predict(X_test.values)
y_true = y_test.values

In [27]:
#compute performance metrics for the decision tree classifier 

from sklearn.metrics import accuracy_score, precision_score, recall_score

print('Accuracy is :', accuracy_score(y_true, y_pred))
print('Precision is :', precision_score(y_true, y_pred))
print('Recall is :', recall_score(y_true, y_pred))

Accuracy is : 0.7519083969465649
Precision is : 0.7058823529411765
Recall is : 0.6


In [28]:
import numpy as np
from collections import Counter

class RandomForest:
    def __init__(self, n_trees=10, max_depth=10, min_samples_split=2, n_feature=None):
        self.n_trees = n_trees
        self.max_depth=max_depth
        self.min_samples_split=min_samples_split
        self.updated_Fea=n_feature
        self.trees = []
    
    #trains the model on the input data X and labels y
    def fit(self, X, y):
        self.trees = []
        #loop to create n_trees decision trees, each trained on a different random subset of the training data
        for _ in range(self.n_trees):
            tree = DecisionTreeClassifier(criterion = 'entropy', max_depth=self.max_depth, min_samples_split=self.min_samples_split)
            X_sample, y_sample = self._sam_bootstrap(X, y)
            tree.fit(X_sample, y_sample)
            self.trees.append(tree)
    
    #generate a bootstrap sample of the input data X and labels y
    def _sam_bootstrap(self, X, y):  #implements the process of bootstrap aggregating
        n_samples = len(X)
        idxs = np.random.choice(n_samples, n_samples, replace=True)
        return X[idxs], y[idxs]
    #used to reduce variance in the model by training individual trees on different subsets of the data
    
    
    #used to determine the majority label in a set of labels y
    def _most_common_lbl(self, y):
        counter = Counter(y)
        most_common = counter.most_common(1)[0][0]
        return most_common
   
    #predicts labels for a new set of input data X using the trained decision trees
    def predict(self, X):
        PR = np.array([tree.predict(X) for tree in self.trees])
        tree_preds = np.swapaxes(PR, 0, 1)
        PR = np.array([self._most_common_lbl(pred) for pred in tree_preds])
        return PR

In [29]:
rand_classifier = RandomForest()
rand_classifier.fit(X_train.values, y_train.values)

In [30]:
y_pred = rand_classifier.predict(X_test.values)
y_true = y_test.values

In [31]:
from sklearn.metrics import accuracy_score, precision_score, recall_score

print('Accuracy is :', accuracy_score(y_true, y_pred))
print('Precision is :', precision_score(y_true, y_pred))
print('Recall is :', recall_score(y_true, y_pred))

Accuracy is : 0.7748091603053435
Precision is : 0.7530864197530864
Recall is : 0.61


In [32]:
import numpy as np

class AdaBoost:
    def __init__(self, base_estimator, n_estimators=50, learning_rate=1, max_depth = None):
        self.base_estimator = base_estimator
        self.n_estimators = n_estimators
        self.learning_rate = learning_rate
        self.estimators = []
        self.weights = []
        self.max_depth = max_depth
        
    def fit(self, X, y):
        
        SW = np.ones(len(X)) / len(X)
        
        for i in range(self.n_estimators):
            estimator = self.base_estimator(max_depth = self.max_depth)
            estimator.fit(X, y)
            y_pred = estimator.predict(X)
            mis_class = np.sum(np.where(y != y_pred, SW, 0))
            total = np.sum(SW)
            error = mis_class / total
            weight = np.log((1 - error) / error)
            weight *= self.learning_rate
            weight = np.exp(weight)
            self.weights.append(weight)
            self.estimators.append(estimator)
            misclassed = (y != y_pred)
            SW = SW * np.exp(weight * misclassed)
            SW = SW / np.sum(SW)

 
    def predict(self, X):
      y_pred = np.zeros(len(X))
      for i in range(len(self.estimators)):
          y_pred = y_pred + self.weights[i] * self.estimators[i].predict(X)
      return np.sign(y_pred)


In [33]:
clf_ada = AdaBoost(base_estimator= DecisionTreeClassifier)
clf_ada.fit(X_train.values, y_train.values)

In [34]:
y_pred = clf_ada.predict(X_test.values)

In [35]:
y_pred = y_pred.astype(int)

In [36]:
y_true = y_test.values

In [37]:
#calculates the accuracy of the AdaBoost classifier on the test data
accuracy = np.mean(y_true == y_pred)
print("Adaboost Accuracy ", accuracy)

Adaboost Accuracy  0.7290076335877863
