In [24]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostClassifier

In [22]:
class myadaboost:

    def __init__(self,trees=4):
        self.trees_ = trees
        self.alpha_ = []
        self.clfs_ = []

    #creating a function for calculating upscaled sample from given df
    def sample_in_df(self, df):
        n = df.shape[0]
        random_vals = np.random.rand(n)  # n random numbers between 0 and 1
        indices = np.searchsorted(df['cum_sum'].values, random_vals)
        return indices.tolist()

    def fit(self,x_train,y_train):
        #making y +1 or -1
        y_train = y_train.iloc[:,0]
        y_train = y_train.apply(lambda x:1 if x==1 else -1)
        #creating temporary dfs to work
        df = x_train.copy()
        df_x = x_train.copy()
        df_y = y_train.copy()
        df['y'] = y_train.copy()
        #assigning weights to rows
        df['weights'] = 1/df.shape[0]
        
        for i in range(self.trees_):
            clf = DecisionTreeClassifier(max_depth=1)
            clf.fit(df_x,df_y)
            df['y_pred'] = clf.predict(df_x)
            incorrect = df['y'] != df['y_pred'] #rows where ypred != y
            error = df.loc[incorrect , 'weights'].sum() #using these rows to calculate error
            if error == 0:
                alpha = np.log(1e10) #high value of alpha for flawless model
            else:
                alpha = 0.5*np.log((1-error)/error)
                
            #appending models and alphas    
            self.alpha_.append(alpha)
            self.clfs_.append(clf)
            
            #updating weights based on classification    
            df['weights'] = df['weights']*np.exp(-alpha * df['y'] * df['y_pred'])
            df['weights'] = df['weights']/df['weights'].sum()
            #calculating cumulative sum for upscaled sampling
            df['cum_sum'] = df['weights'].cumsum()

            #creating new temporary df having upscaled samples
            indices = self.sample_in_df(df)
            df_x = df_x.iloc[indices].reset_index(drop=True)
            df_y = df_y.iloc[indices].reset_index(drop=True)
            df = df_x.copy()
            df['y'] = df_y.copy()
            df['weights'] = 1/df.shape[0]

    def predict(self,x_test):
        preds = []
        for clf,alpha in zip(self.clfs_,self.alpha_):
            pred = alpha*clf.predict(x_test)
            preds.append(pred)
        #calculating the value of h(x) = sum(alpha[i]*h[i](x)) (i goes from 0 to trees-1)
        final_pred = np.sum(preds,axis=0)
        #classifying based on sign (to original 0/1 format)
        final_pred = np.where(final_pred < 0 , 0, 1)
        #return predictions
        return final_pred

In [18]:
x,y = make_classification(n_samples = 500 , n_features=5 , random_state =42)

In [19]:
x = pd.DataFrame(x)
y = pd.DataFrame(y)
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=42)

In [20]:
y_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 400 entries, 249 to 102
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   0       400 non-null    int64
dtypes: int64(1)
memory usage: 6.2 KB


In [42]:
myada = myadaboost(trees=19)
myada.fit(x_train,y_train)
y_pred = myada.predict(x_test)
accuracy_score(y_test,y_pred)

0.93

In [51]:
m = AdaBoostClassifier(n_estimators=13,algorithm='SAMME')

In [55]:
m.fit(x_train,y_train)

  y = column_or_1d(y, warn=True)


In [49]:
y_pred_m = m.predict(x_test)

In [50]:
accuracy_score(y_test,y_pred_m)

0.94