In [None]:
#This file contains the implementations from scratch of 
#numerous machine learning algorithms and auxiliary functions.


In [None]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_selection import RFECV
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
import numpy as np
from sklearn.model_selection import train_test_split

# Function to plot decision boundary

In [None]:
def plot_decision_boundary(model, X, yp,feature1,feature2, plot_step=0.02, cmap='viridis', alpha=0.8):
  #plot_step is the step size for the meshgrid used to plot the decision boundary.
  #Aplha is a value used to plot the decision regions
  x_min = X.iloc[:, 0].min() - 0.5
  x_max = X.iloc[:, 0].max() + 0.5
  y_min = X.iloc[:, 1].min() - 0.5 
  y_max = X.iloc[:, 1].max() + 0.5 
  xx, yy = np.meshgrid(np.arange(x_min, x_max, plot_step),
                       np.arange(y_min, y_max, plot_step))
  Z = model.predict(np.c_[xx.ravel(), yy.ravel()])
  Z = Z.reshape(xx.shape)
  plt.contourf(xx, yy, Z, cmap=cmap, alpha=alpha)
  plt.scatter(X.iloc[:, 0], X.iloc[:, 1], c=yp, cmap=cmap, edgecolors='k',alpha=0.3)
  plt.xlabel(feature1)
  plt.ylabel(feature2)
  plt.title('Decision Boundary') 

MSE Calculation of Repeated 5 fold cross validation (model = DecisionTreeClassifier)

In [None]:
def repeated_5_fold_validation_mse(X,Y,n_repeats,n_splits):
  mse_final=[]
  for i in range(n_repeats):
    mse=[]
    for j in range(n_splits):
      xd_train,xd_test,yd_train,yd_test=train_test_split(X,Y,test_size=(1/n_splits))
      dt5=DecisionTreeRegressor(max_depth=10,min_samples_leaf=1)
      dt5.fit(xd_train,yd_train)
      yd_pred=dt5.predict(xd_test)
      m=mean_squared_error(yd_test,yd_pred)
      mse.append(m)
    mse_5_fold=sum(mse)/len(mse)
    mse_final.append(mse_5_fold)
  return(sum(mse_final)/len(mse_final))

# DecisionTreeClassifier from Scratch 

Using Gini Index and ID3 algorithm

In [None]:
def gini_index(coll):
  l=list(coll)
  ol=[]
  for i in l:
    if i not in ol:
      ol.append(i)

  freq=[]
  for i in ol:
    a=l.count(i)
    freq.append(a)

  total=len(l)
  sq=[]
  for i in freq:
    sq.append((i/total)**2)

  ans=1-sum(sq)
  return(ans)

def min_gini_indices(X):

  keys=[i for i in X.columns]
  ginivals=[]
  for i in keys:
    ginivals.append(gini_index(X[i]))
  ##Converting the dictionary into a dictionary sorted by values
  gg=dict(zip(keys,ginivals))
  gini=sorted(gg.items(),key=lambda x:x[1])
  giniindex=dict(gini)
  columns=list(giniindex.keys())
  print(columns)
  column=columns[0]
  return(column)

  

In [None]:
#This function will return threshold value
#Where if t<threshold, then t belongs to feature
#Else to list2
def cont_to_cat(feature,target):
  target=list(target)   
  feature=list(feature)
  feature.sort()
  running_mean=[]
  for i in range(len(feature)-1):
    k=((feature[i]+feature[i+1])/2)
    running_mean.append(k)
  gini_impurity=[]
  for i in range(len(running_mean)):
    temp_threshold=running_mean[i]
    left_leaf=[]
    right_leaf=[]
    for j in range(len(target)):
      if(feature[j]<temp_threshold):
        left_leaf.append(target[j])
      else:
        right_leaf.append(target[j])
    g_left=gini_index(left_leaf)
    g_right=gini_index(right_leaf)
    weight_left=len(left_leaf)
    weight_right=len(right_leaf)
    length = weight_left + weight_right 
    G=(((g_left * weight_left) + (g_right * weight_right))/ length)
    gini_impurity.append(G)

  ind=gini_impurity.index(min(gini_impurity))
  threshold=running_mean[ind]
  
  return(threshold)

In [None]:
#Returns 2 X-dataframes which will  be the left and right respective leaves
def binning(X1,column,threshold):
  #X is the feature dataframe
  #column is the column which we want to split
  X2=pd.DataFrame()
  X2=X1.copy(deep=True)
  for i in range(len(X2[column])):
    # print(X1.loc[i,column])
    if(X2.loc[i,column]<threshold):
      X2.loc[i,column]=0
    else:
      X2.loc[i,column]=1 #Right leaf is denoted by 1 by left leaf by 0
  X2.sort_values(by=column,inplace=True)
  X2.reset_index(inplace=True,drop=True)
  index=0
  fis=list(X2[column])[0]  
  for i in range(len(X2[column])):
    if(X2.loc[i,column]!=fis):
      index=i
      break
  X2.drop(column,inplace=True,axis=1)
  df1=X2.iloc[:index,:] 
  df2=X2.iloc[index:,:]
  df1.reset_index(inplace=True,drop=True) 
  df2.reset_index(inplace=True,drop=True)  

  return(df1,df2)

In [None]:
def mode_dataframe(U):
  list1=list(U)#U is a column
  og_list1=[]
  for i in list1:
    if i not in og_list1:
      og_list1.append(i)
  freq=[]
  for i in og_list1:
    freq.append(list1.count(i))
  ind=freq.index(max(freq))
  max_ele=og_list1[ind]
  return(max_ele)

In [None]:
def DecisionTree(XX,yy,max_depth,dtree,depth=0):
  #Using sorted_gini_indices function , we will get a dictionary
  #with sorted values of gini_indices at each iteration of out function
  #i.e DecisionTree
  # Y=y.to_frame()
  XX.reset_index(inplace=True,drop=True)
  yy.reset_index(inplace=True,drop=True)
  Xg=XX.copy(deep=True)
  yg=yy.copy(deep=True)

  if(len(Xg.columns)==3):
    dtree['class']=mode_dataframe(yg)
    return(dtree)

  #Condition for the algorithm to self-identify when there will be no
  #further change in the gini impurity
  #(analogous to no further change in gini-impurity)
  flag=0
  for i in Xg.columns:
    a=gini_index(Xg[i])
    if(a!=0):
      flag=1
      break

  if(flag==0):
    return(dtree)  

  #Condition for max_depth: If depth>max_depth then return
  if(depth>max_depth):
    return(dtree)
  data=pd.concat([Xg,yg],axis=1)
  # print("Prinitng data :- "+str(depth) ,data)
  
  # dtree={}
  
  #Is a dictionary of sorted gini indices values
  #We will use the attribute which comes first in the giniindex dictionary
  #We will also be using the threshold_values dictionary to get the 
  #threshold of the respective column
  column=min_gini_indices(Xg)
  # print('For depth= '+str(depth)+' the column with min gini index is '+str(column))
  df1,df2=binning(data,column,threshold_values[column])
  # if(len(dtree)==0):
  dtree[column]=dict()
  dtree[column][0]=dict()
  dtree[column][1]=dict()
  # print(df1,df2)
  X1=df1.iloc[:,:-1]
  y1=df1.iloc[:,-1]
  X2=df2.iloc[:,:-1]
  y2=df2.iloc[:,-1]
  # print("The depth is "+str(depth),dtree)
  # print(depth)
  DecisionTree(X1,y1,max_depth,dtree[column][0],depth+1)
  DecisionTree(X2,y2,max_depth,dtree[column][1],depth+1)

In [None]:
#The test function takes a row in the series:
#And outputs the predicted class
def classification_dtree(tdata):
  tdata=list(tdata)
  cols=list(threshold_values.keys())
  # print(cols)
  for i in range(len(cols)):
    # print(cols[i])
    thresh=threshold_values[cols[i]]
    if(tdata[i]<thresh):
      tdata[i]=0
    else:
      tdata[i]=1
  binned=dict(zip(cols,tdata))

  c=str()
  classified=-1
  stack=[dptree]
  val=-1
  latest_column=list(dptree.keys())[0]
  while(stack):
    curr_dict=stack.pop()
    kkeys=list(curr_dict.keys())
    #Length of the kkeys list will always be 1
    if 'class' in kkeys:
      classified=curr_dict['class']
      break
    if(type(kkeys[0])==str):
      latest_column=kkeys[0]
      new_curr_dict=curr_dict[latest_column]
      stack.append(new_curr_dict)
    else:
      val=binned[latest_column]
      new_curr_dict=curr_dict[val]
      stack.append(new_curr_dict)

  return(classified)

# Gaussian Naive Bayes Classifier from scratch

In [None]:
class GaussianClassifer:
  
  # def __init__(self,typee):
  #   self.typee=typee  

  def __cov(self,X_train):
    self.X_train=X_train
    X_=self.X_train
    col=[columns for columns in X_]
    d=len(col)#Dimensions
    cov=np.zeros((d,d)) 
    for i in range(len(col)):
      for j in range(len(col)):
        A=list(X_[col[i]])
        B=list(X_[col[j]])
        amean=sum(A)/len(A)
        bmean=sum(B)/len(B)
        sum1=0
        for k in range(len(A)):
          adiff=A[k]-amean
          bdiff=B[k]-bmean
          prod=adiff*bdiff
          sum1+=prod
        c=sum1/(len(A)-1)
        cov[i][j]=c  
    return(cov)

  def train(self,X_train,y_train):
    self.X_train=X_train
    self.y_train=y_train
    X_train=self.X_train
    y_train=self.y_train
    col=[columns for columns in X_train]     
    #Mean vector is thus made which is a column vector  
    d=len(col)#Dimensions
    data=pd.concat([X_train,y_train],axis=1)
    colm=data.columns
    classes=data[colm[-1]].unique()
    class_data = {c: data[data[colm[-1]] == c] for c in classes}
    #prior list consists of all prior values of classes in order
    #class_data is a dictionary that consists of classes as keys and dataframes
    #as values of only those classes 
    prior=[]
    # print(len(class_data[classes[0]]))
    for i in classes:
      p=len(class_data[i])/len(y_train)
      prior.append(p)  
    n_classes=len(classes)
    means=[]
    #Array of mean vectors for each class:-
    for i in class_data:
      mm=[]
      xx=class_data[i].iloc[:,:-1]
      for columns in xx:
        l=list(xx[columns])
        u=sum(l)/len(l)
        mm.append(u)
      m=np.array(mm)  
      means.append(m)

    likelihood=[]
    covv=[]
    #Likelihood will consist of 3(# of classes) different lists
    #Each list containing class conditional of all rows of the dataframe
    for i in class_data:
      X=class_data[i].iloc[:,:-1]
      X.reset_index(drop=True, inplace=True)
      # print(X)
      cov1=self.__cov(X) 
      covv.append(cov1)      
      inv_cov1=np.linalg.inv(cov1)
      det=abs(np.linalg.det(cov1))
      det_sqrt=det**0.5
      count=0
      pxwj=[]
      xcol=[cols for cols in X]
      for j in range(len(X)):
        x=[]  
        # X[col[0]][0]
        # print(xcol)
        for k in xcol:
          # print(k,j)
          b=X[k][j]
          x.append(b)
        #Making a vector x
        x=np.array(x)

        x_u=x-means[count]
        # print((x_u)*(inv_cov1))
        t=-0.5*(x_u.dot(inv_cov1).dot(np.transpose(x_u)))
        # print(t)
        denom=((2*math.pi)**(d/2))*(det_sqrt)
        e=math.exp(t)
        p=(1/denom)*e

        pxwj.append(p)
      count+=1
      likelihood.append(pxwj)

    self.prior=prior
    self.likelihood=likelihood
    # self.posterior=posterior
    self.class_data=class_data
    self.classes=classes
    self.means=means
    self.covv=covv
    # print(means)

  def predict(self,Xt):
    self.Xt=Xt
    X=self.Xt
    prior=self.prior
    means=self.means
    classes=self.classes
    likelihood=self.likelihood
    covv=self.covv
    x=np.array(X)
    posterior=[]
    d=len(Xt)
    for i in range(len(means)):
      x_u=x-means[i]
      inv_cov1=np.linalg.inv(covv[i])
      t=-0.5*(x_u.dot(inv_cov1).dot(np.transpose(x_u)))
      det=abs(np.linalg.det(covv[i]))
      det_sqrt=det**0.5
      denom=((2*math.pi)**(d/2))*(det_sqrt)
      e=math.exp(t)
      p=(1/denom)*e
      p=p*prior[i]
      posterior.append(p)
    #length of posterior list will be equal to no. of classes
    ind=posterior.index(max(posterior))
    # print(classes[ind])
    return(classes[ind])

  def test(self,X_test,y_test):
    # Will output [[predicted_classes], accuracy]
    data=pd.concat([X_test,y_test],axis=1)
    data.reset_index(drop=True, inplace=True)
    X=data.iloc[:,:-1]
    y=list(data.iloc[:,-1])
    predicted=[]
    cols=[i for i in X]
    length=len(X[cols[0]])
    # print(length)
    for i in range(length):  
      x=[]
      for k in cols:
        x.append(X[k][i])
      predicted.append(self.predict(x))
    correct=0
    for i in range(len(predicted)):
      if(predicted[i]==y[i]):
        correct+=1
    accuracy=(correct/len(y_test))*100
    op=[accuracy,predicted]
    return(op)

# Bagging from scratch

In [None]:
class Baggy:

  def __init__(self,base_estimator,n_estimators=10):
    self.base_estimator=base_estimator
    self.n_estimators=n_estimators
    #base estimator is the ML model whose ensemble is going
    #to be considered n_estimators number of times to get the final prediction

  def fit(self,Xt,yt):
    self.bag=[]
    #This bag will contain all the models trained on the base 
    #eastimators, for n_estimators number of times
    
    for i in range(self.n_estimators):
      dft=pd.concat([Xt,yt],axis=1)
      sample_df=dft.sample(n=len(yt),replace=True)
      Xtrain=sample_df.iloc[:,:-1]
      ytrain=sample_df.iloc[:,-1]
      model=self.base_estimator
      model.fit(Xtrain,ytrain)
      self.bag.append(model)

  def predict(self,Xtst):
    initial_pred_list=[]
    #Is a list of length n_estimators(list of lists)
    #It contains the y_pred of all the n_estimators number of models
    for i in range(len(self.bag)):
      ypred=(self.bag[i]).predict(Xtst)
      initial_pred_list.append(ypred)
    y_pred_final=[]
    for i in range(len(initial_pred_list[0])):
      temp_list=[]
      for j in range(len(initial_pred_list)):
        temp_list.append(initial_pred_list[j][i])
      y_pred_final.append(mode(temp_list))
    #We consider each value of y_pred based on majority voting of each index from 
    #all the n_estimator no. of lists
    return(y_pred_final)    


# KMeans Clustering from scratch

In [None]:
class Kmeansss:
  def __init__(self,n_clusters=40,max_iter=100):
    self.n_clusters=n_clusters#Cluster value k
    self.max_iter=max_iter# Max iterations


  def fit(self,X):
    Xp=np.array(X)
    n_samples,n_features=Xp.shape
    centroids = np.zeros((self.n_clusters,n_features))
    for i in range(self.n_clusters): # iterations of 
      centroid = Xp[np.random.choice(range(n_samples))] # random centroids
      centroids[i] = centroid


    for z in range(self.max_iter):

      dist=[]#Contains the euclidean distance of each point from each centroid
      #len(dist)=n_clusters
      #len(dist[i])=400(i.e n_samples)
      for i in range(len(centroids)):
        clus=[]
        x=centroids[i]
        for j in range(n_samples):
          y=Xp[j]
          d=np.linalg.norm(x-y)
          clus.append(d)
        clus=np.array(clus)
        dist.append(clus)

      dist=np.array(dist)
      

      labels=np.zeros(n_samples)

      for i in range(len(dist[0])):
        lk=[]
        for j in range(len(dist)):
          lk.append(dist[j][i])
        ind=lk.index(min(lk))
        labels[i]=ind  

      c=np.array([i for i in centroids])
      for j in range(self.n_clusters):
        centroids[j] = np.mean(X[labels == j], axis=0)

      flag=0
      if(np.array_equal(c,centroids)):#Condition for convergence
          flag=1

      if(flag==1):
        break
      # print(len(labels))

    self.labels_=labels#Cluster labels
    self.cluster_centers_=centroids#Cluster centers stored



# PCA(Principal Component Analysis) from scratch

In [None]:
#PCA implementation:-

#Reduced no. of features is given by reduced_features argument

class PCA:
  #data has to be a pandas DataFrame
  def __init__(self,data,reduced_features):
    self.data=data
    self.reduced_features=reduced_features

  def __covv(self,X,Y):

    xmean=sum(X)/len(X)
    ymean=sum(Y)/len(Y)
    sum1=0
    for i in range(len(X)):
      xdiff=X[i]-xmean
      ydiff=Y[i]-ymean
      prod=xdiff*ydiff
      sum1+=prod
    cov=sum1/(len(X)-1)
    return(cov)


  def __cov_mat(self,data1):
    
    cov_matrix=[]

    for i in range(len(data1.columns)):
      a=[]
      for j in range(len(data1.columns)):
        x=list(data1.iloc[:,i])
        y=list(data1.iloc[:,j])
        c=self.__covv(x,y)
        a.append(c)
      a=np.array(a)
      cov_matrix.append(a)

    cov_matrx=np.array(cov_matrix)

    return(cov_matrx)

#Code for covariance matrix from scratch is done

  def transformed_data(self):
    dataset=self.data
    cov_mat=self.__cov_mat(dataset)

    #Computing the eigenvalues and eigenvectors of Covariance matrix
    eig_vals , eig_vecs = np.linalg.eig(cov_mat)

    sorted_index=np.argsort(eig_vals)[::-1]
    sorted_eig_vals = eig_vals[sorted_index]
    sorted_eig_vecs = eig_vecs[:,sorted_index]

    selected_eig_vec = sorted_eig_vecs[:,:self.reduced_features]

    np_data= np.array(dataset)

    # np_data = (np_data - np.mean(np_data , axis=0))/(np.std(np_data,axis=0))
    transformed= np.dot(np_data,selected_eig_vec)
    transformed_df = pd.DataFrame(transformed)

    self.eigen_values_=eig_vals
    self.eigen_vectors_=eig_vecs
    return(transformed_df)

  def get_cov(self,data1):
    return(self.__cov_mat(data1))





# LDA (Linear Discriminant Analysis) from scratch

In [None]:
class LDA:
  def __init__(self,dataset=None,variance=None):
    self.data=dataset
    self.variance=variance

  def __matrices(self,Xl,yl):
    X=np.array(Xl)
    y=np.array(yl)
    rows,cols=X.shape
    uniq_classes=np.unique(y)
    temp_scat=np.cov(X.T)*(rows-1)

    scatter_intra=0
    for i in range(len(uniq_classes)):
      class_items=np.flatnonzero(y==uniq_classes[i])
      scatter_intra = scatter_intra + np.cov(X[class_items].T)*(len(class_items)-1)


    scatter_inter = temp_scat - scatter_intra
    
    return(scatter_intra,scatter_inter)

  # def __auto_select(self,eig_vals,eig_vecs):
    
  def __abss(self,x):
    return(abs(x))

  def transform(self,Xp,yp,linear_discriminants):
    scatter_intra,scatter_inter=self.__matrices(Xp,yp)
    self.Sw_=scatter_intra
    self.Sb_=scatter_inter
    inv_scat_intra=np.linalg.pinv(scatter_intra)
    eig_vals,eig_vectors = np.linalg.eig(np.dot(inv_scat_intra,scatter_inter)) 
    X=np.array(Xp)
    sorted_index=np.argsort(eig_vals)[::-1]
    sorted_eig_vals = eig_vals[sorted_index]
    sorted_eig_vecs = eig_vectors[:,sorted_index]
    self.sorted_eig_vecs_=sorted_eig_vecs
    xpx=np.dot(X,sorted_eig_vecs)
    self.default_var_conserved_=xpx.var()
    if(self.variance==None):
      total_var= xpx.var()#Amount of variance to be preserved
    else:
        total_var=self.variance

    n_components = -1
    for i in range(1,len(Xp)+1):
      XX=np.dot(X,sorted_eig_vecs[:,:i])
      temp_var=XX.var()
      
      if(temp_var >= total_var):
        n_components=i
        break

    #Auto-selection done
    if(linear_discriminants!=-1):
      n_components=linear_discriminants
    selected_eig_vecs = sorted_eig_vecs[:,:n_components]
    transformed_df = pd.DataFrame(np.dot(X,selected_eig_vecs))
    tdf = transformed_df.applymap(self.__abss)
    self.df=tdf
    self.y=yp
    return(tdf)

  def _roc_multiclass(self,model,Xtest,ytest):
    #n_classes is no. of unique classes
    n_classes= len(list(self.y.unique()))
    y_prob=model.predict_proba(Xtest)
    fpr={}
    tpr={}
    roc_auc={}
    for i in range(n_classes):
      fpr[i], tpr[i], _ = roc_curve(ytest == i, y_prob[:, i])#Calculates the 
      #Probab. of each element in class i is given by y_prob[:,i]
      #ytest==i returns Boolean mask of true-false values, giving true 
      #for class==i only
      roc_auc[i] = auc(fpr[i], tpr[i])

    fpr["micro"], tpr["micro"], _ = roc_curve(np.eye(n_classes)[ytest.ravel()], y_prob.ravel())
    roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])

    return(fpr,tpr,roc_auc)      

  def lda_classifier_roc_auc_5_fold(self,K_fold=5):
    
    #We will plot by micro averaging


    for i in range(K_fold):
      X_tr,X_te,y_tr,y_te=train_test_split(self.df,self.y)
      model=GaussianNB()
      model.fit(X_tr,y_tr)
      fpr1,tpr1,roc_auc1=self._roc_multiclass(model,X_te,y_te)
      colors = ['red', 'green', 'blue']
      # plt.figure()
      print(fpr1)
      print(roc_auc1)


# Bi-Directional Feature Selection

In [None]:


def bi_directional_feature_selection(X, y, n_features):

    # create decision tree classifier
    dt = DecisionTreeClassifier()

    # create RFECV object with StratifiedKFold cross-validation
    rfecv = RFECV(estimator=dt, step=1, cv=StratifiedKFold(5),
                  scoring='accuracy', n_jobs=-1, verbose=0)

    # perform forward feature selection
    rfecv.fit(X, y)

    # select the top n_features from forward selection
    fwd_features = np.array(X.columns)[rfecv.support_]
    fwd_X = X[fwd_features]

    # perform backward feature selection
    bkwd_features = fwd_features.tolist()
    bkwd_X = fwd_X.copy()
    while len(bkwd_features) > n_features:
        # fit decision tree classifier
        dt.fit(bkwd_X, y)

        # calculate feature importances
        feature_importances = dt.feature_importances_

        # find the least important feature
        least_important_feature_idx = np.argmin(feature_importances)
        least_important_feature = bkwd_features[least_important_feature_idx]

        # remove the least important feature from the feature set
        bkwd_features.remove(least_important_feature)
        bkwd_X = fwd_X[bkwd_features]

    # return the final feature set
    return bkwd_X


# Neural Network from scratch

**Artificial Neural Network (ANN) using MLP (Multi Layer Perceptron)**

In [None]:
class MLP_ANN:
  def __init__(self,layers,learning_rate=0.01,activation_function='sigmoid',weights=None):
    #By default the activation fucntion is kept defined as sigmoid
    #layers is a list that contains the no. of neurons
    #included in each layer
    self.layers = layers
    self.activation_function = activation_function
    self.learning_rate = learning_rate
    if(weights==None):
      self.weights = [np.random.rand(layers[i],layers[i+1]) for i in range(len(layers)-1)]
    else:
      self.weights = weights  
  #Specify the name of the activation function

  def activate(self,x,name='sigmoid'):
    #Takes x as a vector input
    #As each function is defined using numpy,
    #Thus , even if x is a vector, the np function will output a vector
    #After all elements have been acted upon by the np function
    x= x.astype(float)
    if(name=='sigmoid'):
      # print(x)
      h = 1/(1+np.exp(-x))
      # print(h)
      return(h)
    elif(name=='ReLU'):
      return(np.maximum(x,0))  
    elif(name=='tanh'):
      return(np.tanh(x))
    elif(name=='Leaky ReLU'):
      return(np.maximum(0.1*x,x))
    else:
      raise ValueError("Invalid Activation Function")      

  def forward_propagation(self,inputs):
    #Gives outputs
    activations = [inputs]
    for w in self.weights:
      #w is a weight vector which contains the 'feature no.' of weights
      #for each neuron
      # net_input = np.dot(np.reshape(activations[-1], (1,w.shape[0])), w)
      # net_input = np.dot(activations[-1].reshape(-1,1).T, w) correct one
      net_input = np.dot(activations[-1].reshape(-1,w.shape[0]), w)
      # net_input = np.dot(w.T,activations[-1])
      #This is the input calculated after giving the edges their corresponding
      #Weights from the previous layer  
      layer_activations=self.activate(net_input,name=self.activation_function)
      #Acted upon by the activation function
      activations.append(layer_activations)

    self.activations = activations
    #The activations list contains all the inputs going into every layer
    #But we return only the outputs of the output layer
    return(activations[-1])  

  def __activation_derivative(self,x,name='sigmoid'):
    if(name=="sigmoid"):
      return( (self.activate(x)*(1-self.activate(x))) )
    elif(name=='ReLU'):
      return( (x>0.0).astype(float) )
    elif(name=='tanh'):
      return(1-x**2)
    elif(name=='Leaky ReLU'):
      if(x<0.0):
        return(0.1)
      else:
        return(x)
    else:
      raise ValueError("Invalid Activation Function")

  def backward_propagation(self, inputs, outputs, act_vals):
    # Perform backward propagation and update the weights and biases
    error = act_vals - outputs
    deltas = [error * self.__activation_derivative(outputs)]
    for i in reversed(range(len(self.layers)-1)):
      layer_output = self.activations[i+1]
      # print(deltas[-1].shape,self.weights[i].shape)
      kp = np.dot(deltas[-1], self.weights[i].T)
      ap = self.__activation_derivative(layer_output)
      print(kp.shape,ap.shape )
      delta = np.dot(kp.T , ap )
      print(delta.shape)
      deltas.append(delta)
    deltas = list(reversed(deltas))

    for i in range(len(self.layers)-1):
      print(self.activations[i].shape,deltas[i].shape)
      weight_delta = np.dot(self.activations[i], deltas[i])
      bias_delta = np.sum(deltas[i], axis=0)
      self.weights[i] += weight_delta * self.learning_rate
      self.biases[i] += bias_delta * self.learning_rate

    return error


  def train(self,inputs,act_vals,epochs):
    #Returns a list of errors for each epoch
    errors=[]
    for i in range(epochs):
      #Now we will implement Stochastoc gradient descent,
      #i.e gradient descent for each row of inputs
      for j in range(len(inputs)):
        # ip = inputs[j].reshape(-1,1) correct one
        ip = inputs[j]
        op = self.forward_propagation(ip)
        act_val = act_vals[j]
        curr_error = self.backward_propagation(ip,op,act_val)
        #backward_propagation used => self.weights is updated
        errors.append(curr_error)

      if(i%100==0):
        mean_error = np.mean(np.abs(errors))
        print(f"Epoch {i}: Error = {mean_error}")

    self.errors = errors
    return(errors)

  def predict(self,inputs):
    return(self.activations)

  def accuracy(self,act_vals,outputs):
    a = 0
    for i in range(len(act_vals)):
      if(list(act_vals)[i]==list(outputs)[i]):
        a+=1
    acc = a/len(act_vals)

    return(acc)    


# SVM from Scratch

In [None]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
from sklearn.utils import shuffle
from sklearn.tree import DecisionTreeClassifier


def remove_correlated_features(X):
    corr_threshold = 0.9
    corr = X.corr()
    drop_columns = np.full(corr.shape[0], False, dtype=bool)
    for i in range(corr.shape[0]):
        for j in range(i + 1, corr.shape[0]):
            if corr.iloc[i, j] >= corr_threshold:
                drop_columns[j] = True
    columns_dropped = X.columns[drop_columns]
    X.drop(columns_dropped, axis=1, inplace=True)
    return columns_dropped


def remove_less_significant_features(X, Y):
    sl = 0.05
    regression_ols = None
    columns_dropped = np.array([])
    for itr in range(0, len(X.columns)):
        regression_ols = sm.OLS(Y, X).fit()
        max_col = regression_ols.pvalues.idxmax()
        max_val = regression_ols.pvalues.max()
        if max_val > sl:
            X.drop(max_col, axis='columns', inplace=True)
            columns_dropped = np.append(columns_dropped, [max_col])
        else:
            break
    regression_ols.summary()
    return columns_dropped


def compute_cost(W, X, Y):
    # calculate hinge loss
    N = X.shape[0]
    distances = 1 - Y * (np.dot(X, W))
    distances[distances < 0] = 0  # equivalent to max(0, distance)
    hinge_loss = regularization_strength * (np.sum(distances) / N)

    # calculate cost
    cost = 1 / 2 * np.dot(W, W) + hinge_loss
    return cost


def calculate_cost_gradient(W, X_batch, Y_batch):
    # if only one example is passed (eg. in case of SGD)
    if type(Y_batch) == np.float64:
        Y_batch = np.array([Y_batch])
        X_batch = np.array([X_batch])  # gives multidimensional array

    distance = 1 - (Y_batch * np.dot(X_batch, W))
    dw = np.zeros(len(W))

    for ind, d in enumerate(distance):
        if max(0, d) == 0:
            di = W
        else:
            di = W - (regularization_strength * Y_batch[ind] * X_batch[ind])
        dw += di

    dw = dw/len(Y_batch)  # average
    return dw


def sgd(features, outputs):
    max_epochs = 5000
    weights = np.zeros(features.shape[1])
    nth = 0
    prev_cost = float("inf")
    cost_threshold = 0.01  # in percent
    # stochastic gradient descent
    for epoch in range(1, max_epochs):
        # shuffle to prevent repeating update cycles
        X, Y = shuffle(features, outputs)
        for ind, x in enumerate(X):
            ascent = calculate_cost_gradient(weights, x, Y[ind])
            weights = weights - (learning_rate * ascent)

        # convergence check on 2^nth epoch
        if epoch == 2 ** nth or epoch == max_epochs - 1:
            cost = compute_cost(weights, features, outputs)
            print("Epoch is: {} and Cost is: {}".format(epoch, cost))
            # stoppage criterion
            if abs(prev_cost - cost) < cost_threshold * prev_cost:
                return weights
           

X_normalized = MinMaxScaler().fit_transform(X)
X = pd.DataFrame(X_normalized)

# split data into train and test set
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# train the model
clf = SGDClassifier(loss="hinge", penalty="l2", max_iter=1000)
clf.fit(X_train, y_train)

# testing the model
y_train_predicted = clf.predict(X_train)
y_test_predicted = clf.predict(X_test)

print("accuracy on test dataset: {}".format(accuracy_score(y_test, y_test_predicted)))
print("recall on test dataset: {}".format(recall_score(y_test, y_test_predicted)))
print("precision on test dataset: {}".format(precision_score(y_test, y_test_predicted)))