# Machine Learning From Scratch

In [1]:
import sklearn.datasets as data
import cupy as np
import pandas as pd

data = data.load_iris()
y = data.target
X = data.data

X.shape

(150, 4)

## Introduction:

This notebook was desinged with two purposes in mind. First, I wanted to brush up on the concepts of the most common machine learning algorithms, and create them from scratch, including scoring metrics. Second, I wanted to provide a tutorial for others who are looking to build some intuition for the underlying mechanics of the algorithms involved. I will cover the following algorithms in this notebook:

1. Linear Regression (OLS)
2. Logistic Regression
3. Naieve Bayes Classifier
4. K-Nearest Neighbors
5. K-Means Clustering
6. Support Vector Machines
7. Decision Trees
8. Random Forest

### 1. Linear Regression
This model uses the ordinary least squares algorithm: $\hat{\beta} = (X^TX)^{-1}X^Ty$

In [2]:
class LinearRegression:
    #TODO: Extend to MLR?
    
    def __init__(self) -> None:
        self.coef_ = None
        self.fit_predictions = None
        self.residuals = None
        self.df = None
        self.rss = None
        self.tss = None
        self.mse = None
        self.mae = None
        self.rmse = None
        self.r2 = None
        self.adj_r2 = None
        self.bic = None
        self.aic = None
        self.fstat = None


    def fit(self, X, y, multiple = False, verbose=False):
        if multiple:
            pass
        # Add a column of ones at position zero to calculate the model intercept
        X = np.hstack((np.ones((X.shape[0],1)),X))
        # Calculate a = X^(T)X
        a_1 = X.T @ X
        # Calculate a^(-1)
        a_inv = np.linalg.inv(a_1)
        # calculate a_2 = a^(-1)X^(T) again
        a_2 = a_inv @ X.T
        # calculate the coefficient vector
        beta_hat = a_2 @ y
        self.coef_ = beta_hat
        
        # Make and store predictions based on fitted model training data
        preds = []
        for i in range(X.shape[0]):
            pred = self.coef_[0]
            for j in range(self.coef_.shape[0]-1):
                pred += self.coef_[j+1]*X[i,j+1]
            preds.append(pred)   
        
        self.fit_predictions = np.array(preds)  

        # Calculate model scores
        self.residuals = y-preds 
        n = X.shape[0] # number of observations
        p = X.shape[1] # number of predictors
        self.residual_df = n-p
        self.model_df = p
        self.rss = np.sum(self.residuals**2)
        self.tss = np.sum((y-(np.mean(y)))**2)
        self.mae = np.sum(np.abs(self.residuals))/n
        self.mse = self.rss/n
        self.rmse = np.sqrt(self.mse)
        self.r2 = 1-(self.rss/self.tss)
        self.adj_r2 = 1-((1-self.r2)*(n-1))/(n-p-1)  
        self.aic = (2*p)-(n*np.log(self.rss))      

        if verbose:
            #TODO: Build out model summary output, maybe it needs its own method
            pass

    def predict(self, X):
        #TODO: Should the fit method call this with the training data?
        preds = []
        for i in range(X.shape[0]):
            pred = 0
            for idx,coef in enumerate(lm.coef_):
                pred += coef*X[i,idx]
            preds.append(pred)   
        
        return np.array(preds)
        
        
        


In [9]:
# this implementation
lm = LinearRegression()

lm.fit(X,y)

print(lm.coef_)

from numpy.linalg import pinv

# create a random matrix:
A = np.random.normal(size=(5,2))

# add a column of zeros to it:
A = np.hstack((np.ones((A.shape[0],1)),A))

print(pinv(A))

[ 0.18649525 -0.11190585 -0.04007949  0.22864503  0.60925205]
[[ 0.19211862  0.07391778  0.49722661  0.11067328  0.12606371]
 [-0.07603363 -0.15970266  0.42881137 -0.06372103 -0.12935406]
 [ 0.52903089 -0.24298621  0.14166483 -0.57938575  0.15167624]]


In [4]:
# create a random matrix:
A = np.random.normal(size=(5,2))

# add a column of zeros to it:
print(np.hstack((np.ones((A.shape[0],1)),A)))

[[ 1.          2.03219578  0.16612443]
 [ 1.          0.33459659 -0.72356847]
 [ 1.          0.00639409 -0.26058593]
 [ 1.         -3.25596154  0.38935975]
 [ 1.          0.55300325  1.3364459 ]]


In [5]:
# from sklearn -> model doesnt match, close but not perfect...
import sklearn.linear_model as lr
lm = lr.LinearRegression()

lm.fit(X,y)
print(lm.intercept_)
print(lm.score(X,y))
print(np.mean(y))


0.18649524720625021
0.9303939218549564
1.0


In [6]:
#Check with statsmodels library
import statsmodels.api as sm

results = sm.OLS(y, X).fit()

print(results.summary())

                                 OLS Regression Results                                
Dep. Variable:                      y   R-squared (uncentered):                   0.972
Model:                            OLS   Adj. R-squared (uncentered):              0.971
Method:                 Least Squares   F-statistic:                              1267.
Date:                Thu, 18 Nov 2021   Prob (F-statistic):                   3.17e-112
Time:                        23:02:19   Log-Likelihood:                          17.009
No. Observations:                 150   AIC:                                     -26.02
Df Residuals:                     146   BIC:                                     -13.98
Df Model:                           4                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

### 2. Logistic Regression

