# Machine Learning From Scratch

In [71]:
import sklearn.datasets as data 
import numpy as np
import pandas as pd

data = data.load_iris()
y = data.target
X = data.data

X.shape

(150, 4)

## Introduction:

This notebook was desinged with two purposes in mind. First, I wanted to brush up on the concepts of the most common machine learning algorithms, and create them from scratch, including scoring metrics. Second, I wanted to provide a tutorial for others who are looking to build some intuition for the underlying mechanics of the algorithms involved. I will cover the following algorithms in this notebook:

- Linear Regression (OLS)
- Logistic Regression
- Naieve Bayes Classifier
- K-Nearest Neighbors
- K-Means Clustering
- Support Vector Machines
- Decision Trees
- Random Forest

### Linear Regression
This model uses the ordinary least squares algorithm:
$$
\hat{\beta} = (X^TX)^{-1}X^Ty
$$

In [90]:
class LinearRegression:
    
    def __init__(self) -> None:
        self.coef_ = None
        self.fit_predictions = None
        self.residuals = None
        self.df = None
        self.rss = None
        self.tss = None
        self.mse = None
        self.mae = None
        self.rmse = None
        self.r2 = None
        self.adj_r2 = None
        self.bic = None
        self.aic = None
        self.fstat = None


    def fit(self, X, y, verbose=False):
        # Calculate a = X^(T)X
        a_1 = X.T @ X
        # Calculate a^(-1)
        a_inv = np.linalg.inv(a_1)
        # calculate b = a^(-1)X^(T) again
        a_2 = a_inv @ X.T
        # calculate the coefficient vector
        beta_hat = a_2 @ y
        self.coef_ = beta_hat
        
        # Make and store predictions based on fitted model training data
        preds = []
        for i in range(X.shape[0]):
            pred = 0
            for idx,coef in enumerate(lm.coef_):
                pred += coef*X[i,idx]
            preds.append(pred)   
        
        self.fit_predictions = np.array(preds)  

        # Calculate model scores
        self.residuals = y-preds 
        n = X.shape[0] # number of observations
        p = X.shape[1] # number of predictors
        self.residual_df = n-p
        self.model_df = p
        self.rss = np.sum(self.residuals**2)
        self.tss = np.sum((y-(np.mean(y)))**2)
        self.mae = np.sum(np.abs(self.residuals))/n
        self.mse = self.rss/n
        self.rmse = np.sqrt(self.mse)
        self.r2 = 1-(self.rss/self.tss)
        self.adj_r2 = 1-((1-self.r2)*(n-1))/(n-p-1)

        

        if verbose:
            #print the model output
            pass

    def predict(self, X):
        # Make and store predictions based on fitted model and testing data
        preds = []
        for i in range(X.shape[0]):
            pred = 0
            for idx,coef in enumerate(lm.coef_):
                pred += coef*X[i,idx]
            preds.append(pred)   
        
        return np.array(preds)
        
        
        


In [93]:

lm = LinearRegression()

lm.fit(X,y)

lm.r2

0.9299960156084729

In [36]:
preds = []
for i in range(X.shape[0]):
    pred = 0
    for idx,coef in enumerate(lm.coef_):
        pred += coef*X[i,idx]
    preds.append(pred)
preds = np.array(preds)

# print(preds.shape)
# print(preds)
# print(y)


print('MAE: ' + str(mae) + '\n',
        'MSE: ' + str(mse) + '\n',
        'RMSE: ' + str(rmse))


MAE: 0.06610507853286897
 MSE: 0.004500539282329266
 RMSE: 0.06708605877773166


In [92]:
import statsmodels.api as sm

results = sm.OLS(y, X).fit()

print(results.summary())

                                 OLS Regression Results                                
Dep. Variable:                      y   R-squared (uncentered):                   0.972
Model:                            OLS   Adj. R-squared (uncentered):              0.971
Method:                 Least Squares   F-statistic:                              1267.
Date:                Sun, 12 Sep 2021   Prob (F-statistic):                   3.17e-112
Time:                        21:18:30   Log-Likelihood:                          17.009
No. Observations:                 150   AIC:                                     -26.02
Df Residuals:                     146   BIC:                                     -13.98
Df Model:                           4                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------