# Dataset

[Advertising Dataset](https://www.kaggle.com/datasets/yasserh/advertising-sales-dataset)

In [110]:
import pandas as pd
import numpy as np

In [11]:
# load the dataset
dataset_filename = './dataset/advertising.csv'

advertising_df = pd.read_csv(dataset_filename)

advertising_df.head()

Unnamed: 0.1,Unnamed: 0,TV Ad Budget ($),Radio Ad Budget ($),Newspaper Ad Budget ($),Sales ($)
0,1,230.1,37.8,69.2,22.1
1,2,44.5,39.3,45.1,10.4
2,3,17.2,45.9,69.3,9.3
3,4,151.5,41.3,58.5,18.5
4,5,180.8,10.8,58.4,12.9


In [12]:
# drop the un-named column
advertising_df.drop(columns = ['Unnamed: 0'], axis = 1, inplace = True)

# X, y split

In [43]:
X = advertising_df.iloc[:,0:3] # convert to numpy array
y = advertising_df.iloc[:, 3] # convert to numpy array

# Train, Test split

In [154]:
# 80% train, 20% test

# X
X_train, X_test = np.split(X.sample(frac = 1), [int(0.8 * len(X))])
X_train, X_test = X_train.values, X_test.values

# y
y_train, y_test = np.split(y.sample(frac = 1), [int(0.8 * len(y))])
y_train, y_test = y_train.values.reshape(-1, 1), y_test.values.reshape(-1, 1)

# Linear Regressor

In [189]:
class LinearRegression:        
    def __init__(self, weights=[], alpha=0.01, max_iters=100, threshold=1e-6):
        self.weights = weights;
        self.alpha = alpha;
        self.max_iters = max_iters;
        self.threshold = threshold;

    def set_threshold(self, threshold):
        self.threshold = threshold

    def set_max_iters(self, max_iters):
        self.max_iters = max_iters

    def mean_squared_error(self, y_real, y_pred):
        y_real = np.array(y_real)
        y_pred = np.array(y_pred)
        
        squared_diff = np.square(y_real - y_pred)
        
        mse = np.mean(squared_diff)
        
        return mse

    def train(self, X, Y, alpha = 0.01, max_iters = None, print_loss_iter = 100):
        """
        - X: Training data (features).
        - Y: Target variable.
        - alpha: Learning rate (default = 0.01).
        - max_iters: Maximum number of iterations for training. If None, use stopping criteria (e.g., when the loss is constant for the last 3 epochs).
        - print_loss_iter: Print the loss every n iterations (default = 100).
        - If max_iters is not provided, stop when the change in loss falls below a defined threshold.
        """

        # setting the class attributes
        self.alpha = alpha
        self.max_iters = max_iters

        # pre-prend a column of 1's in X
        ones_col = (np.ones(len(X))).reshape(-1, 1)
        X = np.hstack((ones_col, X))

        # initialize weights
        self.weights = np.zeros(X.shape[1]).reshape(-1, 1)
        
        # multiply with weights to get prediction
        y_pred = np.dot(X, self.weights)

        # calculate total error
        tot_err = self.mean_squared_error(y_real=Y, y_pred=y_pred)
        print("Total error before: ", tot_err)
        
        # update weights based on gradient descent
        self.weights[0] -= (alpha * (2 * (np.mean(y_pred - Y))))

        for i in range(len(self.weights)):
            if i != 0: # first weight has been updated
                self.weights[i] -= (alpha * (2 * (np.mean(np.dot(X.T, (y_pred - Y)))))) 

        # multiply with weights to get prediction
        y_pred = np.dot(X, self.weights)

        print(y_pred)

        # calculate total error
        tot_err = self.mean_squared_error(y_real=Y, y_pred=y_pred)
        print("Total error after: ", tot_err)

In [190]:
lr = LinearRegression()
lr.train(X_train, y_train)

Total error before:  224.0041875
[[667619.6900775]
 [829487.6334825]
 [633048.1774125]
 [410050.4732025]
 [623890.1608125]
 [372731.5555575]
 [331291.5304425]
 [285959.3482725]
 [586342.2927525]
 [703793.8556475]
 [511933.4078775]
 [423100.6468575]
 [617479.5491925]
 [553373.4329925]
 [640374.5906925]
 [681585.6653925]
 [435692.9196825]
 [510101.8045575]
 [211321.5129825]
 [245435.1248175]
 [569399.9620425]
 [614503.1937975]
 [664643.3346825]
 [475530.2918925]
 [144467.9918025]
 [262606.4059425]
 [992729.2793775]
 [631674.4749225]
 [665101.2355125]
 [302901.6789825]
 [575352.6728325]
 [274282.8771075]
 [478506.6472875]
 [237650.8107075]
 [642893.0452575]
 [183160.6119375]
 [739739.0708025]
 [771792.1289025]
 [346631.2082475]
 [257111.5959825]
 [343425.9024375]
 [127525.6610925]
 [621371.7062475]
 [121115.0494725]
 [236506.0586325]
 [684333.0703725]
 [769273.6743375]
 [486290.9613975]
 [208574.1080025]
 [543757.5155625]
 [697612.1944425]
 [220021.6287525]
 [339991.6462125]
 [ 89519.8922