# Dataset

[Advertising Dataset](https://www.kaggle.com/datasets/yasserh/advertising-sales-dataset)

In [14]:
import pandas as pd
import numpy as np

In [15]:
# load the dataset
dataset_filename = './dataset/advertising.csv'

advertising_df = pd.read_csv(dataset_filename)

advertising_df.head()

Unnamed: 0.1,Unnamed: 0,TV Ad Budget ($),Radio Ad Budget ($),Newspaper Ad Budget ($),Sales ($)
0,1,230.1,37.8,69.2,22.1
1,2,44.5,39.3,45.1,10.4
2,3,17.2,45.9,69.3,9.3
3,4,151.5,41.3,58.5,18.5
4,5,180.8,10.8,58.4,12.9


In [16]:
# drop the un-named column
advertising_df.drop(columns = ['Unnamed: 0'], axis = 1, inplace = True)

# X, y split

In [17]:
X = advertising_df.iloc[:,0:3] # convert to numpy array
y = advertising_df.iloc[:, 3] # convert to numpy array

# Train, Test split

In [18]:
# Shuffle the dataset while maintaining the correspondence between X and y
shuffled_indices = np.random.permutation(len(X))

# Apply the shuffled indices to both X and y
X_shuffled = X.iloc[shuffled_indices].values
y_shuffled = y.iloc[shuffled_indices].values

# 80% train, 20% test

# X
X_train, X_test = np.split(X_shuffled, [int(0.8 * len(X_shuffled))])

# y
y_train, y_test = np.split(y_shuffled, [int(0.8 * len(y_shuffled))])

# Linear Regressor

In [19]:
def mean_squared_error(y_real, y_pred):
        y_real = np.array(y_real)
        y_pred = np.array(y_pred)
        
        squared_diff = np.square(y_real - y_pred)
        
        mse = np.mean(squared_diff)
        
        return mse

class LinearRegression:        
    def __init__(self, weights=[], alpha=0.01, max_iters=100, threshold=1e-6):
        self.weights = weights;
        self.alpha = alpha;
        self.max_iters = max_iters;
        self.threshold = threshold;

    def set_threshold(self, threshold):
        self.threshold = threshold

    def set_max_iters(self, max_iters):
        self.max_iters = max_iters

    def get_weights(self):
        return self.weights

    def train(self, X, Y, alpha = 0.01, max_iters = None, print_loss_iter = 100):
        """
        - X: Training data (features). (2d numpy array)
        - Y: Target variable. (2d numpy array)
        - alpha: Learning rate (default = 0.01).
        - max_iters: Maximum number of iterations for training. If None, use stopping criteria (e.g., when the loss is constant for the last 3 epochs).
        - print_loss_iter: Print the loss every n iterations (default = 100).
        - If max_iters is not provided, stop when the change in loss falls below a defined threshold.
        """
        prevLoss = float('inf')
        
        # setting the class attributes
        self.alpha = alpha
        self.max_iters = max_iters

        # pre-prend a column of 1's in X
        ones_col = (np.ones(len(X))).reshape(-1, 1)
        X = np.hstack((ones_col, X))

        # initialize weights
        self.weights = np.zeros(X.shape[1]).reshape(-1, 1)

        # if max_iters is not provided, fall back to the pre-defined threshold
        for num_iters in range(max_iters if max_iters is not None else 1_000_000):
            # multiply with weights to get prediction
            y_pred = np.dot(X, self.weights)
    
            # calculate total error
            tot_err = mean_squared_error(y_real=Y, y_pred=y_pred)
            
            # update weights based on gradient descent
            self.weights[0] -= (alpha * (2 * (np.mean(y_pred - Y))))
    
            for i in range(len(self.weights)):
                if i != 0: # first weight has been updated
                    self.weights[i] -= (alpha * (2 * (np.mean(np.dot(X.T, (y_pred - Y)))))) 
    
            # multiply with weights to get prediction
            y_pred = np.dot(X, self.weights)
            
            # calculate total error
            tot_err = mean_squared_error(y_real=Y, y_pred=y_pred)

            if num_iters % print_loss_iter == 0:
                print(f"Error on iteration {num_iters}: {tot_err}")
            
            # Check for convergence
            if max_iters is None and abs(prevLoss - tot_err) < self.threshold:
                print("Converged according to the predefined threshold")
                break

            prevLoss = tot_err

    def predict(self, X):
        # # pre-prend a column of 1's in X
        # ones_col = (np.ones(len(X))).reshape(-1, 1)
        # X = np.hstack((ones_col, X))
        
        return np.dot(X, self.weights[1:]) + self.weights[0]

# Training step

In [20]:
lr = LinearRegression()
lr.train(X_train, y_train, alpha=0.0000001, print_loss_iter=5)

Error on iteration 0: 122.9479329823928
Error on iteration 5: 64.18332650238372
Error on iteration 10: 63.78771440639339
Error on iteration 15: 63.785990783583756
Error on iteration 20: 63.786052242682096
Converged according to the predefined threshold


# Weights

In [21]:
print("Weights of the Linear Regression model that it learned from the data: \n", lr.get_weights()[1:])

Weights of the Linear Regression model that it learned from the data: 
 [[0.05734455]
 [0.05734455]
 [0.05734455]]


## Conslusion from weights:

Seeing the weights array we get to know that the first column (TV Ad Budget ($)) has the least significance in the feature set

# Predict on the test data

In [22]:
y_pred = lr.predict(X_test)

# Evaluate using MSE

In [23]:
mean_squared_error(y_test, y_pred)

np.float64(53.00771205799144)

# Test using sample values

In [24]:
lr.predict(X_test[0:5]) # predicted values

array([[15.46584155],
       [17.13456788],
       [ 5.21263643],
       [17.07148888],
       [16.79623505]])

In [25]:
y_test[0:5] # actual values

array([22.6, 18.9,  7. , 21.5, 14.8])