# **LECTURE - 1 : ANDREW NG**

### Importing Libraries

In [1]:
from sklearn.datasets import make_regression
import pandas as pd
import numpy as np

### Creating a Dataset

For Test Purposes we will create a sample regression dataset

In [2]:
# Making a very tough regression dataset
X, y, coeff = make_regression(
    n_samples=10000000,
    n_features=4,
    n_informative=2,
    n_targets=1,
    noise=0.1,
    bias=1.2,
    random_state=42,
    coef=True
)

In [3]:
coeff

array([99.39714974,  0.        , 47.4977107 ,  0.        ])

### Plotting the Data using Plotly

**Contour Plot**

In [4]:
# 2D Contour plot with scatter plot and same data
import plotly.graph_objects as go
import plotly.express as px
fig = go.Figure()
fig.add_trace(go.Scatter(x=X[:20,0], y=X[:20,1], mode='markers', marker=dict(size=8, color='black')))
fig.add_trace(go.Contour(z=y[:20], x=X[:20,0], y=X[:20,1], colorscale='Viridis'))
# Label Axes
fig.update_layout(xaxis_title='X1', yaxis_title='X2')
# Bold Axis
fig.update_layout(xaxis = dict(title_font=dict(size=18, color='black')), yaxis = dict(title_font=dict(size=18, color='black')))
# Title
fig.update_layout(title='Regression Dataset Contour Plot')
# Fig Size
fig.update_layout(width=800, height=800)
fig.show()

**3-D Visualisation of Sample Data**

In [5]:
# PLOTTING the data wrt first 2 features in 3D so it is interactive using gpu
# Use GPU for plotting
fig = px.scatter_3d(x=X[:20,0], y=X[:20,1], z=y[:20])
# Label Axes
fig.update_layout(scene = dict(
                    xaxis_title='X1',
                    yaxis_title='X2',
                    zaxis_title='Y'))
# Bold Axis
fig.update_layout(scene = dict(
                    xaxis = dict(title_font=dict(size=18, color='black')),
                    yaxis = dict(title_font=dict(size=18, color='black')),
                    zaxis = dict(title_font=dict(size=18, color='black')))
)
# Title
fig.update_layout(title='Regression Dataset')
# Fig Size
fig.update_layout(width=800, height=800)
fig.show()

### Splitting into Test and Train

In [6]:
# Split Train and Test
from sklearn.model_selection import train_test_split

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [8]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((9000000, 4), (1000000, 4), (9000000,), (1000000,))

### Linear Regression Class

**Loss Used is MSE**

In [9]:
from Models.LinearRegressor import LinearRegressor

In [10]:
model = LinearRegressor()

### Train using Mini-Batch Gradient Descent

**Note:** $Batch$ $Size$ $= 1$ for **Stochastic Gradient Descent**, $Batch$ $Size$ $= m$ for **Batch Gradient Descent**

In [11]:
# Train Model in Batches
n_iter = 20
batch_size = 1000
for iter in range(1, n_iter+1):
    # Shuffle the data every epoch for better training | TODO. THINK REASON
    indices = np.random.permutation(len(X_train))
    X_train = X_train[indices]
    y_train = y_train[indices]
    # Train on every batch once
    for batch in range(0, len(X_train), batch_size):
        X_batch = X_train[batch:batch+batch_size]
        y_batch = y_train[batch:batch+batch_size]
        model.fit(X_batch, y_batch, n_iter=1, lr=0.1)
    if iter%10 == 0:
        print(f"Iteration: {iter}, MSE: {model.mse(model.predict(X_train), y_train)}")

Iteration: 10, MSE: 0.010003699127726882
Iteration: 20, MSE: 0.010003697644861831


In [12]:
# Test Model
model.mse(model.predict(X_test), y_test)

0.010001249126604

## Test on Real Data

**Import Datastet**

In [13]:
# TEST ON REAL DATA
from sklearn.datasets import fetch_california_housing

In [14]:
dataset = fetch_california_housing()

In [15]:
X = pd.DataFrame(dataset.data, columns=dataset.feature_names)
y = pd.DataFrame(dataset.target, columns=['target'])

In [16]:
X.shape, y.shape

((20640, 8), (20640, 1))

In [17]:
X.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25


In [18]:
y.head()

Unnamed: 0,target
0,4.526
1,3.585
2,3.521
3,3.413
4,3.422


### **Data Visualisation and Preprocessing**

In [19]:
# Find Correlation of Features with Target
Xy = pd.concat([X, y], axis=1)
correlation = Xy.corr()

In [20]:
# Plot Absolute Correlation in Ascending Order of Correlation with Target with values written on top of bars
import plotly.express as px
fig = px.bar(correlation['target'].abs().filter(X.columns).sort_values(ascending=True), orientation='h', text_auto="0.2f")
fig.update_layout(title='Feature Correlation with Target')
fig.update_layout(xaxis_title='Correlation')
fig.update_layout(yaxis_title='Feature')
fig.update_traces(textfont_size=12, textangle=0, textposition="outside", cliponaxis=False)
# LEGEND OFF
fig.update_layout(showlegend=False)
fig.show()

In [21]:
# Correlation between features
correlation_f = X.corr()

In [22]:
# Plot Heatmap using Plotly
import plotly.express as px
fig = px.imshow(correlation_f, color_continuous_scale='Viridis', text_auto="0.2f")
# Title
fig.update_layout(title='Inter-Feature Correlation Heatmap')
# Fig Size
fig.update_layout(width=500, height=500)
fig.show()

#### **Feature Selection**:

From above correlation map of features with Target, lets pick features with minimum 0.1 correlation with Target

In [23]:
# PICK FEATURES WITH CORRELATION OF MINIMUM 0.1
features = correlation['target'].abs().filter(X.columns)[correlation['target'].abs() > 0.1].index

In [24]:
features

Index(['MedInc', 'HouseAge', 'AveRooms', 'Latitude'], dtype='object')

In [25]:
X = X[features]

In [26]:
X.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,Latitude
0,8.3252,41.0,6.984127,37.88
1,8.3014,21.0,6.238137,37.86
2,7.2574,52.0,8.288136,37.85
3,5.6431,52.0,5.817352,37.85
4,3.8462,52.0,6.281853,37.85


Let's check inter feature correlation again.

In [27]:
correlation_f = X.corr()

In [28]:
# Plot Heatmap using Plotly
import plotly.express as px
fig = px.imshow(correlation_f, color_continuous_scale='Viridis', text_auto="0.2f")
# Title
fig.update_layout(title='Inter-Feature Correlation Heatmap')
# Fig Size
fig.update_layout(width=500, height=500)
fig.show()

MedInc and AveRooms are highly correlated, so we will drop AveRooms.

In [29]:
# Drop AveRooms
X = X.drop('AveRooms', axis=1)

In [30]:
X.head()

Unnamed: 0,MedInc,HouseAge,Latitude
0,8.3252,41.0,37.88
1,8.3014,21.0,37.86
2,7.2574,52.0,37.85
3,5.6431,52.0,37.85
4,3.8462,52.0,37.85


In [31]:
X.shape, y.shape

((20640, 3), (20640, 1))

In [32]:
X = X.values
y = y.values

In [33]:
X, y

(array([[ 8.3252, 41.    , 37.88  ],
        [ 8.3014, 21.    , 37.86  ],
        [ 7.2574, 52.    , 37.85  ],
        ...,
        [ 1.7   , 17.    , 39.43  ],
        [ 1.8672, 18.    , 39.43  ],
        [ 2.3886, 16.    , 39.37  ]]),
 array([[4.526],
        [3.585],
        [3.521],
        ...,
        [0.923],
        [0.847],
        [0.894]]))

#### **Data Splitting**

In [34]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [35]:
X_train

array([[ 3.8372, 41.    , 33.91  ],
       [ 8.0069, 52.    , 37.73  ],
       [ 3.6712, 30.    , 37.46  ],
       ...,
       [ 2.9344, 36.    , 34.03  ],
       [ 5.7192, 15.    , 37.58  ],
       [ 2.5755, 52.    , 37.77  ]])

In [36]:
X_test

array([[ 1.6812, 25.    , 36.06  ],
       [ 2.5313, 30.    , 35.14  ],
       [ 3.4801, 52.    , 37.8   ],
       ...,
       [ 5.8578, 21.    , 34.28  ],
       [ 2.2554, 39.    , 34.04  ],
       [ 3.495 , 35.    , 34.08  ]])

#### **Feature Scaling**

In [37]:
# Normalisation of all features in range 0-1
from sklearn.preprocessing import MinMaxScaler

In [38]:
scaler = MinMaxScaler()

In [39]:
X_train = scaler.fit_transform(X_train)

In [40]:
X_test = scaler.transform(X_test)

In [41]:
# Checking Range of Y
y_train.min(), y_train.max()

(0.14999, 5.00001)

In [42]:
# Scaling Target to 0-1
y_scaler = MinMaxScaler()

In [43]:
y_train = y_scaler.fit_transform(y_train)

In [44]:
y_test = y_scaler.transform(y_test)

In [45]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((18576, 3), (2064, 3), (18576, 1), (2064, 1))

In [46]:
scaler_x = scaler
scaler_y = y_scaler

In [47]:
scaler_x, scaler_y

(MinMaxScaler(), MinMaxScaler())

#### **Training our Model**

In [48]:
def train_mini_batch(X_train, y_train, model, n_iter=20, batch_size=1000):
    for iter in range(1, n_iter+1):
        # Shuffle the data every epoch for better training | TODO. THINK REASON
        indices = np.random.permutation(len(X_train))
        X_train = X_train[indices]
        y_train = y_train[indices]
        # Train on every batch once
        for batch in range(0, len(X_train), batch_size):
            X_batch = X_train[batch:batch+batch_size]
            y_batch = y_train[batch:batch+batch_size]
            model.fit(X_batch, y_batch, n_iter=1)
        if iter%10 == 0:
            print(f"Iteration: {iter}, MSE: {model.mse(model.predict(X_train), y_train)}")
    return model

In [49]:
# Cross Validation
from sklearn.model_selection import KFold
kf = KFold(n_splits=5, shuffle=True, random_state=42)

In [50]:
models = []
mse = []
for train_index, test_index in kf.split(X_train):
    X_train_cv, X_test_cv = X_train[train_index], X_train[test_index]
    y_train_cv, y_test_cv = y_train[train_index], y_train[test_index]
    model = LinearRegressor()
    model = train_mini_batch(X_train_cv, y_train_cv, model)
    pred = model.predict(X_test_cv)
    mse.append(model.mse(scaler_y.inverse_transform(y_test_cv), scaler_y.inverse_transform(pred)))
    models.append(model)

Iteration: 10, MSE: 0.07754317087721596
Iteration: 20, MSE: 0.044874686616774254
Iteration: 10, MSE: 0.04012092339200512
Iteration: 20, MSE: 0.03121222985802919
Iteration: 10, MSE: 0.028671637845621368
Iteration: 20, MSE: 0.02735429200013718
Iteration: 10, MSE: 0.02835956576723656
Iteration: 20, MSE: 0.02746159018449071
Iteration: 10, MSE: 0.0603126149734417
Iteration: 20, MSE: 0.04003824873468785


In [51]:
mse

[1.0253847442026383,
 0.7212624011832759,
 0.6392461976248697,
 0.6582098012576971,
 0.9325446605415635]

In [52]:
model_mse_array = np.array(mse)

3rd Model Seems to be the best model

#### **ENSEMBLE LEARNING**

We have $5$ regression models with varying mse, we can use them to create an ensemble model.

Lets take a weighted average of all models to create an ensemble model.

If $mse_1, mse_2, mse_3, mse_4, mse_5$ are the mse of models $1$, $2$, $3$, $4$, $5$ respectively, then $w_i = \frac{1}{mse_i}$ where we round of $mse_i$ and $w_i$ to 2 decimal places for simplicity.

We will also normalise the weights such that $\sum_{i=1}^{4} w_i = 1$

**Formula for Final Prediction:**

$y_{ensemble} = \dfrac{\sum_{i=1}^{4} w_i * y_i}{\sum_{i=1}^{4} w_i}$ = $\sum_{i=1}^{4} w_i * y_i$

where $y_i$ is the prediction of model i.

In [53]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((18576, 3), (2064, 3), (18576, 1), (2064, 1))

In [54]:
def ensemble_predict(models, mse, X_test):
    # Weight of a model = 1/mse of model
    weights = [1/round(m, 2) for m in mse]
    # Round off weights to 2 decimal places
    weights = [round(w, 2) for w in weights]
    # Normalise weights
    weights = [w/sum(weights) for w in weights]
    # Predict using each model
    predictions = [model.predict(X_test) for model in models]
    # Multiply each prediction with its weight
    predictions = [p*w for p, w in zip(predictions, weights)]
    # Sum all predictions
    predictions = sum(predictions)
    return predictions

In [55]:
y_pred = ensemble_predict(models, model_mse_array, X_train)

In [56]:
# Inverse Transform Target
y_pred_orig = y_scaler.inverse_transform(y_pred)
y_train_orig = y_scaler.inverse_transform(y_train)

In [57]:
# Calculate MSE
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_train_orig, y_pred_orig)

In [58]:
print(f"Train MSE: {mse}")

Train MSE: 0.68623344740665


#### **TESTING**

In [59]:
# Test Model
y_pred = ensemble_predict(models, model_mse_array, X_test)

In [60]:
# Inverse Transform Target
y_pred_orig = y_scaler.inverse_transform(y_pred)
y_test_orig = y_scaler.inverse_transform(y_test)

In [61]:
# Calculate MSE
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_test_orig, y_pred_orig)

In [62]:
print(f"Test MSE: {mse}")

Test MSE: 0.7040430580228944


#### **VISUALISING RESULTS** with respect to first two features

In [78]:
# PLOTTING the data wrt first 2 features in 3D so it is interactive using gpu
# Use GPU for plotting
fig = px.scatter_3d()
# Plot Actual Data
fig.add_trace(go.Scatter3d(x=X_test[:20,0].flatten(), y=X_test[:20,1].flatten(), z=y_test_orig[:20].flatten(), mode='markers', marker=dict(size=8, color='red'), name='Y Test'))
# Plot Predictions
fig.add_trace(go.Scatter3d(x=X_test[:20,0].flatten(), y=X_test[:20,1].flatten(), z=y_pred_orig[:20].flatten(), mode='markers', marker=dict(size=8, color='black'), name='Y Pred'))
# Plot Plane
x1 = np.linspace(X_test[:,0].min(), X_test[:,0].max(), 10)
x2 = np.linspace(X_test[:,1].min(), X_test[:,1].max(), 10)
x1, x2 = np.meshgrid(x1, x2)
X = np.c_[x1.flatten(), x2.flatten(), np.zeros(100).reshape(-1,1)]
y = ensemble_predict(models, model_mse_array, X)
y = y_scaler.inverse_transform(y)
fig.add_trace(go.Surface(x=x1, y=x2, z=y.reshape(10,10), colorscale='Viridis', opacity=0.5, showscale=False, name='Regression Plane'))
# Label Axes
fig.update_layout(scene = dict(
                    xaxis_title='X1',
                    yaxis_title='X2',
                    zaxis_title='Y'))
# Bold Axis
fig.update_layout(scene = dict(
                    xaxis = dict(title_font=dict(size=18, color='black')),
                    yaxis = dict(title_font=dict(size=18, color='black')),
                    zaxis = dict(title_font=dict(size=18, color='black')))
)
# Title
fig.update_layout(title='Regression Dataset')
# Fig Size
fig.update_layout(width=600, height=600)
fig.show()

Following Improvements can be made to the project:

1. **Feature Engineering**: We can create new features from existing features to improve the model. This also includes adding polynomial features.
2. **Hyperparameter Tuning**: We can tune the hyperparameters of the model to improve the model.
3. **Outlier Detection and Removal**: We can remove outliers from the dataset to improve the model.
4. **Exponential Learning Rate Decay**: We can use exponential learning rate decay to improve the model.
5. **Adding Regularisation**: We can add regularisation to the model to improve the model.

### **CONCLUSION**

We have successfully implemented a Linear Regression Model from scratch and tested it on a real dataset. We also created an ensemble model from the individual models and tested it on the dataset.