**IMPORTING NECESSARY LIBRARIES**

In [34]:
import random
import numpy as np
import pandas as pd

from sklearn import linear_model
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error

**CREATING DUMMY TRAINING DATA**

The dummy data contains one feature column (age of a person) and a target column (1 if they have insurance or 0 if they do not have insurance)

In [35]:
# Set the seed to ensure reproducibility
random.seed(42)

# Generate 50 random integers between 20 and 90 for training
train_age = [random.randint(20, 90) for _ in range(50)]
print(len(train_age))

# Creating target features
train_insurance = [1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 
             0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 
             1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 
             1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 
             0, 1, 1, 1, 0, 1, 1, 1, 1, 1]
print(len(train_insurance))

50
50


In [36]:
data = {
    'Age' : train_age,
    'Insurance' : train_insurance
}
columns = list(data.keys())
print(columns)

# converting to dataframe
train = pd.DataFrame(data, columns=columns)
print(train.shape)
train.head()

['Age', 'Insurance']
(50, 2)


Unnamed: 0,Age,Insurance
0,34,1
1,23,0
2,55,1
3,51,1
4,48,1


In [37]:
# converting to numpy array
training_data = train.to_numpy()
print(training_data.shape)
training_data[:5,:] # printing 5 rows

(50, 2)


array([[34,  1],
       [23,  0],
       [55,  1],
       [51,  1],
       [48,  1]])

**CREATING DUMMY TESTING DATA**

In [38]:
# testing data
x_test = np.array([49, 57, 90, 49, 32, 68, 19, 78, 66, 40])
y_test = np.array([1, 1, 0, 1, 1, 1, 0, 0, 1, 1])

**LINEAR REGRESSION IMPLEMENTATION**

In [39]:
# Preparing data for training linear regression model
x_train = training_data[:, 0].reshape(-1, 1)
print(x_train.shape, x_train[:5])

y_train = training_data[:, 1]
print(y_train.shape, y_train[:5])

(50, 1) [[34]
 [23]
 [55]
 [51]
 [48]]
(50,) [1 0 1 1 1]


In [40]:
# This function is used to find the best fit line using the training data
def fit(x_train, y_train):
    num = (x_train * y_train).mean() - x_train.mean() * y_train.mean()
    den = (x_train ** 2).mean() - x_train.mean() ** 2
    m = num / den
    c = y_train.mean() - m * x_train.mean()
    return m, c    


# This function predicts the value of 'y' corresponding to each 'x'
def predict(x, m, c):
    return m * x + c


# This function returns the score using the Y(actual) and Y(predited), coefficient of determination.
def score(y_truth, y_pred): 
    u = ((y_truth - y_pred)**2).sum()
    v = ((y_truth - y_truth.mean())**2).sum()
    return 1 - u/v


# This functions returns the cost
def cost (x, y, m , c):
    return ((y - m * x - c)**2).mean()

In [41]:
# training (finding value for m and c)
m, c = fit(x_train, y_train)

# Test data
y_pred_linear_regression = predict(x_test, m, c)
print("Test Score: ",score(x_test, y_pred_linear_regression))

# Train data
y_train_pred = predict(x_train, m, c)
print("Train Score:", score(y_train, y_train_pred))
print("M:", m)
print("C:", c)
print("Cost on training data:", cost(x_train,y_train, m, c ))


Test Score:  -7.0194023407521104
Train Score: -49.0
M: 0.0
C: 0.7
Cost on training data: 0.21


**LINEAR REGRESSION IMPLEMENTATION USING SCIKIT-LEARN LIBRARY**

In [42]:
# Create an instance of the LinearRegression model
reg = linear_model.LinearRegression()

# training model
reg.fit(x_train, y_train)

In [43]:
# Predictions
y_predict_reg = reg.predict(x_test.reshape(-1, 1))

predictions_reg = []
for i in y_predict_reg:
    predictions_reg.append(round(i))
print(predictions_reg)

[1, 1, 0, 1, 1, 1, 1, 0, 1, 1]


In [44]:
mse_self_implement = mean_squared_error(y_test, y_predict_reg)
 
print("MSE", mse_self_implement)
 
weights = reg.coef_           # Single weight in case of 1D or an array of weights corresponding to each feature 
intercept = reg.intercept_
print("Coeffecient for x is:", weights[0])
print("Intercept value is:", intercept)

MSE 0.2021143981417004
Coeffecient for x is: -0.00815668430308302
Intercept value is: 1.1117494236196308


**IMPLEMENTATION OF GRADIENT DESCENT ALGORITHM FOR SINGLE FEATURE**

In [45]:
# This function finds the new cost after each optimisation.
def cost(points, m, c):
    total_cost = 0
    M = len(points)
    for i in range(M):
        x = points[i, 0]
        y = points[i, 1]
        total_cost += (1/M)*((y - m*x - c)**2)
    return total_cost


# This function finds the new gradient at each step
def step_gradient(points, learning_rate, m , c):
    m_slope = 0
    c_slope = 0
    M = len(points)
    for i in range(M):
        x = points[i, 0]
        y = points[i, 1]
        m_slope += (-2/M)* (y - m * x - c)*x
        c_slope += (-2/M)* (y - m * x - c)
    new_m = m - learning_rate * m_slope
    new_c = c - learning_rate * c_slope
    return new_m, new_c


# The Gradient Descent Function
def gd(points, learning_rate, num_iterations):
    m = 0       # Intial random value taken as 0
    c = 0       # Intial random value taken as 0
    for i in range(num_iterations):
        m, c = step_gradient(points, learning_rate, m , c)
        print("{} \tm : {}, \tc : {}, \tcost : {}".format(i, m, c, cost(points, m, c)))
        # print(i, " Cost: ", cost(points, m, c))
    return m, c


# Training the linear regression model
def run():
    learning_rate = 0.0001
    num_iterations = 100
    m, c = gd(training_data, learning_rate, num_iterations)
    print("\nFinal m :", m)
    print("Final c :", c)
    return m,c


# Prediction
def predict(final_m, final_c, testing_data):
    y_pred = []
    for i in range(len(testing_data)):
        ans = final_m*testing_data[i] + final_c
        y_pred.append(round(ans))
    return y_pred

In [46]:
# training
m, c = run()

0 	m : 0.006380000000000001, 	c : 0.00014000000000000007, 	cost : 0.4137210987839998
1 	m : 0.00896953008, 	c : 0.00021555952000000008, 	cost : 0.3665394471107369
2 	m : 0.0100203857546944, 	c : 0.00026496003240832007, 	cost : 0.3587564998545511
3 	m : 0.010446643700037221, 	c : 0.00030374122582244376, 	cost : 0.3574657142889592
4 	m : 0.010619357221744013, 	c : 0.00033821116278170347, 	cost : 0.3572447146348444
5 	m : 0.010689148865279687, 	c : 0.00037093049003841957, 	cost : 0.3571999959337829
6 	m : 0.010717161196558115, 	c : 0.00040293865699654813, 	cost : 0.3571843254701956
7 	m : 0.010728213961886, 	c : 0.0004346576098246981, 	cost : 0.357173442077592
8 	m : 0.01073238229954896, 	c : 0.0004662586301435321, 	cost : 0.35716334797463345
9 	m : 0.010733756034246103, 	c : 0.000497811246721257, 	cost : 0.3571533844025508
10 	m : 0.010733995358184582, 	c : 0.0005293436835501641, 	cost : 0.35714344281033317
11 	m : 0.010733774197111069, 	c : 0.0005608673976772226, 	cost : 0.3571335053108

In [47]:
# predictions
y_prediction_gd = predict(m, c, x_test)
y_prediction_gd

[1, 1, 1, 1, 0, 1, 0, 1, 1, 0]

**USING INBUILT GRADIENT BOOSTER**

In [48]:
# Initialize the Gradient Boosting Regressor model
model = GradientBoostingRegressor()

# Fit the model to the training data
model.fit(x_train,y_train)

In [49]:
# predictions
y_pred_gradient_boosting_reg = model.predict(x_test.reshape(-1, 1))

prediction_gradient_boosting_reg = []
for i in y_pred_gradient_boosting_reg:
    prediction_gradient_boosting_reg.append(round(i))

print(prediction_gradient_boosting_reg)

[1, 1, 0, 1, 1, 1, 0, 0, 1, 1]
