In [1]:
import numpy as np
import pandas as pd
import plotly.graph_objects as go
import matplotlib.pyplot as plt

In [2]:
def print_all(*objects):
    for i in objects:
        print(f'obj: {i}')

def function(*, x, theta):
    y = np.zeros_like(x, dtype= float)
    y = np.sum(np.dot(x, theta), axis= 1)
    return y

def normalized(vector: np.ndarray) -> np.ndarray:
    mean = np.mean(vector)
    standard_deviation = np.std(vector)
    normalized_vector = (vector - mean) / standard_deviation
    return normalized_vector

def convert_data(*, input, order):
    ones = np.ones((len(input), order+1))
    for i in range(1, order + 1):
        ones[:, i] = input**i
    return ones

def true_value(x_normalized, y_normalized):
    XT_X_inv = np.linalg.inv(np.dot(x_normalized.T, x_normalized))
    XT_Y = np.dot(x_normalized.T, y_normalized)
    theta_true = np.dot(XT_X_inv, XT_Y)
    return theta_true

def inverse_normalized(theta_normalized: np.ndarray, mean_x, std_x, mean_y, std_y) -> np.ndarray:
    # Giải chuẩn hóa các hệ số theta
    theta = np.zeros_like(theta_normalized)
    theta[1:] = theta_normalized[1:] * (std_y / std_x)
    theta[0] = mean_y -  theta_normalized[0] * std_y
    return theta

In [3]:
class Linear_Regression_Multivariables:
    def __init__(self,) -> None:
        pass

    def predict(self, *, theta, normalized_input):
        y_pred = np.matmul(normalized_input, theta)
        return y_pred
    
    def compute_loss_function(self, *, y_true, y_pred):
        m = len(y_true)
        E = y_pred - y_true
        J = np.sum((E)**2)/ (2*m)
        return J
    
    def update_params(self, *, theta, lr, y_pred, y_true, normalized_input):
        m = len(y_true)
        E = y_pred - y_true
        dJ_dtheta = np.dot(normalized_input.T, E) / (m)
        theta_updated = theta - lr*dJ_dtheta
        return theta_updated
    
    def train(self, *, epoch, theta, normalized_input, y_true, lr):
        m = len(y_true)
        J_array = []
        for i in range(epoch):
            y_pred = self.predict(theta= theta, 
                                  normalized_input= normalized_input)
            J = self.compute_loss_function(y_true= y_true, 
                                           y_pred= y_pred)
            theta = self.update_params(theta= theta, lr= lr, y_pred= y_pred, 
                                       y_true= y_true, normalized_input= normalized_input)
            J_array.append(J)

        return J_array, theta




**Raw**

In [4]:
order = 2

In [5]:
df = pd.read_csv('ex3.csv')

x_cols = df.columns[:len(df.columns) - 1]
y_col = df.columns[-1]

print_all(x_cols, y_col)

obj: Index(['x'], dtype='object')
obj: y


In [6]:
x_value = df[x_cols].to_numpy().reshape(-1, )
y_value = df[y_col].to_numpy().reshape(-1, )

# print_all(x_value, y_value )

In [7]:
x_value = convert_data(input= x_value, order= order)
y_value = y_value.reshape(-1, 1)

In [8]:
lrm_order = Linear_Regression_Multivariables()
theta_init = np.random.randn(order + 1, 1)

In [9]:
theta_train = 0
for i in range(10000):
    pred = lrm_order.predict(theta= theta_init, normalized_input= x_value)
    cost = lrm_order.compute_loss_function(y_pred= pred, y_true= y_value)
    theta_init = lrm_order.update_params(theta= theta_init, lr= 0.0000000001, y_pred= pred,
                                y_true= y_value, normalized_input= x_value)
    theta_train = theta_init
    # print(theta_init)
    print(cost)

548635779.377887
546441169.248495
544255346.829404
542078276.9326545
539909924.5111877
537750254.6582803
535599232.6069844
533456823.72956574
531322993.5369481
529197707.6781568
527080931.9397663
524972632.245349
522872774.65492755
520781325.3644269
518698250.70513237
516623517.1431451
514557091.2788442
512498939.84634817
510449029.71297956
508407327.8787315
506373801.4757366
504348417.7677377
502331144.14956105
500321948.1465913
498320797.4142486
496327659.7374682
494342503.0301817
492365295.3348002
490396004.8217005
488434599.78871226
486481048.6606076
484535319.98859286
482597382.44980276
480667204.8467952
478744756.1070503
476830005.28246915
474922921.5488757
473023474.20552135
471131632.6745897
469247366.5007052
467370645.3504419
465501439.0118363
463639717.3938998
461785450.5261354
459938608.55805427
458099161.7586956
456267080.51614845
454442335.337074
452624896.84623176
450814735.7860063
449011823.015936
447216129.51224416
445427626.3673722
443646284.78951347
441872076.10215014

In [10]:
theta_train

array([[-1.52763887],
       [ 0.51529732],
       [ 6.59033374]])

In [11]:
y_range = function(x= x_value, theta= theta_train)

# Create a new Plotly figure to visualize the data points and the regression line
fig = go.Figure()

# Add a scatter plot for the original data points from the CSV file
fig.add_trace(
    go.Scatter(
        x=x_value[:, 1],  # X values from the 'x' column
        y=y_value.reshape(-1,),  # Y values from the 'y' column
        mode='markers',  # Plot style as markers (points)
        marker=dict(symbol='x'),  # Marker style
        name='Data Points'  # Name for the legend
    )
)

fig.add_trace(
    go.Scatter(
        x=x_value[:, 1],  # X values for the regression line
        y=y_range.reshape(-1,),  # Y values for the regression line
        mode='lines',  # Plot style as a line
        name='Regression Line'  # Name for the legend
    )
)

**Function**

In [12]:
lrm_order = Linear_Regression_Multivariables()

J_array, theta_train_2 = lrm_order.train(
    epoch= 10000,
    theta= theta_init,
    normalized_input= x_value,
    y_true= y_value, 
    lr= 0.0000000001
)

In [13]:
theta_train_2

array([[-1.5275945 ],
       [ 0.51565002],
       [ 6.59032934]])

In [14]:
y_range = function(x= x_value, theta= theta_train_2)

# Create a new Plotly figure to visualize the data points and the regression line
fig = go.Figure()

# Add a scatter plot for the original data points from the CSV file
fig.add_trace(
    go.Scatter(
        x=x_value[:, 1],  # X values from the 'x' column
        y=y_value.reshape(-1,),  # Y values from the 'y' column
        mode='markers',  # Plot style as markers (points)
        marker=dict(symbol='x'),  # Marker style
        name='Data Points'  # Name for the legend
    )
)

fig.add_trace(
    go.Scatter(
        x=x_value[:, 1],  # X values for the regression line
        y=y_range.reshape(-1,),  # Y values for the regression line
        mode='lines',  # Plot style as a line
        name='Regression Line'  # Name for the legend
    )
)

**Normalize**

In [15]:
order = 3
theta_init = np.random.randn(order + 1, 1)

x_value = df[x_cols].to_numpy().reshape(-1, )
y_value = df[y_col].to_numpy().reshape(-1, )

# print_all(x_value, y_value )
x_normalized = convert_data(input= x_value, order= order)
y_normalized = y_value.reshape(-1, 1)

print_all(x_normalized, y_normalized)

obj: [[1.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [1.00000000e+00 1.20000000e+00 1.44000000e+00 1.72800000e+00]
 [1.00000000e+00 2.40000000e+00 5.76000000e+00 1.38240000e+01]
 [1.00000000e+00 3.60000000e+00 1.29600000e+01 4.66560000e+01]
 [1.00000000e+00 4.80000000e+00 2.30400000e+01 1.10592000e+02]
 [1.00000000e+00 6.00000000e+00 3.60000000e+01 2.16000000e+02]
 [1.00000000e+00 7.20000000e+00 5.18400000e+01 3.73248000e+02]
 [1.00000000e+00 8.40000000e+00 7.05600000e+01 5.92704000e+02]
 [1.00000000e+00 9.60000000e+00 9.21600000e+01 8.84736000e+02]
 [1.00000000e+00 1.08000000e+01 1.16640000e+02 1.25971200e+03]
 [1.00000000e+00 1.20000000e+01 1.44000000e+02 1.72800000e+03]
 [1.00000000e+00 1.32000000e+01 1.74240000e+02 2.29996800e+03]
 [1.00000000e+00 1.44000000e+01 2.07360000e+02 2.98598400e+03]
 [1.00000000e+00 1.56000000e+01 2.43360000e+02 3.79641600e+03]
 [1.00000000e+00 1.68000000e+01 2.82240000e+02 4.74163200e+03]
 [1.00000000e+00 1.80000000e+01 3.24000000e+02 5.8

In [16]:
x_normalized[:, 1:] = np.apply_along_axis(func1d= normalized, arr= x_normalized[:, 1:], axis= 0)
y_normalized = np.apply_along_axis(func1d= normalized, arr= y_normalized.reshape(-1, 1), axis= 0)

print_all(x_normalized, y_normalized)

obj: [[ 1.         -1.71155244 -1.11063688 -0.87734359]
 [ 1.         -1.67031022 -1.11015612 -0.87733753]
 [ 1.         -1.62906799 -1.10871384 -0.87729508]
 [ 1.         -1.58782576 -1.10631004 -0.87717985]
 [ 1.         -1.54658353 -1.10294471 -0.87695546]
 [ 1.         -1.50534131 -1.09861787 -0.87658553]
 [ 1.         -1.46409908 -1.0933295  -0.87603366]
 [ 1.         -1.42285685 -1.08707962 -0.87526347]
 [ 1.         -1.38161462 -1.07986821 -0.87423857]
 [ 1.         -1.3403724  -1.07169528 -0.87292258]
 [ 1.         -1.29913017 -1.06256083 -0.8712791 ]
 [ 1.         -1.25788794 -1.05246486 -0.86927176]
 [ 1.         -1.21664571 -1.04140737 -0.86686415]
 [ 1.         -1.17540348 -1.02938836 -0.86401991]
 [ 1.         -1.13416126 -1.01640782 -0.86070263]
 [ 1.         -1.09291903 -1.00246577 -0.85687594]
 [ 1.         -1.0516768  -0.98756219 -0.85250344]
 [ 1.         -1.01043457 -0.9716971  -0.84754876]
 [ 1.         -0.96919235 -0.95487048 -0.84197549]
 [ 1.         -0.92795012 

In [17]:
J_array, theta_train_3 = lrm_order.train(
    epoch= 1000000,
    theta= theta_init,
    normalized_input= x_normalized,
    y_true= y_normalized, 
    lr= 0.001
)
theta_train_3

array([[1.66528693e-16],
       [1.41887080e-01],
       [6.22944601e-01],
       [2.41351741e-01]])

In [18]:
y_range = function(x= x_normalized, theta= theta_train_3)
y_range.shape

(84,)

In [19]:
# Create a new Plotly figure to visualize the data points and the regression line
fig = go.Figure()

# Add a scatter plot for the original data points from the CSV file
fig.add_trace(
    go.Scatter(
        x=x_normalized[:, 1],  # X values from the 'x' column
        y=y_normalized.reshape(-1,),  # Y values from the 'y' column
        mode='markers',  # Plot style as markers (points)
        marker=dict(symbol='x'),  # Marker style
        name='Data Points'  # Name for the legend
    )
)

fig.add_trace(
    go.Scatter(
        x=x_normalized[:, 1],  # X values for the regression line
        y=y_range.reshape(-1,),  # Y values for the regression line
        mode='lines',  # Plot style as a line
        name='Regression Line'  # Name for the legend
    )
)

**Unnormalize**