In [25]:
import numpy as np
import pandas as pd
import plotly.graph_objects as go
import matplotlib.pyplot as plt

In [26]:
def function(*, input: np.ndarray, theta: np.ndarray) -> np.ndarray:
    # Create zeros matrix 
    y = np.zeros_like(input, dtype= float)
    # Multiply matrix X with theta matrix
    y = np.sum(np.dot(input, theta), axis= 1)
    return y

def normalized(vector: np.ndarray) -> np.ndarray:
    mean = np.mean(vector)  # Mean of input vector
    standard_deviation = np.std(vector) # std of input vector
    normalized_vector = (vector - mean) / standard_deviation # linear transform vector by using gauss
    return normalized_vector

def convert_data(*, input: np.ndarray, order: int) -> np.ndarray:
    """
        Convert vector x into matrix x with multiple order
    """
    ones = np.ones((len(input), order+1))
    for i in range(1, order + 1):
        ones[:, i] = input**i
    return ones

def true_value(*, x_normalized: np.ndarray, y_normalized: np.ndarray) -> np.ndarray:
    """
        Compute true value of theta 
        theta =   (X.T*X)^-1 * (X.T * Y)
    """
    inv_XT_X = np.linalg.inv(np.dot(x_normalized.T, x_normalized))
    XT_Y = np.dot(x_normalized.T, y_normalized)
    theta_true = np.dot(inv_XT_X, XT_Y)
    return theta_true

def inverse_normalized(*, theta_normalized: np.ndarray, mean_x: float, 
                       std_x: float, mean_y: float, std_y:float) -> np.ndarray:
    # Giải chuẩn hóa các hệ số theta
    theta = np.zeros_like(theta_normalized)
    theta[1:] = theta_normalized[1:] * (std_y / std_x)
    theta[0] = mean_y -  theta_normalized[0] * std_y
    return theta

In [27]:
class Linear_Regression_Multivariables:
    def __init__(self,) -> None:
        pass

    def predict(self, *, theta: np.ndarray, normalized_input: np.ndarray) -> np.ndarray:
        y_pred = np.matmul(normalized_input, theta)
        return y_pred
    
    def compute_loss_function(self, *, y_true: np.ndarray, y_pred: np.ndarray) -> np.ndarray:
        m = len(y_true)
        E = y_pred - y_true
        J = np.sum((E)**2)/ (2*m)
        return J
    
    def update_params(self, *, theta: np.ndarray, lr: float, y_pred: np.ndarray, 
                      y_true: np.ndarray, normalized_input: np.ndarray) -> np.ndarray:
        m = len(y_true)
        E = y_pred - y_true
        dJ_dtheta = np.dot(normalized_input.T, E) / (m)
        theta_updated = theta - lr*dJ_dtheta
        return theta_updated
    
    def train(self, *, epoch: int, theta: np.ndarray, normalized_input: 
              np.ndarray, y_true: np.ndarray, lr: float) -> np.ndarray:
        m = len(y_true)
        J_array = []
        for i in range(epoch):
            y_pred = self.predict(theta= theta, 
                                  normalized_input= normalized_input)
            J = self.compute_loss_function(y_true= y_true, 
                                           y_pred= y_pred)
            theta = self.update_params(theta= theta, lr= lr, y_pred= y_pred, 
                                       y_true= y_true, normalized_input= normalized_input)
            J_array.append(J)

        return J_array, theta



### 1. Normalize data

In [28]:
# Read csv file ex2.csv
pd_ex2 = pd.read_csv('ex2.csv')

# Get collumns of file 
X_cols = pd_ex2.columns[:-1]
Y_col = pd_ex2.columns[-1]

In [29]:
# Get vector input and output
X = pd_ex2[X_cols].values
Y = pd_ex2[Y_col].values

In [30]:
# Normalize 
X = np.apply_along_axis(normalized, 0, X)
Y = np.apply_along_axis(normalized, 0, Y)

In [31]:
### 2. Training 


**Raw**

In [32]:
order = 2

In [33]:
df = pd.read_csv('ex3.csv')

x_cols = df.columns[:len(df.columns) - 1]
y_col = df.columns[-1]

In [34]:
x_value = df[x_cols].to_numpy().reshape(-1, )
y_value = df[y_col].to_numpy().reshape(-1, )

# print_all(x_value, y_value )

In [35]:
x_value = convert_data(input= x_value, order= order)
y_value = y_value.reshape(-1, 1)

In [36]:
lrm_order = Linear_Regression_Multivariables()
theta_init = np.random.randn(order + 1, 1)

In [37]:
theta_train = 0
for i in range(10000):
    pred = lrm_order.predict(theta= theta_init, normalized_input= x_value)
    cost = lrm_order.compute_loss_function(y_pred= pred, y_true= y_value)
    theta_init = lrm_order.update_params(theta= theta_init, lr= 0.0000000001, y_pred= pred,
                                y_true= y_value, normalized_input= x_value)
    theta_train = theta_init
    # print(theta_init)
    print(cost)

419399267.10943806
417722147.78065073
416051744.0131396
414388028.9162941
412730975.7071792
411080557.7101049
409436748.3561967
407799521.18296784
406168849.8338931
404544708.0579848
402927069.7093703
401315908.7468706
399711199.23358124
398112915.33645546
396521031.32588756
394935521.57529885
393356360.56072515
391783522.8604062
390216983.1543761
388656716.2240554
387102696.95184594
385554900.3207259
384013301.41384727
382477875.4141344
380948597.60388505
379425443.36437213
377908388.1754476
376397407.615147
374892477.35929775
373393573.1811259
371900670.9508674
370413746.63537914
368932776.2977519
367457736.09692526
365988602.28730404
364525351.21837515
363067959.3343281
361616403.1736747
360170659.3688718
358730704.6459455
357296515.8241159
355868069.8154238
354445343.62435997
353028314.3474938
351616959.1731056
350211255.38081867
348811180.34123355
347416711.5155645
346027826.4552758
344644502.8017206
343266718.2857814
341894450.7275109
340527678.0357755
339166378.2078992
337810529

In [38]:
theta_train

array([[-0.33474988],
       [ 1.4465607 ],
       [ 6.57851746]])

In [39]:
y_range = function(input= x_value, theta= theta_train)

# Create a new Plotly figure to visualize the data points and the regression line
fig = go.Figure()

# Add a scatter plot for the original data points from the CSV file
fig.add_trace(
    go.Scatter(
        x=x_value[:, 1],  # X values from the 'x' column
        y=y_value.reshape(-1,),  # Y values from the 'y' column
        mode='markers',  # Plot style as markers (points)
        marker=dict(symbol='x'),  # Marker style
        name='Data Points'  # Name for the legend
    )
)

fig.add_trace(
    go.Scatter(
        x=x_value[:, 1],  # X values for the regression line
        y=y_range.reshape(-1,),  # Y values for the regression line
        mode='lines',  # Plot style as a line
        name='Regression Line'  # Name for the legend
    )
)

**Function**

In [40]:
lrm_order = Linear_Regression_Multivariables()

J_array, theta_train_2 = lrm_order.train(
    epoch= 10000,
    theta= theta_init,
    normalized_input= x_value,
    y_true= y_value, 
    lr= 0.0000000001
)

In [41]:
theta_train_2

array([[-0.33471377],
       [ 1.44670996],
       [ 6.5785156 ]])

In [42]:
y_range = function(input= x_value, theta= theta_train_2)

# Create a new Plotly figure to visualize the data points and the regression line
fig = go.Figure()

# Add a scatter plot for the original data points from the CSV file
fig.add_trace(
    go.Scatter(
        x=x_value[:, 1],  # X values from the 'x' column
        y=y_value.reshape(-1,),  # Y values from the 'y' column
        mode='markers',  # Plot style as markers (points)
        marker=dict(symbol='x'),  # Marker style
        name='Data Points'  # Name for the legend
    )
)

fig.add_trace(
    go.Scatter(
        x=x_value[:, 1],  # X values for the regression line
        y=y_range.reshape(-1,),  # Y values for the regression line
        mode='lines',  # Plot style as a line
        name='Regression Line'  # Name for the legend
    )
)

**Normalize**

In [43]:
order = 3
theta_init = np.random.randn(order + 1, 1)

x_value = df[x_cols].to_numpy().reshape(-1, )
y_value = df[y_col].to_numpy().reshape(-1, )

# print_all(x_value, y_value )
x_normalized = convert_data(input= x_value, order= order)
y_normalized = y_value.reshape(-1, 1)

In [44]:
x_normalized[:, 1:] = np.apply_along_axis(func1d= normalized, arr= x_normalized[:, 1:], axis= 0)
y_normalized = np.apply_along_axis(func1d= normalized, arr= y_normalized.reshape(-1, 1), axis= 0)

In [45]:
J_array, theta_train_3 = lrm_order.train(
    epoch= 1000000,
    theta= theta_init,
    normalized_input= x_normalized,
    y_true= y_normalized, 
    lr= 0.001
)
theta_train_3

array([[1.38785393e-16],
       [1.81011722e-01],
       [5.25141254e-01],
       [3.02039232e-01]])

In [46]:
y_range = function(input= x_normalized, theta= theta_train_3)
y_range.shape

(84,)

In [47]:
# Create a new Plotly figure to visualize the data points and the regression line
fig = go.Figure()

# Add a scatter plot for the original data points from the CSV file
fig.add_trace(
    go.Scatter(
        x=x_normalized[:, 1],  # X values from the 'x' column
        y=y_normalized.reshape(-1,),  # Y values from the 'y' column
        mode='markers',  # Plot style as markers (points)
        marker=dict(symbol='x'),  # Marker style
        name='Data Points'  # Name for the legend
    )
)

fig.add_trace(
    go.Scatter(
        x=x_normalized[:, 1],  # X values for the regression line
        y=y_range.reshape(-1,),  # Y values for the regression line
        mode='lines',  # Plot style as a line
        name='Regression Line'  # Name for the legend
    )
)

**Unnormalize**