

The hypothesis for logistic regression utilizes the sigmoid function:

\begin{align*}
h_\theta(a) = \frac{1}{1 + e^{-a}} \quad \text{where} \quad a = \theta^T x
\end{align*}


For logistic regression, the cost function (also known as the loss function) for a single training example $ (x^{(i)}, y^{(i)}) $ is defined as:
 
\begin{align*}
Cost\left(h_\theta(x^{(i)}), y^{(i)}\right) = 
\begin{cases}
- \log\left(h_\theta(x^{(i)})\right) & \text{if } y^{(i)} = 1, \\
- \log\left(1 - h_\theta(x^{(i)})\right) & \text{if } y^{(i)} = 0 \\
\end{cases}
\end{align*}

This can be compactly written as:

\begin{align*}
 Cost\left(h_\theta(x^{(i)}), y^{(i)}\right) = -y^{(i)} \log\left(h_\theta(x^{(i)})\right) - (1 - y^{(i)}) \log\left(1 - h_\theta(x^{(i)})\right)
\end{align*}

And the overall cost function is the average cost over all training examples:

\begin{align*}
J(\theta) = \frac{1}{m} \sum_{i=1}^{m} \text{cost}\left(h_\theta(x^{(i)}), y^{(i)}\right).
\end{align*}

Substituting the compact form of the cost function:

\[
J(\theta) = -\frac{1}{m} \sum_{i=1}^{m} \left[ y^{(i)} \log\left(h_\theta(x^{(i)})\right) + (1 - y^{(i)}) \log\left(1 - h_\theta(x^{(i)})\right) \right].
\]

\section{Gradient of the Cost Function}

To minimize \(J(\theta)\), we compute its gradient with respect to the parameters \(\theta\). The derivative of \(J\) with respect to \(\theta_j\) is:

\[
\frac{\partial J(\theta)}{\partial \theta_j} = \frac{1}{m} \sum_{i=1}^{m} \left( h_\theta(x^{(i)}) - y^{(i)} \right) x_j^{(i)}.
\]

In vectorized form, the gradient can be written as:

\[
\nabla_\theta J(\theta) = \frac{1}{m} \sum_{i=1}^{m} \left( h_\theta(x^{(i)}) - y^{(i)} \right) x^{(i)}.
\]

\section{Gradient Descent Update Rule}

Using gradient descent to update the parameters \(\theta\), the update rule for each parameter \(\theta_j\) is:

\[
\theta_j := \theta_j - \alpha \frac{\partial J(\theta)}{\partial \theta_j},
\]

where:
\begin{itemize}
    \item \(\alpha\) is the learning rate.
    \item \(\frac{\partial J(\theta)}{\partial \theta_j}\) is the partial derivative of the cost function with respect to \(\theta_j\).
\end{itemize}

In vectorized form, the update rule for all parameters simultaneously is:

\[
\theta := \theta - \alpha \nabla_\theta J(\theta).
\]

\section{Summary of Complete Formulas}

Putting it all together, here are the complete and corrected formulas for logistic regression:

\begin{align*}
h_\theta(x) &= \frac{1}{1 + e^{-\theta^T x}} \\
J(\theta) &= -\frac{1}{m} \sum_{i=1}^{m} \left[ y^{(i)} \log\left(h_\theta(x^{(i)})\right) + (1 - y^{(i)}) \log\left(1 - h_\theta(x^{(i)})\right) \right] \\
\nabla_\theta J(\theta) &= \frac{1}{m} \sum_{i=1}^{m} \left( h_\theta(x^{(i)}) - y^{(i)} \right) x^{(i)} \\
\theta &:= \theta - \alpha \nabla_\theta J(\theta)
\end{align*}

\section{Additional Notes}

\begin{itemize}
    \item \textbf{Initialization}: Initialize the parameters \(\theta\) (e.g., to zeros or small random values) before starting gradient descent.
    \item \textbf{Convergence}: Choose an appropriate learning rate \(\alpha\) to ensure convergence. If \(\alpha\) is too large, gradient descent may overshoot the minimum; if it's too small, convergence may be very slow.
    \item \textbf{Vectorization}: Implementing these operations in a vectorized form (using matrices and vectors) can significantly speed up computations, especially for large datasets.
\end{itemize}

\end{document}


In [1]:
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import numpy as np
import pandas as pd

In [2]:
ex5_data = pd.read_csv('ex5data.txt', header= None).to_numpy()

In [3]:
ex5_data.shape

(118, 3)

In [4]:
x1, x2, y = ex5_data[:, 0], ex5_data[:, 1], ex5_data[:, 2]  

In [5]:
colors = np.where(y == 1, 'red', 'blue')

In [6]:
fig = go.Figure()

# Plot data points
fig.add_trace(go.Scatter(
    x=x1, 
    y=x2, 
    mode='markers', 
    marker=dict(color=colors, size=10),
    name='Data Points'
))

fig.update_xaxes(title= 'x1')
fig.update_yaxes(title= 'x2')


In [7]:
n = 0
X_mapFeature = np.ones((118, 27))

for order in range(1, 7, 1):
    for i in range(0, order+1):
        x12 = x1**(order - i) * x2**(i)
        X_mapFeature[:, n] = x12
        n = n + 1

In [8]:
class Logistic_Regression_Multivariables:
    def __init__(self, *, number_of_feature: int) -> None:
        self.number_of_features = number_of_feature

    def normalize_vector(self, vector: np.ndarray) -> np.ndarray:
        mean = np.mean(vector)
        std = np.std(vector)
        if std == 0:
            return vector - mean
        return (vector - mean) / std
    
    def normalize_input(self, *, X: np.ndarray) -> tuple:
        norm_X = np.apply_along_axis(self.normalize_vector, arr=X, axis=0).reshape(-1, self.number_of_features)
        return norm_X

    def add_ones_columns(self, *, normalized_input: np.ndarray) -> np.ndarray:
        ones = np.ones(len(normalized_input)).reshape(-1, 1)
        x_add = np.hstack((ones, normalized_input))
        return x_add

    def predict(self, *, theta: np.ndarray, normalized_input: np.ndarray) -> np.ndarray:
        y_pred = np.matmul(normalized_input, theta)
        y_pred = 1/(1 + np.exp(-y_pred))
        return y_pred
    
    def compute_loss(self, *, y_true: np.ndarray, y_pred: np.ndarray) -> np.ndarray:
        m = len(y_true)
        epsilon = 1e-15
        y_pred = np.clip(y_pred, epsilon, 1 - epsilon)
        J = np.sum(- y_true*np.log(y_pred) - (1 - y_true)*np.log(1 - y_pred)) / (m)
        return J
    
    def update_params(self, *, theta: np.ndarray, lr: float, y_pred: np.ndarray, 
                      y_true: np.ndarray, normalized_input: np.ndarray) -> np.ndarray:
        m = len(y_true)
        E = y_pred - y_true
        dJ_dtheta = np.dot(normalized_input.T, E) / (m)
        theta_updated = theta - lr*dJ_dtheta
        return theta_updated
    
    def train(self, *, epochs: int, theta: np.ndarray, input: np.ndarray, 
              output: np.ndarray, lr: float, plot_graph: False, color: list, time_delay: float) -> np.ndarray:
        output = output.reshape(-1, 1)
        normalized_input = self.normalize_input(X= input)
        normalized_input_with_ones = self.add_ones_columns(normalized_input= normalized_input)

        J_array = np.array([])
        
        for epoch in range(epochs):
            y_pred = self.predict(theta= theta, normalized_input= normalized_input_with_ones)

            J = self.compute_loss(y_true= output, y_pred= y_pred)
            J_array = np.append(arr= J_array, values= J)

            theta = self.update_params(theta= theta, lr= lr, y_pred= y_pred, 
                                       y_true= output, normalized_input= normalized_input_with_ones)

            
    
        return J_array, theta

In [9]:
num_feature = X_mapFeature.shape[1]
Logistic_Regression = Logistic_Regression_Multivariables(number_of_feature= num_feature)

normalized_input = Logistic_Regression.normalize_input(X= X_mapFeature)
normalized_input_with_ones = Logistic_Regression.add_ones_columns(normalized_input= normalized_input)


In [10]:
# np.random.seed(1)

theta_init = np.random.rand(num_feature + 1, 1)
theta_init

array([[0.82373771],
       [0.42635565],
       [0.2641027 ],
       [0.11039775],
       [0.03286226],
       [0.42760521],
       [0.77312301],
       [0.82532883],
       [0.55130219],
       [0.73300235],
       [0.5132352 ],
       [0.8909225 ],
       [0.98637977],
       [0.14401891],
       [0.85792826],
       [0.0055092 ],
       [0.51188692],
       [0.82927031],
       [0.96850516],
       [0.51108968],
       [0.03707858],
       [0.69875215],
       [0.2447933 ],
       [0.79085206],
       [0.43660233],
       [0.12444577],
       [0.0361907 ],
       [0.37835309]])

In [11]:
learning_rate_ = [0.01, 0.001, 0.003, 0.3, 0.04, 0.1]
J = np.zeros((1000, ))
theta_ = np.zeros_like(theta_init)
for i in learning_rate_:
    
    J_arr, theta_arr = Logistic_Regression.train(epochs= 1000, theta= theta_init, 
                                           input= X_mapFeature, output= y, lr= i, plot_graph= False,
                                           color= colors, time_delay= 0.01)
    theta_ = np.hstack([theta_, theta_arr])
    J = np.vstack([J, J_arr])

fig = go.Figure()
for i in range(len(J[1:, :])):
    fig.add_trace(go.Scatter(x=np.arange(1000), y=J[(1+i), :],
                            mode= 'lines', name= f'lr: {learning_rate_[i]}'))
fig.update_xaxes(title= 'epochs')
fig.update_yaxes(title= 'J', tickangle= 0)
fig.show()

In [12]:
theta_visualized = theta_[:, 5]
theta_visualized

array([-0.41541377,  0.5528915 ,  1.05170244, -0.9681885 , -1.03526323,
       -1.03444017,  0.65194454,  0.3658805 ,  0.06043028,  0.57694847,
       -0.71228789,  0.27125688, -0.10936055, -0.42456112, -0.47211943,
       -0.45666575, -0.18105074,  0.62421825,  0.15740455, -0.14507925,
       -0.49528977, -0.4181033 , -0.18655733, -0.30449864,  0.1020953 ,
       -0.98821043, -0.35498956, -0.6648598 ])

In [13]:
x_values = np.linspace(normalized_input_with_ones[:, 1].min(), normalized_input_with_ones[:, 1].max(), 100)
y_values = np.linspace(normalized_input_with_ones[:, 2].min(), normalized_input_with_ones[:, 2].max(), 100)

X, Y = np.meshgrid(x_values, y_values)


In [14]:
theta_visualized

array([-0.41541377,  0.5528915 ,  1.05170244, -0.9681885 , -1.03526323,
       -1.03444017,  0.65194454,  0.3658805 ,  0.06043028,  0.57694847,
       -0.71228789,  0.27125688, -0.10936055, -0.42456112, -0.47211943,
       -0.45666575, -0.18105074,  0.62421825,  0.15740455, -0.14507925,
       -0.49528977, -0.4181033 , -0.18655733, -0.30449864,  0.1020953 ,
       -0.98821043, -0.35498956, -0.6648598 ])

In [15]:
n = 0
X_mapFeature_data = np.ones((100, 100))

for order in range(0, 7, 1):
    for i in range(0, order+1):
        x12 = X**(order - i) * Y**(i) * theta_arr[n]
        X_mapFeature_data = X_mapFeature_data + x12
        n = n + 1

In [16]:
X_mapFeature_data.shape

(100, 100)

In [17]:
fig = go.Figure()

fig.add_trace(go.Scatter(
    x=x1, 
    y=x2, 
    mode='markers', 
    marker=dict(color=colors, size=10),
    name='Data Points'
))

# Thêm đồ thị đường đồng mức (Contour) để hiển thị các điểm mức 0
fig.add_trace(go.Contour(
    z=X_mapFeature_data,
    x=x_values,
    y=y_values,
    contours=dict(
        start=1,
        end=0,
        size=2,
        coloring='lines',
    ),
    line=dict(color='blue', width= 3),
))