\documentclass{article}
\usepackage{amsmath}
\usepackage{amsfonts}
\usepackage{geometry}
\geometry{a4paper, margin=1in}

\title{Logistic Regression: Cost Function and Gradient Descent}
\author{}
\date{}

\begin{document}

\maketitle

\section{Hypothesis Function}

The hypothesis for logistic regression utilizes the sigmoid function:

\[
h_\theta(x) = \frac{1}{1 + e^{-a}} \quad \text{where} \quad a = \theta^T x
\]

\section{Cost Function}

For logistic regression, the cost function (also known as the loss function) for a single training example \((x^{(i)}, y^{(i)})\) is defined as:

\[
\text{cost}\left(h_\theta(x^{(i)}), y^{(i)}\right) = 
\begin{cases}
- \log\left(h_\theta(x^{(i)})\right) & \text{if } y^{(i)} = 1, \\
- \log\left(1 - h_\theta(x^{(i)})\right) & \text{if } y^{(i)} = 0.
\end{cases}
\]

This can be compactly written as:

\[
\text{cost}\left(h_\theta(x^{(i)}), y^{(i)}\right) = -y^{(i)} \log\left(h_\theta(x^{(i)})\right) - (1 - y^{(i)}) \log\left(1 - h_\theta(x^{(i)})\right).
\]

\section{Overall Cost Function \(J(\theta)\)}

The overall cost function is the average cost over all training examples:

\[
J(\theta) = \frac{1}{m} \sum_{i=1}^{m} \text{cost}\left(h_\theta(x^{(i)}), y^{(i)}\right).
\]

Substituting the compact form of the cost function:

\[
J(\theta) = -\frac{1}{m} \sum_{i=1}^{m} \left[ y^{(i)} \log\left(h_\theta(x^{(i)})\right) + (1 - y^{(i)}) \log\left(1 - h_\theta(x^{(i)})\right) \right].
\]

\section{Gradient of the Cost Function}

To minimize \(J(\theta)\), we compute its gradient with respect to the parameters \(\theta\). The derivative of \(J\) with respect to \(\theta_j\) is:

\[
\frac{\partial J(\theta)}{\partial \theta_j} = \frac{1}{m} \sum_{i=1}^{m} \left( h_\theta(x^{(i)}) - y^{(i)} \right) x_j^{(i)}.
\]

In vectorized form, the gradient can be written as:

\[
\nabla_\theta J(\theta) = \frac{1}{m} \sum_{i=1}^{m} \left( h_\theta(x^{(i)}) - y^{(i)} \right) x^{(i)}.
\]

\section{Gradient Descent Update Rule}

Using gradient descent to update the parameters \(\theta\), the update rule for each parameter \(\theta_j\) is:

\[
\theta_j := \theta_j - \alpha \frac{\partial J(\theta)}{\partial \theta_j},
\]

where:
\begin{itemize}
    \item \(\alpha\) is the learning rate.
    \item \(\frac{\partial J(\theta)}{\partial \theta_j}\) is the partial derivative of the cost function with respect to \(\theta_j\).
\end{itemize}

In vectorized form, the update rule for all parameters simultaneously is:

\[
\theta := \theta - \alpha \nabla_\theta J(\theta).
\]

\section{Summary of Complete Formulas}

Putting it all together, here are the complete and corrected formulas for logistic regression:

\begin{align*}
h_\theta(x) &= \frac{1}{1 + e^{-\theta^T x}} \\
J(\theta) &= -\frac{1}{m} \sum_{i=1}^{m} \left[ y^{(i)} \log\left(h_\theta(x^{(i)})\right) + (1 - y^{(i)}) \log\left(1 - h_\theta(x^{(i)})\right) \right] \\
\nabla_\theta J(\theta) &= \frac{1}{m} \sum_{i=1}^{m} \left( h_\theta(x^{(i)}) - y^{(i)} \right) x^{(i)} \\
\theta &:= \theta - \alpha \nabla_\theta J(\theta)
\end{align*}

\section{Additional Notes}

\begin{itemize}
    \item \textbf{Initialization}: Initialize the parameters \(\theta\) (e.g., to zeros or small random values) before starting gradient descent.
    \item \textbf{Convergence}: Choose an appropriate learning rate \(\alpha\) to ensure convergence. If \(\alpha\) is too large, gradient descent may overshoot the minimum; if it's too small, convergence may be very slow.
    \item \textbf{Vectorization}: Implementing these operations in a vectorized form (using matrices and vectors) can significantly speed up computations, especially for large datasets.
\end{itemize}

\end{document}


In [25]:
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import numpy as np
import pandas as pd

In [26]:
ex5_data = pd.read_csv('ex5data.txt', header= None).to_numpy()

In [27]:
ex5_data.shape

(118, 3)

In [28]:
x1, x2, y = ex5_data[:, 0], ex5_data[:, 1], ex5_data[:, 2]  

In [29]:
colors = np.where(y == 1, 'red', 'blue')

In [30]:
fig = go.Figure()

# Plot data points
fig.add_trace(go.Scatter(
    x=x1, 
    y=x2, 
    mode='markers', 
    marker=dict(color=colors, size=10),
    name='Data Points'
))

fig.update_xaxes(title= 'x1')
fig.update_yaxes(title= 'x2')


In [31]:
n = 0
X_mapFeature = np.ones((118, 28))

for order in range(1, 7, 1):
    for i in range(0, order+1):
        x12 = x1**(order - i) * x2**(i)
        X_mapFeature[:, n] = x12
        n = n + 1

In [32]:
X_mapFeature[:5, :]

array([[ 5.12670000e-02,  6.99560000e-01,  2.62830529e-03,
         3.58643425e-02,  4.89384194e-01,  1.34745327e-04,
         1.83865725e-03,  2.50892595e-02,  3.42353606e-01,
         6.90798869e-06,  9.42624411e-05,  1.28625106e-03,
         1.75514423e-02,  2.39496889e-01,  3.54151856e-07,
         4.83255257e-06,  6.59422333e-05,  8.99809795e-04,
         1.22782870e-02,  1.67542444e-01,  1.81563032e-08,
         2.47750473e-07,  3.38066048e-06,  4.61305487e-05,
         6.29470940e-04,  8.58939846e-03,  1.17205992e-01,
         1.00000000e+00],
       [-9.27420000e-02,  6.84940000e-01,  8.60107856e-03,
        -6.35227055e-02,  4.69142804e-01, -7.97681228e-04,
         5.89122275e-03, -4.35092419e-02,  3.21334672e-01,
         7.39785525e-05, -5.46363780e-04,  4.03513411e-03,
        -2.98012201e-02,  2.20094970e-01, -6.86091891e-06,
         5.06708697e-05, -3.74226408e-04,  2.76382476e-03,
        -2.04120477e-02,  1.50751849e-01,  6.36295342e-07,
        -4.69931780e-06,  3.47

In [33]:
class Logistic_Regression_Multivariables:
    def __init__(self, *, number_of_feature: int) -> None:
        self.number_of_features = number_of_feature

    def normalize_vector(self, vector: np.ndarray) -> np.ndarray:
        mean = np.mean(vector)
        std = np.std(vector)
        if std == 0:
            return vector - mean
        return (vector - mean) / std
    
    def normalize_input(self, *, X: np.ndarray) -> tuple:
        norm_X = np.apply_along_axis(self.normalize_vector, arr=X, axis=0).reshape(-1, self.number_of_features)
        return norm_X

    def add_ones_columns(self, *, normalized_input: np.ndarray) -> np.ndarray:
        ones = np.ones(len(normalized_input)).reshape(-1, 1)
        x_add = np.hstack((ones, normalized_input))
        return x_add

    def predict(self, *, theta: np.ndarray, normalized_input: np.ndarray) -> np.ndarray:
        y_pred = np.matmul(normalized_input, theta)
        y_pred = 1/(1 + np.exp(-y_pred))
        return y_pred
    
    def compute_loss(self, *, y_true: np.ndarray, y_pred: np.ndarray) -> np.ndarray:
        m = len(y_true)
        epsilon = 1e-15
        y_pred = np.clip(y_pred, epsilon, 1 - epsilon)
        J = np.sum(- y_true*np.log(y_pred) - (1 - y_true)*np.log(1 - y_pred)) / (m)
        return J
    
    def update_params(self, *, theta: np.ndarray, lr: float, y_pred: np.ndarray, 
                      y_true: np.ndarray, normalized_input: np.ndarray) -> np.ndarray:
        m = len(y_true)
        E = y_pred - y_true
        dJ_dtheta = np.dot(normalized_input.T, E) / (m)
        theta_updated = theta - lr*dJ_dtheta
        return theta_updated
    
    def train(self, *, epochs: int, theta: np.ndarray, input: np.ndarray, 
              output: np.ndarray, lr: float, plot_graph: False, color: list, time_delay: float) -> np.ndarray:
        output = output.reshape(-1, 1)
        normalized_input = self.normalize_input(X= input)
        normalized_input_with_ones = self.add_ones_columns(normalized_input= normalized_input)

        J_array = np.array([])
        
        for epoch in range(epochs):
            y_pred = self.predict(theta= theta, normalized_input= normalized_input_with_ones)

            J = self.compute_loss(y_true= output, y_pred= y_pred)
            J_array = np.append(arr= J_array, values= J)

            theta = self.update_params(theta= theta, lr= lr, y_pred= y_pred, 
                                       y_true= output, normalized_input= normalized_input_with_ones)

            
    
        return J_array, theta

In [34]:
num_feature = X_mapFeature.shape[1]
Logistic_Regression = Logistic_Regression_Multivariables(number_of_feature= num_feature)

normalized_input = Logistic_Regression.normalize_input(X= X_mapFeature)
normalized_input_with_ones = Logistic_Regression.add_ones_columns(normalized_input= normalized_input)


In [35]:
# np.random.seed(1)

theta_init = np.random.rand(num_feature + 1, 1)
theta_init

array([[0.88731166],
       [0.30285223],
       [0.01600103],
       [0.89896328],
       [0.58893361],
       [0.55514667],
       [0.96768268],
       [0.35596477],
       [0.69275576],
       [0.31835355],
       [0.43741072],
       [0.130016  ],
       [0.97675049],
       [0.50832254],
       [0.45744178],
       [0.42122006],
       [0.49595314],
       [0.40898115],
       [0.23652172],
       [0.71053889],
       [0.55334264],
       [0.25438248],
       [0.77371139],
       [0.7983268 ],
       [0.46883848],
       [0.29172161],
       [0.93874773],
       [0.678779  ],
       [0.19352278]])

In [36]:
learning_rate_ = [0.01, 0.001, 0.003, 0.3, 0.04, 0.1]
J = np.zeros((1000, ))
theta_ = np.zeros_like(theta_init)
for i in learning_rate_:
    
    J_arr, theta_arr = Logistic_Regression.train(epochs= 1000, theta= theta_init, 
                                           input= X_mapFeature, output= y, lr= i, plot_graph= False,
                                           color= colors, time_delay= 0.01)
    theta_ = np.hstack([theta_, theta_arr])
    J = np.vstack([J, J_arr])

fig = go.Figure()
for i in range(len(J[1:, :])):
    fig.add_trace(go.Scatter(x=np.arange(1000), y=J[(1+i), :],
                            mode= 'lines', name= f'lr: {learning_rate_[i]}'))
fig.update_xaxes(title= 'epochs')
fig.update_yaxes(title= 'J', tickangle= 0)
fig.show()

In [37]:
theta_visualized = theta_[:, 5]
theta_visualized

array([-0.41275321,  0.42752804,  1.11094852, -0.52835856, -0.71486532,
       -0.75868335,  0.81505593,  0.24773446,  0.03895017,  0.23457573,
       -0.95963302, -0.43137006, -0.10138158, -0.43642229, -0.86771553,
       -0.10247346, -0.02782992,  0.28498295, -0.24987298, -0.34596624,
       -0.06469703, -0.94463019,  0.36358311, -0.30553666,  0.20109533,
       -0.66042173,  0.0554791 , -0.46981254,  0.19352278])

In [38]:
x_values = np.linspace(normalized_input_with_ones[:, 1].min(), normalized_input_with_ones[:, 1].max(), 100)
y_values = np.linspace(normalized_input_with_ones[:, 2].min(), normalized_input_with_ones[:, 2].max(), 100)

X, Y = np.meshgrid(x_values, y_values)


In [39]:
theta_visualized

array([-0.41275321,  0.42752804,  1.11094852, -0.52835856, -0.71486532,
       -0.75868335,  0.81505593,  0.24773446,  0.03895017,  0.23457573,
       -0.95963302, -0.43137006, -0.10138158, -0.43642229, -0.86771553,
       -0.10247346, -0.02782992,  0.28498295, -0.24987298, -0.34596624,
       -0.06469703, -0.94463019,  0.36358311, -0.30553666,  0.20109533,
       -0.66042173,  0.0554791 , -0.46981254,  0.19352278])

In [40]:
n = 0
X_mapFeature_data = np.ones((100, 100))

for order in range(0, 7, 1):
    for i in range(0, order+1):
        x12 = X**(order - i) * Y**(i) * theta_arr[n]
        X_mapFeature_data = X_mapFeature_data + x12
        n = n + 1

In [41]:
X_mapFeature_data.shape

(100, 100)

In [42]:
fig = go.Figure()

fig.add_trace(go.Scatter(
    x=x1, 
    y=x2, 
    mode='markers', 
    marker=dict(color=colors, size=10),
    name='Data Points'
))

# Thêm đồ thị đường đồng mức (Contour) để hiển thị các điểm mức 0
fig.add_trace(go.Contour(
    z=X_mapFeature_data,
    x=x_values,
    y=y_values,
    contours=dict(
        start=1,
        end=0,
        size=2,
        coloring='lines',
    ),
    line=dict(color='blue', width= 3),
))