# Activation function

\begin{align*}
Binary \space Step \space Function (x) = 
\begin{cases}
1 \quad if \quad \sum_{i=1}^{m} w_i x_i + b \ge threshold \\
0 \quad if \quad \sum_{i=1}^{m} w_i x_i + b \lt threshold \\
\end{cases}
\end{align*}

\begin{align*}
ReLU (x) = 
\begin{cases}
0 \quad if \quad x \lt 0 \\
1 \quad if \quad x \ge  x
\end{cases}
\end{align*}

\begin{align*}
Sigmoid (x) = 

\frac{1}{1 + e^{-x}} \in [0, 1]

\end{align*}

\begin{align*}
Tanh (x) = 

\frac{ e^{x} - e^{-x}  }{ e^{x} + e^{-x}} \in [-1, 1]

\end{align*}

# Derivative / Gradient

## Derivative

$$
f'(x) = lim_{\triangle x \rightarrow 0} \frac { f(x+\triangle) - f(x) } {\triangle x} \\ 
y' = lim_{\triangle x \rightarrow 0} \frac { \triangle y } {\triangle x} \\
\frac {dy} {dx} = lim_{\triangle x \rightarrow 0} \frac { \triangle y } {\triangle x}
$$

## Gradient

$$
\nabla f = [ \frac{\partial f}{\partial x} , \frac{\partial f}{\partial y} , \frac{\partial f}{\partial z} ]  \\ 
\nabla f = [ \frac{\partial f}{\partial x}i + \frac{\partial f}{\partial y}j + \frac{\partial f}{\partial z}k ]
$$

## Direction of increase of function

$$
D_{\vec{b}}f = \nabla f.\vec{b} = \|  \nabla f \| \| b \| \cos{\theta}
$$

## Gradient Decent

$$
\vec{x_0} = (x_0, y_0) \\
\vec{x_{n+1}} = \vec{x_n} - \eta \nabla f \vec{x_n}
$$

$w_{j, i}$ là trọng số kết nối từ ngõ vào neural thứ i đến neural thứ j ở lớp sau nó
$$
a_j = \sum_{i=1}^{n}x_i w_{j,i} + b_j \\
o_j = \sigma (a_j) = \frac{1}{1 + e^{-a_j}} \\
$$

# Cost function 

$$
MSE = \frac{1}{n} \sum_{i=1}^{n} (y_i - (ax_i+b)^2) \\
$$

# Update

$$
J = Costfunction() \\
w = w - \eta\frac{\partial J}{\partial w}
$$

# Train new neural network with gradient descent

$$
a_j = \sum_{i=1}^{20}x_i w_{j, i} + b_j \\
o_j = \sigma(a_j)=\frac{1}{1+e^{-a_j}}  \\
J = \frac{1}{5} \sum_{t=1}^{5} \sum_{k=1}^{5} (y_k^t - o_k^t)^2 \space trong \ đó \ k: \ là \ số \ ngõ \ ra, \ t \ là \ số \ mẫu \
$$

In [1]:
import numpy as np  
import matplotlib.pyplot as plt 

In [2]:
import numpy as np
import matplotlib.pyplot as plt

# Character matrix and target labels
char = np.matrix([
    [0, 0, 1, 0,
     0, 1, 1, 0,
     0, 0, 1, 0,
     0, 0, 1, 0,
     1, 1, 1, 1], 
    [1, 1, 1, 1,
     0, 0, 0, 1,
     1, 1, 1, 1,
     1, 0, 0, 0,
     1, 1, 1, 1],
    [1, 1, 1, 1,
     0, 0, 0, 1,
     1, 1, 1, 1,
     0, 0, 0, 1,
     1, 1, 1, 1],
    [1, 0, 0, 1,
     1, 0, 0, 1,
     1, 1, 1, 1,
     0, 0, 0, 1,
     0, 0, 0, 1],
    [1, 1, 1, 1,
     1, 0, 0, 0,
     1, 1, 1, 1,
     0, 0, 0, 1,
     1, 1, 1, 1]
], dtype=np.int8)

target = np.matrix([
    [1, 0, 0, 0, 0],
    [0, 1, 0, 0, 0],
    [0, 0, 1, 0, 0],
    [0, 0, 0, 1, 0],
    [0, 0, 0, 0, 1],
])


In [3]:

# Function for sigmoid
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

In [None]:

# Hyperparameters
alpha = 0.1  # Learning rate
epochs = 100  # Number of epochs

# Weight initialization
w = np.matrix(np.random.uniform(-0.1, 0.1, (5, 20)))

# To track cost function (J) values over epochs
J = np.zeros(epochs)

for epoch in range(epochs):
    dJ_dw = np.zeros_like(w)  # Initialize gradient to zero for each epoch
    total_cost = 0  # Initialize total cost for each epoch
    
    for sample in range(5):  # Loop over all samples
        X = char[sample, :]  # Input character sample
        y = target[sample, :]  # Corresponding target
        
        # Forward pass
        y_pred = sigmoid(np.dot(X, w.T))  # Predicted output
        
        # Compute error and gradients
        error = y - y_pred
        delta = np.multiply(np.multiply(error, y_pred), (1 - y_pred))  # Derivative of sigmoid
        dJ_dw += delta.T * X  # Accumulate gradients
        
        # Cost (Mean Squared Error for this sample)
        total_cost += np.mean(np.power(error, 2))
    
    # Update weights after processing all samples
    w += alpha * dJ_dw / 5  # Gradient step, averaged over the samples
    
    # Store the cost for this epoch
    J[epoch] = total_cost / 5

# Plot the cost function over epochs
plt.plot(J)
plt.ylabel('Cost Function (J)')
plt.xlabel('Epoch')
plt.title('Cost Function Over Epochs')
plt.show()


# Backpropagation and gradient descent

In [5]:
import numpy as np 
import pandas as pd
from abc import abstractmethod

In [6]:
train_data = np.loadtxt('mnist_train.csv', delimiter= ',')
test_data = np.loadtxt('mnist_test.csv', delimiter= ',')

In [7]:
X_train = train_data[:, 1:]
X_test = test_data[:, 1:]

y_train = train_data[:, 0]
y_test = test_data[:, 0]

# My 

In [61]:
import numpy as np
np.random.seed(42)


class ActivationFunction:
    @staticmethod
    def relu(x):
        return np.maximum(0, x)

    @staticmethod
    def relu_derivative(x):
        return np.where(x > 0, 1, 0)

    @staticmethod
    def softmax(x):
        # Đảm bảo tính ổn định số học
        exp_values = np.exp(x - np.max(x, axis=1, keepdims=True))
        return exp_values / np.sum(exp_values, axis=1, keepdims=True)

class NeuralNetwork:
    def __init__(self, *, input_size: int, hidden_layer: list,
                  output_size: int, learning_rate: float, optimizer: list,
                    activation_functions: list, input: np.array, target= np.array) -> None:
        self.input_size = input_size # input
        self.output_size = output_size # output
        self.hidden_layer = hidden_layer # hidden layer
        self.learning_rate = learning_rate # learning rate
        self.activation_functions = activation_functions # list activation function
        self.optimizer = optimizer # optimizer
        self.input = input # input
        self.target = target # target
        self.len = len(input) # len input (m)

        # size of layer
        self.layer_size = [self.input_size] + list(self.hidden_layer) + [self.output_size]
        print(f'layer_size: \n {self.layer_size}, len: {len(self.layer_size)}')

        # output of layer after feedforward
        self.list_output_in_layer= [0]*(len(self.layer_size) - 1 )

        # init weight and bias
        self.matrices_weights = [0]*(len(self.layer_size) - 1 )
        self.matrices_biases = [0]*(len(self.layer_size) - 1 )
        for idx in range(len(self.layer_size) - 1):
            # init weight 
            # matrix_weight = np.random.randn(self.layer_size[idx], self.layer_size[idx+1]) / 10
            matrix_weight = np.ones((self.layer_size[idx], self.layer_size[idx+1]))
            self.matrices_weights[idx] = matrix_weight
            print(f'matrix_weight {idx}: \n {matrix_weight}, shape: {matrix_weight.shape}')

            # init bias
            # matrix_bias = np.random.randn(1, self.layer_size[idx + 1]) / 10
            matrix_bias = np.ones((1, self.layer_size[idx + 1])) / 10
            self.matrices_biases[idx] = matrix_bias
            print(f'matrix_bias {idx}: \n {matrix_bias}, shape: {matrix_bias.shape}')

    def feed_forward(self):
        """Performs feedforward calculation for the network."""
        input_array = self.input
        
        for idx, (weight, bias) in enumerate(zip(self.matrices_weights, self.matrices_biases)):
            # print(f'optimizer at layer {idx}= {self.layer_activation_fn[idx]}')
            # print(idx, (weight, bias))
            input_array = input_array @ weight + bias  # Matrix multiplication and bias addition
            vector_activated = input_array

            self.list_output_in_layer[idx] = vector_activated
            print(f'vector_activated {idx}: \n {vector_activated}, shape: {vector_activated.shape}')

        return vector_activated
    
    def cal_error_layer(self):
        self.error_layer[-1] = (self.target - self.list_vectors_layer[-1])*self.list_vectors_layer[-1]*(1 - self.list_vectors_layer[-1])
        # print(self.error_layer[-1])

        for idx, _ in enumerate(reversed(self.matrices_weights)):
            reverse_i = len(self.matrices_weights) - 1 - idx
            if reverse_i == len(self.matrices_weights) - 1:
                self.error_layer[-1] = (self.target - self.list_vectors_layer[-1])*self.list_vectors_layer[-1]*(1 - self.list_vectors_layer[-1])
            else:
                self.error_layer[reverse_i] = self.error_layer[reverse_i + 1] @ self.matrices_weights[reverse_i + 1].T
                # print(reverse_i + 1)

                # print(self.error_layer[reverse_i + 1].shape)
                # print(self.matrices_weights[reverse_i + 1].T.shape)

    def back_prop(self):

        for idx, matrix in enumerate(reversed(self.matrices_weights)):
            reverse_i = len(self.matrices_weights) - 1 - idx
            print(reverse_i)

            len_ = self.error_layer[reverse_i].shape[1]
            print(len_)

            predict_ = self.list_vectors_layer[reverse_i].reshape(-1, 1).repeat(len_, 1)
            print(predict_)

            self.matrices_weights[reverse_i] += self.learning_rate * self.error_layer[reverse_i] * predict_ 


In [62]:
layer_activation_fn = ['ReLu', 'ReLu', 'Softmax']
neural = NeuralNetwork(input_size= 2, hidden_layer= (1,), output_size= 1, learning_rate= 2, optimizer= 'str',
                       activation_functions= layer_activation_fn, input= 2*np.ones((1, 2)), target= np.zeros((1, 2)))

layer_size: 
 [2, 1, 1], len: 3
matrix_weight 0: 
 [[1.]
 [1.]], shape: (2, 1)
matrix_bias 0: 
 [[0.1]], shape: (1, 1)
matrix_weight 1: 
 [[1.]], shape: (1, 1)
matrix_bias 1: 
 [[0.1]], shape: (1, 1)


In [63]:
neural.feed_forward()

vector_activated 0: 
 [[4.1]], shape: (1, 1)
vector_activated 1: 
 [[4.2]], shape: (1, 1)


array([[4.2]])

# Chat GPT 

In [1]:
import numpy as np
np.random.seed(42)

class ActivationFunction:
    @staticmethod
    def relu(x):
        return np.maximum(0, x)

    @staticmethod
    def relu_derivative(x):
        return np.where(x > 0, 1, 0)

    @staticmethod
    def softmax(x):
        # Đảm bảo tính ổn định số học
        exp_values = np.exp(x - np.max(x, axis=1, keepdims=True))
        return exp_values / np.sum(exp_values, axis=1, keepdims=True)

class NeuralNetwork:
    def __init__(self, input_size, hidden_layers, output_size, learning_rate, activation_functions, inputs, targets):
        self.input_size = input_size
        self.hidden_layers = hidden_layers
        self.output_size = output_size
        self.learning_rate = learning_rate
        self.activation_functions = activation_functions
        self.inputs = inputs
        self.targets = targets
        self.num_samples = len(inputs)

        # Xây dựng kiến trúc mạng
        self.layer_sizes = [self.input_size] + list(self.hidden_layers) + [self.output_size]

        # Khởi tạo trọng số và bias
        self.weights = []
        self.biases = []
        for idx in range(len(self.layer_sizes) - 1):
            weight_matrix = np.random.randn(self.layer_sizes[idx], self.layer_sizes[idx + 1]) / np.sqrt(self.layer_sizes[idx])
            bias_vector = np.zeros((1, self.layer_sizes[idx + 1]))
            self.weights.append(weight_matrix)
            self.biases.append(bias_vector)

    def feed_forward(self):
        """Thực hiện tính toán feedforward."""
        self.activations = [self.inputs]
        self.pre_activations = []
        for idx in range(len(self.weights)):
            z = np.dot(self.activations[-1], self.weights[idx]) + self.biases[idx]
            self.pre_activations.append(z)
            # Áp dụng hàm kích hoạt
            if self.activation_functions[idx] == 'relu':
                a = ActivationFunction.relu(z)
            elif self.activation_functions[idx] == 'softmax':
                a = ActivationFunction.softmax(z)
            else:
                raise ValueError(f"Hàm kích hoạt không được hỗ trợ: {self.activation_functions[idx]}")
            self.activations.append(a)
        return self.activations[-1]

    def back_propagate(self):
        """Thực hiện backpropagation và cập nhật trọng số và bias."""
        deltas = [0] * len(self.weights)
        # Tính delta cho lớp output
        if self.activation_functions[-1] == 'softmax':
            delta = self.activations[-1] - self.targets
        else:
            delta = (self.activations[-1] - self.targets) * ActivationFunction.relu_derivative(self.pre_activations[-1])
        deltas[-1] = delta

        # Lan truyền ngược lỗi
        for idx in reversed(range(len(deltas) - 1)):
            if self.activation_functions[idx] == 'relu':
                derivative = ActivationFunction.relu_derivative(self.pre_activations[idx])
            else:
                derivative = 1  # Giả định hàm kích hoạt tuyến tính
            delta = np.dot(deltas[idx + 1], self.weights[idx + 1].T) * derivative
            deltas[idx] = delta

        # Cập nhật trọng số và bias
        for idx in range(len(self.weights)):
            weight_gradient = np.dot(self.activations[idx].T, deltas[idx]) / self.num_samples
            bias_gradient = np.sum(deltas[idx], axis=0, keepdims=True) / self.num_samples
            self.weights[idx] -= self.learning_rate * weight_gradient
            self.biases[idx] -= self.learning_rate * bias_gradient

    def train(self, epochs):
        for epoch in range(epochs):
            output = self.feed_forward()
            loss = self.compute_loss(output, self.targets)
            self.back_propagate()
            print(f"Epoch {epoch + 1}, Loss: {loss}")

    def compute_loss(self, predictions, targets):
        if self.activation_functions[-1] == 'softmax':
            # Sử dụng hàm mất mát cross-entropy
            epsilon = 1e-12
            predictions = np.clip(predictions, epsilon, 1. - epsilon)
            N = predictions.shape[0]
            ce_loss = -np.sum(targets * np.log(predictions + 1e-9)) / N
            return ce_loss
        else:
            # Sử dụng MSE cho các hàm kích hoạt khác
            mse_loss = np.mean((predictions - targets) ** 2)
            return mse_loss
        
# Định nghĩa các tham số mạng
input_size = 784  # Kích thước đầu vào (ví dụ, ảnh MNIST)
hidden_layers = [128, 64]  # Các lớp ẩn
output_size = 10  # Số lớp đầu ra (số lớp phân loại)
learning_rate = 0.01
activation_functions = ['relu', 'relu', 'softmax']

train_data = np.loadtxt('mnist_train.csv', delimiter= ',')

# Tạo dữ liệu đầu vào và mục tiêu
inputs = train_data[:, 1:]  # Dữ liệu đầu vào mẫu
targets = train_data[:, 0]
# Gán nhãn mục tiêu (one-hot encoding)
for i in range(1000):
    targets[i, np.random.randint(0, output_size)] = 1

# Khởi tạo và huấn luyện mạng
nn = NeuralNetwork(input_size, hidden_layers, output_size, learning_rate, activation_functions, inputs, targets)
nn.train(epochs=10000)



IndexError: too many indices for array: array is 1-dimensional, but 2 were indexed

In [1]:
import numpy as np
import pandas as pd
import plotly.graph_objects as go

# Define the ActivationFunc and NeuralNetwork classes
class ActivationFunc:
    def __init__(self, input) -> None:
        self.input = input

    def ReLu(self):
        # return np.maximum(0, self.input)
        return self.input
    
    def ReLu_derivative(self):
        return np.where(self.input > 0, 1, 0)

    def SoftMax(self):
        exp_values = np.exp(self.input - np.max(self.input, axis=1, keepdims=True))
        sum_exp = np.sum(exp_values, axis=1, keepdims=True)
        return exp_values / sum_exp

class NeuralNetwork:
    def __init__(self, *, number_neural_input: int, hidden_layer: tuple,
                  number_neural_output: int, learning_rate: float, optimizer: list,
                  layer_activation_fn: list, input: np.array, target= np.array) -> None:
        self.number_neural_input = number_neural_input
        self.hidden_layer = hidden_layer
        self.number_neural_output = number_neural_output
        self.learning_rate = learning_rate
        self.layer_activation_fn = layer_activation_fn
        self.optimizer = optimizer
        self.input = input
        self.target = target
        self.len = len(input)

    def __call__(self):
        self.array = np.insert(self.hidden_layer, 0, self.number_neural_input)
        self.array = np.append(self.array, self.number_neural_output)
        self.error_layer = [0] * (len(self.array) - 1)
        self.list_vectors_layer = [0] * len(self.array)
        self.list_vectors_layer[0] = self.input
        self.matrices_weights = [0] * (len(self.array) - 1)
        self.matrices_biases = [0] * (len(self.array) - 1)

        self.create_matrices_bias_layer()
        self.create_matrices_weight_layer()

    def create_matrices_weight_layer(self):
        for idx, value in enumerate(self.array):
            if idx + 1 > len(self.array) - 1:
                break
            else:
                matrix_weight = np.random.randn(self.array[idx], self.array[idx+1]) / 10
                self.matrices_weights[idx] = matrix_weight

    def create_matrices_bias_layer(self):
        for idx, num_bias in enumerate(self.array[1:]):
            matrix_bias = np.random.randn(1, num_bias) / 10
            self.matrices_biases[idx] = matrix_bias

    def feed_forward(self):
        vector = self.input.reshape(self.len, -1)
        for idx, (weight, bias) in enumerate(zip(self.matrices_weights, self.matrices_biases)):
            vector = vector @ weight + bias
            activation_func = ActivationFunc(vector)
            if self.layer_activation_fn[idx] == 'ReLu':
                vector_activated = activation_func.ReLu()
            elif self.layer_activation_fn[idx] == 'SoftMax':
                vector_activated = activation_func.SoftMax()
            else:
                raise ValueError(f"Unsupported activation function: {self.layer_activation_fn[idx]}")
            self.list_vectors_layer[idx + 1] = vector_activated
            vector = vector_activated
        return vector_activated
    
    def test(self, input):
        vector = input.reshape(self.len, -1)
        for idx, (weight, bias) in enumerate(zip(self.matrices_weights, self.matrices_biases)):
            vector = vector @ weight + bias
            activation_func = ActivationFunc(vector)
            if self.layer_activation_fn[idx] == 'ReLu':
                vector_activated = activation_func.ReLu()
            elif self.layer_activation_fn[idx] == 'SoftMax':
                vector_activated = activation_func.SoftMax()
            else:
                raise ValueError(f"Unsupported activation function: {self.layer_activation_fn[idx]}")
            self.list_vectors_layer[idx + 1] = vector_activated
            vector = vector_activated
        return vector_activated
    
    def cal_error_layer(self):
        self.error_layer[-1] = self.list_vectors_layer[-1] - self.target
        for idx in reversed(range(len(self.error_layer) - 1)):
            activation_input = self.list_vectors_layer[idx + 1]
            activation_func = ActivationFunc(activation_input)
            if self.layer_activation_fn[idx] == 'ReLu':
                activation_derivative = activation_func.ReLu_derivative()
            else:
                activation_derivative = 1
            weight_next = self.matrices_weights[idx + 1]
            error_next = self.error_layer[idx + 1]
            self.error_layer[idx] = (error_next @ weight_next.T) * activation_derivative

    def back_prop(self):
        for idx in reversed(range(len(self.matrices_weights))):
            activation_prev = self.list_vectors_layer[idx]
            error_current = self.error_layer[idx]
            gradient_weights = activation_prev.T @ error_current
            gradient_biases = np.sum(error_current, axis=0, keepdims=True)
            self.matrices_weights[idx] -= self.learning_rate * gradient_weights
            self.matrices_biases[idx] -= self.learning_rate * gradient_biases

    def compute_loss(self, predictions, targets):
        if self.layer_activation_fn[-1] == 'SoftMax':
            epsilon = 1e-12
            predictions = np.clip(predictions, epsilon, 1. - epsilon)
            N = predictions.shape[0]
            ce_loss = -np.sum(targets * np.log(predictions)) / N
            return ce_loss
        else:
            mse_loss = np.mean((predictions - targets) ** 2)
            return mse_loss

    def train(self, epochs):
        for epoch in range(epochs):
            output = self.feed_forward()
            loss = self.compute_loss(output, self.target)
            self.cal_error_layer()
            self.back_prop()
            if epoch % 100 == 0:
                print(f"Epoch {epoch + 1}, Loss: {loss}")

# Load the sample data
pd_dataframe = pd.read_csv('ex1.csv')
input_data = pd_dataframe['x'].to_numpy().reshape(1, -1)
target_data = pd_dataframe['y'].to_numpy().reshape(1, -1)

# Define network parameters
number_neural_input = 100
hidden_layer = (128, 64)
number_neural_output = 100
learning_rate = 0.01
optimizer = []
layer_activation_fn = ['ReLu', 'ReLu', 'ReLu']

# Initialize and train the neural network
nn = NeuralNetwork(
    number_neural_input=number_neural_input,
    hidden_layer=hidden_layer,
    number_neural_output=number_neural_output,
    learning_rate=learning_rate,
    optimizer=optimizer,
    layer_activation_fn=layer_activation_fn,
    input=input_data,
    target=target_data
)

# Initialize network parameters
nn()

# Train the model
nn.train(epochs=500)

# Generate the plot
fig = go.Figure()

# Add scatter plot for the original data points
fig.add_trace(
    go.Scatter(
        x=pd_dataframe['x'],  
        y=pd_dataframe['y'],  
        mode='markers',  
        marker=dict(symbol='x'),  
        name='Data Points'
    )
)

# Generate a range of x values and predict using the trained model
x_range = np.linspace(1, 100, 100)
y_range = nn.test(x_range.reshape(1, -1)).reshape(-1,)

# Add regression line
fig.add_trace(
    go.Scatter(
        x=x_range,  
        y=y_range,  
        mode='lines',  
        name='Regression Line'
    )
)

# Show plot
fig.show()


Epoch 1, Loss: 97576.74786855133
Epoch 101, Loss: nan
Epoch 201, Loss: nan
Epoch 301, Loss: nan
Epoch 401, Loss: nan


  vector = vector @ weight + bias
  vector = vector @ weight + bias
