In [7]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
df = pd.read_csv('siCoData.csv')

In [12]:
# Setting the seed to 42
np.random.seed(42)

# Reshaping x and y
x = df['x'].values.reshape(-1, 1)
y = df['y'].values.reshape(-1, 1)

# Normalizing the data using MinMaxScaler
# DO I RANGE IT FROM -1 to 1??
scaler = MinMaxScaler()
x = scaler.fit_transform(x) 

# Adding a column of ones (bias) to incorporate the intercept term
# By adding this column of ones, we can have weight
x = np.hstack([np.ones((x.shape[0], 1)), x])

# Getting the total number of rows
total = x.shape[0]

# Setting up the Gradient Descent Parameters
# α aka learning rate; controls the step size
learning_rate = 0.1
# Setting up a convergence threshold 
tolerance = 0.0001
# Setting up max iterations
max_iterations = 1000

# Setting up the input, hidden, and output layer
# The input layer has 2 features (the bias and x)
input_layer = x.shape[1]
# Trying 2 neurons for the hidden layer (tanh)
hidden_layer = 2
# The output layer has only one output (linear)
output_layer = 1

# Initializing the weights randomly
# Weights from input to hidden 
w1 = np.random.rand(input_layer, hidden_layer)
# Weights from hidden to output; add the bias from the hidden layer (hidden_layer + 1)
w2 = np.random.rand(hidden_layer + 1, output_layer)
# Initializing to get the minimum Ein; currently sets it to a high value so it can get replaced later
min_ein = float('inf')

# Applying backpropagation algorithm (SGD)
# Looping through the max iterations
for i in range(max_iterations):
    # Initializing the total error to 0
    total_error = 0
    
    # DO WE NEED TO SHUFFLE OR USE INDICES? 
    #indices = np.arange(total)
    #np.random.shuffle(indices)
    
    # Looping through each training point
    for j in range(total):
    #for j in indices:
        # Getting the x and y of that training point
        xj = x[j].reshape(1, -1)
        yj = y[j]
        
        # Doing all forward calculations (s, theta)
        # w^T @ x but since numpy uses row-wise data, x @ W
        s1 = np.dot(xj, w1)
        theta_s1 = np.tanh(s1)
        # Adding the bias in
        theta_s1_bias = np.hstack([np.ones((1, 1)), theta_s1])
        s2 = np.dot(theta_s1_bias, w2)

        # Computing the squared error since we have a linear output
        error = (s2 - yj)
        # Adding it to the total error
        total_error += error**2

        # Doing all backward calculations 
        # For the output layer (linear)
        # s^2 = w^2 * theta(s^1)
        # de/ds^2 = 2*(error) * ds^2/dw^2 = theta(s^1)
        # theta_s1_bias.T because we need to include the bias since this is the input for the output layer and we need
        # to calculate the gradient for all output weights
        dw2 = np.dot(theta_s1_bias.T, 2 * error) 
        
        # For the hidden layer 
        # 2*(error) * w^2j * (1 - theta^2(s^1))
        # w2[1:].T represents each neuron in the hidden layer; don't need the bias
        delta_hidden = np.dot(2 * error, w2[1:].T) * (1 - np.tanh(s1)**2)

        # For the input layer
        # 2*(error) * w^2j * (1 - theta^2(s^1)) * xi; don't need the bias
        dw1 = np.dot(xj.T, delta_hidden)

        # Updating the weights
        w2 -= learning_rate * dw2
        w1 -= learning_rate * dw1

    # Calculating the in-sample error for each iteration
    # ein = summation of the errors / N
    ein = total_error / total

    # Getting the minimum ein and its iteration
    if ein < min_ein:
        min_ein = ein
        iteration = i + 1
    
    # Checking for convergence
    if ein < tolerance:
        break

print(f'Minimum Ein: {min_ein.item():.6f} at {iteration} iterations out of {max_iterations}')
print('w1 (input to hidden):')
print(w1)
print('w2 (hidden to output):')
print(w2)

Minimum Ein: 0.051814 at 755 iterations out of 1000
w1 (input to hidden):
[[-3.46286813 -0.97876014]
 [ 6.84086611  1.96227013]]
w2 (hidden to output):
[[-0.21596718]
 [ 1.55272521]
 [-1.32282178]]


In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

# Load data
data = pd.read_csv('siCoData.csv')
X = data['x'].values.reshape(-1, 1)
y = data['y'].values.reshape(-1, 1)

# Normalize input
scaler = MinMaxScaler()
X = scaler.fit_transform(X)

# Add bias to input
X_with_bias = np.hstack([np.ones((X.shape[0], 1)), X])  # shape (n_samples, 2)

# Define activation and derivative
def tanh(x):
    return np.tanh(x)

def tanh_derivative(x):
    return 1 - np.tanh(x) ** 2

# Initialize weights
np.random.seed(42)
W1 = np.random.rand(X_with_bias.shape[1], 2)     # (2, 2)
W2 = np.random.rand(2 + 1, 1)                    # (3, 1)                   # (3, 1) for 2 hidden + bias

# Hyperparameters
learning_rate = 0.1
max_iterate = 1000
tolerance = 0.0001

min_Ein = float('inf')
best_iter = 0
best_W1 = None
best_W2 = None

for iterate in range(max_iterate):
    total_error = 0

    for i in range(len(X_with_bias)):
        x_i = X_with_bias[i].reshape(1, -1)  # (1, 2)
        y_i = y[i]                           # scalar value

        # Forward pass
        s1 = np.dot(x_i, W1)                # (1, 2)
        theta_s1 = tanh(s1)                 # (1, 2)

        # Add bias to hidden layer
        theta_s1_bias = np.hstack([np.ones((1, 1)), theta_s1])  # (1, 3)

        s2 = np.dot(theta_s1_bias, W2)      # (1, 1)
        y_hat = s2

        error = y_hat - y_i
        total_error += (error ** 2)[0]

        # Backward pass
        dW2 = np.dot(theta_s1_bias.T, 2 * error)                # (3, 1)
        delta_hidden = np.dot(2 * error, W2[1:].T) * tanh_derivative(s1)  # (1, 2)
        dW1 = np.dot(x_i.T, delta_hidden)                       # (2, 2)

        # Update weights
        W2 -= learning_rate * dW2
        W1 -= learning_rate * dW1

    Ein = total_error / len(X_with_bias)

    if Ein < min_Ein:
        min_Ein = Ein
        best_iter = iterate
        best_W1 = W1
        best_W2 = W2

    if Ein < tolerance:
        break

print(f"Minimum Ein: {min_Ein}, at iteration: {best_iter}")
print("Best W1 (input to hidden):")
print(best_W1)
print("Best W2 (hidden to output):")
print(best_W2)