# **PART A**

In [22]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm_notebook
from sklearn.preprocessing import MinMaxScaler
import torch



In [2]:
# load dataset
data = pd.read_csv('https://raw.githubusercontent.com/aaubs/ds-master/main/data/Swedish_Auto_Insurance_dataset.csv')

In [5]:
# Using sklearn
scaler = MinMaxScaler()
data_scaled = scaler.fit_transform(data)
data_s = pd.DataFrame(data_scaled, columns=data.columns)

In [7]:
# Hyperparameters
epochs = 1
learning_rate = 0.00037

In [11]:
# Initial parameter
w = 3.8

In [10]:
# 1. Creating a Neural Network
# 1.1 Structure (Architecture) of NN
def nn(x, w):
    """Compute prediction: y_hat = x * w"""
    return x * w

In [20]:
# 1.2 Loss Function
def loss(y_hat, t):
    """Calculate the Mean Squared Error"""
    return ((t - y_hat)**2)

In [15]:
# 1.3 Gradient of the loss w.r.t weight
def gradient(w, x, t):
    """Compute gradient: dL/dw = 2x(y_hat - t)"""
    y_hat = x * w
    return 2 * x * (y_hat - t)

In [16]:
def delta_w(w_i, x, t, learning_rate):
    """Compute the weight update Δw"""
    return learning_rate * gradient(w_i, x, t)

In [17]:
# Store weight history
w_his = []
w_his.append(w)

In [21]:
# Training loop (SGD)
for epoch in range(epochs):
    total_loss = 0

    for i in range(len(data_s['X'])):
        x_i, t_i = data_s['X'][i], data_s['Y'][i]

        # 2. Forward Pass
        y_hat = nn(x_i, w)

        # 3. Loss Evaluation
        loss_val = loss(y_hat, t_i)

        # 4. Gradient Calculation
        grad_value = gradient(w, x_i, t_i)
        dw = delta_w(w, x_i, t_i, learning_rate)

        # 5. Backpropagation / Weight Update (SGD)
        w = w - dw
        w_his.append(w)

        total_loss += loss_val

        print(
            f"Epoch {epoch+1}, Sample {i+1}: "
            f"Gradient={grad_value:.4f}, Δw={dw:.4f}, "
            f"w={w:.4f}, Loss={loss_val:.4f}"
        )

    avg_loss = total_loss / len(data_s['X'])
    print(f"Epoch {epoch+1}: Average Loss = {avg_loss:.4f}\n")


Epoch 1, Sample 1: Gradient=4.1458, Δw=0.0015, w=3.7985, Loss=5.6645
Epoch 1, Sample 2: Gradient=0.1448, Δw=0.0001, w=3.7984, Loss=0.2233
Epoch 1, Sample 3: Gradient=0.0757, Δw=0.0000, w=3.7984, Loss=0.1303
Epoch 1, Sample 4: Gradient=5.5968, Δw=0.0021, w=3.7963, Loss=7.8310
Epoch 1, Sample 5: Gradient=0.6076, Δw=0.0002, w=3.7961, Loss=0.8870
Epoch 1, Sample 6: Gradient=1.2321, Δw=0.0005, w=3.7956, Loss=1.7961
Epoch 1, Sample 7: Gradient=0.2112, Δw=0.0001, w=3.7956, Loss=0.3241
Epoch 1, Sample 8: Gradient=0.0553, Δw=0.0000, w=3.7955, Loss=0.0600
Epoch 1, Sample 9: Gradient=0.6318, Δw=0.0002, w=3.7953, Loss=0.7578
Epoch 1, Sample 10: Gradient=0.0244, Δw=0.0000, w=3.7953, Loss=0.0229
Epoch 1, Sample 11: Gradient=0.0083, Δw=0.0000, w=3.7953, Loss=0.0107
Epoch 1, Sample 12: Gradient=0.6825, Δw=0.0003, w=3.7950, Loss=0.7771
Epoch 1, Sample 13: Gradient=0.0499, Δw=0.0000, w=3.7950, Loss=0.0790
Epoch 1, Sample 14: Gradient=0.2263, Δw=0.0001, w=3.7949, Loss=0.3722
Epoch 1, Sample 15: Gradient=

# **PART B**

Sentence 1: "the bat flew"     
Sentence 2: "the bat swung"   

In [43]:
# Sentence 1: animal meaning
embeddings_bat_animal = {
    "the":  np.array([0.1, 0.1]),
    "bat":  np.array([0.5, 0.5]),
    "flew": np.array([0.1, 0.9])
}

# Sentence 2: sports meaning
embeddings_bat_sports = {
    "the":   np.array([0.1, 0.1]),
    "bat":   np.array([0.5, 0.5]),
    "swung": np.array([0.9, 0.1])
}


In [44]:
# Sentence 1 (animal meaning)
sentence_1 = ["the", "bat", "flew"]

# Initialize matrices for Q, K, V
Q_1 = np.array([embeddings_bat_animal[word] for word in sentence_1])
K_1 = np.array([embeddings_bat_animal[word] for word in sentence_1])
V_1 = np.array([embeddings_bat_animal[word] for word in sentence_1])

# Step 1: Similarity - Calculate dot products for Q and K (attention scores)
attention_scores_1 = Q_1.dot(K_1.T)

# Step 2: Weights - Apply softmax to attention scores
def softmax(x):
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum(axis=0)

attention_weights_1 = np.apply_along_axis(softmax, 1, attention_scores_1)

# Step 3: Contextualized vectors
attention_output_1 = attention_weights_1.dot(V_1)


In [50]:
# Sentence 2 (sports meaning)
sentence_1 = ["the", "bat", "swung"]

# Initialize matrices for Q, K, V
Q_2 = np.array([embeddings_bat_sports[word] for word in sentence_1])
K_2 = np.array([embeddings_bat_sports[word] for word in sentence_1])
V_2 = np.array([embeddings_bat_sports[word] for word in sentence_1])

# Step 1: Similarity - Calculate dot products for Q and K (attention scores)
attention_scores_2 = Q_1.dot(K_1.T)

# Step 2: Weights - Apply softmax to attention scores
def softmax(x):
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum(axis=0)

attention_weights_2 = np.apply_along_axis(softmax, 1, attention_scores_1)

# Step 3: Contextualized vectors
attention_output_2 = attention_weights_1.dot(V_1)

In [32]:
Q_1

array([[0.1, 0.1],
       [0.5, 0.5],
       [0.1, 0.9]])

In [51]:
Q_2

array([[0.1, 0.1],
       [0.5, 0.5],
       [0.9, 0.1]])

In [33]:
attention_scores_1

array([[0.02, 0.1 , 0.1 ],
       [0.1 , 0.5 , 0.5 ],
       [0.1 , 0.5 , 0.82]])

In [52]:
attention_scores_2

array([[0.02, 0.1 , 0.1 ],
       [0.1 , 0.5 , 0.5 ],
       [0.1 , 0.5 , 0.82]])

In [34]:
attention_weights_1

array([[0.3157987 , 0.34210065, 0.34210065],
       [0.25102611, 0.37448695, 0.37448695],
       [0.21996112, 0.32814344, 0.45189544]])

In [53]:
attention_weights_2

array([[0.3157987 , 0.34210065, 0.34210065],
       [0.25102611, 0.37448695, 0.37448695],
       [0.21996112, 0.32814344, 0.45189544]])

In [35]:
attention_output_1

array([[0.23684026, 0.51052078],
       [0.24979478, 0.54938434],
       [0.23125737, 0.59277373]])

In [54]:
attention_output_2

array([[0.51052078, 0.23684026],
       [0.54938434, 0.24979478],
       [0.59277373, 0.23125737]])

In [55]:
bat_1 = attention_output_1[1]   # bat in animal context
bat_2 = attention_output_2[1]   # bat in sports context


In [57]:
distance = np.linalg.norm(bat_1 - bat_2)
distance

np.float64(0.4236836148453855)