# Q1 

In [1]:
import pandas as pd

# Define the gradient descent function
def gradient_descent(start_x, learning_rate, tolerance=0.01, max_iterations=100):
    x = start_x
    iterations = []
    
    for i in range(max_iterations):
        # Calculate gradient f'(x) = 2x - 2
        gradient = 2 * x - 2
        # Update x
        x_new = x - learning_rate * gradient
        
        # Store iteration details
        iterations.append((i + 1, x, x_new, gradient))
        
        # Check if x is within the desired range (0.8, 1.2)
        if 0.8 <= x_new <= 1.2:
            break
        
        # Update x
        x = x_new
    
    return iterations

# Initial parameters for both cases
start_x = 4
learning_rate_A = 0.4
learning_rate_B = 0.7

# Run gradient descent for both cases
iterations_A = gradient_descent(start_x, learning_rate_A)
iterations_B = gradient_descent(start_x, learning_rate_B)

# Creating DataFrames to display iterations for each case
df_A = pd.DataFrame(iterations_A, columns=["Iteration", "x_old", "x_new", "Gradient"])
df_B = pd.DataFrame(iterations_B, columns=["Iteration", "x_old", "x_new", "Gradient"])

# Display the results
print("Gradient Descent Iterations for Learning Rate 0.4")
display(df_A)

print("\nGradient Descent Iterations for Learning Rate 0.7")
display(df_B)


Gradient Descent Iterations for Learning Rate 0.4


Unnamed: 0,Iteration,x_old,x_new,Gradient
0,1,4.0,1.6,6.0
1,2,1.6,1.12,1.2



Gradient Descent Iterations for Learning Rate 0.7


Unnamed: 0,Iteration,x_old,x_new,Gradient
0,1,4.0,-0.2,6.0
1,2,-0.2,1.48,-2.4
2,3,1.48,0.808,0.96


# Q2

In [8]:
import numpy as np

# Activation functions
def relu(x):
    return np.maximum(0, x)

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

# Input values
I1 = 10  # Blood Pressure
I2 = 20  # BMI

# Weights and biases for hidden layer 1
W_H11 = [0.2, 0.8]
b_H11 = -0.4

W_H12 = [0.1, 0.1]
b_H12 = 0.1

# Hidden layer 1 computations
H11 = relu(W_H11[0] * I1 + W_H11[1] * I2 + b_H11)
H12 = relu(W_H12[0] * I1 + W_H12[1] * I2 + b_H12)

print(f"H11: {H11}")
print(f"H12: {H12}")

# Weights and biases for hidden layer 2
W_H21 = [0.7, 0.2]
b_H21 = -0.5

W_H22 = [0.4, 0.5]
b_H22 = 0.5

# Hidden layer 2 computations
H21 = relu(W_H21[0] * H11 + W_H21[1] * H12 + b_H21)
H22 = relu(W_H22[0] * H11 + W_H22[1] * H12 + b_H22)

print(f"H21: {H21}")
print(f"H22: {H22}")

# Weights and biases for output layer
W_O1 = [-0.5, 0.5]

# Output layer computation
O1 = sigmoid(W_O1[0] * H21 + W_O1[1] * H22)

print(f"O1 (Sigmoid Output): {O1}")

# Classification based on threshold
threshold = 0.5
prediction = 1 if O1 >= threshold else 0

print(f"Predicted Class: {'Heart_Attack (1)' if prediction == 1 else 'No_Heart_Attack (0)'}")


H11: 17.6
H12: 3.1
H21: 12.440000000000001
H22: 9.090000000000002
O1 (Sigmoid Output): 0.15775868701896995
Predicted Class: No_Heart_Attack (0)


# Q3

In [5]:
from collections import Counter, defaultdict

# Initial training data
training_data = ["low", "lower", "high", "higher"]

# Preprocess training data to tokenize words as lists of characters with a special end-of-word token
def preprocess_data(data):
    return [list(word) + ['</w>'] for word in data]

# Count pairs in a given vocabulary
def get_pair_frequencies(vocab):
    pairs = defaultdict(int)
    for word in vocab:
        for i in range(len(word) - 1):
            pairs[(word[i], word[i + 1])] += 1
    return pairs

# Merge the most frequent pair in the vocabulary
def merge_pair(vocab, pair):
    new_vocab = []
    bigram = "".join(pair)
    for word in vocab:
        new_word = []
        i = 0
        while i < len(word):
            if i < len(word) - 1 and (word[i], word[i + 1]) == pair:
                new_word.append(bigram)  # Merge pair
                i += 2
            else:
                new_word.append(word[i])
                i += 1
        new_vocab.append(new_word)
    return new_vocab

# Main BPE algorithm with a min-frequency threshold
def byte_pair_encoding(data, min_frequency=2):
    vocab = preprocess_data(data)
    iterations = []
    
    while True:
        # Get frequency of each pair
        pair_freq = get_pair_frequencies(vocab)
        
        # Find the most common pair with frequency >= min_frequency
        most_frequent_pair = None
        max_freq = 0
        for pair, freq in pair_freq.items():
            if freq > max_freq and freq >= min_frequency:
                most_frequent_pair = pair
                max_freq = freq
        
        # If no pairs meet the frequency requirement, break
        if most_frequent_pair is None:
            break
        
        # Merge the most frequent pair
        vocab = merge_pair(vocab, most_frequent_pair)
        
        # Save the iteration state
        iterations.append({
            "pair": most_frequent_pair,
            "frequency": max_freq,
            "vocab_state": ["".join(word) for word in vocab]
        })
    
    return iterations

# Run BPE on the training data with min frequency 2
bpe_iterations = byte_pair_encoding(training_data, min_frequency=2)

# Display the iterations
for i, iteration in enumerate(bpe_iterations):
    print(f"Iteration {i + 1}:")
    print(f"  Most Frequent Pair: {iteration['pair']} (Frequency: {iteration['frequency']})")
    print(f"  Vocabulary State: {iteration['vocab_state']}")
    print()


Iteration 1:
  Most Frequent Pair: ('l', 'o') (Frequency: 2)
  Vocabulary State: ['low</w>', 'lower</w>', 'high</w>', 'higher</w>']

Iteration 2:
  Most Frequent Pair: ('lo', 'w') (Frequency: 2)
  Vocabulary State: ['low</w>', 'lower</w>', 'high</w>', 'higher</w>']

Iteration 3:
  Most Frequent Pair: ('e', 'r') (Frequency: 2)
  Vocabulary State: ['low</w>', 'lower</w>', 'high</w>', 'higher</w>']

Iteration 4:
  Most Frequent Pair: ('er', '</w>') (Frequency: 2)
  Vocabulary State: ['low</w>', 'lower</w>', 'high</w>', 'higher</w>']

Iteration 5:
  Most Frequent Pair: ('h', 'i') (Frequency: 2)
  Vocabulary State: ['low</w>', 'lower</w>', 'high</w>', 'higher</w>']

Iteration 6:
  Most Frequent Pair: ('hi', 'g') (Frequency: 2)
  Vocabulary State: ['low</w>', 'lower</w>', 'high</w>', 'higher</w>']

Iteration 7:
  Most Frequent Pair: ('hig', 'h') (Frequency: 2)
  Vocabulary State: ['low</w>', 'lower</w>', 'high</w>', 'higher</w>']



# Q4

Suppose that word embeddings (with the same dimensions) were created using an approach which preserves relationships between embeddings precisely.

A) If the following equations are correct calculate the embedding of “brothers”:

E(students) - E(student) = [0,0,2]

E(father) - E(mother) = [0,1,0]

E(sister)=[-2,1,0]

 

E(brothers)= ?

 

B) Suppose that the following sentence is processed using self-attention mechanism and the context is already added to embeddings of words. What is the embedding of “sibling”  after adding the context by the self-attention? 

Sentence: “I have one male sibling”

 

E(sibling)= ?

In [6]:
import numpy as np

# Given information:
E_students_minus_E_student = np.array([0, 0, 2])
E_father_minus_E_mother = np.array([0, 1, 0])
E_sister = np.array([-2, 1, 0])

# Part A: Calculating E(brothers)
# Assuming E(brothers) can be calculated similarly to the plural relationship:
# E(brothers) = E(father) + (E(students) - E(student))
# Since we don't have E(father) directly, but we have E(father) - E(mother), 
# we'll need to express E(brothers) in terms of E(mother).

# Let E_mother be the base, then:
# E_father = E_mother + E(father - mother)
# E_brothers = E_father + (E(students) - E(student))
# Substituting E_father:
E_mother = np.array([0, 0, 0])  # We'll assume E_mother as [0,0,0] if no explicit embedding is given
E_father = E_mother + E_father_minus_E_mother
E_brothers = E_father + E_students_minus_E_student

print("Embedding of brothers (E(brothers)):", E_brothers)

# Part B: Embedding of "sibling" after adding context by self-attention
# Assuming the context added for the word "male" to "sibling" aligns sibling towards male attributes.
# Since we lack specific contextual vectors, let's assume sibling's embedding is similar to an average of male-related embeddings
# in this simplified context. 

# Hypothetically, let's say the context embedding shift for "male sibling" is [1, 0, 0] to indicate gender aspect.
# E(sibling) after self-attention would be:
E_sibling = E_brothers + np.array([1, 0, 0])  # Adding context indicating "male"

print("Embedding of sibling with context (E(sibling)):", E_sibling)


Embedding of brothers (E(brothers)): [0 1 2]
Embedding of sibling with context (E(sibling)): [1 1 2]


# Q5

**Use Greedy and Beam Search (beam width=3) as decoding strategies for a sentence which starts with “I” and the following probabilities. Show steps of both decoding strategies and the resulting sentences.**

In [7]:
import numpy as np

# Define the probability dictionary for each word and its possible next words
probabilities = {
    "I": {
        "have": 0.20,
        "buy": 0.25,
        "had": 0.15,
        "got": 0.10
    },
    "have": {
        "this": 0.10,
        "that": 0.15,
        "it": 0.20,
        "not": 0.05
    },
    "buy": {
        "it": 0.10,
        "nothing": 0.05,
        "anything": 0.01,
        "something": 0.05
    },
    "had": {
        "this": 0.10,
        "that": 0.10,
        "it": 0.20,
        "not": 0.02
    },
    "got": {
        "nothing": 0.05,
        "it": 0.20,
        "something": 0.10,
        "this": 0.10
    }
}

# Greedy Search
def greedy_search(start_word, probabilities):
    sentence = [start_word]
    current_word = start_word
    
    while current_word in probabilities:
        # Select the next word with the highest probability
        next_word = max(probabilities[current_word], key=probabilities[current_word].get)
        sentence.append(next_word)
        current_word = next_word
    
    return sentence

# Beam Search
def beam_search(start_word, probabilities, beam_width=3):
    beams = [(start_word, 1.0)]  # Each beam is a tuple (sentence, probability)
    
    for _ in range(2):  # Limit to two steps (Step1 and Step2) for this example
        new_beams = []
        
        for sentence, prob in beams:
            last_word = sentence.split()[-1]
            
            if last_word in probabilities:
                # Expand each beam by the possible next words
                for next_word, next_prob in probabilities[last_word].items():
                    new_sentence = sentence + " " + next_word
                    new_prob = prob * next_prob
                    new_beams.append((new_sentence, new_prob))
        
        # Sort beams by probability and keep the top-k (beam width)
        new_beams = sorted(new_beams, key=lambda x: x[1], reverse=True)
        beams = new_beams[:beam_width]
    
    return beams

# Perform Greedy Search and Beam Search
greedy_result = greedy_search("I", probabilities)
beam_result = beam_search("I", probabilities, beam_width=3)

# Display Results
print("Greedy Search Result:")
print("Sentence:", " ".join(greedy_result))
print("\nBeam Search Results (Top 3):")
for sentence, prob in beam_result:
    print(f"Sentence: {sentence}, Probability: {prob}")


Greedy Search Result:
Sentence: I buy it

Beam Search Results (Top 3):
Sentence: I have it, Probability: 0.04000000000000001
Sentence: I have that, Probability: 0.03
Sentence: I had it, Probability: 0.03
