In [3]:
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModel

In [23]:
# Load the lightweight BERT model and its tokenizer
tokenizer = AutoTokenizer.from_pretrained("prajjwal1/bert-small")
model = AutoModel.from_pretrained("prajjwal1/bert-small")
model.eval()


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 512, padding_idx=0)
    (position_embeddings): Embedding(512, 512)
    (token_type_embeddings): Embedding(2, 512)
    (LayerNorm): LayerNorm((512,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-3): 4 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=512, out_features=512, bias=True)
            (key): Linear(in_features=512, out_features=512, bias=True)
            (value): Linear(in_features=512, out_features=512, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=512, out_features=512, bias=True)
            (LayerNorm): LayerNorm((512,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)


# Task 1

In [7]:

# Dictionary to cache computed word embeddings
embedding_cache = {}

def get_embedding(word):
    """
    Given a word, computes its embedding using the BERT model.
    If the word tokenizes into multiple tokens, their embeddings are averaged.
    The result is cached for efficiency.
    """
    # Use lower-case since we are using an uncased model.
    word = word.lower()
    if word in embedding_cache:
        return embedding_cache[word]
    with torch.no_grad():
        # Tokenize without adding special tokens so that only the word’s sub–tokens are processed
        inputs = tokenizer(word, add_special_tokens=False, return_tensors="pt")
        outputs = model(**inputs)
        # outputs.last_hidden_state has shape (1, sequence_length, hidden_size)
        token_embeds = outputs.last_hidden_state.squeeze(0)  # shape: (sequence_length, hidden_size)
        # If the word is split into multiple tokens, average the token embeddings
        if token_embeds.dim() == 1:
            embed = token_embeds
        else:
            embed = token_embeds.mean(dim=0)
        embedding_cache[word] = embed
        return embed

def read_analogy_file(file_path):
    """
    Reads the analogy file and splits it into groups.
    Each group starts with a line beginning with a colon (":") specifying the group name.
    The following lines (until the next group header) are assumed to be analogies
    in the format: a b c d
    """
    groups = {}
    current_group = None
    with open(file_path, 'r') as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            # If the line starts with ":" it indicates a new group.
            if line.startswith(":"):
                group_name = line[1:].strip()
                current_group = group_name
                groups[current_group] = []
            else:
                # Each line should contain exactly 4 words
                tokens = line.split()
                if len(tokens) == 4 and current_group is not None:
                    groups[current_group].append(tokens)
    return groups

def evaluate_group(analogy_list):
    """
    For a given list of analogies (each a list of 4 words [a, b, c, d]),
    compute prediction accuracy for varying cutoff values k based on two measures:
      - Cosine similarity: higher is better.
      - L2 distance: lower is better.
    
    The candidate set is defined as all unique words that appear in the second (b)
    and fourth (d) positions among the analogies in the group.
    Returns the list of ks and two dictionaries mapping each k to the accuracy percentage.
    """
    # Build candidate set: unique words from the second and fourth positions in the group
    candidates = set()
    for tokens in analogy_list:
        candidates.add(tokens[1].lower())
        candidates.add(tokens[3].lower())
    candidates = list(candidates)

    ks = [1, 2, 5, 10, 20]
    cosine_correct = {k: 0 for k in ks}
    l2_correct = {k: 0 for k in ks}
    total = len(analogy_list)
    
    for tokens in analogy_list:
        # Unpack the analogy: a is to b as c is to d.
        a, b, c, d = [word.lower() for word in tokens]
        # Compute the reference difference vector from the known pair (a and b)
        ref_diff = get_embedding(a) - get_embedding(b)
        
        # For each candidate, compute the difference vector with respect to c
        cos_scores = {}
        l2_scores = {}
        for cand in candidates:
            candidate_diff = get_embedding(c) - get_embedding(cand)
            # For cosine similarity: higher is better
            cos_sim = F.cosine_similarity(ref_diff, candidate_diff, dim=0)
            # For L2 distance: lower is better
            l2_distance = torch.norm(ref_diff - candidate_diff, p=2)
            cos_scores[cand] = cos_sim.item()
            l2_scores[cand] = l2_distance.item()
        
        # Sort candidates by cosine similarity (descending order) and by L2 distance (ascending order)
        sorted_cos = sorted(cos_scores.items(), key=lambda x: x[1], reverse=True)
        sorted_l2 = sorted(l2_scores.items(), key=lambda x: x[1])
        
        # For each cutoff k, check if the true answer d is among the top k candidates
        for k in ks:
            top_k_cos = [item[0] for item in sorted_cos[:k]]
            top_k_l2 = [item[0] for item in sorted_l2[:k]]
            if d in top_k_cos:
                cosine_correct[k] += 1
            if d in top_k_l2:
                l2_correct[k] += 1
                
    # Calculate accuracy percentages
    cosine_acc = {k: (cosine_correct[k] / total) * 100 for k in ks}
    l2_acc = {k: (l2_correct[k] / total) * 100 for k in ks}
    
    return ks, cosine_acc, l2_acc

def print_table(group_name, ks, cosine_acc, l2_acc):
    """
    Prints a formatted results table for a given group.
    """
    print("Group:", group_name)
    print("{:<5} {:<30} {:<30}".format("k", "Accuracy (Cosine Similarity)", "Accuracy (L2 Distance)"))
    for k in ks:
        print("{:<5} {:<30.2f} {:<30.2f}".format(k, cosine_acc[k], l2_acc[k]))
    print("\n" + "-"*70 + "\n")
    
if __name__ == "__main__":
    # Read the analogy dataset from the local file
    file_path = "./data/task_1_data.txt"
    groups = read_analogy_file(file_path)
    
    # Choose three groups; one of them must be a 'capital' group.
    selected_groups = {}
    capital_group_key = None
    
    # Look for a group name containing 'capital' (case-insensitive)
    for key in groups.keys():
        if "capital" in key.lower():
            capital_group_key = key
            break
    if capital_group_key is None:
        raise ValueError("No capital-related group found in the dataset!")
    
    selected_groups[capital_group_key] = groups[capital_group_key]
    
    # Add any two other groups (excluding the already selected capital group)
    count = 1
    for key in groups.keys():
        if key == capital_group_key:
            continue
        if count >= 3:
            break
        selected_groups[key] = groups[key]
        count += 1
    
    # Evaluate each selected group and output the results table
    for group_name, analogy_list in selected_groups.items():
        ks, cosine_acc, l2_acc = evaluate_group(analogy_list)
        print_table(group_name, ks, cosine_acc, l2_acc)


Group: capital-common-countries
k     Accuracy (Cosine Similarity)   Accuracy (L2 Distance)        
1     64.82                          98.02                         
2     76.48                          99.01                         
5     86.17                          100.00                        
10    94.47                          100.00                        
20    99.21                          100.00                        

----------------------------------------------------------------------

Group: capital-world
k     Accuracy (Cosine Similarity)   Accuracy (L2 Distance)        
1     16.78                          51.99                         
2     21.68                          59.86                         
5     31.12                          71.24                         
10    40.52                          78.69                         
20    52.83                          85.08                         

---------------------------------------------------------

# Task 2

In [24]:
import os
import torch
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModel
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def get_sentence_embedding(text):
    """
    Given a review text, compute its sentence embedding using the BERT model.
    We use the [CLS] token's output as the sentence representation.
    """
    # Tokenize and encode the text. We enable truncation and padding.
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=256)
    # Move inputs to the same device as the model.
    inputs = {k: v.to(device) for k, v in inputs.items()}
    
    with torch.no_grad():
        outputs = model(**inputs)
        # Get the [CLS] token embedding: first token of the sequence.
        cls_embedding = outputs.last_hidden_state[:, 0, :]
    
    return cls_embedding.squeeze(0).cpu().numpy()

def compute_embeddings(texts):
    """
    Given a list of texts, compute the BERT embeddings.
    (This function processes texts one by one; for many samples consider batching.)
    """
    embeddings = []
    for text in texts:
        emb = get_sentence_embedding(text)
        embeddings.append(emb)
    return np.vstack(embeddings)


# Load the dataset. Adjust the file path and column names as needed.
file_path = "./data/amazon_reviews.csv"
if not os.path.exists(file_path):
    raise FileNotFoundError(f"File not found: {file_path}")

# Read CSV into a pandas DataFrame.
data = pd.read_csv(file_path)

# Check that the expected columns exist.
if "reviewText" not in data.columns or "overall" not in data.columns:
    raise ValueError("The CSV file must contain 'reviewText' and 'overall' columns.")

# Extract text and labels.
texts = data["reviewText"].astype(str).tolist()
labels = data["overall"].values  # original labels in range 1-5

# Convert ratings to zero-indexed labels: 1 becomes 0, 2 becomes 1, etc.
labels = labels - 1

# Split the data into training and test sets (80/20 split)
X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.2, random_state=42)

print("Computing embeddings for training samples...")
train_embeddings = compute_embeddings(X_train)
print("Computing embeddings for test samples...")
test_embeddings = compute_embeddings(X_test)

# -----------------------------
# Model 1: Logistic Regression
# -----------------------------
print("\nTraining Logistic Regression...")
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(train_embeddings, y_train)
y_pred_lr = lr_model.predict(test_embeddings)
lr_acc = accuracy_score(y_test, y_pred_lr)
print("Logistic Regression Accuracy: {:.2f}%".format(lr_acc * 100))
print("Classification Report (Logistic Regression):")
print(classification_report(y_test, y_pred_lr))

# -----------------------------
# Model 2: XGBoost Classifier
# -----------------------------
print("\nTraining XGBoost Classifier...")
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
xgb_model.fit(train_embeddings, y_train)
y_pred_xgb = xgb_model.predict(test_embeddings)
xgb_acc = accuracy_score(y_test, y_pred_xgb)
print("XGBoost Accuracy: {:.2f}%".format(xgb_acc * 100))
print("Classification Report (XGBoost):")
print(classification_report(y_test, y_pred_xgb))


Computing embeddings for training samples...
Computing embeddings for test samples...

Training Logistic Regression...
Logistic Regression Accuracy: 77.62%
Classification Report (Logistic Regression):
              precision    recall  f1-score   support

         0.0       0.45      0.48      0.47        50
         1.0       0.08      0.05      0.06        20
         2.0       0.17      0.16      0.17        25
         3.0       0.20      0.10      0.14       106
         4.0       0.86      0.92      0.89       782

    accuracy                           0.78       983
   macro avg       0.35      0.34      0.34       983
weighted avg       0.74      0.78      0.75       983


Training XGBoost Classifier...


Parameters: { "use_label_encoder" } are not used.



XGBoost Accuracy: 79.96%
Classification Report (XGBoost):
              precision    recall  f1-score   support

         0.0       0.57      0.24      0.34        50
         1.0       0.00      0.00      0.00        20
         2.0       0.00      0.00      0.00        25
         3.0       0.00      0.00      0.00       106
         4.0       0.81      0.99      0.89       782

    accuracy                           0.80       983
   macro avg       0.28      0.25      0.25       983
weighted avg       0.67      0.80      0.73       983



In [None]:
# -----------------------------
# Model 1: Logistic Regression
# -----------------------------
print("\nTraining Logistic Regression...")
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(train_embeddings, y_train)

# Evaluate on training set
y_pred_train_lr = lr_model.predict(train_embeddings)
train_acc_lr = accuracy_score(y_train, y_pred_train_lr)
print("\nLogistic Regression - Training Accuracy: {:.2f}%".format(train_acc_lr * 100))
print("Classification Report (Logistic Regression on Training Set):")
print(classification_report(y_train, y_pred_train_lr))

# Evaluate on test set
y_pred_lr = lr_model.predict(test_embeddings)
lr_acc = accuracy_score(y_test, y_pred_lr)
print("\nLogistic Regression - Test Accuracy: {:.2f}%".format(lr_acc * 100))
print("Classification Report (Logistic Regression on Test Set):")
print(classification_report(y_test, y_pred_lr))

# -----------------------------
# Model 2: XGBoost Classifier
# -----------------------------
print("\nTraining XGBoost Classifier...")
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
xgb_model.fit(train_embeddings, y_train)

# Evaluate on training set
y_pred_train_xgb = xgb_model.predict(train_embeddings)
train_acc_xgb = accuracy_score(y_train, y_pred_train_xgb)
print("\nXGBoost - Training Accuracy: {:.2f}%".format(train_acc_xgb * 100))
print("Classification Report (XGBoost on Training Set):")
print(classification_report(y_train, y_pred_train_xgb))

# Evaluate on test set
y_pred_xgb = xgb_model.predict(test_embeddings)
xgb_acc = accuracy_score(y_test, y_pred_xgb)
print("\nXGBoost - Test Accuracy: {:.2f}%".format(xgb_acc * 100))
print("Classification Report (XGBoost on Test Set):")
print(classification_report(y_test, y_pred_xgb))


Training Logistic Regression...

Logistic Regression - Training Accuracy: 89.67%
Classification Report (Logistic Regression on Training Set):
              precision    recall  f1-score   support

         0.0       0.94      0.95      0.95       194
         1.0       0.98      0.93      0.96        60
         2.0       0.95      0.66      0.78       117
         3.0       0.69      0.31      0.42       421
         4.0       0.90      0.98      0.94      3140

    accuracy                           0.90      3932
   macro avg       0.89      0.77      0.81      3932
weighted avg       0.88      0.90      0.88      3932


Logistic Regression - Test Accuracy: 77.62%
Classification Report (Logistic Regression on Test Set):
              precision    recall  f1-score   support

         0.0       0.45      0.48      0.47        50
         1.0       0.08      0.05      0.06        20
         2.0       0.17      0.16      0.17        25
         3.0       0.20      0.10      0.14      

Parameters: { "use_label_encoder" } are not used.




XGBoost - Training Accuracy: 99.97%
Classification Report (XGBoost on Training Set):
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00       194
         1.0       1.00      1.00      1.00        60
         2.0       1.00      1.00      1.00       117
         3.0       1.00      1.00      1.00       421
         4.0       1.00      1.00      1.00      3140

    accuracy                           1.00      3932
   macro avg       1.00      1.00      1.00      3932
weighted avg       1.00      1.00      1.00      3932


XGBoost - Test Accuracy: 79.96%
Classification Report (XGBoost on Test Set):
              precision    recall  f1-score   support

         0.0       0.57      0.24      0.34        50
         1.0       0.00      0.00      0.00        20
         2.0       0.00      0.00      0.00        25
         3.0       0.00      0.00      0.00       106
         4.0       0.81      0.99      0.89       782

    accuracy         

In [32]:
# -----------------------------
# Model 1: Logistic Regression
# -----------------------------
print("\nTraining Logistic Regression...")
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(train_embeddings, y_train)

# Evaluate on training set
y_pred_train_lr = lr_model.predict(train_embeddings)
train_acc_lr = accuracy_score(y_train, y_pred_train_lr)
print("\nLogistic Regression - Training Accuracy: {:.2f}%".format(train_acc_lr * 100))
print("Classification Report (Logistic Regression on Training Set):")
print(classification_report(y_train, y_pred_train_lr))

# Evaluate on test set
y_pred_lr = lr_model.predict(test_embeddings)
lr_acc = accuracy_score(y_test, y_pred_lr)
print("\nLogistic Regression - Test Accuracy: {:.2f}%".format(lr_acc * 100))
print("Classification Report (Logistic Regression on Test Set):")
print(classification_report(y_test, y_pred_lr))

# -----------------------------
# Model 2: XGBoost Classifier
# -----------------------------
print("\nTraining XGBoost Classifier...")
# Updated XGBoost model with tuned hyperparameters for better generalization.
xgb_model = XGBClassifier(
    use_label_encoder=False,
    eval_metric='mlogloss',
    max_depth=3,           # Limit tree depth to prevent overfitting.
    learning_rate=0.1,     # Moderate learning rate.
    n_estimators=50,      # Limit number of trees.
    subsample=0.8,         # Use 80% of samples for each tree.
    colsample_bytree=0.8,  # Use 80% of features for each tree.
    reg_alpha=0.1,         # L1 regularization.
    reg_lambda=1.0         # L2 regularization.
)

# Train XGBoost with early stopping to mitigate overfitting.
xgb_model.fit(
    train_embeddings, 
    y_train, 
    eval_set=[(train_embeddings, y_train), (test_embeddings, y_test)],
)

# Evaluate on training set
y_pred_train_xgb = xgb_model.predict(train_embeddings)
train_acc_xgb = accuracy_score(y_train, y_pred_train_xgb)
print("\nXGBoost - Training Accuracy: {:.2f}%".format(train_acc_xgb * 100))
print("Classification Report (XGBoost on Training Set):")
print(classification_report(y_train, y_pred_train_xgb))

# Evaluate on test set
y_pred_xgb = xgb_model.predict(test_embeddings)
xgb_acc = accuracy_score(y_test, y_pred_xgb)
print("\nXGBoost - Test Accuracy: {:.2f}%".format(xgb_acc * 100))
print("Classification Report (XGBoost on Test Set):")
print(classification_report(y_test, y_pred_xgb))


Training Logistic Regression...

Logistic Regression - Training Accuracy: 89.67%
Classification Report (Logistic Regression on Training Set):
              precision    recall  f1-score   support

         0.0       0.94      0.95      0.95       194
         1.0       0.98      0.93      0.96        60
         2.0       0.95      0.66      0.78       117
         3.0       0.69      0.31      0.42       421
         4.0       0.90      0.98      0.94      3140

    accuracy                           0.90      3932
   macro avg       0.89      0.77      0.81      3932
weighted avg       0.88      0.90      0.88      3932


Logistic Regression - Test Accuracy: 77.62%
Classification Report (Logistic Regression on Test Set):
              precision    recall  f1-score   support

         0.0       0.45      0.48      0.47        50
         1.0       0.08      0.05      0.06        20
         2.0       0.17      0.16      0.17        25
         3.0       0.20      0.10      0.14      

Parameters: { "use_label_encoder" } are not used.



[7]	validation_0-mlogloss:0.92974	validation_1-mlogloss:0.96387
[8]	validation_0-mlogloss:0.88853	validation_1-mlogloss:0.92721
[9]	validation_0-mlogloss:0.85155	validation_1-mlogloss:0.89446
[10]	validation_0-mlogloss:0.81848	validation_1-mlogloss:0.86589
[11]	validation_0-mlogloss:0.78927	validation_1-mlogloss:0.84063
[12]	validation_0-mlogloss:0.76276	validation_1-mlogloss:0.81903
[13]	validation_0-mlogloss:0.73885	validation_1-mlogloss:0.79888
[14]	validation_0-mlogloss:0.71742	validation_1-mlogloss:0.78200
[15]	validation_0-mlogloss:0.69762	validation_1-mlogloss:0.76591
[16]	validation_0-mlogloss:0.68049	validation_1-mlogloss:0.75197
[17]	validation_0-mlogloss:0.66400	validation_1-mlogloss:0.73928
[18]	validation_0-mlogloss:0.64930	validation_1-mlogloss:0.72823
[19]	validation_0-mlogloss:0.63493	validation_1-mlogloss:0.71813
[20]	validation_0-mlogloss:0.62203	validation_1-mlogloss:0.70908
[21]	validation_0-mlogloss:0.61020	validation_1-mlogloss:0.70030
[22]	validation_0-mlogloss:0

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [33]:
# -----------------------------
# Model 1: Logistic Regression
# -----------------------------
print("\nTraining Logistic Regression...")
lr_model = LogisticRegression(max_iter=1000, class_weight="balanced")
lr_model.fit(train_embeddings, y_train)
y_pred_lr = lr_model.predict(test_embeddings)
lr_acc = accuracy_score(y_test, y_pred_lr)
print("Logistic Regression Accuracy: {:.2f}%".format(lr_acc * 100))
print("Classification Report (Logistic Regression):")
print(classification_report(y_test, y_pred_lr))



Training Logistic Regression...
Logistic Regression Accuracy: 61.55%
Classification Report (Logistic Regression):
              precision    recall  f1-score   support

         0.0       0.33      0.44      0.38        50
         1.0       0.15      0.25      0.19        20
         2.0       0.13      0.32      0.19        25
         3.0       0.18      0.41      0.25       106
         4.0       0.90      0.67      0.77       782

    accuracy                           0.62       983
   macro avg       0.34      0.42      0.35       983
weighted avg       0.76      0.62      0.67       983

