# Project #2 â€“ Transformers for NLP

In [3]:
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModel

In [23]:
# Load the lightweight BERT model and its tokenizer
tokenizer = AutoTokenizer.from_pretrained("prajjwal1/bert-small")
model = AutoModel.from_pretrained("prajjwal1/bert-small")
model.eval()


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 512, padding_idx=0)
    (position_embeddings): Embedding(512, 512)
    (token_type_embeddings): Embedding(2, 512)
    (LayerNorm): LayerNorm((512,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-3): 4 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=512, out_features=512, bias=True)
            (key): Linear(in_features=512, out_features=512, bias=True)
            (value): Linear(in_features=512, out_features=512, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=512, out_features=512, bias=True)
            (LayerNorm): LayerNorm((512,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)


## Task 1

In [None]:
embedding_cache = {}
def get_embedding(word):
    word = word.lower()
    if word in embedding_cache:
        return embedding_cache[word]
    with torch.no_grad():
        inputs = tokenizer(word, add_special_tokens=False, return_tensors="pt")
        outputs = model(**inputs)
        token_embeds = outputs.last_hidden_state.squeeze(0)  
        if token_embeds.dim() == 1:
            embed = token_embeds
        else:
            embed = token_embeds.mean(dim=0)
        embedding_cache[word] = embed
        return embed

def read_analogy_file(file_path):
    groups = {}
    current_group = None
    with open(file_path, 'r') as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            if line.startswith(":"):
                group_name = line[1:].strip()
                current_group = group_name
                groups[current_group] = []
            else:
                tokens = line.split()
                if len(tokens) == 4 and current_group is not None:
                    groups[current_group].append(tokens)
    return groups

def evaluate_group(analogy_list):
    for tokens in analogy_list:
        candidates.add(tokens[1].lower())
        candidates.add(tokens[3].lower())
    candidates = list(candidates)

    ks = [1, 2, 5, 10, 20]
    cosine_correct = {k: 0 for k in ks}
    l2_correct = {k: 0 for k in ks}
    total = len(analogy_list)
    
    for tokens in analogy_list:
        a, b, c, d = [word.lower() for word in tokens]
        ref_diff = get_embedding(a) - get_embedding(b)
        

        cos_scores = {}
        l2_scores = {}
        for cand in candidates:
            candidate_diff = get_embedding(c) - get_embedding(cand)
            cos_sim = F.cosine_similarity(ref_diff, candidate_diff, dim=0)

            l2_distance = torch.norm(ref_diff - candidate_diff, p=2)
            cos_scores[cand] = cos_sim.item()
            l2_scores[cand] = l2_distance.item()
        

        sorted_cos = sorted(cos_scores.items(), key=lambda x: x[1], reverse=True)
        sorted_l2 = sorted(l2_scores.items(), key=lambda x: x[1])
        

        for k in ks:
            top_k_cos = [item[0] for item in sorted_cos[:k]]
            top_k_l2 = [item[0] for item in sorted_l2[:k]]
            if d in top_k_cos:
                cosine_correct[k] += 1
            if d in top_k_l2:
                l2_correct[k] += 1
                

    cosine_acc = {k: (cosine_correct[k] / total) * 100 for k in ks}
    l2_acc = {k: (l2_correct[k] / total) * 100 for k in ks}
    
    return ks, cosine_acc, l2_acc

def print_table(group_name, ks, cosine_acc, l2_acc):
    print("Group:", group_name)
    print("{:<5} {:<30} {:<30}".format("k", "Accuracy (Cosine Similarity)", "Accuracy (L2 Distance)"))
    for k in ks:
        print("{:<5} {:<30.2f} {:<30.2f}".format(k, cosine_acc[k], l2_acc[k]))
    print("\n" + "-"*70 + "\n")
    
if __name__ == "__main__":
    file_path = "./data/task_1_data.txt"
    groups = read_analogy_file(file_path)
    selected_groups = {}
    capital_group_key = None
    
    for key in groups.keys():
        if "capital" in key.lower():
            capital_group_key = key
            break
    if capital_group_key is None:
        raise ValueError("No capital-related group found in the dataset!")
    
    selected_groups[capital_group_key] = groups[capital_group_key]
    

    count = 1
    for key in groups.keys():
        if key == capital_group_key:
            continue
        if count >= 3:
            break
        selected_groups[key] = groups[key]
        count += 1    
    for group_name, analogy_list in selected_groups.items():
        ks, cosine_acc, l2_acc = evaluate_group(analogy_list)
        print_table(group_name, ks, cosine_acc, l2_acc)


Group: capital-common-countries
k     Accuracy (Cosine Similarity)   Accuracy (L2 Distance)        
1     16.80                          89.13                         
2     38.34                          96.84                         
5     59.88                          98.81                         
10    80.24                          100.00                        
20    96.64                          100.00                        

----------------------------------------------------------------------

Group: capital-world
k     Accuracy (Cosine Similarity)   Accuracy (L2 Distance)        
1     5.04                           34.88                         
2     11.45                          52.14                         
5     18.35                          64.83                         
10    26.02                          73.05                         
20    37.44                          81.61                         

---------------------------------------------------------

#### **note: the numbers may slightly differ from the ones put in the report, due to switching between bert-tiny, bert-mini and bert-small during testing**

## Task 2

### Initial implementation: Logistic Regression and XGBoost

In [None]:
import os
import torch
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModel
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def get_sentence_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=256)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    
    with torch.no_grad():
        outputs = model(**inputs)
        # first token of the sequence
        cls_embedding = outputs.last_hidden_state[:, 0, :]
    
    return cls_embedding.squeeze(0).cpu().numpy()

def compute_embeddings(texts):
    embeddings = []
    for text in texts:
        emb = get_sentence_embedding(text)
        embeddings.append(emb)
    return np.vstack(embeddings)



file_path = "./data/amazon_reviews.csv"
data = pd.read_csv(file_path)
texts = data["reviewText"].astype(str).tolist()
labels = data["overall"].values  # original labels in range 1-5

# Convert ratings to zero-indexed labels for XGBoost to handle them properly
labels = labels - 1


X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.2, random_state=42)


train_embeddings = compute_embeddings(X_train)
test_embeddings = compute_embeddings(X_test)


# Model 1: Logistic Regression
print("\nTraining Logistic Regression...")
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(train_embeddings, y_train)
y_pred_lr = lr_model.predict(test_embeddings)
lr_acc = accuracy_score(y_test, y_pred_lr)
print("Logistic Regression Accuracy: {:.2f}%".format(lr_acc * 100))
print("Classification Report (Logistic Regression):")
print(classification_report(y_test, y_pred_lr))


# Model 2: XGBoost Classifier
print("\nTraining XGBoost Classifier...")
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
xgb_model.fit(train_embeddings, y_train)
y_pred_xgb = xgb_model.predict(test_embeddings)
xgb_acc = accuracy_score(y_test, y_pred_xgb)
print("XGBoost Accuracy: {:.2f}%".format(xgb_acc * 100))
print("Classification Report (XGBoost):")
print(classification_report(y_test, y_pred_xgb))


Computing embeddings for training samples...
Computing embeddings for test samples...

Training Logistic Regression...
Logistic Regression Accuracy: 77.62%
Classification Report (Logistic Regression):
              precision    recall  f1-score   support

         0.0       0.45      0.48      0.47        50
         1.0       0.08      0.05      0.06        20
         2.0       0.17      0.16      0.17        25
         3.0       0.20      0.10      0.14       106
         4.0       0.86      0.92      0.89       782

    accuracy                           0.78       983
   macro avg       0.35      0.34      0.34       983
weighted avg       0.74      0.78      0.75       983


Training XGBoost Classifier...


Parameters: { "use_label_encoder" } are not used.



XGBoost Accuracy: 79.96%
Classification Report (XGBoost):
              precision    recall  f1-score   support

         0.0       0.57      0.24      0.34        50
         1.0       0.00      0.00      0.00        20
         2.0       0.00      0.00      0.00        25
         3.0       0.00      0.00      0.00       106
         4.0       0.81      0.99      0.89       782

    accuracy                           0.80       983
   macro avg       0.28      0.25      0.25       983
weighted avg       0.67      0.80      0.73       983



### Experiment: Tweaked Logistic Regression to address class imbalance

In [None]:
print("\nTraining Logistic Regression...")
lr_model = LogisticRegression(max_iter=1000, class_weight="balanced")
lr_model.fit(train_embeddings, y_train)
y_pred_lr = lr_model.predict(test_embeddings)
lr_acc = accuracy_score(y_test, y_pred_lr)
print("Logistic Regression Accuracy: {:.2f}%".format(lr_acc * 100))
print("Classification Report (Logistic Regression):")
print(classification_report(y_test, y_pred_lr))



Training Logistic Regression...
Logistic Regression Accuracy: 61.55%
Classification Report (Logistic Regression):
              precision    recall  f1-score   support

         0.0       0.33      0.44      0.38        50
         1.0       0.15      0.25      0.19        20
         2.0       0.13      0.32      0.19        25
         3.0       0.18      0.41      0.25       106
         4.0       0.90      0.67      0.77       782

    accuracy                           0.62       983
   macro avg       0.34      0.42      0.35       983
weighted avg       0.76      0.62      0.67       983



### Experiment: Tweaked XGBoost to make it generalize better (reduce overfit)

In [None]:
# tuned hyperparameters for better generalization
xgb_model = XGBClassifier(
    use_label_encoder=False,
    eval_metric='mlogloss',
    max_depth=3,           
    learning_rate=0.1,     
    n_estimators=50,      
    subsample=0.8,         
    colsample_bytree=0.8,  
    reg_alpha=0.1,         
    reg_lambda=1.0         
)
xgb_model.fit(
    train_embeddings, 
    y_train, 
    eval_set=[(train_embeddings, y_train), (test_embeddings, y_test)],
)


# Evaluate on training set
y_pred_train_xgb = xgb_model.predict(train_embeddings)
train_acc_xgb = accuracy_score(y_train, y_pred_train_xgb)
print("\nXGBoost - Training Accuracy: {:.2f}%".format(train_acc_xgb * 100))
print("Classification Report (XGBoost on Training Set):")
print(classification_report(y_train, y_pred_train_xgb))

# Evaluate on test set
y_pred_xgb = xgb_model.predict(test_embeddings)
xgb_acc = accuracy_score(y_test, y_pred_xgb)
print("\nXGBoost - Test Accuracy: {:.2f}%".format(xgb_acc * 100))
print("Classification Report (XGBoost on Test Set):")
print(classification_report(y_test, y_pred_xgb))


Training XGBoost Classifier...
[0]	validation_0-mlogloss:1.46149	validation_1-mlogloss:1.46535
[1]	validation_0-mlogloss:1.34245	validation_1-mlogloss:1.35067
[2]	validation_0-mlogloss:1.24347	validation_1-mlogloss:1.25597
[3]	validation_0-mlogloss:1.16051	validation_1-mlogloss:1.17752
[4]	validation_0-mlogloss:1.09013	validation_1-mlogloss:1.11197
[5]	validation_0-mlogloss:1.02931	validation_1-mlogloss:1.05592
[6]	validation_0-mlogloss:0.97626	validation_1-mlogloss:1.00674
[7]	validation_0-mlogloss:0.92974	validation_1-mlogloss:0.96387
[8]	validation_0-mlogloss:0.88853	validation_1-mlogloss:0.92721
[9]	validation_0-mlogloss:0.85155	validation_1-mlogloss:0.89446


Parameters: { "use_label_encoder" } are not used.



[10]	validation_0-mlogloss:0.81848	validation_1-mlogloss:0.86589
[11]	validation_0-mlogloss:0.78927	validation_1-mlogloss:0.84063
[12]	validation_0-mlogloss:0.76276	validation_1-mlogloss:0.81903
[13]	validation_0-mlogloss:0.73885	validation_1-mlogloss:0.79888
[14]	validation_0-mlogloss:0.71742	validation_1-mlogloss:0.78200
[15]	validation_0-mlogloss:0.69762	validation_1-mlogloss:0.76591
[16]	validation_0-mlogloss:0.68049	validation_1-mlogloss:0.75197
[17]	validation_0-mlogloss:0.66400	validation_1-mlogloss:0.73928
[18]	validation_0-mlogloss:0.64930	validation_1-mlogloss:0.72823
[19]	validation_0-mlogloss:0.63493	validation_1-mlogloss:0.71813
[20]	validation_0-mlogloss:0.62203	validation_1-mlogloss:0.70908
[21]	validation_0-mlogloss:0.61020	validation_1-mlogloss:0.70030
[22]	validation_0-mlogloss:0.59833	validation_1-mlogloss:0.69233
[23]	validation_0-mlogloss:0.58791	validation_1-mlogloss:0.68599
[24]	validation_0-mlogloss:0.57820	validation_1-mlogloss:0.68002
[25]	validation_0-mloglos

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
