In [None]:
!pip install datasets

Collecting datasets
  Downloading datasets-2.21.0-py3-none-any.whl.metadata (21 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-2.21.0-py3-none-any.whl (527 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m527.3/527.3 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl (39.9 MB)
[2K

In [None]:
from datasets import load_dataset
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
import spacy
from sklearn.model_selection import GridSearchCV
from transformers import BertTokenizer, BertModel, BertForSequenceClassification, Trainer, TrainingArguments
import torch
from datasets import Dataset
import pandas as pd

In [None]:
import torch

# Check if CUDA (GPU) is available
if torch.cuda.is_available():
    device = torch.device('cuda')
    print("Using GPU")
else:
    device = torch.device('cpu')
    print("Using CPU")


Using GPU


## Task 1: Phrase Similarity Classification

Given a pair of phrases, classify whether or not they are similar.

### Loading Dataset from Huggingface

The dataset is loaded from the Huggingface library.


In [None]:
phrase_train = load_dataset("PiC/phrase_similarity",split="train")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading data:   0%|          | 0.00/1.42M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/202k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/403k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/7004 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [None]:
phrase_train.shape

(7004, 6)

In [None]:
phrase_val = load_dataset("PiC/phrase_similarity",split="validation")
phrase_test = load_dataset("PiC/phrase_similarity",split="test")

In [None]:
phrase_val.shape, phrase_test.shape

((1000, 6), (2000, 6))

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### Representation Using Average Word Embeddings

Phrase representation will be achieved through the computation of average word embeddings.


downloaded Glove text file from https://nlp.stanford.edu/projects/glove/

Converts GloVe embeddings from the original format to Word2Vec format and loads them for use with `KeyedVectors` in `gensim`.


In [None]:
glove_input_file = '/content/drive/My Drive/glove.6B.300d.txt'
word2vec_output_file = 'glove.6B.300d.word2vec.txt'
glove2word2vec(glove_input_file, word2vec_output_file)
glove = KeyedVectors.load_word2vec_format(word2vec_output_file, binary=False)


  glove2word2vec(glove_input_file, word2vec_output_file)


### Computing Average Word Embeddings

Define a function to compute the average word embedding for a given phrase using the GloVe model.

In [None]:
def get_average_embedding(phrase, model):
    embeddings = [model[word] for word in phrase.split() if word in model]
    if embeddings:
        return np.mean(embeddings, axis=0)
    else:
        return np.zeros(model.vector_size)

In [None]:
X_train_glove = np.array([cosine_similarity([get_average_embedding(pair['phrase1'], glove)],
                                            [get_average_embedding(pair['phrase2'], glove)])[0][0]
                          for pair in phrase_train])
y_train = np.array([pair['label'] for pair in phrase_train])

In [None]:
clf_glove = LogisticRegression().fit(X_train_glove.reshape(-1, 1), y_train)

In [None]:
X_test_glove = np.array([cosine_similarity([get_average_embedding(pair['phrase1'], glove)],
                                           [get_average_embedding(pair['phrase2'], glove)])[0][0]
                         for pair in phrase_test])
y_test = np.array([pair['label'] for pair in phrase_test])

In [None]:
y_pred_glove = clf_glove.predict(X_test_glove.reshape(-1, 1))
print("GloVe Results")
print(f"Accuracy: {accuracy_score(y_test, y_pred_glove)}")
print(f"F1-Score: {f1_score(y_test, y_pred_glove)}")

GloVe Results
Accuracy: 0.4795
F1-Score: 0.47076766649720386


### Weighted Average Word Embeddings Using TF-IDF

In addition to simple averaging, we will implement a method that uses TF-IDF weights to compute weighted average word embeddings, thereby enhancing the representation quality.


In [None]:
tfidf = TfidfVectorizer()
tfidf.fit([" ".join([pair['phrase1'], pair['phrase2']]) for pair in phrase_train])

In [None]:
def get_weighted_average_embedding(phrase, model, tfidf_vectorizer):
    weights = tfidf_vectorizer.transform([phrase]).toarray()[0]
    words = phrase.split()
    embeddings = [model[word] * weights[idx] for idx, word in enumerate(words) if word in model]
    if embeddings:
        return np.mean(embeddings, axis=0)
    else:
        return np.zeros(model.vector_size)

In [None]:
X_train_glove = np.array([cosine_similarity([get_weighted_average_embedding(pair['phrase1'], glove,tfidf)],
                                            [get_weighted_average_embedding(pair['phrase2'], glove,tfidf)])[0][0]
                          for pair in phrase_train])
y_train = np.array([pair['label'] for pair in phrase_train])

In [None]:
clf_glove = LogisticRegression().fit(X_train_glove.reshape(-1, 1), y_train)

In [None]:
X_test_glove = np.array([cosine_similarity([get_weighted_average_embedding(pair['phrase1'], glove,tfidf)],
                                           [get_weighted_average_embedding(pair['phrase2'], glove,tfidf)])[0][0]
                         for pair in phrase_test])
y_test = np.array([pair['label'] for pair in phrase_test])

In [None]:
y_pred_glove = clf_glove.predict(X_test_glove.reshape(-1, 1))
print("TF-IDF Results")
print(f"Accuracy: {accuracy_score(y_test, y_pred_glove)}")
print(f"F1-Score: {f1_score(y_test, y_pred_glove)}")

TF-IDF Results
Accuracy: 0.5
F1-Score: 0.0


### Refinement and Improvement

An improved version of the initial model will be developed, incorporating more sophisticated techniques for phrase similarity classification.


In [None]:
# Load spacy model for POS tagging
nlp = spacy.load('en_core_web_sm')


# Additional feature functions
def get_pos_similarity(phrase1, phrase2):
    doc1 = nlp(phrase1)
    doc2 = nlp(phrase2)
    pos1 = [token.pos_ for token in doc1]
    pos2 = [token.pos_ for token in doc2]
    return len(set(pos1).intersection(set(pos2))) / len(set(pos1).union(set(pos2)))

def extract_features(pair, model, tfidf_vectorizer):
    phrase1 = pair['phrase1']
    phrase2 = pair['phrase2']

    # Average embeddings
    avg_embedding1 = np.mean([model[word] for word in phrase1.split() if word in model], axis=0)
    avg_embedding2 = np.mean([model[word] for word in phrase2.split() if word in model], axis=0)

    # TF-IDF weighted embeddings
    weighted_embedding1 = get_weighted_average_embedding(phrase1, model, tfidf_vectorizer)
    weighted_embedding2 = get_weighted_average_embedding(phrase2, model, tfidf_vectorizer)

    # Cosine Similarity
    cosine_sim = cosine_similarity([weighted_embedding1], [weighted_embedding2])[0][0]

    # Euclidean Distance
    euclidean_dist = euclidean_distances([weighted_embedding1], [weighted_embedding2])[0][0]

    # Length Difference
    length_diff = abs(len(phrase1.split()) - len(phrase2.split()))

    # POS Tag Similarity
    pos_sim = get_pos_similarity(phrase1, phrase2)

    # Combine all features into a single feature vector
    return np.array([cosine_sim, euclidean_dist, length_diff, pos_sim])

# Create feature matrix for training
X_train = np.array([extract_features(pair, glove, tfidf) for pair in phrase_train])
y_train = np.array([pair['label'] for pair in phrase_train])

# Standardize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)

# Train an advanced model
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# Create feature matrix for testing
X_test = np.array([extract_features(pair, glove, tfidf) for pair in phrase_test])
y_test = np.array([pair['label'] for pair in phrase_test])

# Standardize test features
X_test = scaler.transform(X_test)

# Make predictions and evaluate
y_pred = clf.predict(X_test)




  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=

Random Forest with Enhanced Features Results
Accuracy: 0.482
F1-Score: 0.5350089766606823


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


In [None]:
print("Random Forest with Enhanced Features Results")
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(f"F1-Score: {f1_score(y_test, y_pred)}")

Random Forest with Enhanced Features Results
Accuracy: 0.482
F1-Score: 0.5350089766606823


# Sentence Similarity

In [None]:
sen_train = load_dataset("google-research-datasets/paws", "labeled_final",split="train")
sen_validation = load_dataset("google-research-datasets/paws", "labeled_final",split="validation")
sen_test = load_dataset("google-research-datasets/paws", "labeled_final",split="test")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/9.79k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/8.43M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.24M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.23M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/49401 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/8000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/8000 [00:00<?, ? examples/s]

Average Word Embedding

Initially, sentence embeddings will be calculated by averaging the word embeddings of individual words within the sentence.


In [None]:
sen_train1 = load_dataset("google-research-datasets/paws", "labeled_final",split="train")
sen_validation1 = load_dataset("google-research-datasets/paws", "labeled_final",split="validation")
sen_test1 = load_dataset("google-research-datasets/paws", "labeled_final",split="test")

In [None]:
glove_input_file = '/content/drive/My Drive/glove.6B.300d.txt'
word2vec_output_file = 'glove.6B.300d.word2vec.txt'
glove2word2vec(glove_input_file, word2vec_output_file)
glove = KeyedVectors.load_word2vec_format(word2vec_output_file, binary=False)

  glove2word2vec(glove_input_file, word2vec_output_file)


In [None]:
def get_average_embedding(sen, model):
    embeddings = [model[word] for word in sen.split() if word in model]
    if embeddings:
        return np.mean(embeddings, axis=0)
    else:
        return np.zeros(model.vector_size)

In [None]:
X_train_glove = np.array([cosine_similarity([get_average_embedding(pair['sentence1'], glove)],
                                            [get_average_embedding(pair['sentence2'], glove)])[0][0]
                          for pair in sen_train1])
y_train = np.array([pair['label'] for pair in sen_train1])

In [None]:
clf_glove = LogisticRegression().fit(X_train_glove.reshape(-1, 1), y_train)

In [None]:
X_test_glove = np.array([cosine_similarity([get_average_embedding(pair['sentence1'], glove)],
                                           [get_average_embedding(pair['sentence2'], glove)])[0][0]
                         for pair in sen_test1])
y_test = np.array([pair['label'] for pair in sen_test1])

In [None]:
y_pred_glove = clf_glove.predict(X_test_glove.reshape(-1, 1))
print("GloVe Results")
print(f"Accuracy: {accuracy_score(y_test, y_pred_glove)}")
print(f"F1-Score: {f1_score(y_test, y_pred_glove)}")

GloVe Results
Accuracy: 0.558
F1-Score: 0.0


In [None]:
from sklearn.utils.class_weight import compute_class_weight

# Check class distribution
class_distribution = np.bincount(y_train)
print(f"Class distribution: {class_distribution}")

# Compute class weights
class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weights_dict = {i: class_weights[i] for i in range(len(class_weights))}

# Train Logistic Regression with class weights
clf_glove = LogisticRegression(class_weight=class_weights_dict, max_iter=1000)
clf_glove.fit(X_train_glove.reshape(-1, 1), y_train)

# Predict and evaluate
y_pred_glove = clf_glove.predict(X_test_glove.reshape(-1, 1))
print("GloVe Results with Class Weighting")
print(f"Accuracy: {accuracy_score(y_test, y_pred_glove)}")
print(f"F1-Score: {f1_score(y_test, y_pred_glove)}")




Class distribution: [27572 21829]
GloVe Results with Class Weighting
Accuracy: 0.531
F1-Score: 0.3790135716650116


In [None]:
# Try an SVM model
clf_svm = SVC(kernel='linear', class_weight=class_weights_dict)
clf_svm.fit(X_train_glove.reshape(-1, 1), y_train)

# Predict and evaluate SVM
y_pred_svm = clf_svm.predict(X_test_glove.reshape(-1, 1))
print("SVM Results")
print(f"Accuracy: {accuracy_score(y_test, y_pred_svm)}")
print(f"F1-Score: {f1_score(y_test, y_pred_svm)}")

SVM Results
Accuracy: 0.55775
F1-Score: 0.005061867266591676


### Application of TF-IDF Weighting

To improve the sentence embeddings, TF-IDF weighting will be applied to the average word embeddings, thereby refining the representation quality.

In [None]:
tfidf = TfidfVectorizer()
tfidf.fit([" ".join([pair['sentence1'], pair['sentence2']]) for pair in sen_train1])

In [None]:
def get_weighted_average_embedding(sen, model, tfidf_vectorizer):
    weights = tfidf_vectorizer.transform([sen]).toarray()[0]
    words = sen.split()
    embeddings = [model[word] * weights[idx] for idx, word in enumerate(words) if word in model]
    if embeddings:
        return np.mean(embeddings, axis=0)
    else:
        return np.zeros(model.vector_size)

In [None]:
X_train_glove1 = np.array([cosine_similarity([get_weighted_average_embedding(pair['sentence1'], glove,tfidf)],
                                            [get_weighted_average_embedding(pair['sentence2'], glove,tfidf)])[0][0]
                          for pair in sen_train1])
y_train1 = np.array([pair['label'] for pair in sen_train1])

In [None]:
clf_glove1 = LogisticRegression().fit(X_train_glove.reshape(-1, 1), y_train)

In [None]:
X_test_glove1 = np.array([cosine_similarity([get_weighted_average_embedding(pair['sentence1'], glove,tfidf)],
                                           [get_weighted_average_embedding(pair['sentence2'], glove,tfidf)])[0][0]
                         for pair in sen_test1])
y_test1 = np.array([pair['label'] for pair in sen_test1])

In [None]:
y_pred_glove1 = clf_glove1.predict(X_test_glove1.reshape(-1, 1))
print("TF-IDF Results")
print(f"Accuracy: {accuracy_score(y_test1, y_pred_glove1)}")
print(f"F1-Score: {f1_score(y_test1, y_pred_glove1)}")

TF-IDF Results
Accuracy: 0.558
F1-Score: 0.0


In [None]:

# Check class distribution
class_distribution = np.bincount(y_train1)
print(f"Class distribution: {class_distribution}")

# Compute class weights
class_weights = compute_class_weight('balanced', classes=np.unique(y_train1), y=y_train1)
class_weights_dict = {i: class_weights[i] for i in range(len(class_weights))}

# Train Logistic Regression with class weights
clf_glove1 = LogisticRegression(class_weight=class_weights_dict, max_iter=1000)
clf_glove1.fit(X_train_glove1.reshape(-1, 1), y_train1)

# Predict and evaluate
y_pred_glove1 = clf_glove1.predict(X_test_glove1.reshape(-1, 1))
print("GloVe Results with Class Weighting")
print(f"Accuracy: {accuracy_score(y_test1, y_pred_glove1)}")
print(f"F1-Score: {f1_score(y_test1, y_pred_glove1)}")




Class distribution: [27572 21829]
GloVe Results with Class Weighting
Accuracy: 0.556375
F1-Score: 0.018257261410788383


In [None]:
# Try an SVM model
clf_svm1 = SVC(kernel='linear', class_weight=class_weights_dict)
clf_svm1.fit(X_train_glove1.reshape(-1, 1), y_train1)

# Predict and evaluate SVM
y_pred_svm1 = clf_svm1.predict(X_test_glove1.reshape(-1, 1))
print("SVM Results")
print(f"Accuracy: {accuracy_score(y_test1, y_pred_svm1)}")
print(f"F1-Score: {f1_score(y_test1, y_pred_svm1)}")

SVM Results
Accuracy: 0.55775
F1-Score: 0.015033407572383074


# Bonus Task
Transformers are all the rage right now (backbone of most of the LLMs
you might have used). Can you fine-tune a pre-trained transformer based
models (BERT, Roberta, etc) to solve Phrase and Sentence Similarity
Tasks described above? You are free to use any resource out there

In [None]:
sen_train = sen_train.select(range(8000))
sen_validation = sen_validation.select(range(2000))
sen_test = sen_test.select(range(2000))

In [None]:
from transformers import AutoTokenizer

# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

# Tokenize the sentences
def preprocess_function(examples):
    return tokenizer(examples['sentence1'], examples['sentence2'], truncation=True, padding='max_length')

# Apply preprocessing to all splits
encoded_train = sen_train.map(preprocess_function, batched=True)
encoded_dev = sen_validation.map(preprocess_function, batched=True)
encoded_test = sen_test.map(preprocess_function, batched=True)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [None]:
encoded_train = encoded_train.remove_columns(['sentence1', 'sentence2'])
encoded_dev = encoded_dev.remove_columns(['sentence1', 'sentence2'])
encoded_test = encoded_test.remove_columns(['sentence1', 'sentence2'])

In [None]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = predictions.argmax(axis=-1)  # Get the index of the highest probability
    accuracy = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average='weighted')
    precision = precision_score(labels, predictions, average='weighted')
    recall = recall_score(labels, predictions, average='weighted')
    return {
        'accuracy': accuracy,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }


In [None]:
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments

# Load a pretrained model
model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

model.to(device)
# Define training arguments
training_args = TrainingArguments(
    output_dir='/content/results',
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    logging_dir='./logs',  # directory for storing logs
    logging_steps=10,

)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_train,
    eval_dataset=encoded_dev,
    compute_metrics=compute_metrics,
)



In [None]:
# Train the model
trainer.train()

In [None]:
results = trainer.evaluate(eval_dataset=encoded_test)

# Print results
print("Results on the test set:")
for key, value in results.items():
    print(f"{key}: {value}")

Results on the test set:
eval_loss: 0.6892745494842529
eval_accuracy: 0.5755
eval_f1: 0.5219813127413127
eval_precision: 0.5701906049679487
eval_recall: 0.5755
eval_runtime: 41.3311
eval_samples_per_second: 48.39
eval_steps_per_second: 12.097


In [None]:
output_dir='/content/drive/MyDrive/Model_Trained'

# Save the model
model.save_pretrained(output_dir)

# Save the tokenizer
tokenizer.save_pretrained(output_dir)

('/content/drive/MyDrive/Model_Trained/tokenizer_config.json',
 '/content/drive/MyDrive/Model_Trained/special_tokens_map.json',
 '/content/drive/MyDrive/Model_Trained/vocab.txt',
 '/content/drive/MyDrive/Model_Trained/added_tokens.json',
 '/content/drive/MyDrive/Model_Trained/tokenizer.json')