https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2

https://www.sbert.net/examples/applications/semantic-search/README.html

https://www.kaggle.com/datasets/mathurinache/samanantar

https://www.kaggle.com/datasets/eshuenglish/semantic-similarity?select=cw2_train.csv

In [1]:
from googletrans import Translator
t = Translator()

import pandas as pd
from tqdm import tqdm
from datetime import datetime
from sklearn.model_selection import train_test_split

import tensorflow as tf
from transformers import TFAutoModel

from sentence_transformers import SentenceTransformer, models
from sentence_transformers.datasets import ParallelSentencesDataset
from sentence_transformers import SentencesDataset, losses, evaluation, readers

In [None]:
class TFSentenceTransformer(tf.keras.layers.Layer):
    def __init__(self, model_name_or_path, **kwargs):
        super(TFSentenceTransformer, self).__init__()
        # loads transformers model
        self.model = TFAutoModel.from_pretrained(model_name_or_path, **kwargs)

    def call(self, inputs, normalize=True):
        # runs model on inputs
        model_output = self.model(inputs)
        # Perform pooling. In this case, mean pooling.
        embeddings = self.mean_pooling(model_output, inputs["attention_mask"])
        # normalizes the embeddings if wanted
        if normalize:
            embeddings = self.normalize(embeddings)
        return embeddings

    def mean_pooling(self, model_output, attention_mask):
        token_embeddings = model_output[0] # First element of model_output contains all token embeddings
        input_mask_expanded = tf.cast(
            tf.broadcast_to(tf.expand_dims(attention_mask, -1), tf.shape(token_embeddings)),
            tf.float32
        )
        return tf.math.reduce_sum(token_embeddings * input_mask_expanded, axis=1) / tf.clip_by_value(tf.math.reduce_sum(input_mask_expanded, axis=1), 1e-9, tf.float32.max)

    def normalize(self, embeddings):
        embeddings, _ = tf.linalg.normalize(embeddings, 2, axis=1)
        return embeddings

## Create teacher and student model

In [3]:
###### CREATE MODEL ######
max_seq_length = 128
train_batch_size = 64

# Load teacher model
print("Load teacher model")
teacher_model = SentenceTransformer('bert-base-nli-stsb-mean-tokens')

# Create student model
print("Create student model")
word_embedding_model = models.Transformer("xlm-roberta-base")

# Apply mean pooling to get one fixed sized sentence vector
pooling_model = models.Pooling(
    word_embedding_model.get_word_embedding_dimension(),
    pooling_mode_mean_tokens=True,
    pooling_mode_cls_token=False,
    pooling_mode_max_tokens=False
)

model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

Load teacher model
Create student model


Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaModel: ['lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


## Load datasets

In [4]:
t = pd.read_csv('../datasets/english-hindi/train.csv')
d = pd.read_csv('../datasets/english-hindi/dev.csv')
v = pd.read_csv('../datasets/english-hindi/test.csv')

In [5]:
dev_src = d.Sent1.values
dev_trg = d.Sent2.values
dev_scores = d.SimScore.values

dev_src.shape, dev_trg.shape, dev_scores.shape

((2300,), (2300,), (2300,))

In [6]:
test_src = v.Sent1.values
test_trg = v.Sent2.values

test_src.shape, test_trg.shape

((3000,), (3000,))

In [7]:
###### Load train sets ######

train_reader = ParallelSentencesDataset(student_model=model, teacher_model=teacher_model)
train_reader.load_data('../datasets/english-hindi/train.txt')
train_dataloader = DataLoader(train_reader, shuffle=True, batch_size=train_batch_size)
train_loss = losses.MSELoss(model=model)


###### Load dev sets ######

evaluators = []
# sts_reader = readers.STSDataReader('../datasets/english-hindi/', s1_col_idx=0, s2_col_idx=1, score_col_idx=2)
# dev_data = SentencesDataset(examples=sts_reader.get_examples('dev.txt'), model=model)
# dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=train_batch_size)
evaluator_sts = evaluation.EmbeddingSimilarityEvaluator(
    dev_src, dev_trg, dev_scores, name='dev'
)
evaluators.append(evaluator_sts)


###### Load test sets ######

# test_reader = ParallelSentencesDataset(student_model=model, teacher_model=teacher_model)
# test_reader.load_data('../datasets/english-hindi/test.txt')
# test_dataloader = DataLoader(test_reader, shuffle=False, batch_size=train_batch_size)
test_mse = evaluation.MSEEvaluator(
    test_src, test_trg, name='test',
    teacher_model=teacher_model
)
evaluators.append(test_mse)

## Train model

<!-- 
output_path = "output/model-" + datetime.now().strftime("%Y-%m-%d")
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    evaluator=evaluation.SequentialEvaluator(evaluators, main_score_function=lambda scores: scores[-1]),
    epochs=20,
    evaluation_steps=1000,
    warmup_steps=10000,
    scheduler='warmupconstant',
    output_path=output_path,
    save_best_model=True,
    optimizer_params= {'lr': 2e-5, 'eps': 1e-6, 'correct_bias': False}
)
 -->

In [8]:
output_path = "./output/model-" + datetime.now().strftime("%Y-%m-%d")

model.fit(
    train_objectives = [(train_dataloader, train_loss)],
    evaluator = evaluation.SequentialEvaluator(evaluators, main_score_function=lambda scores: scores[-1]),
    epochs = 20,
    evaluation_steps = 1000,
    warmup_steps = 1000,
    scheduler = 'warmupconstant',
    output_path = output_path,
    save_best_model = True,
    optimizer_params = {'lr': 2e-5, 'eps': 1e-6}
)

Epoch:   0%|          | 0/20 [00:00<?, ?it/s]

Iteration:   0%|          | 0/403 [00:00<?, ?it/s]

  labels = torch.tensor(labels)


Iteration:   0%|          | 0/403 [00:00<?, ?it/s]

KeyboardInterrupt: 

# 1 preprocess text

1. Converting to lowercase
2. Converting digits to words
3. Remove punctuation an whitespace
4. Removing default stopwords
5. Lemmatization

## 1.1 Converting to lowercase
Happy > happy

In [None]:
def text_lowercase(text):
    return text.lower()

## 1.2 Converting digits to words
3 > three

In [None]:
def convert_number(text):
    p = inflect.engine()
    temp_str = text.split()

    new_string = []

    for word in temp_str:
        if word.isdigit():
            temp = p.number_to_words(word)
            new_string.append(temp)

        else:
            new_string.append(word)

    temp_str = ' '.join(new_string)
    return temp_str

## 1.3 Remove punctuation and whitespace
itching   ! > itching

In [None]:
def remove_punctuation(text):
    text = text.replace('_', ' ')
    translator = str.maketrans('', '', string.punctuation)
    return text.translate(translator)

In [None]:
def remove_whitespace(text):
    return  " ".join(text.split())

## 1.4 Removing default stopwords

In [None]:
def remove_stopwords(text):
    stop_words = set(stopwords.words("english"))
    word_tokens = word_tokenize(text)
    filtered_text = [word for word in word_tokens if word not in stop_words]
    return " ".join(filtered_text)

## 1.5 Lemmatization
itching > itch

In [None]:
def lemmatize_word(text):
    lemmatizer = WordNetLemmatizer()
    word_tokens = word_tokenize(text)

    lemmas = [lemmatizer.lemmatize(word, pos ='v') for word in word_tokens]
    return " ".join(lemmas)

In [None]:
def preprocess_pipe(text):
    """
    Combining all preprocessing steps.
    """
    text = text_lowercase(text)
    text = convert_number(text)
    text = remove_punctuation(text)
    text = remove_whitespace(text)
    text = remove_stopwords(text)
    text = lemmatize_word(text)

    return text

## Create dataset

In [None]:
def eng2hi(sentence):
    return t.translate(sentence, dest="hi").text

In [None]:
train_df = pd.read_csv('../datasets/english-hindi/cw2_train.csv', index_col=0)
test_df = pd.read_csv('../datasets/english-hindi/cw2_dev.csv', index_col=0)

### Converting sent1 to hindi for train and dev datasets

In [None]:
res = []
for sent in tqdm(train_df['Sent1'].values[:5000]):
    res.append(eng2hi(sent))

In [None]:
for sent in tqdm(train_df['Sent1'].values[5000:]):
    res.append(eng2hi(sent))

In [None]:
len(res)

In [None]:
train_df['Sent1'] = res

In [None]:
train_df

In [None]:
res = []
for sent in tqdm(test_df['Sent1'].values):
    res.append(eng2hi(sent))

In [None]:
len(res)

In [None]:
test_df['Sent1'] = res

In [None]:
test_df

### Converting dataset to tab spaced strings for ParallelSentencesDataset

In [None]:
df_1 = train_df.iloc[:2300,:]
df_2 = train_df.iloc[2300:,:]

In [None]:
df_1 = df_1[['Sent2', 'Sent1', 'SimScore']]
df_2 = df_2[['Sent2', 'Sent1', 'SimScore']]
test_df = test_df[['Sent2', 'Sent1', 'SimScore']]

In [None]:
df_1.to_csv('../datasets/english-hindi/dev.txt', header=None, index=False, sep='\t')

In [None]:
df_2.to_csv('../datasets/english-hindi/train.txt', header=None, index=False, sep='\t')

In [None]:
test_df.to_csv('../datasets/english-hindi/test.txt', header=None, index=False, sep='\t')