https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2

https://www.sbert.net/examples/applications/semantic-search/README.html

https://www.kaggle.com/datasets/mathurinache/samanantar

https://www.kaggle.com/datasets/eshuenglish/semantic-similarity?select=cw2_train.csv

In [None]:
from sentence_transformers import SentenceTransformer, LoggingHandler, models, evaluation, losses
from torch.utils.data import DataLoader
from sentence_transformers.datasets import ParallelSentencesDataset
from datetime import datetime

import os
import logging
import sentence_transformers.util
import csv
import gzip
from tqdm.autonotebook import tqdm
import numpy as np
import pandas as pd
import zipfile
import io

In [None]:
logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    handlers=[LoggingHandler()])
logger = logging.getLogger(__name__)

In [None]:
# Our monolingual teacher model, we want to convert to multiple languages
teacher_model_name = 'paraphrase-distilroberta-base-v2'

# Multilingual base model we use to imitate the teacher model
student_model_name = 'xlm-roberta-base'


In [None]:
max_seq_length = 128                #Student model max. lengths for inputs (number of word pieces)
train_batch_size = 64               #Batch size for training
inference_batch_size = 64           #Batch size at inference
max_sentences_per_language = 500000 #Maximum number of  parallel sentences for training
train_max_sentence_length = 250     #Maximum length (characters) for parallel training sentences

num_epochs = 5                       #Train for x epochs
num_warmup_steps = 10000             #Warumup steps

num_evaluation_steps = 1000          #Evaluate performance after every xxxx steps
dev_sentences = 1000                 #Number of parallel sentences to be used for development

In [None]:
parallel_sentences_folder = "../datasets/english-hindi/"

# Create parallel files for the selected language combinations
os.makedirs(parallel_sentences_folder, exist_ok=True)

In [None]:
######## Start the extension of the teacher model to multiple languages ########
logger.info("Load teacher model")
teacher_model = SentenceTransformer(teacher_model_name)


logger.info("Create student model from scratch")
word_embedding_model = models.Transformer(student_model_name, max_seq_length=max_seq_length)

# Apply mean pooling to get one fixed sized sentence vector
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
student_model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

In [None]:
###### Read Parallel Sentences Dataset ######
train_data = ParallelSentencesDataset(
    student_model=student_model,
    teacher_model=teacher_model,
    batch_size=inference_batch_size,
    use_embedding_cache=True
)

train_data.load_data(
    '../datasets/english-hindi/train.csv',
    max_sentences=max_sentences_per_language,
    max_sentence_length=train_max_sentence_length
)
train_dataloader = DataLoader(
    train_data, shuffle=True, batch_size=train_batch_size)
train_loss = losses.MSELoss(model=student_model)

In [None]:
#### Evaluate cross-lingual performance on different tasks #####
evaluators = []  # evaluators has a list of different evaluator classes we call periodically
v = pd.read_csv('../datasets/english-hindi/test.csv')
src_sentences = v.Sent2.values
trg_sentences = v.Sent1.values


# Mean Squared Error (MSE) measures the (euclidean) distance between teacher and student embeddings
dev_mse = evaluation.MSEEvaluator(
    src_sentences,
    trg_sentences,
    name='dev',
    teacher_model=teacher_model,
    batch_size=inference_batch_size
)
evaluators.append(dev_mse)

# TranslationEvaluator computes the embeddings for all parallel sentences. It then check if the embedding of source[i] is the closest to target[i] out of all available target sentences
dev_trans_acc = evaluation.TranslationEvaluator(
    src_sentences, trg_sentences,
    name='dev',
    batch_size=inference_batch_size
)
evaluators.append(dev_trans_acc)

In [None]:
d = pd.read_csv('../datasets/english-hindi/dev.csv')
dev_src = d.Sent2.values
dev_trg = d.Sent1.values
dev_scores = d.SimScore.values

dev_src.shape, dev_trg.shape, dev_scores.shape

In [None]:
test_evaluator = evaluation.EmbeddingSimilarityEvaluator(
    dev_src, dev_trg, dev_scores,
    batch_size=inference_batch_size,
    name='test',
    show_progress_bar=False
)
evaluators.append(test_evaluator)

In [None]:
output_path = "output/make-multilingual-" + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

# Train the model
student_model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    evaluator=evaluation.SequentialEvaluator(
        evaluators, main_score_function=lambda scores: np.mean(scores)
    ),
    epochs=num_epochs,
    warmup_steps=num_warmup_steps,
    evaluation_steps=num_evaluation_steps,
    output_path=output_path,
    save_best_model=True,
    optimizer_params={'lr': 2e-5, 'eps': 1e-6,}
)


## Create teacher and student model

In [None]:
###### CREATE MODEL ######
max_seq_length = 128
train_batch_size = 64

# Load teacher model
print("Load teacher model")
teacher_model = SentenceTransformer('bert-base-nli-stsb-mean-tokens')

# Create student model
print("Create student model")
word_embedding_model = models.Transformer("xlm-roberta-base")

# Apply mean pooling to get one fixed sized sentence vector
pooling_model = models.Pooling(
    word_embedding_model.get_word_embedding_dimension(),
    pooling_mode_mean_tokens=True,
    pooling_mode_cls_token=False,
    pooling_mode_max_tokens=False
)

model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

## Load datasets

In [None]:
t = pd.read_csv('../datasets/english-hindi/train.csv')
d = pd.read_csv('../datasets/english-hindi/dev.csv')
v = pd.read_csv('../datasets/english-hindi/test.csv')

In [None]:
dev_src = d.Sent1.values
dev_trg = d.Sent2.values
dev_scores = d.SimScore.values

dev_src.shape, dev_trg.shape, dev_scores.shape

In [None]:
test_src = v.Sent1.values
test_trg = v.Sent2.values

test_src.shape, test_trg.shape

In [None]:
###### Load train sets ######

train_reader = ParallelSentencesDataset(student_model=model, teacher_model=teacher_model)
train_reader.load_data('../datasets/english-hindi/train.txt')
train_dataloader = DataLoader(train_reader, shuffle=True, batch_size=train_batch_size)
train_loss = losses.MSELoss(model=model)


###### Load dev sets ######

evaluators = []
# sts_reader = readers.STSDataReader('../datasets/english-hindi/', s1_col_idx=0, s2_col_idx=1, score_col_idx=2)
# dev_data = SentencesDataset(examples=sts_reader.get_examples('dev.txt'), model=model)
# dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=train_batch_size)
evaluator_sts = evaluation.EmbeddingSimilarityEvaluator(
    dev_src, dev_trg, dev_scores, name='dev'
)
evaluators.append(evaluator_sts)


###### Load test sets ######

# test_reader = ParallelSentencesDataset(student_model=model, teacher_model=teacher_model)
# test_reader.load_data('../datasets/english-hindi/test.txt')
# test_dataloader = DataLoader(test_reader, shuffle=False, batch_size=train_batch_size)
test_mse = evaluation.MSEEvaluator(
    test_src, test_trg, name='test',
    teacher_model=teacher_model
)
evaluators.append(test_mse)

## Train model

<!-- 
output_path = "output/model-" + datetime.now().strftime("%Y-%m-%d")
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    evaluator=evaluation.SequentialEvaluator(evaluators, main_score_function=lambda scores: scores[-1]),
    epochs=20,
    evaluation_steps=1000,
    warmup_steps=10000,
    scheduler='warmupconstant',
    output_path=output_path,
    save_best_model=True,
    optimizer_params= {'lr': 2e-5, 'eps': 1e-6, 'correct_bias': False}
)
 -->

In [None]:
output_path = "./output/model-" + datetime.now().strftime("%Y-%m-%d")

model.fit(
    train_objectives = [(train_dataloader, train_loss)],
    evaluator = evaluation.SequentialEvaluator(evaluators, main_score_function=lambda scores: scores[-1]),
    epochs = 20,
    evaluation_steps = 1000,
    warmup_steps = 1000,
    scheduler = 'warmupconstant',
    output_path = output_path,
    save_best_model = True,
    optimizer_params = {'lr': 2e-5, 'eps': 1e-6}
)

# 1 preprocess text

1. Converting to lowercase
2. Converting digits to words
3. Remove punctuation an whitespace
4. Removing default stopwords
5. Lemmatization

## 1.1 Converting to lowercase
Happy > happy

In [None]:
def text_lowercase(text):
    return text.lower()

## 1.2 Converting digits to words
3 > three

In [None]:
def convert_number(text):
    p = inflect.engine()
    temp_str = text.split()

    new_string = []

    for word in temp_str:
        if word.isdigit():
            temp = p.number_to_words(word)
            new_string.append(temp)

        else:
            new_string.append(word)

    temp_str = ' '.join(new_string)
    return temp_str

## 1.3 Remove punctuation and whitespace
itching   ! > itching

In [None]:
def remove_punctuation(text):
    text = text.replace('_', ' ')
    translator = str.maketrans('', '', string.punctuation)
    return text.translate(translator)

In [None]:
def remove_whitespace(text):
    return  " ".join(text.split())

## 1.4 Removing default stopwords

In [None]:
def remove_stopwords(text):
    stop_words = set(stopwords.words("english"))
    word_tokens = word_tokenize(text)
    filtered_text = [word for word in word_tokens if word not in stop_words]
    return " ".join(filtered_text)

## 1.5 Lemmatization
itching > itch

In [None]:
def lemmatize_word(text):
    lemmatizer = WordNetLemmatizer()
    word_tokens = word_tokenize(text)

    lemmas = [lemmatizer.lemmatize(word, pos ='v') for word in word_tokens]
    return " ".join(lemmas)

In [None]:
def preprocess_pipe(text):
    """
    Combining all preprocessing steps.
    """
    text = text_lowercase(text)
    text = convert_number(text)
    text = remove_punctuation(text)
    text = remove_whitespace(text)
    text = remove_stopwords(text)
    text = lemmatize_word(text)

    return text

## Create dataset

In [None]:
def eng2hi(sentence):
    return t.translate(sentence, dest="hi").text

In [None]:
train_df = pd.read_csv('../datasets/english-hindi/cw2_train.csv', index_col=0)
test_df = pd.read_csv('../datasets/english-hindi/cw2_dev.csv', index_col=0)

### Converting sent1 to hindi for train and dev datasets

In [None]:
res = []
for sent in tqdm(train_df['Sent1'].values[:5000]):
    res.append(eng2hi(sent))

In [None]:
for sent in tqdm(train_df['Sent1'].values[5000:]):
    res.append(eng2hi(sent))

In [None]:
len(res)

In [None]:
train_df['Sent1'] = res

In [None]:
train_df

In [None]:
res = []
for sent in tqdm(test_df['Sent1'].values):
    res.append(eng2hi(sent))

In [None]:
len(res)

In [None]:
test_df['Sent1'] = res

In [None]:
test_df

### Converting dataset to tab spaced strings for ParallelSentencesDataset

In [None]:
df_1 = train_df.iloc[:2300,:]
df_2 = train_df.iloc[2300:,:]

In [None]:
df_1 = df_1[['Sent2', 'Sent1', 'SimScore']]
df_2 = df_2[['Sent2', 'Sent1', 'SimScore']]
test_df = test_df[['Sent2', 'Sent1', 'SimScore']]

In [None]:
df_1.to_csv('../datasets/english-hindi/dev.txt', header=None, index=False, sep='\t')

In [None]:
df_2.to_csv('../datasets/english-hindi/train.txt', header=None, index=False, sep='\t')

In [None]:
test_df.to_csv('../datasets/english-hindi/test.txt', header=None, index=False, sep='\t')