In [1]:
import torch
import os
torch.device('cuda' if torch.cuda.is_available() else 'cpu')

device(type='cuda')

In [2]:
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:21"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [3]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
tqdm.pandas()
from sentence_transformers import SentenceTransformer, models, InputExample, losses
from datasets import Dataset
from torch.utils.data import DataLoader
from sklearn.model_selection import KFold
import re

In [4]:
DATA_PATH = "data/"
topics = pd.read_csv(DATA_PATH + "topics.csv")
content = pd.read_csv(DATA_PATH + "content.csv")
correlations = pd.read_csv('data/kfold_correlations.csv')
correlations = correlations[correlations.fold != 0]

In [5]:
correlations

Unnamed: 0,topic_id,content_ids,fold
0,t_00004da3a1b2,c_1108dd0c7a5d c_376c5a8eb028 c_5bc0e1e2cba0 c...,1
1,t_00068291e9a4,c_639ea2ef9c95 c_89ce9367be10 c_ac1672cdcd2c c...,1
2,t_00069b63a70a,c_11a1dc0bfb99,4
3,t_0006d41a73a8,c_0c6473c3480d c_1c57a1316568 c_5e375cf14c47 c...,2
5,t_0008a1bd84ba,c_7ff92a954a3d c_8790b074383e,3
...,...,...,...
61512,t_fff830472691,c_61fb63326e5d c_8f224e321c87,1
61513,t_fff9e5407d13,c_026db653a269 c_0fb048a6412c c_20de77522603 c...,4
61514,t_fffbe1d5d43c,c_46f852a49c08 c_6659207b25d5,2
61515,t_fffe14f1be1e,c_cece166bad6a,2


In [6]:
correlations["content_id"] = correlations["content_ids"].str.split(" ")
corr = correlations.explode("content_id").drop(columns=["content_ids"])

In [7]:
topics.rename(columns=lambda x: "topic_" + x, inplace=True)
content.rename(columns=lambda x: "content_" + x, inplace=True)

In [8]:
corr = corr.merge(topics, how="left", on="topic_id")
corr = corr.merge(content, how="left", on="content_id")
corr.head()

Unnamed: 0,topic_id,fold,content_id,topic_title,topic_description,topic_channel,topic_category,topic_level,topic_language,topic_parent,topic_has_content,content_title,content_description,content_kind,content_text,content_language,content_copyright_holder,content_license
0,t_00004da3a1b2,1,c_1108dd0c7a5d,Откриването на резисторите,"Изследване на материали, които предизвикват на...",000cf7,source,4,bg,t_16e29365b50d,True,Молив като резистор,"Моливът причинява промяна в отклонението, подо...",video,,bg,,
1,t_00004da3a1b2,1,c_376c5a8eb028,Откриването на резисторите,"Изследване на материали, които предизвикват на...",000cf7,source,4,bg,t_16e29365b50d,True,Да чуем променливото съпротивление,Тук чертаем линия на лист хартия и я използвам...,video,,bg,,
2,t_00004da3a1b2,1,c_5bc0e1e2cba0,Откриването на резисторите,"Изследване на материали, които предизвикват на...",000cf7,source,4,bg,t_16e29365b50d,True,Променлив резистор (реостат) с графит от молив,Използваме сърцевината на молива (неговия граф...,video,,bg,,
3,t_00004da3a1b2,1,c_76231f9d0b5e,Откриването на резисторите,"Изследване на материали, които предизвикват на...",000cf7,source,4,bg,t_16e29365b50d,True,Последователно свързване на галваничен елемент...,"Защо отклонението се променя, когато се свърже...",video,,bg,,
4,t_00068291e9a4,1,c_639ea2ef9c95,Entradas e saídas de uma função,Entenda um pouco mais sobre funções.,8e286a,source,4,pt,t_d14b6c2a2b70,True,Dados e resultados de funções: gráficos,Encontre todas as entradas que correspondem a ...,exercise,,pt,,


In [9]:
#corr = corr.drop_duplicates(subset = ['topic_title','content_title'])

In [10]:
corr['topic_title'].fillna("", inplace = True)
corr['content_title'].fillna("", inplace = True)

corr['topic_description'].fillna("no description", inplace = True)
corr['content_description'].fillna("no description", inplace = True)

corr['content_text'].fillna("", inplace = True)
corr['content_license'].fillna("", inplace = True)

In [11]:
def white_spaces(x):
    return re.sub(' +', ' ', x)

corr['text1'] =  '[CLS] ' + corr['topic_title'] + ' <|=t_sep=|> '  + corr['topic_description'] + ' [SEP]'
corr['text2'] = '[CLS] ' + corr['content_title'] + ' <|=t_sep=|> '  + corr['topic_description'] + ' <|=t_sep=|> ' + corr['content_text'] + ' <|=t_sep=|> ' + corr['content_kind'] + ' [SEP]'
#corr['text1'] = corr['text1'].progress_apply(lambda x: white_spaces(x))
#corr['text2'] = corr['text2'].progress_apply(lambda x: white_spaces(x))

In [12]:
corr

Unnamed: 0,topic_id,fold,content_id,topic_title,topic_description,topic_channel,topic_category,topic_level,topic_language,topic_parent,topic_has_content,content_title,content_description,content_kind,content_text,content_language,content_copyright_holder,content_license,text1,text2
0,t_00004da3a1b2,1,c_1108dd0c7a5d,Откриването на резисторите,"Изследване на материали, които предизвикват на...",000cf7,source,4,bg,t_16e29365b50d,True,Молив като резистор,"Моливът причинява промяна в отклонението, подо...",video,,bg,,,[CLS] Откриването на резисторите <|=t_sep=|> И...,[CLS] Молив като резистор <|=t_sep=|> Изследва...
1,t_00004da3a1b2,1,c_376c5a8eb028,Откриването на резисторите,"Изследване на материали, които предизвикват на...",000cf7,source,4,bg,t_16e29365b50d,True,Да чуем променливото съпротивление,Тук чертаем линия на лист хартия и я използвам...,video,,bg,,,[CLS] Откриването на резисторите <|=t_sep=|> И...,[CLS] Да чуем променливото съпротивление <|=t_...
2,t_00004da3a1b2,1,c_5bc0e1e2cba0,Откриването на резисторите,"Изследване на материали, които предизвикват на...",000cf7,source,4,bg,t_16e29365b50d,True,Променлив резистор (реостат) с графит от молив,Използваме сърцевината на молива (неговия граф...,video,,bg,,,[CLS] Откриването на резисторите <|=t_sep=|> И...,[CLS] Променлив резистор (реостат) с графит от...
3,t_00004da3a1b2,1,c_76231f9d0b5e,Откриването на резисторите,"Изследване на материали, които предизвикват на...",000cf7,source,4,bg,t_16e29365b50d,True,Последователно свързване на галваничен елемент...,"Защо отклонението се променя, когато се свърже...",video,,bg,,,[CLS] Откриването на резисторите <|=t_sep=|> И...,[CLS] Последователно свързване на галваничен е...
4,t_00068291e9a4,1,c_639ea2ef9c95,Entradas e saídas de uma função,Entenda um pouco mais sobre funções.,8e286a,source,4,pt,t_d14b6c2a2b70,True,Dados e resultados de funções: gráficos,Encontre todas as entradas que correspondem a ...,exercise,,pt,,,[CLS] Entradas e saídas de uma função <|=t_sep...,[CLS] Dados e resultados de funções: gráficos ...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
223824,t_fff9e5407d13,4,c_d64037a72376,NA_U06 - El periódico,no description,71fd51,supplemental,2,es,t_5bd8f6ae9f7d,True,Introducción: El periódico,no description,html5,,es,,,[CLS] NA_U06 - El periódico <|=t_sep=|> no des...,[CLS] Introducción: El periódico <|=t_sep=|> n...
223825,t_fffbe1d5d43c,2,c_46f852a49c08,Inscribed shapes problem solving,Use properties of inscribed angles to prove pr...,0c929f,source,4,sw,t_50145b9bab3f,True,Proof: Right triangles inscribed in circles -d...,Proof showing that a triangle inscribed in a c...,video,,sw,,,[CLS] Inscribed shapes problem solving <|=t_se...,[CLS] Proof: Right triangles inscribed in circ...
223826,t_fffbe1d5d43c,2,c_6659207b25d5,Inscribed shapes problem solving,Use properties of inscribed angles to prove pr...,0c929f,source,4,sw,t_50145b9bab3f,True,Area of inscribed equilateral triangle -dubbed...,A worked example of finding the area of an equ...,video,,sw,,,[CLS] Inscribed shapes problem solving <|=t_se...,[CLS] Area of inscribed equilateral triangle -...
223827,t_fffe14f1be1e,2,c_cece166bad6a,Lección 7,no description,6e90a7,aligned,6,es,t_d448c707984d,True,Juego con las palabras,no description,document,,es,,,[CLS] Lección 7 <|=t_sep=|> no description [SEP],[CLS] Juego con las palabras <|=t_sep=|> no de...


In [13]:
corr["set"] = corr[["text1", "text2"]].values.tolist()
train_df = pd.DataFrame(corr["set"])
#target = corr['target']

In [14]:
dataset = Dataset.from_pandas(train_df)

In [15]:
train_examples = []
train_data = dataset["set"] 
n_examples = dataset.num_rows

for i in tqdm(range(n_examples)):
    example = train_data[i]
    if example[0] == None: #remove None
        print(example)
        continue        
    train_examples.append(InputExample(texts=[str(example[0]), str(example[1])],
                                      # label=target[i]
                                      ))

  0%|          | 0/223829 [00:00<?, ?it/s]

from sentence_transformers import SentenceTransformer, models
from torch import nn

word_embedding_model = models.Transformer('sentence-transformers/paraphrase-multilingual-mpnet-base-v2', max_seq_length=64)
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
dense_model = models.Dense(in_features=pooling_model.get_sentence_embedding_dimension(), out_features=384, activation_function=nn.Tanh())

model = SentenceTransformer(modules=[word_embedding_model, pooling_model, dense_model])

In [16]:
model = SentenceTransformer("sentence-transformers/paraphrase-multilingual-mpnet-base-v2")
tokens = ["<|=t_sep=|> "]
word_embedding_model = model._first_module()
word_embedding_model.tokenizer.add_tokens(tokens, special_tokens=True)
word_embedding_model.auto_model.resize_token_embeddings(len(word_embedding_model.tokenizer))
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
model.max_seq_length = 64

In [17]:
model

SentenceTransformer(
  (0): Transformer({'max_seq_length': 64, 'do_lower_case': False}) with Transformer model: XLMRobertaModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
)

In [18]:
#qmodel = torch.quantization.quantize_dynamic(model, {torch.nn.Linear}, dtype=torch.qint8)

In [19]:
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=32,num_workers=1)
train_loss = losses.MultipleNegativesRankingLoss(model=model)
# num_epochs = 10
num_epochs = 20
warmup_steps = int(len(train_dataloader) * num_epochs * 0.1) #10% of train data

In [None]:
model.fit(train_objectives=[(train_dataloader, train_loss)],
          epochs=num_epochs,
          show_progress_bar=True,
          use_amp = True,
          #gradient_checkpointing=True,
          save_best_model = True,
          #output_path='./paraphrase-multilingual-mpnet-base-v2-exp_fold0_epochs10',
          warmup_steps=warmup_steps
         )

Epoch:   0%|          | 0/20 [00:00<?, ?it/s]

Iteration:   0%|          | 0/6995 [00:00<?, ?it/s]

In [20]:
model.save(f"model/paraphrase-multilingual-mpnet-base-v2-epochs-2.5-tuned/")

In [None]:
#model.save("models/all-MiniLM-L6-v2_fold0_epochs20/")