In [None]:
!pip install pandas pytorch_lightning pytorch_metric_learning wandb fastparquet
!pip install sklearn torch transformers sentence-transformers
# !pip3 install torch==1.10.1+cu113 -f https://download.pytorch.org/whl/cu113/torch_stable.html

# Domain adoptation

In [None]:
from pathlib import Path

import nltk
from sentence_transformers import SentenceTransformer, models, datasets, losses
from torch.utils.data import DataLoader
from tqdm.notebook import tqdm

# Define your sentence transformer model using CLS pooling
model_name = 'paraphrase-MiniLM-L12-v2'
word_embedding_model = models.Transformer(model_name, max_seq_length=64)
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), 'cls')
model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

# Define a list with sentences (1k - 100k sentences)
data_path = Path(r'..\1. crawling & parsing\wiki_film_descriptions\film_plots.txt')
train_sentences = []
texts = open(data_path, encoding='utf-8').read()
for text in tqdm(texts.split('\n')):
    train_sentences.extend(nltk.sent_tokenize(text, 'russian'))

#### TSDAE

In [None]:
# Create the special denoising dataset that adds noise on-the-fly
train_dataset = datasets.DenoisingAutoEncoderDataset(train_sentences)

# DataLoader to batch your data
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)

# Use the denoising auto-encoder loss
train_loss = losses.DenoisingAutoEncoderLoss(model, decoder_name_or_path=model_name, tie_encoder_decoder=True)

# Call the fit method
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=1,
    weight_decay=0,
    scheduler='constantlr',
    optimizer_params={'lr': 3e-5},
    show_progress_bar=True
)

model.save('output/tsdae-model')

# Similarity VK - Wiki

In [None]:
from pathlib import Path

import pandas as pd
from sentence_transformers import SentenceTransformer, InputExample, evaluation, models, datasets, losses
from torch.utils.data import DataLoader

In [None]:
train_path = Path(r'..\1. crawling & parsing\wiki_film_descriptions')

wiki_data = open(train_path / 'film_plots.txt', encoding='utf-8').read().split('\n')
wiki_labels = open(train_path / 'wiki_titles.txt', encoding='utf-8').read().split('\n')

test_path = Path(r'..\1. crawling & parsing\vk_test_queries')

vk_data = open(test_path / 'test_data.txt', encoding='utf-8').read().split('\n')
vk_labels = open(test_path / 'test_titles.txt', encoding='utf-8').read().split('\n')

_, _, vk_test_titles, _ = open(r'data_test_100\data_100.txt', encoding='utf-8').read().split('\n&&&\n')
vk_test_titles = vk_test_titles.split('\n')

negative_pairs = pd.read_csv('negative_pairs.tsv', sep='\t')
train_pairs = []
val_pairs = []

for query_labels, query_text in zip(vk_labels, vk_data):
    for query_label in query_labels.split(';'):
        wiki_label_id = wiki_labels.index(query_label)
        if query_labels not in vk_test_titles:
            train_pairs.append((query_text, wiki_data[wiki_label_id], 1))
        else:
            val_pairs.append((query_text, wiki_data[wiki_label_id], 1))

# todo split to train and 100 val queries
for _, row in negative_pairs.iterrows():
    vk_label, wiki_label = row.values
    wiki_label_id = wiki_labels.index(wiki_label)
    vk_label_ids = [i for i, label in enumerate(vk_labels) if vk_label in label]
    for vk_label_id in vk_label_ids:
        if vk_label_id < 100:
            val_pairs.append((vk_data[vk_label_id], wiki_data[wiki_label_id], 0))
        else:
            train_pairs.append((vk_data[vk_label_id], wiki_data[wiki_label_id], 0))

train_df = pd.DataFrame(train_pairs, columns=['vk', 'wiki', 'label']).drop_duplicates().reset_index().drop(columns=['index'])
val_df = pd.DataFrame(train_pairs, columns=['vk', 'wiki', 'label']).drop_duplicates().reset_index().drop(columns=['index'])
train_df.to_csv('mined_pairs_test_100.tsv', sep='\t', index=False)

In [None]:
# Define the model. Either from scratch of by loading a pre-trained model
model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')
# Define your train examples.
train_examples = [InputExample(texts=row.values[:2], label=row.values[2])
                  for _, row in train_df.iterrows()]
# Define your train dataset, the dataloader and the train loss
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16)
train_loss = losses.ContrastiveLoss(model=model)
evaluator = evaluation.EmbeddingSimilarityEvaluator(
    val_df['vk'].to_list(), val_df['wiki'].to_list(), val_df['label'].to_list())
# Tune the model
model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=20, warmup_steps=100,
          show_progress_bar=True, evaluator=evaluator, evaluation_steps=1000, output_path='adopted_model')