In [1]:
import re
from random import sample
import json
import jsonlines as jl
from collections import defaultdict

import pandas as pd
import numpy as np
from numpy.linalg import norm

from tqdm.auto import tqdm

from sklearn.model_selection import train_test_split
from scipy.stats import spearmanr, pearsonr

import torch
from torch import nn
import torch.nn.functional as F
from torch.optim import Adam
from torch.utils.data import Dataset, DataLoader

from transformers import AutoTokenizer, AutoModel
from transformers.tokenization_utils_base import BatchEncoding
from transformers import get_scheduler

In [2]:
INIT_DATA_TAGGED = r'./npf_dialogues_tagged.xlsx'
NPF_DIALOGUE_JSON = r'./on-clean-texts/npf_dialogues.json'
NPF_FAQ_JSON = r'./FAQ_NPF.json'
QUESTION2FAQ = r'./question2id.json'
ANSWER2FAQ = r'./answer2id.json'
TRIPLET_DATA = r'./npf_dialogue_triplets.jsonl'

ID = 'ID'
DIALOGUE = 'Диалог'
FAQ = 'FAQ'
QUESTION = 'Вопрос'
ANSWER = 'Ответ'

OPERATOR = 'operator'
CLIENT = 'client'
_QUESTION = 'question'
_ANSWER = 'answer'

ANCHOR = 'anchor ids'
POSITIVE = 'positive ids'
NEGATIVE = 'negative ids'
ANCHOR_TEXT = 'anchor'
POSITIVE_TEXT = 'positive'
NEGATIVE_TEXT = 'negative'

DIALOGUE_SHEET = 'гн'
FAQ_SHEET = 'FAQs'

SBERT_MODEL = r'./sbert_large_nlu_ru'
OUTPUT_MODEL = r'./triplet_loss_model.sav'

In [3]:
class _dict(dict):
    __getattr__ = dict.get
    __setattr__ = dict.__setitem__
    __delattr__ = dict.__delitem__

In [4]:
tokenizer = AutoTokenizer.from_pretrained(SBERT_MODEL)

# Data preprocessing

In [4]:
with open(NPF_FAQ_JSON, 'r', encoding='utf-8') as infile:
    faq_dict = json.load(infile)
    
with open(QUESTION2FAQ, 'r', encoding='utf-8') as infile:
    question2id = json.load(infile)
    
with open(ANSWER2FAQ, 'r', encoding='utf-8') as infile:
    answer2id = json.load(infile)
    
with open(NPF_DIALOGUE_JSON, 'r', encoding='utf-8') as infile:
    dialogue_dict = json.load(infile)

In [5]:
def load_data(_path):
    def strip_text_data(df, field_lst):
        for field in field_lst:
            df[field] = df[field].map(lambda _: str(_).strip())
        return df
    
    # reading the dialogue sheet
    tagged_dialogues = pd.read_excel(_path, sheet_name=DIALOGUE_SHEET)
    tagged_dialogues = strip_text_data(tagged_dialogues, [DIALOGUE, FAQ])
    # reading the FAQs
    faqs = pd.read_excel(_path, sheet_name=FAQ_SHEET)
    faqs = strip_text_data(faqs, [QUESTION, ANSWER])
    
    return tagged_dialogues, faqs

In [6]:
dialogues, faqs = load_data(INIT_DATA_TAGGED)

  warn(msg)


In [7]:
dialogues.head(5)

Unnamed: 0,ID,Диалог,FAQ,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6
0,610722d4000000000a77c6f024670002,operator: меня зовут рита\nclient: здравствуйт...,нет,,,,
1,610726dd000000000a77c62723f80002,operator: алло максим здравствуйте\nclient: зд...,нет,,,,
2,61072ba5000500000a77c62723f80002,operator: меня зовут деда\noperator: я вас слу...,Как получить выписку о состоянии пенсионного с...,,,,
3,61073228001100000a77c77724250002,operator: максим здравствуйте\noperator: улице...,"Как изменить персональные данные, указанные в ...",,,,
4,61073327000400000a6d28a8239e0002,operator: сергей здравствуйте\nclient: сергей ...,нет,,,,


In [8]:
faqs.head(5)

Unnamed: 0,ID,Вопрос,Ответ
0,d50f98e783994ad0956ad50967543dbd,как передать в управление накопительную пенсию,"В заявлении нужно указать, что управление ваши..."
1,945c4c9316f6460aa1ef2349ada0c7f2,В какие сроки банк зачисляет доход по Социальн...,Договор отрывается без участия работника путем...
2,c494590c064e439da8386fb836f9c50b,В какие сроки банк зачисляет софинансирование ...,Софинансирование производится два раза в год: ...
3,505bf511bae14f10846b79225a872c8b,В какие сроки будет произведена срочная выплат...,Решение выносится по истечении 10 дней с даты ...
4,150d3dfd81d547e5a57891d698196a12,В каким способом и в какие сроки пенсионный фо...,Решение о выплате по ОПС направляется в течени...


In [9]:
def clean_text(_text):
    _text = re.sub("\n|(</?[^>]*>)", " ", _text)
    _text = re.sub('здравствуйте', '', _text)
    _text = re.sub('добрый день', '', _text)
    _text = re.sub(r"\s+", " ", _text)
    return _text.strip()

In [10]:
triplet_dict = dict()

with jl.open(TRIPLET_DATA, mode='w') as outfile:
    # Filling in the anchor FAQ questions
    for _id in faqs[ID]:
        triplet_dict[_id] = {
            ANCHOR: set(),
            ANCHOR_TEXT: [],
            POSITIVE: set(),
            POSITIVE_TEXT: [],
            NEGATIVE: set(),
            NEGATIVE_TEXT: []
        }
        try:
            _question = faq_dict[_id][_QUESTION]
            triplet_dict[_id][ANCHOR].add(_id)
            triplet_dict[_id][ANCHOR_TEXT].append(_question)
        except Exception:
            print('Invalid FAQ ID:', _id)
    # Filling in the positive examples
    for idx, row in dialogues.iterrows():
        if row[FAQ] == 'нет':
            continue
        _id = row[ID]  # Dialogue ID
        _question = row[FAQ]
        try:
            client_lines = dialogue_dict[_id][CLIENT]
            _client = clean_text(' '.join(client_lines))
            faq_id = question2id[_question]
            assert triplet_dict.get(faq_id) is not None
            triplet_dict[faq_id][POSITIVE].add(_id)
            triplet_dict[faq_id][POSITIVE_TEXT].append(_client)
        except Exception:
            print('Invalid dialogue ID:', _id)
    # Filling in the negative examples
    dialogue_id_set = set(dialogue_dict.keys())
    for faq_id in faqs[ID]:
        if not triplet_dict[faq_id][POSITIVE]:
            triplet_dict.pop(faq_id, None)
            continue
        negative_candidates = dialogue_id_set - triplet_dict[faq_id][POSITIVE]
        triplet_dict[faq_id][NEGATIVE] = sample(negative_candidates,
                                                len(triplet_dict[faq_id][POSITIVE]))
        _question = faq_dict[faq_id][_QUESTION]
        for _id in triplet_dict[faq_id][NEGATIVE]:
            client_lines = dialogue_dict[_id][CLIENT]
            _client = clean_text(' '.join(client_lines))
            faq_id = question2id[_question]
            assert triplet_dict.get(faq_id) is not None
            triplet_dict[faq_id][NEGATIVE_TEXT].append(_client)
            
    # Dumping the triplet items
    for triplet_item in triplet_dict.values():
        triplet_item.pop(ANCHOR, None)
        triplet_item.pop(POSITIVE, None)
        triplet_item.pop(NEGATIVE, None)
        outfile.write(triplet_item)

Invalid FAQ ID: nan


# Preprocessed Data 

In [5]:
raw_data = []

with jl.open(TRIPLET_DATA, mode='r') as infile:
    for item in infile:
        raw_data.append(item)
        
train_data, val_test_data = train_test_split(raw_data, test_size=0.3, random_state=1)
val_data, test_data = train_test_split(val_test_data, test_size=0.5, random_state=1)

In [6]:
class TripletDataset(Dataset):
    def __init__(self, _data, labels):
        assert _data.input_ids.shape == _data.token_type_ids.shape == \
               _data.attention_mask.shape
        assert _data.input_ids.shape[0] % 3 == 0
        
        triplet_len = _data.input_ids.shape[0] // 3
        
        anchors = self.triplet_item(_data, 0, triplet_len)
        positives = self.triplet_item(_data, 1, triplet_len)
        negatives = self.triplet_item(_data, 2, triplet_len)
        
        input_ids, token_type_ids, attention_mask = [], [], []
        for i in range(triplet_len):
            # input token IDs
            triplet = torch.stack((anchors.input_ids[i, :],
                                   positives.input_ids[i, :],
                                   negatives.input_ids[i, :]), 0)
            input_ids.append(triplet)
            # token type IDs
            triplet = torch.stack((anchors.token_type_ids[i, :],
                                   positives.token_type_ids[i, :],
                                   negatives.token_type_ids[i, :]), 0)
            token_type_ids.append(triplet)
            # attention mask
            triplet = torch.stack((anchors.attention_mask[i, :],
                                   positives.attention_mask[i, :],
                                   negatives.attention_mask[i, :]), 0)
            attention_mask.append(triplet)
            
        self.features = BatchEncoding(_dict(input_ids=torch.stack(input_ids, 0),
                                      token_type_ids=torch.stack(token_type_ids, 0),
                                      attention_mask=torch.stack(attention_mask, 0)))
        self.targets = torch.tensor(labels).float()
        
    def __len__(self):
        return self.features.input_ids.shape[0]
    
    def __getitem__(self, idx):
        _features = BatchEncoding(_dict(input_ids=self.features.input_ids[idx, :, :],
                                  token_type_ids=self.features.token_type_ids[idx, :, :],
                                  attention_mask=self.features.attention_mask[idx, :, :]))
        _label = self.targets[idx, :]
        return (_features, _label)
    
    @staticmethod
    def triplet_item(_data, item_type, triplet_len):
        left, right = triplet_len * item_type, triplet_len * (item_type + 1)
        res_item = BatchEncoding(_dict(
                                 input_ids=_data.input_ids[left:right, :],
                                 token_type_ids=_data.token_type_ids[left:right, :],
                                 attention_mask=_data.attention_mask[left:right, :]
        ))
        return res_item

In [7]:
class TripletSet:
    def __init__(self, train_data, val_data, batch_size=32):
        kwargs = self.make_kwargs()
        
        train_texts, train_labels = self.make_triplets(train_data)
        val_texts, val_labels = self.make_triplets(val_data)
        
        self.train_set = TripletDataset(train_texts, train_labels)
        self.train_loader = DataLoader(self.train_set, batch_size=batch_size,
                                     drop_last=True, shuffle=True, **kwargs)
        self.val_set = TripletDataset(val_texts, val_labels)
        self.val_loader = DataLoader(self.val_set, batch_size=batch_size,
                                    drop_last=True, shuffle=True, **kwargs)
            
    @staticmethod
    def get_token_ids(_text, seq_len=64, _padding=True,
                      _truncation=True, tensor_type='pt'):
        tokenized_text = tokenizer(_text, padding=_padding, truncation=_truncation,
                                  max_length=seq_len, return_tensors=tensor_type)
        return tokenized_text
    
    def make_triplets(self, _data):
        anchors, positives, negatives = [], [], []
        
        for raw_item in _data:
            assert len(raw_item[POSITIVE_TEXT]) == len(raw_item[NEGATIVE_TEXT])
            for i in range(len(raw_item[POSITIVE_TEXT])):
                anchors.append(raw_item[ANCHOR_TEXT][0])
                positives.append(raw_item[POSITIVE_TEXT][i])
                negatives.append(raw_item[NEGATIVE_TEXT][i])
                
        labels = np.ones((len(anchors), 1))
        all_texts = anchors + positives + negatives
        tokenized_texts = self.get_token_ids(all_texts)
        
        return tokenized_texts, labels
    
    @staticmethod
    def make_kwargs():
        # TODO deal with pin_memory for custom datasets
        use_cuda = False # torch.cuda.is_available()
        kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {}
        return kwargs

In [8]:
class SoftmaxLoss(nn.Module):
    def __init__(self, _loss=(lambda _: F.relu(_))):
        super(SoftmaxLoss, self).__init__()
        self.loss_func = _loss
    
    def forward(self, anchor, positive, negative, is_test=False):
        positive_similarity = torch.sum(anchor * positive, axis=-1, keepdims=True)
        _matmul = torch.matmul(anchor, torch.transpose(negative, 0, 1))
        negative_similarity = torch.log(torch.sum(
            torch.exp(_matmul), axis=-1, keepdims=True))
        loss = self.loss_func(negative_similarity - positive_similarity)
        return loss
    
    @staticmethod
    def mean_loss(_true, _predicted):
        _mean = torch.mean(_predicted - 0 * _true, dim=0)
        return _mean

In [9]:
class TripletModel(nn.Module):
    def __init__(self, model_path=SBERT_MODEL):
        super(TripletModel, self).__init__()
        self.sbert = AutoModel.from_pretrained(model_path)
        self.cosine_similarity = nn.CosineSimilarity(dim=1, eps=1e-6)
        
    def forward(self, _input, c=0.5):
        _input = BatchEncoding(_input)
        anchor_input = self.fetch_triplet_item(_input, 0)
        positive_input = self.fetch_triplet_item(_input, 1)
        negative_input = self.fetch_triplet_item(_input, 2)
        
        encoded_anchor = self.mean_pooling(self.sbert(**anchor_input),
                                          anchor_input.attention_mask) * c
        encoded_positive = self.mean_pooling(self.sbert(**positive_input),
                                            positive_input.attention_mask) * c
        encoded_negative = self.mean_pooling(self.sbert(**negative_input),
                                            negative_input.attention_mask) * c
        cosine_pos = self.cos(encoded_anchor, encoded_positive)
        cosine_neg = self.cos(encoded_anchor, encoded_negative)
        return encoded_anchor, encoded_positive, encoded_negative, (cosine_pos, cosine_neg)
        
    @staticmethod
    def mean_pooling(_output, attention_mask):
        token_embeddings = _output[0]
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(
            token_embeddings.size()).float()
        sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
        sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
        return sum_embeddings / sum_mask
    
    @staticmethod
    def fetch_triplet_item(_input, item_idx):
        res_item = BatchEncoding(_dict(
            input_ids=_input.input_ids[:, item_idx, :],
            token_type_ids=_input.token_type_ids[:, item_idx, :],
            attention_mask=_input.attention_mask[:, item_idx, :]
        ))
        return res_item
    
    def cos(self, u, v):
        return self.cosine_similarity(u, v)

In [10]:
class ModelPipeline:
    def __init__(self, _dataset, _model, _criterion, _optimizer, _device='cpu'):
        self.dataset = _dataset
        self.model = _model
        self.criterion = _criterion
        self.optimizer = _optimizer
        self.device = torch.device(_device)
        
        self.best = defaultdict(int)
        
    def fit(self, epoch_num, log_interval=5):
        self.model.to(self.device)
        self.model.train()
        
        num_steps = epoch_num * len(self.dataset.train_loader)
        lr_scheduler = get_scheduler(name='linear', optimizer=self.optimizer,
                                    num_warmup_steps=0, num_training_steps=num_steps)
        
        # TODO deal with IProgress
        # progress_bar = tqdm(range(num_steps))
        
        for epoch in range(epoch_num):
            mean_epoch_loss = self.train_epoch(lr_scheduler)
            print("Epoch #{epoch_num}\nMean epoch loss: {mean_loss}".format(
                    epoch_num=epoch + 1, mean_loss=mean_epoch_loss))
                
            if (epoch + 1) % log_interval == 0 or epoch == 0:
                print()
                self.validate()
                print()
                
    @staticmethod
    def mean_loss(_true, _predicted):
        _mean = torch.mean(_predicted - 0 * _true, dim=0)
        return _mean
    
    def train_epoch(self, lr_scheduler):
        self.model.to(self.device)
        self.model.train()
        
        epoch_loss = []
        for batch, (_input, _target) in enumerate(self.dataset.train_loader):
            # TODO method to transfer _input to device
            # _input, _target = _input.to(self.device), _target.to(self.device)
            anchor, positive, negative, _ = self.model(_input)
            _loss = self.criterion(anchor, positive, negative)
            mean_loss = self.mean_loss(_loss, _loss)

            epoch_loss.append(mean_loss.item())
            mean_loss.backward()

            self.optimizer.step()
            lr_scheduler.step()
            self.optimizer.zero_grad()
            # progress_bar.update(1)
                
        return sum(epoch_loss) / len(epoch_loss)
    
    def validate(self):
        self.model.eval()
        
        with torch.no_grad():
            cosine_dist, targets = [], []
            for _input, _target in self.dataset.val_loader:
                _, _, _, cos = self.model(_input)
                cosine_dist.append(cos[0])
                cosine_dist.append(cos[1])
                targets.append(_target)
                targets.append(_target * 0)
            cosine_dist = torch.cat(cosine_dist).detach().numpy()
            targets = torch.cat(targets)
            targets = torch.squeeze(targets).detach().numpy()

        for metric, func in [("spearman_r", spearmanr), ("pearson_r", pearsonr)]:
            coef, _ = func(targets, cosine_dist)
            coef = np.round(coef, 4)

            metric_name = f"{metric}"
            message = f"{metric_name} = {coef}"
            if coef > self.best[metric_name]:
                self.best[metric_name] = coef
                message = "*** New best: " + message
                if metric == "spearman_r":
                    torch.save(self.model.state_dict(), OUTPUT_MODEL)

            print(message)

In [11]:
dataset = TripletSet(train_data, val_data)
model = TripletModel()
criterion = SoftmaxLoss()
optimizer = Adam(model.parameters(), lr=1e-5)

evaluation_pipeline = ModelPipeline(_dataset=dataset, _model=model,
                                   _criterion=criterion, _optimizer=optimizer)

In [None]:
evaluation_pipeline.fit(epoch_num=50)

Epoch #1
Mean epoch loss: 4.020175554535606

*** New best: spearman_r = 0.1887
*** New best: pearson_r = 0.1755

Epoch #2
Mean epoch loss: 2.2606953219933943
Epoch #3
Mean epoch loss: 1.6565082235769792
Epoch #4
Mean epoch loss: 1.129652207547968
Epoch #5
Mean epoch loss: 0.7738909016955983

*** New best: spearman_r = 0.2572
*** New best: pearson_r = 0.2471

Epoch #6
Mean epoch loss: 0.42020295424894855
Epoch #7
Mean epoch loss: 0.2411789677359841
Epoch #8
Mean epoch loss: 0.16618689623746005
Epoch #9
Mean epoch loss: 0.10451420870694247
Epoch #10
Mean epoch loss: 0.095385957847942

*** New best: spearman_r = 0.328
*** New best: pearson_r = 0.3069

Epoch #11
Mean epoch loss: 0.13102248040112582
Epoch #12
Mean epoch loss: 0.04371857101267034
Epoch #13
Mean epoch loss: 0.04948553172024814
Epoch #14
Mean epoch loss: 0.04179584438150579


In [31]:
b = [i % 2 for i in range(1, 11)]
b = torch.tensor(b).float()
b = torch.unsqueeze(b, 1)
b

tensor([[1.],
        [0.],
        [1.],
        [0.],
        [1.],
        [0.],
        [1.],
        [0.],
        [1.],
        [0.]])

In [33]:
a = [i ** -1 for i in range(1, 11)]
a = torch.tensor(a).float()
a = torch.unsqueeze(a, 1)
a

tensor([[1.0000],
        [0.5000],
        [0.3333],
        [0.2500],
        [0.2000],
        [0.1667],
        [0.1429],
        [0.1250],
        [0.1111],
        [0.1000]])

In [34]:
a = a.detach().numpy()
b = b.detach().numpy()
a

array([[1.        ],
       [0.5       ],
       [0.33333334],
       [0.25      ],
       [0.2       ],
       [0.16666667],
       [0.14285715],
       [0.125     ],
       [0.11111111],
       [0.1       ]], dtype=float32)

In [35]:
b

array([[1.],
       [0.],
       [1.],
       [0.],
       [1.],
       [0.],
       [1.],
       [0.],
       [1.],
       [0.]], dtype=float32)

In [36]:
res = spearmanr(b, a)
res

SpearmanrResult(correlation=0.17407765595569785, pvalue=0.6305360755569764)

In [26]:
from scipy.stats.mstats import spearmanr, pearsonr

In [20]:
b = torch.tensor(b).float()
b[0, :]

  b = torch.tensor(b).float()


tensor([1.])

In [12]:
tokenizer = AutoTokenizer.from_pretrained(SBERT_MODEL)
model = AutoModel.from_pretrained(SBERT_MODEL)

In [11]:
sentences = ['Привет! Как твои дела?',
             'А правда, что 42 твое любимое число?']

In [12]:
encoded_input = tokenizer(sentences, padding=True, truncation=True, max_length=24, return_tensors='pt')

In [14]:
type(encoded_input)

transformers.tokenization_utils_base.BatchEncoding

In [35]:
encoded_input.input_ids.shape

torch.Size([2, 11])

In [18]:
import tensorflow as tf

In [19]:
a = tf.constant([[1.2989, -0.5805,  0.6631],
                [-0.6603,  0.3014,  1.9022],
                [0.0637,  2.7615, -1.6628],
                [-0.3582,  0.0641, -0.9180]])
a

2022-03-30 19:02:16.977766: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


<tf.Tensor: shape=(4, 3), dtype=float32, numpy=
array([[ 1.2989, -0.5805,  0.6631],
       [-0.6603,  0.3014,  1.9022],
       [ 0.0637,  2.7615, -1.6628],
       [-0.3582,  0.0641, -0.918 ]], dtype=float32)>

In [20]:
b = tf.constant([[-0.1118,  0.1751, -0.3803],
                [-0.4642,  0.7397,  0.5235],
                [0.1512,  1.4405,  0.8461],
                [0.2314, -0.8797,  0.0437]])
b

<tf.Tensor: shape=(4, 3), dtype=float32, numpy=
array([[-0.1118,  0.1751, -0.3803],
       [-0.4642,  0.7397,  0.5235],
       [ 0.1512,  1.4405,  0.8461],
       [ 0.2314, -0.8797,  0.0437]], dtype=float32)>

In [21]:
a * b

<tf.Tensor: shape=(4, 3), dtype=float32, numpy=
array([[-0.14521702, -0.10164555, -0.2521769 ],
       [ 0.30651125,  0.22294559,  0.99580175],
       [ 0.00963144,  3.9779406 , -1.406895  ],
       [-0.08288749, -0.05638877, -0.0401166 ]], dtype=float32)>