https://github.com/UKPLab/sentence-transformers

In [None]:
!pip install sentence-transformers

In [None]:
import numpy as np
import pandas as pd
import os
import shutil
from sklearn.metrics.pairwise import paired_cosine_distances
from sklearn.metrics.classification import precision_recall_fscore_support
from sklearn.metrics import accuracy_score

import sys
import logging
import math
from torch.utils.data import DataLoader
from sentence_transformers import models, losses
from sentence_transformers import SentencesDataset, LoggingHandler, SentenceTransformer
from sentence_transformers.evaluation import BinaryEmbeddingSimilarityEvaluator
from sentence_transformers.readers import *
from datetime import datetime

# data

columns:
- txt_1
- txt_2
- label

In [None]:
file_name = 'data.csv'
data_pd = pd.read_csv(file_name, index_col=0)
print(data_pd.shape)

# sentence encoding using pretrained model

In [None]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('bert-base-nli-mean-tokens')

In [None]:
sent_1 = data_pd.txt_1.values.tolist()
sent_2 = data_pd.txt_2.values.tolist()

# obtain sentence embeddings
sent_1_emb = model.encode(sent_1)
sent_2_emb = model.encode(sent_2)

In [None]:
cosine_scores = 1 - (paired_cosine_distances(sent_1_emb, sent_2_emb))
data_pd['cosine_distance'] = np.array(cosine_scores)
data_pd.head()

# fine tune model with our data

In [None]:
from sklearn.model_selection import train_test_split

train_pd, valid_pd, _, _ = train_test_split(data_pd, data_pd['label'],
                                            test_size=0.4, random_state=42)
dev_pd, test_pd, _, _ = train_test_split(valid_pd, valid_pd['label'],
                                         test_size=0.5, random_state=42)

print(train_pd[train_pd['class']==1].shape[0]/train_pd.shape[0], train_pd.shape[0])
print(dev_pd[dev_pd['class']==1].shape[0]/dev_pd.shape[0], dev_pd.shape[0])
print(test_pd[test_pd['class']==1].shape[0]/test_pd.shape[0], test_pd.shape[0])

## load pre-train weights and set up model

In [None]:
# set logging to print debug information to stdout
logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    handlers=[LoggingHandler()],
                   )

# specify huggingface/transformers pre-trained model, e.g: bert-base-uncased, roberta-base, xlm-roberta-base
model_name = 'distilbert-base-uncased'

# mapping tokens to embeddings with pre-trained model
word_embedding_model = models.Transformer(model_name, max_seq_length=128)

# apply mean pooling to get one fixed sized sentence vector
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
                               pooling_mode_mean_tokens=True,
                               pooling_mode_cls_token=False,
                               pooling_mode_max_tokens=False)

model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

## load data

In [None]:
from sentence_transformers.readers import InputExample

class DataReader(object):
    """
    Reads data
    """
    def __init__(self, dataset):
        self.dataset = dataset

    def get_examples(self, filename, max_examples=0):
        
        s1 = self.dataset['txt_1'].values.tolist()
        s2 = self.dataset['txt_2'].values.tolist()
        labels = self.dataset['label'].values.tolist()

        examples = []
        id = 0
        for sentence_a, sentence_b, label in zip(s1, s2, labels):
            guid = "%s-%d" % (filename, id)
            id += 1
            examples.append(InputExample(guid=guid, texts=[sentence_a, sentence_b], label=label))

            if 0 < max_examples <= len(examples):
                break

        return examples

In [None]:
# read dataset with customized reader 
batch_size = 16
train_data_reader = DataReader(train_pd)
dev_data_reader = DataReader(dev_pd)
train_num_labels = 2

# convert dataset to a DataLoader ready for training
logging.info("Read train dataset")
train_data = SentencesDataset(train_data_reader.get_examples('train'), model=model)
train_dataloader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
train_loss = losses.SoftmaxLoss(model=model, 
                                sentence_embedding_dimension=model.get_sentence_embedding_dimension(), 
                                num_labels=train_num_labels)

logging.info("Read dev dataset")
dev_data = SentencesDataset(dev_data_reader.get_examples('dev'), model=model)
dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=batch_size)
evaluator = BinaryEmbeddingSimilarityEvaluator(dev_dataloader)

In [None]:
# set up model save path
model_save_path = 'output/' + model_name.replace("/", "-")
if os.path.exists(model_save_path):
        shutil.rmtree(model_save_path)
        
print('model_save_path:', model_save_path)

In [None]:
# configure the training
num_epochs = 1

# 10% of train data for warm-up
warmup_steps = math.ceil(len(train_dataloader) * num_epochs / batch_size * 0.1) 
logging.info("Warmup-steps: {}".format(warmup_steps))

# train model
model.fit(train_objectives=[(train_dataloader, train_loss)],
          evaluator=evaluator,
          epochs=num_epochs,
          evaluation_steps=1000,
          warmup_steps=warmup_steps,
          output_path=model_save_path
          )

## model evaluation

In [None]:
model_path = model_save_path
model = SentenceTransformer(model_path)

test_data_reader = DataReader(test_pd)
test_data = SentencesDataset(test_data_reader.get_examples('test'), model=model)

test_dataloader = DataLoader(test_data, shuffle=False, batch_size=batch_size)
evaluator = BinaryEmbeddingSimilarityEvaluator(test_dataloader)

model.evaluate(evaluator)

In [None]:
txt_1 = test_pd.txt_1.values.tolist()
txt_2 = test_pd.txt_2.values.tolist()

embed_1 = model.encode(txt_1)
embed_2 = model.encode(txt_2)

In [None]:
cosine_scores = 1 - (paired_cosine_distances(embed_1, embed_2))
cosine_middle = np.median(cosine_scores)
cosine_middle

In [None]:
y_test = test_pd['class']
y_test_pred_classes = [1 if y>0.5 else 0 for y in cosine_scores]

print('accuracy:', accuracy_score(y_test, y_test_pred_classes))
precision_recall_fscore_support(y_test, y_test_pred_classes)