In [1]:
import os
from sentence_transformers import SentenceTransformer, losses, BiSentenceTransformer
from sentence_transformers.readers import STSDataReader, FEVERReader
from sentence_transformers.datasets import SentencesDataset
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from torch.utils.data import DataLoader
import psycopg2
import time
import numpy as np

## Training with Siamese Model

In [2]:
base_model = SentenceTransformer('bert-base-nli-stsb-mean-tokens')
train_batch_size = 16
num_epochs = 1
warmup_steps=100
model_save_path='./fever-model'

In [None]:
reader = FEVERReader()
train_examples = reader.get_examples('train',table='test.train_article_rerank')
train_data = SentencesDataset(train_examples, base_model)
train_dataloader = DataLoader(train_data, shuffle=True, batch_size=train_batch_size)
train_loss = losses.CosineSimilarityLoss(model=base_model)

dev_examples = reader.get_examples('dev',table='test.test_article_rerank')
dev_data = SentencesDataset(examples=dev_examples, model=base_model)
dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=train_batch_size)
evaluator = EmbeddingSimilarityEvaluator(dev_dataloader)
model.fit(train_objectives=[(train_dataloader, train_loss)],
          evaluator=evaluator,
          epochs=num_epochs,
          evaluation_steps=1000,
          warmup_steps=warmup_steps,
          output_path=model_save_path,
          fp16=True,
          fp16_opt_level='O1'
)

## Training with Modified Siamese (dense feedforward layer for query)

In [3]:
model = BiSentenceTransformer(base_model)
train_batch_size = 16
num_epochs = 1
warmup_steps=100
model_save_path='./modified-fever'

In [4]:
reader = FEVERReader()
train_examples = reader.get_examples('train',table='test.train_article_rerank')
train_data = SentencesDataset(train_examples, base_model)
train_dataloader = DataLoader(train_data, shuffle=True, batch_size=train_batch_size)
train_loss = losses.BiCosineSimilarityLoss(model=model)

dev_examples = reader.get_examples('dev',table='test.test_article_rerank')
dev_data = SentencesDataset(examples=dev_examples, model=base_model)
dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=train_batch_size)

trying to connect to postgres...
connected to postgres
downloading data
trying to connect to postgres...
connected to postgres
downloading data


In [5]:
model.fit((train_dataloader, train_loss),
          None,
          epochs=num_epochs,
          evaluation_steps=1000,
          warmup_steps=warmup_steps,
          output_path_base=model_save_path,
          fp16=True,
          fp16_opt_level='O1'
)

AttributeError: module 'torch.nn.functional' has no attribute 'cosine_similiarity'

In [4]:
load_path = './modified-fever'
base_model = SentenceTransformer(load_path)
loaded_model = BiSentenceTransformer(base_model, path=load_path)

## Encode document set and dump to disk

In [None]:
load_path = './modified-fever/'
model = BiSentenceTransformer(load_path)

In [None]:
HOST = '54.196.150.193'
USER = 'postgres'
PASS = os.environ.get('PGPASS')
PGSSLROOTCERT = os.environ.get('PGSSLROOTCERT')
if PASS == None or PGSSLROOTCERT == None:
    print("Please set PG_PASS and PGSSLROOTCERT env variable")
    raise SystemExit()
DBNAME = 'fever'
POSTGRES_DSN = f'''dbname='fever' user='{USER}' host='{HOST}' password='{PASS}' '''
query = '''
select a.id, l.text 
from wiki.articles_clean a
join wiki.lines l on l.article_id = a.id and line_number = 0
'''
print('trying to connect...')
conn = psycopg2.connect(POSTGRES_DSN)
cur = conn.cursor()
print('executing query...')
cur.execute(query)
res = cur.fetchall()

trying to connect...
executing query...


In [None]:
sent_buffer = []
ids_buffer = []
BATCH_SIZE = 100000
if not os.path.exists('./fever-embs/'):
    os.makedirs('./fever-embs/')
start = time.time()
for i, batch in enumerate(res):
    sent_buffer.append(batch[1])
    ids_buffer.append(batch[0])
    if (i+1) % BATCH_SIZE == 0:
        embs = model.model_b.encode(sent_buffer, batch_size=32)
        ids_buffer = np.array(ids_buffer)
        ids_buffer = np.expand_dims(ids_buffer, 1)
        to_save = np.concatenate((ids_buffer, embs), 1)
        np.save(f'./fever-embs/emb-{i}',to_save)
        sent_buffer = []
        ids_buffer = []
        print(f'Running {i/(time.time() - start)} per second')
embs = model.model_b.encode(sent_buffer, batch_size=32)
ids_buffer = np.array(ids_buffer)
ids_buffer = np.expand_dims(ids_buffer, 1)
to_save = np.concatenate((ids_buffer, embs), 1)
np.save(f'./fever-embs/emb-{i}',to_save)

In [None]:
#embs = model.encode(sent_buffer, batch_size=32)
#ids_buffer = np.array(ids_buffer)
#ids_buffer = np.expand_dims(ids_buffer, 1)
#to_save = np.concatenate((ids_buffer, embs), 1)
np.save(f'./fever-embs/emb-last',to_save)