In [13]:
import os.path as p
import pandas as pd

from sentence_transformers import SentenceTransformer
from definitions import *
from dataset_helper_functions import *

Load devset.

In [17]:
try:
    IS_MASTER
except: 
    IS_MASTER = False

if not IS_MASTER:
    data = {}
    
    dev_path = p.join(PROC_DATA_DIR_PATH, 'dev')

    data_paths = {
        'dev': [p.join(dev_path, 'dev.tsv'), p.join(dev_path, 'dev_spacy.pkl')],
        'test': [
            p.join(POLIT_DATA_DIR_PATH, 'test', 'test_combined.tsv'),
            p.join(PROC_DATA_DIR_PATH, 'test', 'test_spacy.pkl')
        ],
        'train': [
            p.join(POLIT_DATA_DIR_PATH, 'train', 'train_combined.tsv'),
            p.join(PROC_DATA_DIR_PATH, 'train', 'train_spacy.pkl')
        ],
        # 'val': [
        #     p.join(POLIT_DATA_DIR_PATH, 'val', 'val_combined.tsv'),
        #     p.join(PROC_DATA_DIR_PATH, 'val', 'val_spacy.pkl')
        # ],
    }

    for dtype, dpaths in data_paths.items():
        if p.exists(dpaths[1]):
            data[dtype] = pd.read_pickle(dpaths[1])
        else:
            if dtype == 'dev' and not p.exists(dpaths[0]):
                sample_development_set()

            data[dtype] = pd.read_csv(dpaths[0], sep='\t', index_col=False)

Check whether embeddings data folder exists. If not, create it.

In [15]:
embeddings_path = p.join(PROC_DATA_DIR_PATH, 'embeddings')
if not p.exists(embeddings_path):
    os.mkdir(embeddings_path)

Load model.

3 models to try:
- `all-mpnet-base-v2`: best performance overall, slow
- `multi-qa-mpnet-base-dot-v1`: tuned for semantic search, good performance, slow 
- `all-MiniLM-L6-v2`: smallest with moderately good performance, fast; **dev done with this due to it's speed**

In [16]:
model_name = 'all-mpnet-base-v2'
model = SentenceTransformer(model_name)

Downloading: 100%|██████████| 1.18k/1.18k [00:00<00:00, 359kB/s]
Downloading: 100%|██████████| 10.1k/10.1k [00:00<00:00, 2.80MB/s]
Downloading: 100%|██████████| 571/571 [00:00<00:00, 256kB/s]
Downloading: 100%|██████████| 116/116 [00:00<00:00, 51.4kB/s]
Downloading: 100%|██████████| 39.3k/39.3k [00:00<00:00, 125kB/s] 
Downloading: 100%|██████████| 349/349 [00:00<00:00, 188kB/s]
Downloading: 100%|██████████| 438M/438M [10:39<00:00, 685kB/s]   
Downloading: 100%|██████████| 53.0/53.0 [00:00<00:00, 24.2kB/s]
Downloading: 100%|██████████| 239/239 [00:00<00:00, 102kB/s]
Downloading: 100%|██████████| 466k/466k [00:01<00:00, 294kB/s]  
Downloading: 100%|██████████| 363/363 [00:00<00:00, 184kB/s]
Downloading: 100%|██████████| 13.1k/13.1k [00:00<00:00, 4.70MB/s]
Downloading: 100%|██████████| 232k/232k [00:00<00:00, 400kB/s] 
Downloading: 100%|██████████| 190/190 [00:00<00:00, 86.2kB/s]


Get sentences from dataset splits.

Models accept list of sentences to encode.

In [18]:
for dtype, df in data.items():
    sentences = df['content'].values
    embeddings = model.encode(sentences)

    embeddings_df = df.loc[:, ['id']].merge(pd.DataFrame(embeddings), left_index=True, right_index=True)
    embeddings_df.to_pickle(p.join(embeddings_path, f'{dtype}_sent_emb_{model_name}.pkl'))

Create devframe with sentences ids and embeddings and save it.

In [7]:
# embeddings_df = dev.loc[:, ['id']].merge(pd.DataFrame(embeddings), left_index=True, right_index=True)

# embeddings_df.to_csv(p.join(embeddings_path, f'dev_sent_emb_{model_name}.tsv'), sep='\t', index=False)
# embeddings_df.to_pickle(p.join(embeddings_path, f'dev_sent_emb_{model_name}.pkl'))