Install the dependencies

In [None]:
pip install flair segtok bpemb deprecated pytorch_transformers allennlp

Clone the repo

In [3]:
!git clone https://github.com/EsterHlav/Quantitative-Comparison-NLP-Embeddings-from-GloVe-to-RoBERTa.git

Cloning into 'Quantitative-Comparison-NLP-Embeddings-from-GloVe-to-RoBERTa'...
remote: Enumerating objects: 81, done.[K
remote: Total 81 (delta 0), reused 0 (delta 0), pack-reused 81[K
Receiving objects: 100% (81/81), 33.96 MiB | 36.07 MiB/s, done.
Resolving deltas: 100% (11/11), done.


In [11]:
cd Quantitative-Comparison-NLP-Embeddings-from-GloVe-to-RoBERTa/

/content/Quantitative-Comparison-NLP-Embeddings-from-GloVe-to-RoBERTa


Get downstream data from senteval

In [None]:
!bash data/downstream/get_transfer_data.bash

In [1]:
from flair.data import Sentence
from flair.embeddings import (
    WordEmbeddings,
    TransformerWordEmbeddings
)

In [2]:
WORD2VEC = WordEmbeddings('en')
GLOVE    = WordEmbeddings('glove')
BERT   = TransformerWordEmbeddings()

2023-11-13 09:51:07,668 https://flair.informatik.hu-berlin.de/resources/embeddings/token/en-fasttext-news-300d-1M.vectors.npy not found in cache, downloading to /tmp/tmpczzl59ed


100%|██████████| 1.12G/1.12G [00:49<00:00, 24.3MB/s]

2023-11-13 09:51:57,508 copying /tmp/tmpczzl59ed to cache at /root/.flair/embeddings/en-fasttext-news-300d-1M.vectors.npy





2023-11-13 09:52:10,041 removing temp file /tmp/tmpczzl59ed
2023-11-13 09:52:10,773 https://flair.informatik.hu-berlin.de/resources/embeddings/token/en-fasttext-news-300d-1M not found in cache, downloading to /tmp/tmphkflfrtj


100%|██████████| 52.1M/52.1M [00:03<00:00, 17.2MB/s]

2023-11-13 09:52:14,446 copying /tmp/tmphkflfrtj to cache at /root/.flair/embeddings/en-fasttext-news-300d-1M





2023-11-13 09:52:14,643 removing temp file /tmp/tmphkflfrtj
2023-11-13 09:52:29,195 https://flair.informatik.hu-berlin.de/resources/embeddings/token/glove.gensim.vectors.npy not found in cache, downloading to /tmp/tmpi9y2078v


100%|██████████| 153M/153M [00:07<00:00, 22.1MB/s]

2023-11-13 09:52:36,876 copying /tmp/tmpi9y2078v to cache at /root/.flair/embeddings/glove.gensim.vectors.npy





2023-11-13 09:52:37,132 removing temp file /tmp/tmpi9y2078v
2023-11-13 09:52:37,651 https://flair.informatik.hu-berlin.de/resources/embeddings/token/glove.gensim not found in cache, downloading to /tmp/tmpzs_f99sn


100%|██████████| 20.5M/20.5M [00:01<00:00, 11.0MB/s]

2023-11-13 09:52:40,092 copying /tmp/tmpzs_f99sn to cache at /root/.flair/embeddings/glove.gensim
2023-11-13 09:52:40,111 removing temp file /tmp/tmpzs_f99sn





Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

In [4]:
EMBEDDINGS = [WORD2VEC, GLOVE, BERT]
EMBEDDINGS_NAMES = ['WORD2VEC', 'GLOVE', 'BERT']

In [13]:
from __future__ import absolute_import, division, unicode_literals

import sys
import io
import numpy as np
import logging
from flair.data import Sentence

# Set PATHs
PATH_TO_SENTEVAL = './'
PATH_TO_DATA = './data'

# import SentEval
sys.path.insert(0, PATH_TO_SENTEVAL)
sys.path.insert(0, PATH_TO_SENTEVAL+'senteval/tools/')
import senteval

# SentEval prepare and batcher
def prepare(params, samples):
    return

def sentence_emb(sent, type_sent='avg'):
    if type_sent == 'avg':
        sentvec = [tok.embedding.cpu().numpy() for tok in sent]
        return np.mean(sentvec, 0)

# integration of Flair in the batcher
def batcher(params, batch):
    batch = [sent if sent != [] else ['.'] for sent in batch]
    embeddings = []

    for sent in batch:
        sentvec = []
        # create sentence, embed it and recuperate the tokens
        sentence = Sentence(' '.join(sent))
        params['embeddings'].embed(sentence)
        embeddings.append(sentence_emb(sentence))

    embeddings = np.vstack(embeddings)
    return embeddings

# Set up logger
logging.basicConfig(format='%(asctime)s : %(message)s', level=logging.INFO)

In [None]:
# Set params for SentEval
params_senteval = {'task_path': PATH_TO_DATA}
params_senteval['batch_size'] = 256
params_senteval['seed'] = 1
# control the number of max iterations in Bayesian Optimization (most likely will always reach max anyway with NN)
params_senteval['iter_bayes'] = 500
params_senteval['cudaEfficient'] = True

# define embeddings
embs = {name: emb for emb, name in zip(EMBEDDINGS, EMBEDDINGS_NAMES)}

results_embs = {}

for i, emb_name in enumerate(embs.keys()):
    print ("###"*20)
    print ('Training step {} on embedding {}'.format(i+1, emb_name))
    print ("###"*20)

    params_senteval['embeddings'] = embs[emb_name]

    se = senteval.engine.SE(params_senteval, batcher, prepare)

    transfer_tasks = ['CR', 'TREC', 'MRPC', 'SUBJ', 'MPQA'] # can be long to run on SST2


    results = se.eval(name='CR')
    print ("Results for embedding '{}'".format(emb_name))
    print(results)
    results_embs[emb_name] = results

print(results_embs)