In [47]:
#default_exp word2vec

In [48]:
#export


from github_search import paperswithcode_tasks
import gensim.models
import pandas as pd

import logging  # Setting up the loggings to monitor gensim
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)

In [2]:
%cd ..

/home/kuba/Projects/github_search


In [3]:
paperswithcode_df, all_papers_df = paperswithcode_tasks.get_paperswithcode_dfs()
papers_with_readmes_df = pd.read_csv("output/papers_with_readmes.csv")

INFO - 13:15:07: Note: NumExpr detected 32 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
INFO - 13:15:07: NumExpr defaulting to 8 threads.


In [52]:
papers_with_readmes_df.shape

(12224, 25)

In [4]:
!ls output/*readme*

output/papers_with_readmes.csv


In [16]:
#export


def get_sentences(papers_df, papers_with_readmes_df, max_length=1000):
    abstract_sentences = papers_df['abstract'].str.lower().str.split(" ").dropna()
    readme_sentences = papers_with_readmes_df['readme'].str.lower().str.split(" ").dropna()
    sentences = list(abstract_sentences) + list(readme_sentences)
    return [sent[:max_length] for sent in sentences]

In [17]:
sentences = get_sentences(all_papers_df, papers_with_readmes_df)

In [18]:

sentences = get_sentences(all_papers_df, papers_with_readmes_df)
lengths = pd.Series(map(len, sentences))#.describe()

In [49]:
#export


from gensim.models.callbacks import CallbackAny2Vec


class LossLogger(CallbackAny2Vec):
    '''Output loss at each epoch'''
    def __init__(self):
        self.epoch = 1
        self.losses = []

    def on_epoch_begin(self, model):
        print(f'Epoch: {self.epoch}', end='\t')

    def on_epoch_end(self, model):
        loss = model.get_latest_training_loss()
        self.losses.append(loss)
        print(f'  Loss: {loss}')
        self.epoch += 1
        
        
        
class LossCallback(CallbackAny2Vec):
    """
    Callback to print loss after each epoch
    """
    def __init__(self):
        self.epoch = 0

    def on_epoch_end(self, model):
        loss = model.get_latest_training_loss()
        if self.epoch == 0:
            print('Loss after epoch {}: {}'.format(self.epoch, loss))
        else:
            print('Loss after epoch {}: {}'.format(self.epoch, loss- self.loss_previous_step))
        self.epoch += 1
        self.loss_previous_step = loss

In [50]:
embedding_dim = 200

In [51]:
#export

def make_w2v_model(sentences, embedding_dim=200):
    w2v_model = gensim.models.Word2Vec(size=embedding_dim, window=5, min_count=5, workers=24, callbacks=[LossCallback()])
    w2v_model.build_vocab(sentences, progress_per=10000)
    return w2v_model

In [1]:
#export


def train_abstract_readme_w2v(embedding_dim, epochs, upstream, product):
 
    paperswithcode_df, all_papers_df = paperswithcode_tasks.get_paperswithcode_dfs()
    papers_with_readmes_df = pd.read_csv(upstream["make_readmes"])
    
    sentences = get_sentences(all_papers_df, papers_with_readmes_df)
    lengths = pd.Series(map(len, sentences))#.describe()
   
    w2v_model = make_w2v_model(sentences)
    w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=epochs, report_delay=1, compute_loss=True)
    w2v_model.save(str(product))

In [34]:
w2v_model = make_w2v_model(sentences)

INFO - 13:22:06: collecting all words and their counts
INFO - 13:22:06: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO - 13:22:06: PROGRESS: at sentence #10000, processed 1533202 words, keeping 112056 word types
INFO - 13:22:06: PROGRESS: at sentence #20000, processed 3081944 words, keeping 184077 word types
INFO - 13:22:06: PROGRESS: at sentence #30000, processed 4592495 words, keeping 245351 word types
INFO - 13:22:06: PROGRESS: at sentence #40000, processed 6095252 words, keeping 302891 word types
INFO - 13:22:06: PROGRESS: at sentence #50000, processed 7600080 words, keeping 356535 word types
INFO - 13:22:06: PROGRESS: at sentence #60000, processed 9071210 words, keeping 408868 word types
INFO - 13:22:07: PROGRESS: at sentence #70000, processed 10566770 words, keeping 457473 word types
INFO - 13:22:07: PROGRESS: at sentence #80000, processed 12088703 words, keeping 502932 word types
INFO - 13:22:07: PROGRESS: at sentence #90000, processed 13668998 words, kee

In [35]:
len(w2v_model.wv.vocab)

144098

In [36]:
w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=20, report_delay=1, compute_loss=True)

INFO - 13:22:22: training model with 24 workers on 144098 vocabulary and 200 features, using sg=0 hs=0 sample=0.001 negative=5 window=5
INFO - 13:22:23: EPOCH 1 - PROGRESS: at 7.43% examples, 1786780 words/s, in_qsize 45, out_qsize 2
INFO - 13:22:24: EPOCH 1 - PROGRESS: at 15.66% examples, 1878979 words/s, in_qsize 47, out_qsize 0
INFO - 13:22:25: EPOCH 1 - PROGRESS: at 23.61% examples, 1876082 words/s, in_qsize 46, out_qsize 1
INFO - 13:22:26: EPOCH 1 - PROGRESS: at 32.00% examples, 1886753 words/s, in_qsize 47, out_qsize 0
INFO - 13:22:27: EPOCH 1 - PROGRESS: at 40.30% examples, 1911051 words/s, in_qsize 43, out_qsize 4
INFO - 13:22:28: EPOCH 1 - PROGRESS: at 48.32% examples, 1923741 words/s, in_qsize 47, out_qsize 0
INFO - 13:22:29: EPOCH 1 - PROGRESS: at 56.23% examples, 1926390 words/s, in_qsize 47, out_qsize 0
INFO - 13:22:30: EPOCH 1 - PROGRESS: at 65.16% examples, 1926468 words/s, in_qsize 48, out_qsize 0
INFO - 13:22:31: EPOCH 1 - PROGRESS: at 75.15% examples, 1918257 words/s,

Loss after epoch 0: 2057522.5


INFO - 13:22:36: EPOCH 2 - PROGRESS: at 7.73% examples, 1861076 words/s, in_qsize 46, out_qsize 1
INFO - 13:22:37: EPOCH 2 - PROGRESS: at 15.90% examples, 1903972 words/s, in_qsize 48, out_qsize 0
INFO - 13:22:38: EPOCH 2 - PROGRESS: at 23.87% examples, 1894421 words/s, in_qsize 47, out_qsize 0
INFO - 13:22:39: EPOCH 2 - PROGRESS: at 32.09% examples, 1897040 words/s, in_qsize 47, out_qsize 0
INFO - 13:22:40: EPOCH 2 - PROGRESS: at 40.24% examples, 1911448 words/s, in_qsize 43, out_qsize 4
INFO - 13:22:41: EPOCH 2 - PROGRESS: at 48.14% examples, 1908642 words/s, in_qsize 43, out_qsize 4
INFO - 13:22:42: EPOCH 2 - PROGRESS: at 56.35% examples, 1922902 words/s, in_qsize 44, out_qsize 3
INFO - 13:22:43: EPOCH 2 - PROGRESS: at 65.54% examples, 1927768 words/s, in_qsize 48, out_qsize 0
INFO - 13:22:44: EPOCH 2 - PROGRESS: at 75.88% examples, 1927803 words/s, in_qsize 47, out_qsize 0
INFO - 13:22:45: EPOCH 2 - PROGRESS: at 85.14% examples, 1925170 words/s, in_qsize 45, out_qsize 2
INFO - 13:2

Loss after epoch 1: 1819594.0


INFO - 13:22:49: EPOCH 3 - PROGRESS: at 7.73% examples, 1840435 words/s, in_qsize 46, out_qsize 1
INFO - 13:22:50: EPOCH 3 - PROGRESS: at 16.06% examples, 1916283 words/s, in_qsize 45, out_qsize 2
INFO - 13:22:51: EPOCH 3 - PROGRESS: at 24.25% examples, 1925502 words/s, in_qsize 46, out_qsize 1
INFO - 13:22:52: EPOCH 3 - PROGRESS: at 32.84% examples, 1944272 words/s, in_qsize 47, out_qsize 0
INFO - 13:22:53: EPOCH 3 - PROGRESS: at 40.81% examples, 1943535 words/s, in_qsize 47, out_qsize 0
INFO - 13:22:54: EPOCH 3 - PROGRESS: at 48.57% examples, 1939869 words/s, in_qsize 46, out_qsize 1
INFO - 13:22:55: EPOCH 3 - PROGRESS: at 56.88% examples, 1952650 words/s, in_qsize 46, out_qsize 1
INFO - 13:22:56: EPOCH 3 - PROGRESS: at 65.77% examples, 1944586 words/s, in_qsize 48, out_qsize 0
INFO - 13:22:57: EPOCH 3 - PROGRESS: at 75.99% examples, 1939641 words/s, in_qsize 47, out_qsize 0
INFO - 13:22:58: EPOCH 3 - PROGRESS: at 85.54% examples, 1942053 words/s, in_qsize 47, out_qsize 0
INFO - 13:2

Loss after epoch 2: 1651906.5


INFO - 13:23:02: EPOCH 4 - PROGRESS: at 7.79% examples, 1878169 words/s, in_qsize 47, out_qsize 0
INFO - 13:23:03: EPOCH 4 - PROGRESS: at 15.88% examples, 1879934 words/s, in_qsize 43, out_qsize 4
INFO - 13:23:04: EPOCH 4 - PROGRESS: at 23.93% examples, 1891679 words/s, in_qsize 46, out_qsize 1
INFO - 13:23:05: EPOCH 4 - PROGRESS: at 31.81% examples, 1875084 words/s, in_qsize 45, out_qsize 2
INFO - 13:23:06: EPOCH 4 - PROGRESS: at 39.54% examples, 1872515 words/s, in_qsize 47, out_qsize 0
INFO - 13:23:07: EPOCH 4 - PROGRESS: at 47.21% examples, 1875319 words/s, in_qsize 43, out_qsize 4
INFO - 13:23:08: EPOCH 4 - PROGRESS: at 55.46% examples, 1891495 words/s, in_qsize 47, out_qsize 1
INFO - 13:23:09: EPOCH 4 - PROGRESS: at 64.29% examples, 1897249 words/s, in_qsize 47, out_qsize 0
INFO - 13:23:10: EPOCH 4 - PROGRESS: at 74.48% examples, 1897747 words/s, in_qsize 48, out_qsize 0
INFO - 13:23:11: EPOCH 4 - PROGRESS: at 83.66% examples, 1894502 words/s, in_qsize 47, out_qsize 0
INFO - 13:2

Loss after epoch 3: 1492426.0


INFO - 13:23:15: EPOCH 5 - PROGRESS: at 7.73% examples, 1870772 words/s, in_qsize 46, out_qsize 1
INFO - 13:23:16: EPOCH 5 - PROGRESS: at 15.97% examples, 1909369 words/s, in_qsize 45, out_qsize 2
INFO - 13:23:17: EPOCH 5 - PROGRESS: at 24.34% examples, 1932697 words/s, in_qsize 46, out_qsize 1
INFO - 13:23:18: EPOCH 5 - PROGRESS: at 32.39% examples, 1909217 words/s, in_qsize 47, out_qsize 0
INFO - 13:23:19: EPOCH 5 - PROGRESS: at 40.15% examples, 1899050 words/s, in_qsize 45, out_qsize 2
INFO - 13:23:20: EPOCH 5 - PROGRESS: at 48.29% examples, 1917656 words/s, in_qsize 45, out_qsize 2
INFO - 13:23:21: EPOCH 5 - PROGRESS: at 56.57% examples, 1931535 words/s, in_qsize 46, out_qsize 1
INFO - 13:23:22: EPOCH 5 - PROGRESS: at 65.88% examples, 1936716 words/s, in_qsize 45, out_qsize 2
INFO - 13:23:23: EPOCH 5 - PROGRESS: at 76.30% examples, 1938511 words/s, in_qsize 45, out_qsize 2
INFO - 13:23:24: EPOCH 5 - PROGRESS: at 85.54% examples, 1936081 words/s, in_qsize 47, out_qsize 0
INFO - 13:2

Loss after epoch 4: 1462750.0


INFO - 13:23:28: EPOCH 6 - PROGRESS: at 7.55% examples, 1832605 words/s, in_qsize 43, out_qsize 4
INFO - 13:23:29: EPOCH 6 - PROGRESS: at 15.62% examples, 1881147 words/s, in_qsize 47, out_qsize 0
INFO - 13:23:30: EPOCH 6 - PROGRESS: at 23.61% examples, 1876663 words/s, in_qsize 45, out_qsize 2
INFO - 13:23:31: EPOCH 6 - PROGRESS: at 32.00% examples, 1892954 words/s, in_qsize 46, out_qsize 1
INFO - 13:23:32: EPOCH 6 - PROGRESS: at 39.89% examples, 1885165 words/s, in_qsize 46, out_qsize 1
INFO - 13:23:33: EPOCH 6 - PROGRESS: at 47.59% examples, 1890008 words/s, in_qsize 45, out_qsize 2
INFO - 13:23:34: EPOCH 6 - PROGRESS: at 55.58% examples, 1895958 words/s, in_qsize 47, out_qsize 0
INFO - 13:23:35: EPOCH 6 - PROGRESS: at 64.33% examples, 1897148 words/s, in_qsize 46, out_qsize 1
INFO - 13:23:36: EPOCH 6 - PROGRESS: at 74.27% examples, 1894352 words/s, in_qsize 47, out_qsize 0
INFO - 13:23:37: EPOCH 6 - PROGRESS: at 83.81% examples, 1897953 words/s, in_qsize 46, out_qsize 1
INFO - 13:2

Loss after epoch 5: 1328508.0


INFO - 13:23:41: EPOCH 7 - PROGRESS: at 7.61% examples, 1841138 words/s, in_qsize 45, out_qsize 2
INFO - 13:23:42: EPOCH 7 - PROGRESS: at 15.66% examples, 1880140 words/s, in_qsize 47, out_qsize 0
INFO - 13:23:43: EPOCH 7 - PROGRESS: at 24.00% examples, 1909642 words/s, in_qsize 46, out_qsize 1
INFO - 13:23:44: EPOCH 7 - PROGRESS: at 32.21% examples, 1902162 words/s, in_qsize 47, out_qsize 0
INFO - 13:23:45: EPOCH 7 - PROGRESS: at 40.39% examples, 1916975 words/s, in_qsize 45, out_qsize 2
INFO - 13:23:46: EPOCH 7 - PROGRESS: at 47.83% examples, 1906001 words/s, in_qsize 43, out_qsize 4
INFO - 13:23:47: EPOCH 7 - PROGRESS: at 55.87% examples, 1910029 words/s, in_qsize 46, out_qsize 1
INFO - 13:23:48: EPOCH 7 - PROGRESS: at 64.75% examples, 1913099 words/s, in_qsize 46, out_qsize 1
INFO - 13:23:49: EPOCH 7 - PROGRESS: at 74.91% examples, 1910494 words/s, in_qsize 47, out_qsize 0
INFO - 13:23:50: EPOCH 7 - PROGRESS: at 83.73% examples, 1900392 words/s, in_qsize 45, out_qsize 2
INFO - 13:2

Loss after epoch 6: 1316486.0


INFO - 13:23:54: EPOCH 8 - PROGRESS: at 7.52% examples, 1818033 words/s, in_qsize 47, out_qsize 0
INFO - 13:23:55: EPOCH 8 - PROGRESS: at 15.47% examples, 1838028 words/s, in_qsize 45, out_qsize 2
INFO - 13:23:56: EPOCH 8 - PROGRESS: at 23.71% examples, 1860652 words/s, in_qsize 45, out_qsize 2
INFO - 13:23:57: EPOCH 8 - PROGRESS: at 32.18% examples, 1884680 words/s, in_qsize 46, out_qsize 1
INFO - 13:23:58: EPOCH 8 - PROGRESS: at 40.12% examples, 1887221 words/s, in_qsize 47, out_qsize 0
INFO - 13:23:59: EPOCH 8 - PROGRESS: at 48.04% examples, 1895736 words/s, in_qsize 47, out_qsize 0
INFO - 13:24:00: EPOCH 8 - PROGRESS: at 56.08% examples, 1905779 words/s, in_qsize 45, out_qsize 2
INFO - 13:24:01: EPOCH 8 - PROGRESS: at 64.82% examples, 1904261 words/s, in_qsize 43, out_qsize 4
INFO - 13:24:02: EPOCH 8 - PROGRESS: at 74.87% examples, 1900175 words/s, in_qsize 47, out_qsize 0
INFO - 13:24:03: EPOCH 8 - PROGRESS: at 84.02% examples, 1898878 words/s, in_qsize 47, out_qsize 0
INFO - 13:2

Loss after epoch 7: 1275684.0


INFO - 13:24:07: EPOCH 9 - PROGRESS: at 7.70% examples, 1861543 words/s, in_qsize 47, out_qsize 0
INFO - 13:24:08: EPOCH 9 - PROGRESS: at 15.87% examples, 1893453 words/s, in_qsize 46, out_qsize 1
INFO - 13:24:09: EPOCH 9 - PROGRESS: at 23.90% examples, 1897904 words/s, in_qsize 46, out_qsize 1
INFO - 13:24:10: EPOCH 9 - PROGRESS: at 31.97% examples, 1889130 words/s, in_qsize 44, out_qsize 3
INFO - 13:24:11: EPOCH 9 - PROGRESS: at 39.80% examples, 1888150 words/s, in_qsize 46, out_qsize 1
INFO - 13:24:12: EPOCH 9 - PROGRESS: at 47.47% examples, 1890515 words/s, in_qsize 46, out_qsize 1
INFO - 13:24:13: EPOCH 9 - PROGRESS: at 55.40% examples, 1897453 words/s, in_qsize 47, out_qsize 0
INFO - 13:24:14: EPOCH 9 - PROGRESS: at 64.03% examples, 1892525 words/s, in_qsize 45, out_qsize 2
INFO - 13:24:15: EPOCH 9 - PROGRESS: at 73.96% examples, 1890743 words/s, in_qsize 47, out_qsize 0
INFO - 13:24:16: EPOCH 9 - PROGRESS: at 82.98% examples, 1880399 words/s, in_qsize 46, out_qsize 1
INFO - 13:2

Loss after epoch 8: 1307585.0


INFO - 13:24:20: EPOCH 10 - PROGRESS: at 7.61% examples, 1839313 words/s, in_qsize 47, out_qsize 0
INFO - 13:24:21: EPOCH 10 - PROGRESS: at 15.72% examples, 1886720 words/s, in_qsize 46, out_qsize 1
INFO - 13:24:22: EPOCH 10 - PROGRESS: at 24.03% examples, 1915056 words/s, in_qsize 48, out_qsize 0
INFO - 13:24:23: EPOCH 10 - PROGRESS: at 32.12% examples, 1900598 words/s, in_qsize 43, out_qsize 4
INFO - 13:24:24: EPOCH 10 - PROGRESS: at 40.36% examples, 1920696 words/s, in_qsize 44, out_qsize 3
INFO - 13:24:25: EPOCH 10 - PROGRESS: at 48.28% examples, 1929029 words/s, in_qsize 47, out_qsize 0
INFO - 13:24:26: EPOCH 10 - PROGRESS: at 55.97% examples, 1922102 words/s, in_qsize 42, out_qsize 5
INFO - 13:24:27: EPOCH 10 - PROGRESS: at 64.78% examples, 1920074 words/s, in_qsize 43, out_qsize 4
INFO - 13:24:28: EPOCH 10 - PROGRESS: at 74.99% examples, 1918392 words/s, in_qsize 47, out_qsize 0
INFO - 13:24:29: EPOCH 10 - PROGRESS: at 83.91% examples, 1910462 words/s, in_qsize 43, out_qsize 4
I

Loss after epoch 9: 1258963.0


INFO - 13:24:33: EPOCH 11 - PROGRESS: at 7.79% examples, 1875602 words/s, in_qsize 45, out_qsize 2
INFO - 13:24:34: EPOCH 11 - PROGRESS: at 16.03% examples, 1913749 words/s, in_qsize 48, out_qsize 3
INFO - 13:24:35: EPOCH 11 - PROGRESS: at 24.16% examples, 1917328 words/s, in_qsize 47, out_qsize 0
INFO - 13:24:37: EPOCH 11 - PROGRESS: at 32.37% examples, 1910481 words/s, in_qsize 45, out_qsize 2
INFO - 13:24:38: EPOCH 11 - PROGRESS: at 40.27% examples, 1909539 words/s, in_qsize 46, out_qsize 1
INFO - 13:24:39: EPOCH 11 - PROGRESS: at 48.10% examples, 1912675 words/s, in_qsize 45, out_qsize 2
INFO - 13:24:40: EPOCH 11 - PROGRESS: at 56.08% examples, 1918751 words/s, in_qsize 48, out_qsize 4
INFO - 13:24:41: EPOCH 11 - PROGRESS: at 64.91% examples, 1915730 words/s, in_qsize 40, out_qsize 7
INFO - 13:24:42: EPOCH 11 - PROGRESS: at 75.09% examples, 1912284 words/s, in_qsize 44, out_qsize 3
INFO - 13:24:43: EPOCH 11 - PROGRESS: at 84.02% examples, 1904290 words/s, in_qsize 47, out_qsize 0
I

Loss after epoch 10: 1272975.0


INFO - 13:24:47: EPOCH 12 - PROGRESS: at 7.58% examples, 1836383 words/s, in_qsize 47, out_qsize 0
INFO - 13:24:48: EPOCH 12 - PROGRESS: at 15.47% examples, 1849761 words/s, in_qsize 44, out_qsize 3
INFO - 13:24:49: EPOCH 12 - PROGRESS: at 23.33% examples, 1853938 words/s, in_qsize 46, out_qsize 1
INFO - 13:24:50: EPOCH 12 - PROGRESS: at 31.50% examples, 1864278 words/s, in_qsize 47, out_qsize 0
INFO - 13:24:51: EPOCH 12 - PROGRESS: at 39.59% examples, 1878235 words/s, in_qsize 46, out_qsize 1
INFO - 13:24:52: EPOCH 12 - PROGRESS: at 47.36% examples, 1886532 words/s, in_qsize 47, out_qsize 0
INFO - 13:24:53: EPOCH 12 - PROGRESS: at 55.28% examples, 1892191 words/s, in_qsize 43, out_qsize 4
INFO - 13:24:54: EPOCH 12 - PROGRESS: at 63.88% examples, 1888315 words/s, in_qsize 45, out_qsize 2
INFO - 13:24:55: EPOCH 12 - PROGRESS: at 73.69% examples, 1884294 words/s, in_qsize 47, out_qsize 0
INFO - 13:24:56: EPOCH 12 - PROGRESS: at 83.05% examples, 1885708 words/s, in_qsize 46, out_qsize 1
I

Loss after epoch 11: 1135052.0


INFO - 13:25:00: EPOCH 13 - PROGRESS: at 7.55% examples, 1826397 words/s, in_qsize 44, out_qsize 3
INFO - 13:25:01: EPOCH 13 - PROGRESS: at 15.74% examples, 1893800 words/s, in_qsize 45, out_qsize 2
INFO - 13:25:02: EPOCH 13 - PROGRESS: at 23.45% examples, 1871971 words/s, in_qsize 47, out_qsize 0
INFO - 13:25:03: EPOCH 13 - PROGRESS: at 31.40% examples, 1859375 words/s, in_qsize 44, out_qsize 3
INFO - 13:25:04: EPOCH 13 - PROGRESS: at 39.09% examples, 1857137 words/s, in_qsize 47, out_qsize 0
INFO - 13:25:05: EPOCH 13 - PROGRESS: at 46.55% examples, 1853925 words/s, in_qsize 44, out_qsize 3
INFO - 13:25:06: EPOCH 13 - PROGRESS: at 54.65% examples, 1866029 words/s, in_qsize 46, out_qsize 1
INFO - 13:25:07: EPOCH 13 - PROGRESS: at 62.95% examples, 1868589 words/s, in_qsize 47, out_qsize 0
INFO - 13:25:08: EPOCH 13 - PROGRESS: at 73.11% examples, 1874065 words/s, in_qsize 47, out_qsize 0
INFO - 13:25:09: EPOCH 13 - PROGRESS: at 82.36% examples, 1872224 words/s, in_qsize 48, out_qsize 0
I

Loss after epoch 12: 1017576.0


INFO - 13:25:13: EPOCH 14 - PROGRESS: at 7.46% examples, 1749318 words/s, in_qsize 47, out_qsize 0
INFO - 13:25:14: EPOCH 14 - PROGRESS: at 15.72% examples, 1862260 words/s, in_qsize 47, out_qsize 0
INFO - 13:25:15: EPOCH 14 - PROGRESS: at 23.58% examples, 1860941 words/s, in_qsize 42, out_qsize 5
INFO - 13:25:16: EPOCH 14 - PROGRESS: at 31.93% examples, 1868402 words/s, in_qsize 47, out_qsize 1
INFO - 13:25:17: EPOCH 14 - PROGRESS: at 39.63% examples, 1865981 words/s, in_qsize 43, out_qsize 4
INFO - 13:25:18: EPOCH 14 - PROGRESS: at 47.36% examples, 1871540 words/s, in_qsize 47, out_qsize 0
INFO - 13:25:19: EPOCH 14 - PROGRESS: at 55.17% examples, 1876893 words/s, in_qsize 45, out_qsize 2
INFO - 13:25:20: EPOCH 14 - PROGRESS: at 63.77% examples, 1881607 words/s, in_qsize 47, out_qsize 0
INFO - 13:25:21: EPOCH 14 - PROGRESS: at 73.70% examples, 1875684 words/s, in_qsize 44, out_qsize 3
INFO - 13:25:22: EPOCH 14 - PROGRESS: at 83.15% examples, 1879150 words/s, in_qsize 47, out_qsize 0
I

Loss after epoch 13: 1032434.0


INFO - 13:25:26: EPOCH 15 - PROGRESS: at 7.79% examples, 1867722 words/s, in_qsize 48, out_qsize 0
INFO - 13:25:27: EPOCH 15 - PROGRESS: at 16.00% examples, 1906783 words/s, in_qsize 46, out_qsize 1
INFO - 13:25:28: EPOCH 15 - PROGRESS: at 24.00% examples, 1901651 words/s, in_qsize 46, out_qsize 1
INFO - 13:25:29: EPOCH 15 - PROGRESS: at 32.15% examples, 1893289 words/s, in_qsize 41, out_qsize 6
INFO - 13:25:30: EPOCH 15 - PROGRESS: at 40.12% examples, 1899814 words/s, in_qsize 46, out_qsize 1
INFO - 13:25:31: EPOCH 15 - PROGRESS: at 47.93% examples, 1905666 words/s, in_qsize 47, out_qsize 0
INFO - 13:25:32: EPOCH 15 - PROGRESS: at 56.14% examples, 1921464 words/s, in_qsize 47, out_qsize 0
INFO - 13:25:33: EPOCH 15 - PROGRESS: at 64.44% examples, 1907477 words/s, in_qsize 47, out_qsize 0
INFO - 13:25:34: EPOCH 15 - PROGRESS: at 74.34% examples, 1902948 words/s, in_qsize 47, out_qsize 0
INFO - 13:25:35: EPOCH 15 - PROGRESS: at 83.59% examples, 1895272 words/s, in_qsize 44, out_qsize 3
I

Loss after epoch 14: 1001370.0


INFO - 13:25:39: EPOCH 16 - PROGRESS: at 7.85% examples, 1873307 words/s, in_qsize 47, out_qsize 0
INFO - 13:25:40: EPOCH 16 - PROGRESS: at 15.84% examples, 1891225 words/s, in_qsize 48, out_qsize 0
INFO - 13:25:41: EPOCH 16 - PROGRESS: at 23.77% examples, 1884882 words/s, in_qsize 48, out_qsize 4
INFO - 13:25:42: EPOCH 16 - PROGRESS: at 31.87% examples, 1883801 words/s, in_qsize 45, out_qsize 2
INFO - 13:25:43: EPOCH 16 - PROGRESS: at 39.74% examples, 1885473 words/s, in_qsize 46, out_qsize 1
INFO - 13:25:44: EPOCH 16 - PROGRESS: at 47.24% examples, 1881533 words/s, in_qsize 43, out_qsize 4
INFO - 13:25:45: EPOCH 16 - PROGRESS: at 55.28% examples, 1892590 words/s, in_qsize 47, out_qsize 0
INFO - 13:25:46: EPOCH 16 - PROGRESS: at 63.81% examples, 1892356 words/s, in_qsize 42, out_qsize 5
INFO - 13:25:47: EPOCH 16 - PROGRESS: at 73.81% examples, 1889074 words/s, in_qsize 40, out_qsize 7
INFO - 13:25:48: EPOCH 16 - PROGRESS: at 83.16% examples, 1887475 words/s, in_qsize 46, out_qsize 1
I

Loss after epoch 15: 996368.0


INFO - 13:25:53: EPOCH 17 - PROGRESS: at 7.79% examples, 1881651 words/s, in_qsize 46, out_qsize 1
INFO - 13:25:54: EPOCH 17 - PROGRESS: at 15.93% examples, 1915782 words/s, in_qsize 44, out_qsize 3
INFO - 13:25:55: EPOCH 17 - PROGRESS: at 24.00% examples, 1907497 words/s, in_qsize 44, out_qsize 3
INFO - 13:25:56: EPOCH 17 - PROGRESS: at 32.00% examples, 1895000 words/s, in_qsize 47, out_qsize 0
INFO - 13:25:57: EPOCH 17 - PROGRESS: at 40.03% examples, 1903620 words/s, in_qsize 46, out_qsize 1
INFO - 13:25:58: EPOCH 17 - PROGRESS: at 47.74% examples, 1904681 words/s, in_qsize 47, out_qsize 0
INFO - 13:25:59: EPOCH 17 - PROGRESS: at 55.70% examples, 1910418 words/s, in_qsize 44, out_qsize 3
INFO - 13:26:00: EPOCH 17 - PROGRESS: at 64.22% examples, 1904199 words/s, in_qsize 46, out_qsize 1
INFO - 13:26:01: EPOCH 17 - PROGRESS: at 74.18% examples, 1900723 words/s, in_qsize 47, out_qsize 0
INFO - 13:26:02: EPOCH 17 - PROGRESS: at 83.34% examples, 1895858 words/s, in_qsize 47, out_qsize 0
I

Loss after epoch 16: 989056.0


INFO - 13:26:06: EPOCH 18 - PROGRESS: at 7.46% examples, 1797936 words/s, in_qsize 47, out_qsize 0
INFO - 13:26:07: EPOCH 18 - PROGRESS: at 15.47% examples, 1853033 words/s, in_qsize 47, out_qsize 0
INFO - 13:26:08: EPOCH 18 - PROGRESS: at 23.08% examples, 1839122 words/s, in_qsize 44, out_qsize 3
INFO - 13:26:09: EPOCH 18 - PROGRESS: at 30.45% examples, 1807135 words/s, in_qsize 43, out_qsize 4
INFO - 13:26:10: EPOCH 18 - PROGRESS: at 38.12% examples, 1806343 words/s, in_qsize 47, out_qsize 0
INFO - 13:26:11: EPOCH 18 - PROGRESS: at 45.34% examples, 1803523 words/s, in_qsize 43, out_qsize 4
INFO - 13:26:12: EPOCH 18 - PROGRESS: at 53.02% examples, 1811081 words/s, in_qsize 44, out_qsize 3
INFO - 13:26:13: EPOCH 18 - PROGRESS: at 60.81% examples, 1811454 words/s, in_qsize 43, out_qsize 4
INFO - 13:26:14: EPOCH 18 - PROGRESS: at 70.63% examples, 1821600 words/s, in_qsize 47, out_qsize 0
INFO - 13:26:15: EPOCH 18 - PROGRESS: at 79.53% examples, 1812122 words/s, in_qsize 47, out_qsize 0
I

Loss after epoch 17: 974142.0


INFO - 13:26:19: EPOCH 19 - PROGRESS: at 7.34% examples, 1767251 words/s, in_qsize 44, out_qsize 3
INFO - 13:26:20: EPOCH 19 - PROGRESS: at 15.72% examples, 1888360 words/s, in_qsize 47, out_qsize 0
INFO - 13:26:21: EPOCH 19 - PROGRESS: at 23.55% examples, 1872425 words/s, in_qsize 47, out_qsize 0
INFO - 13:26:22: EPOCH 19 - PROGRESS: at 31.87% examples, 1887386 words/s, in_qsize 45, out_qsize 2
INFO - 13:26:23: EPOCH 19 - PROGRESS: at 39.77% examples, 1891959 words/s, in_qsize 43, out_qsize 4
INFO - 13:26:24: EPOCH 19 - PROGRESS: at 47.84% examples, 1909108 words/s, in_qsize 47, out_qsize 0
INFO - 13:26:25: EPOCH 19 - PROGRESS: at 55.61% examples, 1908154 words/s, in_qsize 47, out_qsize 0
INFO - 13:26:26: EPOCH 19 - PROGRESS: at 64.33% examples, 1906627 words/s, in_qsize 47, out_qsize 0
INFO - 13:26:27: EPOCH 19 - PROGRESS: at 74.59% examples, 1909016 words/s, in_qsize 46, out_qsize 1
INFO - 13:26:28: EPOCH 19 - PROGRESS: at 83.98% examples, 1910309 words/s, in_qsize 45, out_qsize 2
I

Loss after epoch 18: 937938.0


INFO - 13:26:32: EPOCH 20 - PROGRESS: at 7.34% examples, 1777550 words/s, in_qsize 46, out_qsize 1
INFO - 13:26:33: EPOCH 20 - PROGRESS: at 15.10% examples, 1802488 words/s, in_qsize 47, out_qsize 0
INFO - 13:26:34: EPOCH 20 - PROGRESS: at 22.86% examples, 1814998 words/s, in_qsize 47, out_qsize 0
INFO - 13:26:35: EPOCH 20 - PROGRESS: at 30.68% examples, 1813076 words/s, in_qsize 45, out_qsize 2
INFO - 13:26:36: EPOCH 20 - PROGRESS: at 38.24% examples, 1808284 words/s, in_qsize 47, out_qsize 0
INFO - 13:26:37: EPOCH 20 - PROGRESS: at 45.56% examples, 1809384 words/s, in_qsize 46, out_qsize 1
INFO - 13:26:38: EPOCH 20 - PROGRESS: at 53.27% examples, 1814629 words/s, in_qsize 44, out_qsize 3
INFO - 13:26:39: EPOCH 20 - PROGRESS: at 61.00% examples, 1815879 words/s, in_qsize 47, out_qsize 0
INFO - 13:26:40: EPOCH 20 - PROGRESS: at 70.75% examples, 1819457 words/s, in_qsize 45, out_qsize 2
INFO - 13:26:41: EPOCH 20 - PROGRESS: at 79.99% examples, 1818181 words/s, in_qsize 47, out_qsize 0
I

Loss after epoch 19: 936440.0


(497189361, 681442220)

In [45]:
w2v_model.save("output/abstract_readme_w2v{}.bin".format(embedding_dim))

INFO - 13:26:48: saving Word2Vec object under output/abstract_readme_w2v200.bin, separately None
INFO - 13:26:48: storing np array 'vectors' to output/abstract_readme_w2v200.bin.wv.vectors.npy
INFO - 13:26:48: not storing attribute vectors_norm
INFO - 13:26:48: storing np array 'syn1neg' to output/abstract_readme_w2v200.bin.trainables.syn1neg.npy
INFO - 13:26:48: not storing attribute cum_table
INFO - 13:26:48: saved output/abstract_readme_w2v200.bin


In [38]:
!ls output

abstract_w2v100.bin
abstract_w2v100.bin.trainables.syn1neg.npy
abstract_w2v100.bin.wv.vectors.npy
abstract_w2v200.bin
abstract_w2v200.bin.trainables.syn1neg.npy
abstract_w2v200.bin.wv.vectors.npy
abstract_w2v300.bin
abstract_w2v300.bin.trainables.syn1neg.npy
abstract_w2v300.bin.wv.vectors.npy
abstract_w2v.bin
abstract_w2v.bin.trainables.syn1neg.npy
abstract_w2v.bin.wv.vectors.npy
call_igraph.pkl
data_w2v.bin
data_w2v.bin.trainables.syn1neg.npy
data_w2v.bin.wv.vectors.npy
dependency_records.csv
gnn_embeddings_fasttext_dim200_epochs2_dim200_layers2.bin
gnn_embeddings_fasttext_dim200_epochs2_dim200_layers2.bin.vectors.npy
gnn_embeddings_fasttext_dim200_epochs50_dim200_layers2.bin
gnn_embeddings_fasttext_dim200_epochs50_dim200_layers2.bin.vectors.npy
gnn_model_2_dim200_layers2.pth
gnn_model_50_dim200_layers2.pth
graph_infomax_embeddings_fasttext_dim200_epochs100_dim200_layers2.bin
graph_infomax_embeddings_fasttext_dim200_epochs100_dim200_layers2.bin.vectors.npy
gra

In [39]:
test_tasks = pd.read_csv('output/test_tasks.csv').iloc[:,0]
train_tasks = pd.read_csv('output/train_tasks.csv').iloc[:,0]

In [40]:
from mlutil.feature_extraction import embeddings

In [41]:
avg_embedder = embeddings.AverageWordEmbeddingsVectorizer(w2v_model)

In [42]:
test_task_embeddings = avg_embedder.transform(test_tasks)
train_task_embeddings = avg_embedder.transform(train_tasks)

In [43]:
task_similarities = pd.DataFrame(test_task_embeddings @ train_task_embeddings.T, columns=train_tasks, index=test_tasks)

In [44]:
task_similarities.idxmax().to_dict()

{'image classification': 'audio classification',
 'representation learning': 'sparse learning',
 'object detection': 'object recognition',
 'neural architecture search': 'bilevel optimization',
 'optical character recognition': 'object recognition',
 'time series': 'real-time strategy games',
 'car racing': 'autonomous vehicles',
 'dimensionality reduction': 'sparse learning',
 'pose estimation': 'density estimation',
 'knowledge graphs': 'dependency parsing',
 'machine translation': 'speech-to-text translation',
 'text matching': 'semantic parsing',
 'hyperparameter optimization': 'bilevel optimization',
 'information retrieval': 'semantic segmentation',
 'word alignment': 'word sense disambiguation',
 'natural language inference': 'natural language understanding',
 'variational inference': 'bilevel optimization',
 'boundary detection': 'lane detection',
 'point processes': 'latent variable models',
 'sentiment analysis': 'lexical analysis',
 'named entity recognition': 'entity linkin

In [17]:
gensim.models.Word2Vec.load('output/abstract_w2v.bin')

INFO - 19:02:03: loading Word2Vec object from output/abstract_w2v.bin
INFO - 19:02:03: loading wv recursively from output/abstract_w2v.bin.wv.* with mmap=None
INFO - 19:02:03: loading vectors from output/abstract_w2v.bin.wv.vectors.npy with mmap=None
INFO - 19:02:04: setting ignored attribute vectors_norm to None
INFO - 19:02:04: loading vocabulary recursively from output/abstract_w2v.bin.vocabulary.* with mmap=None
INFO - 19:02:04: loading trainables recursively from output/abstract_w2v.bin.trainables.* with mmap=None
INFO - 19:02:04: loading syn1neg from output/abstract_w2v.bin.trainables.syn1neg.npy with mmap=None
INFO - 19:02:04: setting ignored attribute cum_table to None
INFO - 19:02:04: loaded output/abstract_w2v.bin


<gensim.models.word2vec.Word2Vec at 0x7f8c81468520>