In [77]:
import gensim
import pandas as pd
import nltk
import numpy as np
import logging
from copy import deepcopy
from gensim.corpora import Dictionary
from gensim.models import Word2Vec, Doc2Vec, FastText, KeyedVectors
from gensim.models.doc2vec import TaggedDocument
from tqdm._tqdm_notebook import tqdm
from pathlib import Path

tqdm.pandas()
# logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

In [2]:
import qiqc
import qiqc.datasets as QD
import qiqc.embeddings as QE
import qiqc.preprocessors as QP
import qiqc.models as QM

## Preprocessor definition

In [3]:
config = {
    'n_rows': None,
    'maxlen': 72,
    'embedding': {
        'src': ['glove', 'paragram']
    },
    'vocab': {
        'min_count': 5
    }
}
preprocessor = qiqc.builder.build_preprocessor([
    'lower',
    'misspell',
    'punct',
    'number+underscore',
])
tokenizer = qiqc.builder.build_tokenizer(
    'space'
)

## Setup & preprocess

In [4]:
%%time
# Load dataset
train_df, submit_df = QD.load_qiqc(n_rows=config['n_rows'])

CPU times: user 6.21 s, sys: 560 ms, total: 6.77 s
Wall time: 6.36 s


In [5]:
%%time
train_df['tokens'] = train_df.question_text.progress_apply(
    lambda x: tokenizer(preprocessor(x)))
submit_df['tokens'] = submit_df.question_text.progress_apply(
    lambda x: tokenizer(preprocessor(x)))
all_df = pd.concat([train_df, submit_df], ignore_index=True, sort=False)

100%|██████████| 1306122/1306122 [01:30<00:00, 14368.97it/s]
100%|██████████| 56370/56370 [00:03<00:00, 14531.70it/s]


CPU times: user 1min 34s, sys: 1.76 s, total: 1min 36s
Wall time: 1min 35s


## Build vocabulary

In [6]:
%%time
print('Build vocabulary...')
vocab = Dictionary(all_df.tokens.values, prune_at=None)
dfs = sorted(vocab.dfs.items(), key=lambda x: x[1], reverse=True)
token2id = dict(
    **{'<PAD>': 0},
    **dict([(vocab[idx], i + 1) for i, (idx, freq) in enumerate(dfs)]))
word_freq = dict(
    **{'<PAD>': 1},
    **dict([(vocab[idx], freq) for idx, freq in dfs]))
assert token2id['<PAD>'] == 0

Build vocabulary...
CPU times: user 30.7 s, sys: 20 ms, total: 30.8 s
Wall time: 30.8 s


In [7]:
%%time
train_df['token_ids'] = train_df.tokens.progress_apply(
    lambda xs: qiqc.utils.pad_sequence([token2id[x] for x in xs], config['maxlen']))
submit_df['token_ids'] = submit_df.tokens.progress_apply(
    lambda xs: qiqc.utils.pad_sequence([token2id[x] for x in xs], config['maxlen']))
all_df = pd.concat([train_df, submit_df], ignore_index=True, sort=False)

100%|██████████| 1306122/1306122 [00:24<00:00, 52682.03it/s]
100%|██████████| 56370/56370 [00:01<00:00, 45794.43it/s]


CPU times: user 26.8 s, sys: 1.11 s, total: 27.9 s
Wall time: 27.2 s


In [8]:
%%time
tokens = all_df.tokens.values

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 358 µs


## Load Pretrained Embedding

In [9]:
%%time
pretrained_vectors = QE.load_pretrained_vectors(
    config['embedding']['src'], token2id, test=config['n_rows'])
glove = pretrained_vectors['glove']

CPU times: user 1.62 s, sys: 1.51 s, total: 3.13 s
Wall time: 1min 31s


In [10]:
%%time
initial_vectors, unk_freqs = [], []
for name, _pretrained_vectors in pretrained_vectors.items():
    vec, known_freq, unk_freq = qiqc.embeddings.build_word_vectors(
        word_freq, _pretrained_vectors, config['vocab']['min_count'])
    initial_vectors.append(vec)
    unk_freqs.append(unk_freq)
initial_vectors = np.array(initial_vectors).mean(axis=0)

CPU times: user 5.75 s, sys: 1.26 s, total: 7.01 s
Wall time: 4.29 s


## Word2Vec

In [11]:
w2v = Word2Vec(size=300, min_count=1)
w2v.build_vocab_from_freq(word_freq)
idxmap = np.array([token2id[w] for w in w2v.wv.index2word])

In [72]:
%%time
# Fine-tuning
w2v_ft = deepcopy(w2v)
w2v_ft.wv.vectors[:] = initial_vectors[idxmap]
w2v_ft.trainables.syn1neg[:] = initial_vectors[idxmap]
w2v_ft.train(tokens, total_examples=len(tokens), epochs=5)

CPU times: user 4min 58s, sys: 4.41 s, total: 5min 2s
Wall time: 1min 44s


In [75]:
w2v_ft.wv.most_similar('h2')

[('h2o', 0.6760711669921875),
 ('nh3', 0.6650677919387817),
 ('h3', 0.6128498911857605),
 ('supercsupercompress', 0.610657811164856),
 ('so4', 0.6090301275253296),
 ('n2', 0.6033170223236084),
 ('astatide', 0.5922127366065979),
 ('o2', 0.5795282125473022),
 ('h2o2', 0.5735531449317932),
 ('hn3', 0.5703514814376831)]

In [71]:
glove.most_similar('polynomialfeatures')

KeyError: "word 'polynomialfeatures' not in vocabulary"

## FastText

In [None]:
ft = FastText(size=300, min_count=1)
ft.build_vocab_from_freq(word_freq)
idxmap = np.array([token2id[w] for w in ft.wv.index2word])

In [1]:
print(1)

1


## Doc2Vec

In [15]:
d2v = Doc2Vec(vector_size=300, min_count=1)
d2v.build_vocab(documents)
idxmap = np.array([token2id[w] for w in d2v.wv.index2word])

NameError: name 'documents' is not defined

In [None]:
%%time
# Fine-tuning
d2v_ft = deepcopy(d2v)
d2v_ft.wv.vectors[:] = initial_vectors[idxmap]
d2v_ft.train(documents, total_examples=len(documents), epochs=3)

In [None]:
d2v_ft.most_similar('gay')

In [None]:
d2v_ft.similar_by_vector(d2v_ft.docvecs[1])

In [21]:
len(d2v_ft.docvecs)

2

In [22]:
documents[0]

TaggedDocument(words=['how', 'did', 'quebec', 'nationalists', 'see', 'their', 'province', 'as', 'a', 'nation', 'in', 'the', '__####__', 's', '?'], tags=[0])

In [26]:
d2v_ft

<gensim.models.doc2vec.Doc2Vec at 0x7f3ce3c8c5f8>

In [1]:
w2v_ft.wv.most_similar('cos2x')

NameError: name 'w2v_ft' is not defined

In [35]:
pretrained_vectors['glove'].most_similar('sin2x')

[('cosx', 0.7151340246200562),
 ('sinx', 0.6846055388450623),
 ('2sin', 0.5574997067451477),
 ('arcsin', 0.5251956582069397),
 ('arctan', 0.5152665376663208),
 ('cosθ', 0.5137823820114136),
 ('2cos', 0.5127534866333008),
 ('squareroot', 0.49551451206207275),
 ('sinθ', 0.4815032482147217),
 ('dxdy', 0.4660615622997284)]

In [45]:
w2v_ft.trainables.syn1neg

array([[ 2.5964957e-01, -8.4670648e-02,  2.7851504e-01, ...,
         1.7954341e-01, -2.6291129e-01,  2.1134192e-02],
       [ 2.1204364e-01, -1.1398113e-01,  2.8634911e-02, ...,
         1.1775280e-01, -5.4585841e-02,  5.0044555e-02],
       [ 1.1414719e-01,  1.9381207e-01, -8.6145736e-02, ...,
         5.8750734e-02,  1.3702831e-01,  4.9093388e-02],
       ...,
       [ 9.9715188e-02, -3.9235208e-02,  3.6428217e-02, ...,
         6.1835591e-02,  4.0068314e-03,  1.7250087e-02],
       [ 8.6653128e-02, -3.9510202e-02,  3.7852433e-02, ...,
         7.8722239e-02, -2.2151789e-02,  6.6478060e-05],
       [ 4.7680137e-01, -7.3896992e-01,  1.6436531e-01, ...,
         1.9286922e-01,  1.6693333e-01, -3.0646479e-01]], dtype=float32)

In [46]:
w2v_ft.wv.vectors

array([[-2.9384291e-01, -5.7541598e-02,  1.0705809e-01, ...,
         3.4006652e-01,  3.6447838e-01,  4.7817993e-01],
       [-1.5696748e-01,  1.6157227e+00,  1.4872612e-01, ...,
        -2.1004108e-01, -3.3103815e-01, -1.0355601e-01],
       [-8.9807987e-01,  1.4232678e+00,  1.0708263e+00, ...,
        -2.7188110e-01, -6.9967037e-01, -4.8269230e-01],
       ...,
       [ 3.7492380e-02, -1.7966816e-02,  2.9923102e-02, ...,
        -3.2731801e-02, -3.8572319e-02,  1.9709233e-02],
       [-6.0223154e-04, -6.0140961e-03,  1.2912366e-03, ...,
        -1.0942204e-02, -5.3032041e-03, -1.4645704e-02],
       [ 4.5158985e-01, -7.2535056e-01,  1.4970027e-01, ...,
         2.0482789e-01,  1.8570842e-01, -2.9702303e-01]], dtype=float32)