In [1]:
import gensim
import pandas as pd
import nltk
import numpy as np
import logging
from copy import deepcopy
from gensim.models import Word2Vec, KeyedVectors
from gensim.models.doc2vec import TaggedDocument
from tqdm import tqdm_notebook as tqdm
from pathlib import Path

# logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

In [2]:
import qiqc.datasets as QD
import qiqc.featurizers as QF
import qiqc.preprocessors as QP
import qiqc.models as QM

## Prepare dataset

In [None]:
%%time
# Load dataset
n_rows = None
train_rawdata = QD.QIQCTrainDataset(nrows=n_rows)
submit_rawdata = QD.QIQCSubmitDataset(nrows=n_rows)

# Build preprocessor
preprocessor = QP.PreprocessPipeline(
    QP.SentenceNormalizationPipeline(
        QP.TypoNormalizer(),
    ),
    nltk.word_tokenize,
)

# Tokenize texts
train_rawdata.df['tokens'] = train_rawdata.texts.apply(preprocessor)
submit_rawdata.df['tokens'] = submit_rawdata.texts.apply(preprocessor)
tokens = np.concatenate([
    train_rawdata.df.tokens.values,
    submit_rawdata.df.tokens.values,
])

In [None]:
%%time
pretrained_vector = QF.load_pretrained_vector('gnews')

In [8]:
%%time
model = QM.Word2VecEx(size=300, window=3, workers=2)
w2v = QF.Word2VecFeaturizer(
    model=model,
    maxlen=100,
    standardize=True,
)
w2v.model.build_vocab_with_pretraining(
    tokens, pretrained_vector, keep_raw_vocab=True)

CPU times: user 6.54 s, sys: 36 ms, total: 6.58 s
Wall time: 6.58 s


In [10]:
w2v_scratch = deepcopy(w2v)
w2v_scratch.model.most_similar('king')

  
  if np.issubdtype(vec.dtype, np.int):


[('kings', 0.7138045430183411),
 ('queen', 0.6510956883430481),
 ('monarch', 0.6413194537162781),
 ('prince', 0.6159993410110474),
 ('sultan', 0.5864823460578918),
 ('ruler', 0.5797567367553711),
 ('princes', 0.5646552443504333),
 ('throne', 0.5422105193138123),
 ('royal', 0.5239794254302979),
 ('kingdom', 0.5210405588150024)]

In [48]:
models = {}

In [47]:
%%time
min_count = 10
_w2v = Word2VecEx(size=300, window=3, workers=2)
_w2v_scratch = deepcopy(_w2v)
_w2v_scratch.build_vocab(tokens, min_count=min_count)
_w2v_gnews = deepcopy(_w2v)
_w2v_gnews.build_vocab_with_pretraining(tokens, gnews, min_count=min_count)

CPU times: user 10.4 s, sys: 12 ms, total: 10.4 s
Wall time: 10.4 s


In [49]:
%%time
w2v_scratch = deepcopy(_w2v_scratch)
w2v_scratch.train(tokens, total_examples=len(tokens), epochs=1)
models['scratch'] = w2v_scratch

CPU times: user 45.6 s, sys: 116 ms, total: 45.7 s
Wall time: 22.9 s


In [50]:
w2v_gnews = deepcopy(_w2v_gnews)
models['gnews'] = w2v_gnews

In [51]:
%%time
n_tokens = None
w2v_gnews_ft1000 = deepcopy(_w2v_gnews)
w2v_gnews_ft1000.train(tokens[:n_tokens], total_examples=len(tokens[:n_tokens]), epochs=1)
models['gnews_ft1000'] = w2v_gnews_ft1000

CPU times: user 47.3 s, sys: 140 ms, total: 47.4 s
Wall time: 24.6 s


In [52]:
%%time
w2v_gnews_ft = deepcopy(_w2v_gnews)
w2v_gnews_ft.train(tokens, total_examples=len(tokens), epochs=1)
models['gnews_ft'] = w2v_gnews_ft

CPU times: user 46.8 s, sys: 128 ms, total: 47 s
Wall time: 23.5 s


In [53]:
%%time
n_tokens = None
w2v_gnews_ft_freeze = deepcopy(_w2v_gnews)
with w2v_gnews_ft_freeze.freeze_pretrained_vector():
    w2v_gnews_ft_freeze.train(tokens[:n_tokens], total_examples=len(tokens[:n_tokens]), epochs=1)
models['gnews_ft_freeze'] = w2v_gnews_ft_freeze

CPU times: user 1min 32s, sys: 2.19 s, total: 1min 34s
Wall time: 1min


In [54]:
word = 'Paris'
pd.DataFrame(dict([(k, [w[0] for w in m.wv.most_similar(word)]) for k, m in models.items()]))

  if np.issubdtype(vec.dtype, np.int):


Unnamed: 0,scratch,gnews,gnews_ft1000,gnews_ft,gnews_ft_freeze
0,Maryland,France,London,London,London
1,Atlanta,French,France,France,France
2,Chicago,Brussels,Berlin,Berlin,Berlin
3,Dallas,Versailles,Amsterdam,Amsterdam,Amsterdam
4,Manhattan,Madrid,Rome,Rome,Rome
5,Boston,Rome,Spain,Spain,Spain
6,Georgia,Berlin,Italy,Italy,Italy
7,Morocco,Marseille,Morocco,Morocco,Morocco
8,Arizona,London,Brussels,Brussels,Brussels
9,Charleston,Vienna,1980s,1980s,1980s


In [55]:
word = '1960s'
pd.DataFrame(dict([(k, [w[0] for w in m.wv.most_similar(word)]) for k, m in models.items()]))

Unnamed: 0,scratch,gnews,gnews_ft1000,gnews_ft,gnews_ft_freeze
0,70s,Forces,70s,70s,70s
1,90s,Liberation,1997,1980s,1980s
2,Grand,Gays,1980s,1997,1997
3,Tennessee,PKK,1994,1994,1994
4,1988,Homosexual,1930s,1930s,1930s
5,Warsaw,Gauntlet,90s,2003,90s
6,Pluto,Transform,2002,90s,2002
7,Louisiana,Labour,2003,2002,1800s
8,1997,Nationalists,1800s,1800s,2003
9,Philadelphia,Future,1942,1942,1942


In [56]:
word = 'Quora'
pd.DataFrame(dict([(k, [w[0] for w in m.wv.most_similar(word)]) for k, m in models.items()]))

Unnamed: 0,scratch,gnews,gnews_ft1000,gnews_ft,gnews_ft_freeze
0,here,Tumblr,Facebook,Facebook,Facebook
1,Facebook,Instagram,Instagram,Instagram,Instagram
2,comments,Squarespace,here,here,here
3,Instagram,Reddit,Snapchat,Snapchat,Snapchat
4,answers,reddit,answers,answers,Twitter
5,questions,DuckDuckGo,Twitter,Twitter,answers
6,Snapchat,Facebook,answer,answer,answer
7,Twitter,Yelp,questions,questions,questions
8,question,LinkedIn,comments,comments,comments
9,comment,Wordpress,Tumblr,Tumblr,Tumblr


In [31]:
class BaseWordEmbeddingsModelEx:
    def hoge(self):
        print('hoge')
        
    def _clear_post_train(self):
        print('fuga')

In [44]:
class Word2VecEx2(BaseWordEmbeddingsModelEx, Word2Vec):
    pass

In [40]:
mm = Word2VecEx2()

In [40]:
set([1, *{'a': 1}.keys()])

{1, 'a'}