In [1]:
from common import *
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
fnames = glob('../data/lingvo/raw/*')

In [3]:
import gc, io

# prog = re.compile("[\W\d]", re.UNICODE)

def process(fnames, i):
    with io.open('../data/lingvo/json/%s.txt'%i, 'w', encoding='utf8') as fw:
        for fn in fnames:
            with io.open(fn, encoding='utf8') as fr:
                text = fr.read()
            sents = sent_tokenize(text)
            sents = [[w for w in s.split() if w not in stop_list and len(w)>1] 
                     for s in sents]
            sents = [s for s in sents if len(s)]
            s = json.dumps((basename(fn).split('.')[0], sents), ensure_ascii=False)            
            fw.write(s + u'\n')
    gc.collect()

In [5]:
parallelizer = Parallel(n_jobs=cpu_count)

tasks_iterator = ( delayed(process)(list_block, i) for 
                  i, list_block in enumerate(grouper(len(fnames)//500, fnames)) ) 
result = parallelizer( tasks_iterator )

In [6]:
pwd = !pwd
%cd ../data/lingvo/json
!cat *.txt > corpus_json.txt&&gzip -f corpus_json.txt&&rm *.txt
%cd {pwd[0]}

/notebooks/data/lingvo/json
cat: write error: No space left on device
/notebooks/pat


In [4]:
import ujson

def iter_docs(corpus_path, jsn):
    with GzipFile(corpus_path, 'r') as fr:
        for line in fr:
            if jsn:
                _id, sents = ujson.loads(line)
            else:
                _id, sents = '', [line.split()]
            yield _id, sents
    
def iter_sents(corpus_path, jsn=True):
    for _id, sents in iter_docs(corpus_path, jsn):
        for s in sents:
            yield s
                
class Sentences(object):
    def __init__(self, corpus_path, jsn=True):
        self.corpus_path = corpus_path
        self.jsn = jsn
    def __iter__(self):
        for sent in iter_sents(self.corpus_path, self.jsn):
            yield sent
            
def save_bigram_corpus(corpus_path, bigram_corpus_path, bigram_ph, jsn):
    logging.info('saving %s' % bigram_corpus_path)
    with GzipFile(bigram_corpus_path, 'w') as f:
        for sent in bigram_ph[iter_sents(corpus_path, jsn)]:
            f.write(unicode(' '.join(sent)).encode('utf8') + '\n')    
            
def extract_bigrams(corpus_path, name, min_count=5, threshold=10, jsn=True):
    bigram_path = '../data/lingvo/%s' % name
    bigram = gensim.models.Phrases(iter_sents(corpus_path, jsn),
                                   progress_per=100000)
    bigram.min_count = min_count
    bigram.threshold = threshold

    bigram.save(bigram_path)
    logging.info('vocab size %s' % len(bigram.vocab))
    
    bigram_ph = gensim.models.phrases.Phraser(bigram)
    bigram_ph_path = bigram_path + '_ph_%s_%s' % (min_count, threshold)
    bigram_ph.save(bigram_ph_path)
    
    bigram_corpus_path = '../data/lingvo/%s_corpus.txt.gz' % name
    save_bigram_corpus(corpus_path, bigram_corpus_path, bigram_ph, jsn)

# Bigrams

In [10]:
name = 'bigram'
corpus_path = '../data/lingvo/corpus_json.txt.gz'
min_count, threshold = 20, 30

In [11]:
extract_bigrams(corpus_path, name, min_count, threshold)

2017-10-31 11:02:31,187 [MainThread  ] [INFO ]  collecting all words and their counts
2017-10-31 11:02:31,197 [MainThread  ] [INFO ]  PROGRESS: at sentence #0, processed 0 words and 0 word types
2017-10-31 11:02:33,325 [MainThread  ] [INFO ]  PROGRESS: at sentence #100000, processed 1182675 words and 559428 word types
2017-10-31 11:02:35,404 [MainThread  ] [INFO ]  PROGRESS: at sentence #200000, processed 2349165 words and 1042414 word types
2017-10-31 11:02:37,489 [MainThread  ] [INFO ]  PROGRESS: at sentence #300000, processed 3456172 words and 1449201 word types
2017-10-31 11:02:39,563 [MainThread  ] [INFO ]  PROGRESS: at sentence #400000, processed 4580458 words and 1839734 word types
2017-10-31 11:02:41,629 [MainThread  ] [INFO ]  PROGRESS: at sentence #500000, processed 5703555 words and 2225198 word types
2017-10-31 11:02:43,748 [MainThread  ] [INFO ]  PROGRESS: at sentence #600000, processed 6817605 words and 2582610 word types
2017-10-31 11:02:45,947 [MainThread  ] [INFO ]  PR

2017-10-31 11:04:45,532 [MainThread  ] [INFO ]  PROGRESS: at sentence #6400000, processed 72469950 words and 16846576 word types
2017-10-31 11:04:47,633 [MainThread  ] [INFO ]  PROGRESS: at sentence #6500000, processed 73620282 words and 17045308 word types
2017-10-31 11:04:49,664 [MainThread  ] [INFO ]  PROGRESS: at sentence #6600000, processed 74748297 words and 17237597 word types
2017-10-31 11:04:51,843 [MainThread  ] [INFO ]  PROGRESS: at sentence #6700000, processed 75908781 words and 17436455 word types
2017-10-31 11:04:53,975 [MainThread  ] [INFO ]  PROGRESS: at sentence #6800000, processed 77043813 words and 17634109 word types
2017-10-31 11:04:56,032 [MainThread  ] [INFO ]  PROGRESS: at sentence #6900000, processed 78175956 words and 17840260 word types
2017-10-31 11:04:58,093 [MainThread  ] [INFO ]  PROGRESS: at sentence #7000000, processed 79284365 words and 18023510 word types
2017-10-31 11:05:00,137 [MainThread  ] [INFO ]  PROGRESS: at sentence #7100000, processed 8042414

2017-10-31 11:07:03,503 [MainThread  ] [INFO ]  PROGRESS: at sentence #12800000, processed 145128887 words and 28190707 word types
2017-10-31 11:07:09,417 [MainThread  ] [INFO ]  PROGRESS: at sentence #12900000, processed 146237371 words and 28351716 word types
2017-10-31 11:07:11,489 [MainThread  ] [INFO ]  PROGRESS: at sentence #13000000, processed 147369370 words and 28515410 word types
2017-10-31 11:07:13,551 [MainThread  ] [INFO ]  PROGRESS: at sentence #13100000, processed 148516599 words and 28678467 word types
2017-10-31 11:07:15,594 [MainThread  ] [INFO ]  PROGRESS: at sentence #13200000, processed 149662710 words and 28835921 word types
2017-10-31 11:07:17,609 [MainThread  ] [INFO ]  PROGRESS: at sentence #13300000, processed 150782065 words and 29006448 word types
2017-10-31 11:07:19,628 [MainThread  ] [INFO ]  PROGRESS: at sentence #13400000, processed 151914947 words and 29157760 word types
2017-10-31 11:07:21,631 [MainThread  ] [INFO ]  PROGRESS: at sentence #13500000, pr

2017-10-31 11:09:34,430 [MainThread  ] [INFO ]  PROGRESS: at sentence #19100000, processed 216229098 words and 37643602 word types
2017-10-31 11:09:36,454 [MainThread  ] [INFO ]  PROGRESS: at sentence #19200000, processed 217361840 words and 37787453 word types
2017-10-31 11:09:38,477 [MainThread  ] [INFO ]  PROGRESS: at sentence #19300000, processed 218484057 words and 37935427 word types
2017-10-31 11:09:40,539 [MainThread  ] [INFO ]  PROGRESS: at sentence #19400000, processed 219633303 words and 38073887 word types
2017-10-31 11:09:42,523 [MainThread  ] [INFO ]  PROGRESS: at sentence #19500000, processed 220736105 words and 38218196 word types
2017-10-31 11:09:44,606 [MainThread  ] [INFO ]  PROGRESS: at sentence #19600000, processed 221888872 words and 38364867 word types
2017-10-31 11:09:46,660 [MainThread  ] [INFO ]  PROGRESS: at sentence #19700000, processed 223038449 words and 38503545 word types
2017-10-31 11:09:48,689 [MainThread  ] [INFO ]  PROGRESS: at sentence #19800000, pr

2017-10-31 11:12:30,732 [MainThread  ] [INFO ]  PROGRESS: at sentence #25200000, processed 285291804 words and 23569903 word types
2017-10-31 11:12:32,865 [MainThread  ] [INFO ]  PROGRESS: at sentence #25300000, processed 286410275 words and 23717957 word types
2017-10-31 11:12:35,047 [MainThread  ] [INFO ]  PROGRESS: at sentence #25400000, processed 287565377 words and 23879278 word types
2017-10-31 11:12:37,117 [MainThread  ] [INFO ]  PROGRESS: at sentence #25500000, processed 288700647 words and 24031933 word types
2017-10-31 11:12:39,206 [MainThread  ] [INFO ]  PROGRESS: at sentence #25600000, processed 289841498 words and 24193324 word types
2017-10-31 11:12:45,077 [MainThread  ] [INFO ]  PROGRESS: at sentence #25700000, processed 290970759 words and 24342097 word types
2017-10-31 11:12:47,151 [MainThread  ] [INFO ]  PROGRESS: at sentence #25800000, processed 292091935 words and 24495665 word types
2017-10-31 11:12:49,222 [MainThread  ] [INFO ]  PROGRESS: at sentence #25900000, pr

2017-10-31 11:14:56,073 [MainThread  ] [INFO ]  PROGRESS: at sentence #31500000, processed 356487881 words and 32884572 word types
2017-10-31 11:14:58,385 [MainThread  ] [INFO ]  PROGRESS: at sentence #31600000, processed 357647272 words and 33032726 word types
2017-10-31 11:15:00,594 [MainThread  ] [INFO ]  PROGRESS: at sentence #31700000, processed 358780391 words and 33168906 word types
2017-10-31 11:15:02,769 [MainThread  ] [INFO ]  PROGRESS: at sentence #31800000, processed 359920480 words and 33314168 word types
2017-10-31 11:15:05,015 [MainThread  ] [INFO ]  PROGRESS: at sentence #31900000, processed 361071358 words and 33476949 word types
2017-10-31 11:15:07,203 [MainThread  ] [INFO ]  PROGRESS: at sentence #32000000, processed 362206662 words and 33628199 word types
2017-10-31 11:15:09,500 [MainThread  ] [INFO ]  PROGRESS: at sentence #32100000, processed 363323627 words and 33756461 word types
2017-10-31 11:15:11,667 [MainThread  ] [INFO ]  PROGRESS: at sentence #32200000, pr

2017-10-31 11:17:41,107 [MainThread  ] [INFO ]  PROGRESS: at sentence #37700000, processed 426747970 words and 16993940 word types
2017-10-31 11:17:43,263 [MainThread  ] [INFO ]  PROGRESS: at sentence #37800000, processed 427890254 words and 17155363 word types
2017-10-31 11:17:45,357 [MainThread  ] [INFO ]  PROGRESS: at sentence #37900000, processed 428998289 words and 17319029 word types
2017-10-31 11:17:47,452 [MainThread  ] [INFO ]  PROGRESS: at sentence #38000000, processed 430102512 words and 17477315 word types
2017-10-31 11:17:49,630 [MainThread  ] [INFO ]  PROGRESS: at sentence #38100000, processed 431258296 words and 17644044 word types
2017-10-31 11:17:51,770 [MainThread  ] [INFO ]  PROGRESS: at sentence #38200000, processed 432388788 words and 17819469 word types
2017-10-31 11:17:53,886 [MainThread  ] [INFO ]  PROGRESS: at sentence #38300000, processed 433509726 words and 17976988 word types
2017-10-31 11:17:56,073 [MainThread  ] [INFO ]  PROGRESS: at sentence #38400000, pr

2017-10-31 11:20:05,036 [MainThread  ] [INFO ]  PROGRESS: at sentence #44000000, processed 498028275 words and 26852337 word types
2017-10-31 11:20:07,087 [MainThread  ] [INFO ]  PROGRESS: at sentence #44100000, processed 499146146 words and 26996779 word types
2017-10-31 11:20:09,126 [MainThread  ] [INFO ]  PROGRESS: at sentence #44200000, processed 500251662 words and 27139049 word types
2017-10-31 11:20:11,227 [MainThread  ] [INFO ]  PROGRESS: at sentence #44300000, processed 501395790 words and 27285851 word types
2017-10-31 11:20:13,326 [MainThread  ] [INFO ]  PROGRESS: at sentence #44400000, processed 502541572 words and 27427576 word types
2017-10-31 11:20:15,457 [MainThread  ] [INFO ]  PROGRESS: at sentence #44500000, processed 503702842 words and 27577306 word types
2017-10-31 11:20:17,557 [MainThread  ] [INFO ]  PROGRESS: at sentence #44600000, processed 504837275 words and 27724683 word types
2017-10-31 11:20:19,610 [MainThread  ] [INFO ]  PROGRESS: at sentence #44700000, pr

2017-10-31 11:22:21,566 [MainThread  ] [INFO ]  PROGRESS: at sentence #50300000, processed 569245636 words and 35674843 word types
2017-10-31 11:22:23,714 [MainThread  ] [INFO ]  PROGRESS: at sentence #50400000, processed 570381025 words and 35803421 word types
2017-10-31 11:22:25,832 [MainThread  ] [INFO ]  PROGRESS: at sentence #50500000, processed 571515337 words and 35931014 word types
2017-10-31 11:22:27,870 [MainThread  ] [INFO ]  PROGRESS: at sentence #50600000, processed 572615247 words and 36061705 word types
2017-10-31 11:22:30,005 [MainThread  ] [INFO ]  PROGRESS: at sentence #50700000, processed 573770779 words and 36194760 word types
2017-10-31 11:22:32,076 [MainThread  ] [INFO ]  PROGRESS: at sentence #50800000, processed 574880235 words and 36330777 word types
2017-10-31 11:22:34,119 [MainThread  ] [INFO ]  PROGRESS: at sentence #50900000, processed 575981246 words and 36460778 word types
2017-10-31 11:22:36,180 [MainThread  ] [INFO ]  PROGRESS: at sentence #51000000, pr

2017-10-31 11:25:01,449 [MainThread  ] [INFO ]  PROGRESS: at sentence #56500000, processed 639513598 words and 19438738 word types
2017-10-31 11:25:03,547 [MainThread  ] [INFO ]  PROGRESS: at sentence #56600000, processed 640631145 words and 19602913 word types
2017-10-31 11:25:05,698 [MainThread  ] [INFO ]  PROGRESS: at sentence #56700000, processed 641788497 words and 19768926 word types
2017-10-31 11:25:07,857 [MainThread  ] [INFO ]  PROGRESS: at sentence #56800000, processed 642951943 words and 19929104 word types
2017-10-31 11:25:09,965 [MainThread  ] [INFO ]  PROGRESS: at sentence #56900000, processed 644076535 words and 20090238 word types
2017-10-31 11:25:12,051 [MainThread  ] [INFO ]  PROGRESS: at sentence #57000000, processed 645196956 words and 20250452 word types
2017-10-31 11:25:14,135 [MainThread  ] [INFO ]  PROGRESS: at sentence #57100000, processed 646310259 words and 20404070 word types
2017-10-31 11:25:16,313 [MainThread  ] [INFO ]  PROGRESS: at sentence #57200000, pr

2017-10-31 11:27:21,034 [MainThread  ] [INFO ]  PROGRESS: at sentence #62800000, processed 710977764 words and 28993239 word types
2017-10-31 11:27:23,211 [MainThread  ] [INFO ]  PROGRESS: at sentence #62900000, processed 712141751 words and 29137255 word types
2017-10-31 11:27:25,367 [MainThread  ] [INFO ]  PROGRESS: at sentence #63000000, processed 713298786 words and 29277722 word types
2017-10-31 11:27:27,480 [MainThread  ] [INFO ]  PROGRESS: at sentence #63100000, processed 714429538 words and 29420306 word types
2017-10-31 11:27:29,623 [MainThread  ] [INFO ]  PROGRESS: at sentence #63200000, processed 715571769 words and 29552659 word types
2017-10-31 11:27:31,766 [MainThread  ] [INFO ]  PROGRESS: at sentence #63300000, processed 716717467 words and 29690239 word types
2017-10-31 11:27:33,942 [MainThread  ] [INFO ]  PROGRESS: at sentence #63400000, processed 717874375 words and 29857973 word types
2017-10-31 11:27:36,057 [MainThread  ] [INFO ]  PROGRESS: at sentence #63500000, pr

2017-10-31 11:29:39,715 [MainThread  ] [INFO ]  PROGRESS: at sentence #69100000, processed 782349102 words and 37683168 word types
2017-10-31 11:29:41,876 [MainThread  ] [INFO ]  PROGRESS: at sentence #69200000, processed 783505237 words and 37811027 word types
2017-10-31 11:29:49,427 [MainThread  ] [INFO ]  PROGRESS: at sentence #69300000, processed 784622638 words and 37944800 word types
2017-10-31 11:29:51,539 [MainThread  ] [INFO ]  PROGRESS: at sentence #69400000, processed 785755748 words and 38074163 word types
2017-10-31 11:29:53,664 [MainThread  ] [INFO ]  PROGRESS: at sentence #69500000, processed 786902403 words and 38195665 word types
2017-10-31 11:29:55,802 [MainThread  ] [INFO ]  PROGRESS: at sentence #69600000, processed 788049836 words and 38331175 word types
2017-10-31 11:29:57,873 [MainThread  ] [INFO ]  PROGRESS: at sentence #69700000, processed 789162021 words and 38461126 word types
2017-10-31 11:29:59,972 [MainThread  ] [INFO ]  PROGRESS: at sentence #69800000, pr

2017-10-31 11:32:26,551 [MainThread  ] [INFO ]  PROGRESS: at sentence #75300000, processed 852592830 words and 21224126 word types
2017-10-31 11:32:28,642 [MainThread  ] [INFO ]  PROGRESS: at sentence #75400000, processed 853698474 words and 21376220 word types
2017-10-31 11:32:30,753 [MainThread  ] [INFO ]  PROGRESS: at sentence #75500000, processed 854815489 words and 21537299 word types
2017-10-31 11:32:32,884 [MainThread  ] [INFO ]  PROGRESS: at sentence #75600000, processed 855947098 words and 21690657 word types
2017-10-31 11:32:34,993 [MainThread  ] [INFO ]  PROGRESS: at sentence #75700000, processed 857066738 words and 21845957 word types
2017-10-31 11:32:37,136 [MainThread  ] [INFO ]  PROGRESS: at sentence #75800000, processed 858204029 words and 22005543 word types
2017-10-31 11:32:39,192 [MainThread  ] [INFO ]  PROGRESS: at sentence #75900000, processed 859299333 words and 22151464 word types
2017-10-31 11:32:41,216 [MainThread  ] [INFO ]  PROGRESS: at sentence #76000000, pr

2017-10-31 11:34:45,255 [MainThread  ] [INFO ]  PROGRESS: at sentence #81600000, processed 923914217 words and 30624370 word types
2017-10-31 11:34:47,349 [MainThread  ] [INFO ]  PROGRESS: at sentence #81700000, processed 925053195 words and 30768184 word types
2017-10-31 11:34:49,494 [MainThread  ] [INFO ]  PROGRESS: at sentence #81800000, processed 926229453 words and 30908580 word types
2017-10-31 11:34:51,590 [MainThread  ] [INFO ]  PROGRESS: at sentence #81900000, processed 927370191 words and 31045857 word types
2017-10-31 11:34:53,677 [MainThread  ] [INFO ]  PROGRESS: at sentence #82000000, processed 928507577 words and 31186316 word types
2017-10-31 11:34:55,776 [MainThread  ] [INFO ]  PROGRESS: at sentence #82100000, processed 929649926 words and 31333026 word types
2017-10-31 11:34:57,865 [MainThread  ] [INFO ]  PROGRESS: at sentence #82200000, processed 930783255 words and 31475581 word types
2017-10-31 11:34:59,964 [MainThread  ] [INFO ]  PROGRESS: at sentence #82300000, pr

2017-10-31 11:37:01,815 [MainThread  ] [INFO ]  PROGRESS: at sentence #87900000, processed 995394739 words and 39222677 word types
2017-10-31 11:37:03,909 [MainThread  ] [INFO ]  PROGRESS: at sentence #88000000, processed 996535145 words and 39358282 word types
2017-10-31 11:37:11,664 [MainThread  ] [INFO ]  PROGRESS: at sentence #88100000, processed 997656873 words and 39491265 word types
2017-10-31 11:37:13,741 [MainThread  ] [INFO ]  PROGRESS: at sentence #88200000, processed 998776930 words and 39623500 word types
2017-10-31 11:37:15,830 [MainThread  ] [INFO ]  PROGRESS: at sentence #88300000, processed 999905768 words and 39750326 word types
2017-10-31 11:37:17,908 [MainThread  ] [INFO ]  PROGRESS: at sentence #88400000, processed 1001036787 words and 39879087 word types
2017-10-31 11:37:45,780 [MainThread  ] [INFO ]  pruned out 26420710 tokens with count <=6 (before 40000009, after 13579299)
2017-10-31 11:37:45,968 [MainThread  ] [INFO ]  PROGRESS: at sentence #88500000, processe

2017-10-31 11:40:07,745 [MainThread  ] [INFO ]  saved ../data/lingvo/bigram
2017-10-31 11:40:07,746 [MainThread  ] [INFO ]  vocab size 21986537
2017-10-31 11:40:07,747 [MainThread  ] [INFO ]  source_vocab length 21986537
2017-10-31 11:40:59,587 [MainThread  ] [INFO ]  Phraser added 50000 phrasegrams
2017-10-31 11:41:51,721 [MainThread  ] [INFO ]  Phraser added 100000 phrasegrams
2017-10-31 11:42:45,436 [MainThread  ] [INFO ]  Phraser added 150000 phrasegrams
2017-10-31 11:43:39,535 [MainThread  ] [INFO ]  Phraser added 200000 phrasegrams
2017-10-31 11:44:15,889 [MainThread  ] [INFO ]  Phraser built with 233482 233482 phrasegrams
2017-10-31 11:44:15,890 [MainThread  ] [INFO ]  saving Phraser object under ../data/lingvo/bigram_ph_20_30, separately None
2017-10-31 11:44:16,186 [MainThread  ] [INFO ]  saved ../data/lingvo/bigram_ph_20_30
2017-10-31 11:44:16,187 [MainThread  ] [INFO ]  saving ../data/lingvo/bigram_corpus.txt.gz


In [15]:
bigram_path = '../data/lingvo/%s' % name
bigram = gensim.models.phrases.Phrases.load(bigram_path)
for phrase, score in bigram.export_phrases(islice(iter_sents(corpus_path), 500)):
    print('{0}   {1}'.format(phrase, score))

торговый наименование   46.0968005278
ti zr   54.2023204961
et al   40.2187126168
gas turbine   1327.44429617
advanced materials   222.224706823
цирконий гафний   40.7752414915
цирконий гафний   40.7752414915
american society   297.733331399
test materials   106.495165158
американский общество   160.174541596
беговой дорожка   82.4931135071
беговой дорожка   82.4931135071
беговой дорожка   82.4931135071
подсинхронный резонанс   64.1262090751
подсинхронный резонанс   64.1262090751
подсинхронный резонанс   64.1262090751
подсинхронный резонанс   64.1262090751
подсинхронный резонанс   64.1262090751
подсинхронный резонанс   64.1262090751
подсинхронный резонанс   64.1262090751
подсинхронный резонанс   64.1262090751
подсинхронный резонанс   64.1262090751
подсинхронный резонанс   64.1262090751
подсинхронный резонанс   64.1262090751
подсинхронный резонанс   64.1262090751
подсинхронный резонанс   64.1262090751
подсинхронный резонанс   64.1262090751
подсинхронный резонанс   64.1262090751
подсинхр

In [11]:
bigram_ph_path = bigram_path + '_ph_%s_%s' % (min_count, threshold)
bigram_ph = gensim.models.phrases.Phraser.load(bigram_ph_path)

2017-10-25 19:21:05,457 [MainThread  ] [INFO ]  loading Phraser object from ../data/lingvo/bigram_ph_20_30
2017-10-25 19:21:05,605 [MainThread  ] [INFO ]  loaded ../data/lingvo/bigram_ph_20_30


In [12]:
len(bigram_ph.phrasegrams)

183888

# Trigrams

In [12]:
name = 'trigram'
corpus_path = '../data/lingvo/bigram_corpus.txt.gz'
min_count, threshold = 20, 30

In [13]:
extract_bigrams(corpus_path, name, min_count, threshold, jsn=False)

2017-10-31 13:46:47,080 [MainThread  ] [INFO ]  collecting all words and their counts
2017-10-31 13:46:47,081 [MainThread  ] [INFO ]  PROGRESS: at sentence #0, processed 0 words and 0 word types
2017-10-31 13:46:51,816 [MainThread  ] [INFO ]  PROGRESS: at sentence #100000, processed 1137214 words and 561346 word types
2017-10-31 13:46:56,502 [MainThread  ] [INFO ]  PROGRESS: at sentence #200000, processed 2266321 words and 1048805 word types
2017-10-31 13:47:01,005 [MainThread  ] [INFO ]  PROGRESS: at sentence #300000, processed 3337934 words and 1460845 word types
2017-10-31 13:47:05,497 [MainThread  ] [INFO ]  PROGRESS: at sentence #400000, processed 4427761 words and 1856833 word types
2017-10-31 13:47:10,015 [MainThread  ] [INFO ]  PROGRESS: at sentence #500000, processed 5516246 words and 2248945 word types
2017-10-31 13:47:14,513 [MainThread  ] [INFO ]  PROGRESS: at sentence #600000, processed 6594456 words and 2612859 word types
2017-10-31 13:47:19,236 [MainThread  ] [INFO ]  PR

2017-10-31 13:51:38,919 [MainThread  ] [INFO ]  PROGRESS: at sentence #6400000, processed 70085121 words and 17363116 word types
2017-10-31 13:51:43,525 [MainThread  ] [INFO ]  PROGRESS: at sentence #6500000, processed 71197180 words and 17570336 word types
2017-10-31 13:51:48,059 [MainThread  ] [INFO ]  PROGRESS: at sentence #6600000, processed 72290557 words and 17770859 word types
2017-10-31 13:51:52,687 [MainThread  ] [INFO ]  PROGRESS: at sentence #6700000, processed 73407136 words and 17978375 word types
2017-10-31 13:51:57,252 [MainThread  ] [INFO ]  PROGRESS: at sentence #6800000, processed 74505433 words and 18184028 word types
2017-10-31 13:52:01,788 [MainThread  ] [INFO ]  PROGRESS: at sentence #6900000, processed 75599283 words and 18399077 word types
2017-10-31 13:52:06,257 [MainThread  ] [INFO ]  PROGRESS: at sentence #7000000, processed 76672185 words and 18590511 word types
2017-10-31 13:52:10,830 [MainThread  ] [INFO ]  PROGRESS: at sentence #7100000, processed 7777362

2017-10-31 13:56:34,622 [MainThread  ] [INFO ]  PROGRESS: at sentence #12800000, processed 140379031 words and 29231283 word types
2017-10-31 13:56:39,107 [MainThread  ] [INFO ]  PROGRESS: at sentence #12900000, processed 141448655 words and 29400599 word types
2017-10-31 13:56:43,697 [MainThread  ] [INFO ]  PROGRESS: at sentence #13000000, processed 142544641 words and 29572636 word types
2017-10-31 13:56:48,338 [MainThread  ] [INFO ]  PROGRESS: at sentence #13100000, processed 143653363 words and 29744033 word types
2017-10-31 13:56:52,951 [MainThread  ] [INFO ]  PROGRESS: at sentence #13200000, processed 144759614 words and 29909451 word types
2017-10-31 13:56:57,471 [MainThread  ] [INFO ]  PROGRESS: at sentence #13300000, processed 145839646 words and 30088391 word types
2017-10-31 13:57:02,018 [MainThread  ] [INFO ]  PROGRESS: at sentence #13400000, processed 146926773 words and 30247802 word types
2017-10-31 13:57:06,404 [MainThread  ] [INFO ]  PROGRESS: at sentence #13500000, pr

2017-10-31 14:01:22,746 [MainThread  ] [INFO ]  PROGRESS: at sentence #19100000, processed 209157049 words and 39166022 word types
2017-10-31 14:01:27,333 [MainThread  ] [INFO ]  PROGRESS: at sentence #19200000, processed 210250711 words and 39317269 word types
2017-10-31 14:01:31,893 [MainThread  ] [INFO ]  PROGRESS: at sentence #19300000, processed 211334506 words and 39473350 word types
2017-10-31 14:01:36,564 [MainThread  ] [INFO ]  PROGRESS: at sentence #19400000, processed 212451170 words and 39619150 word types
2017-10-31 14:01:41,042 [MainThread  ] [INFO ]  PROGRESS: at sentence #19500000, processed 213515427 words and 39770880 word types
2017-10-31 14:01:45,730 [MainThread  ] [INFO ]  PROGRESS: at sentence #19600000, processed 214630981 words and 39925297 word types
2017-10-31 14:02:08,603 [MainThread  ] [INFO ]  pruned out 0 tokens with count <=1 (before 40000001, after 40000001)
2017-10-31 14:02:33,976 [MainThread  ] [INFO ]  pruned out 24068811 tokens with count <=2 (before

2017-10-31 14:06:50,558 [MainThread  ] [INFO ]  PROGRESS: at sentence #25200000, processed 275973054 words and 25594599 word types
2017-10-31 14:06:55,154 [MainThread  ] [INFO ]  PROGRESS: at sentence #25300000, processed 277057286 words and 25748748 word types
2017-10-31 14:06:59,852 [MainThread  ] [INFO ]  PROGRESS: at sentence #25400000, processed 278172183 words and 25916813 word types
2017-10-31 14:07:04,487 [MainThread  ] [INFO ]  PROGRESS: at sentence #25500000, processed 279271143 words and 26076247 word types
2017-10-31 14:07:09,151 [MainThread  ] [INFO ]  PROGRESS: at sentence #25600000, processed 280376318 words and 26243451 word types
2017-10-31 14:07:13,756 [MainThread  ] [INFO ]  PROGRESS: at sentence #25700000, processed 281472939 words and 26398604 word types
2017-10-31 14:07:18,320 [MainThread  ] [INFO ]  PROGRESS: at sentence #25800000, processed 282553515 words and 26560014 word types
2017-10-31 14:07:22,951 [MainThread  ] [INFO ]  PROGRESS: at sentence #25900000, pr

2017-10-31 14:11:41,305 [MainThread  ] [INFO ]  PROGRESS: at sentence #31500000, processed 344862248 words and 35308270 word types
2017-10-31 14:11:45,961 [MainThread  ] [INFO ]  PROGRESS: at sentence #31600000, processed 345979808 words and 35463173 word types
2017-10-31 14:11:50,557 [MainThread  ] [INFO ]  PROGRESS: at sentence #31700000, processed 347080347 words and 35605398 word types
2017-10-31 14:11:55,172 [MainThread  ] [INFO ]  PROGRESS: at sentence #31800000, processed 348181842 words and 35757699 word types
2017-10-31 14:11:59,834 [MainThread  ] [INFO ]  PROGRESS: at sentence #31900000, processed 349294001 words and 35927719 word types
2017-10-31 14:12:04,448 [MainThread  ] [INFO ]  PROGRESS: at sentence #32000000, processed 350393724 words and 36085253 word types
2017-10-31 14:12:08,976 [MainThread  ] [INFO ]  PROGRESS: at sentence #32100000, processed 351474094 words and 36219876 word types
2017-10-31 14:12:13,595 [MainThread  ] [INFO ]  PROGRESS: at sentence #32200000, pr

2017-10-31 14:16:51,866 [MainThread  ] [INFO ]  PROGRESS: at sentence #37700000, processed 412808019 words and 20189248 word types
2017-10-31 14:16:56,769 [MainThread  ] [INFO ]  PROGRESS: at sentence #37800000, processed 413910042 words and 20355824 word types
2017-10-31 14:17:01,564 [MainThread  ] [INFO ]  PROGRESS: at sentence #37900000, processed 414978586 words and 20522473 word types
2017-10-31 14:17:06,204 [MainThread  ] [INFO ]  PROGRESS: at sentence #38000000, processed 416049336 words and 20685744 word types
2017-10-31 14:17:10,876 [MainThread  ] [INFO ]  PROGRESS: at sentence #38100000, processed 417163964 words and 20856940 word types
2017-10-31 14:17:15,471 [MainThread  ] [INFO ]  PROGRESS: at sentence #38200000, processed 418255465 words and 21037499 word types
2017-10-31 14:17:20,017 [MainThread  ] [INFO ]  PROGRESS: at sentence #38300000, processed 419337204 words and 21199925 word types
2017-10-31 14:17:24,722 [MainThread  ] [INFO ]  PROGRESS: at sentence #38400000, pr

2017-10-31 14:21:50,175 [MainThread  ] [INFO ]  PROGRESS: at sentence #44000000, processed 481733668 words and 30334311 word types
2017-10-31 14:21:54,718 [MainThread  ] [INFO ]  PROGRESS: at sentence #44100000, processed 482817608 words and 30483143 word types
2017-10-31 14:21:59,190 [MainThread  ] [INFO ]  PROGRESS: at sentence #44200000, processed 483884818 words and 30629555 word types
2017-10-31 14:22:03,802 [MainThread  ] [INFO ]  PROGRESS: at sentence #44300000, processed 484987706 words and 30780795 word types
2017-10-31 14:22:08,449 [MainThread  ] [INFO ]  PROGRESS: at sentence #44400000, processed 486101653 words and 30927017 word types
2017-10-31 14:22:13,093 [MainThread  ] [INFO ]  PROGRESS: at sentence #44500000, processed 487216296 words and 31081518 word types
2017-10-31 14:22:17,707 [MainThread  ] [INFO ]  PROGRESS: at sentence #44600000, processed 488314013 words and 31234006 word types
2017-10-31 14:22:22,236 [MainThread  ] [INFO ]  PROGRESS: at sentence #44700000, pr

2017-10-31 14:26:38,805 [MainThread  ] [INFO ]  PROGRESS: at sentence #50300000, processed 550644545 words and 39458927 word types
2017-10-31 14:26:43,415 [MainThread  ] [INFO ]  PROGRESS: at sentence #50400000, processed 551743645 words and 39592343 word types
2017-10-31 14:26:47,990 [MainThread  ] [INFO ]  PROGRESS: at sentence #50500000, processed 552838514 words and 39725256 word types
2017-10-31 14:26:52,466 [MainThread  ] [INFO ]  PROGRESS: at sentence #50600000, processed 553905168 words and 39860126 word types
2017-10-31 14:26:57,129 [MainThread  ] [INFO ]  PROGRESS: at sentence #50700000, processed 555021033 words and 39998350 word types
2017-10-31 14:27:22,062 [MainThread  ] [INFO ]  pruned out 25711775 tokens with count <=4 (before 40000001, after 14288226)
2017-10-31 14:27:26,562 [MainThread  ] [INFO ]  PROGRESS: at sentence #50800000, processed 556093955 words and 14465091 word types
2017-10-31 14:27:31,064 [MainThread  ] [INFO ]  PROGRESS: at sentence #50900000, processed

2017-10-31 14:31:50,323 [MainThread  ] [INFO ]  PROGRESS: at sentence #56500000, processed 618626720 words and 24152034 word types
2017-10-31 14:31:54,908 [MainThread  ] [INFO ]  PROGRESS: at sentence #56600000, processed 619711509 words and 24317398 word types
2017-10-31 14:31:59,621 [MainThread  ] [INFO ]  PROGRESS: at sentence #56700000, processed 620832243 words and 24486715 word types
2017-10-31 14:32:04,352 [MainThread  ] [INFO ]  PROGRESS: at sentence #56800000, processed 621958145 words and 24649152 word types
2017-10-31 14:32:08,960 [MainThread  ] [INFO ]  PROGRESS: at sentence #56900000, processed 623049056 words and 24813558 word types
2017-10-31 14:32:13,541 [MainThread  ] [INFO ]  PROGRESS: at sentence #57000000, processed 624132465 words and 24976807 word types
2017-10-31 14:32:18,068 [MainThread  ] [INFO ]  PROGRESS: at sentence #57100000, processed 625204940 words and 25133232 word types
2017-10-31 14:32:22,820 [MainThread  ] [INFO ]  PROGRESS: at sentence #57200000, pr

2017-10-31 14:36:44,924 [MainThread  ] [INFO ]  PROGRESS: at sentence #62800000, processed 687754034 words and 33884987 word types
2017-10-31 14:36:49,602 [MainThread  ] [INFO ]  PROGRESS: at sentence #62900000, processed 688878693 words and 34032309 word types
2017-10-31 14:36:54,258 [MainThread  ] [INFO ]  PROGRESS: at sentence #63000000, processed 689998950 words and 34175556 word types
2017-10-31 14:36:58,825 [MainThread  ] [INFO ]  PROGRESS: at sentence #63100000, processed 691096620 words and 34321853 word types
2017-10-31 14:37:03,449 [MainThread  ] [INFO ]  PROGRESS: at sentence #63200000, processed 692203200 words and 34457258 word types
2017-10-31 14:37:08,079 [MainThread  ] [INFO ]  PROGRESS: at sentence #63300000, processed 693311633 words and 34597425 word types
2017-10-31 14:37:12,743 [MainThread  ] [INFO ]  PROGRESS: at sentence #63400000, processed 694428226 words and 34768679 word types
2017-10-31 14:37:17,294 [MainThread  ] [INFO ]  PROGRESS: at sentence #63500000, pr

2017-10-31 14:41:53,995 [MainThread  ] [INFO ]  PROGRESS: at sentence #69000000, processed 755674935 words and 17014652 word types
2017-10-31 14:41:58,579 [MainThread  ] [INFO ]  PROGRESS: at sentence #69100000, processed 756767820 words and 17183400 word types
2017-10-31 14:42:03,286 [MainThread  ] [INFO ]  PROGRESS: at sentence #69200000, processed 757884988 words and 17351423 word types
2017-10-31 14:42:07,838 [MainThread  ] [INFO ]  PROGRESS: at sentence #69300000, processed 758967180 words and 17529368 word types
2017-10-31 14:42:12,452 [MainThread  ] [INFO ]  PROGRESS: at sentence #69400000, processed 760066945 words and 17698109 word types
2017-10-31 14:42:17,078 [MainThread  ] [INFO ]  PROGRESS: at sentence #69500000, processed 761171830 words and 17857365 word types
2017-10-31 14:42:21,746 [MainThread  ] [INFO ]  PROGRESS: at sentence #69600000, processed 762282231 words and 18032505 word types
2017-10-31 14:42:26,286 [MainThread  ] [INFO ]  PROGRESS: at sentence #69700000, pr

2017-10-31 14:46:52,029 [MainThread  ] [INFO ]  PROGRESS: at sentence #75300000, processed 824720740 words and 27438132 word types
2017-10-31 14:46:56,469 [MainThread  ] [INFO ]  PROGRESS: at sentence #75400000, processed 825787883 words and 27590904 word types
2017-10-31 14:47:00,964 [MainThread  ] [INFO ]  PROGRESS: at sentence #75500000, processed 826870126 words and 27750308 word types
2017-10-31 14:47:05,476 [MainThread  ] [INFO ]  PROGRESS: at sentence #75600000, processed 827960231 words and 27904834 word types
2017-10-31 14:47:09,990 [MainThread  ] [INFO ]  PROGRESS: at sentence #75700000, processed 829049357 words and 28060930 word types
2017-10-31 14:47:14,543 [MainThread  ] [INFO ]  PROGRESS: at sentence #75800000, processed 830148007 words and 28221215 word types
2017-10-31 14:47:18,942 [MainThread  ] [INFO ]  PROGRESS: at sentence #75900000, processed 831204443 words and 28367546 word types
2017-10-31 14:47:23,223 [MainThread  ] [INFO ]  PROGRESS: at sentence #76000000, pr

2017-10-31 14:51:38,406 [MainThread  ] [INFO ]  PROGRESS: at sentence #81600000, processed 893703663 words and 36905543 word types
2017-10-31 14:51:42,996 [MainThread  ] [INFO ]  PROGRESS: at sentence #81700000, processed 894807340 words and 37050352 word types
2017-10-31 14:51:47,699 [MainThread  ] [INFO ]  PROGRESS: at sentence #81800000, processed 895945961 words and 37192328 word types
2017-10-31 14:51:52,288 [MainThread  ] [INFO ]  PROGRESS: at sentence #81900000, processed 897052412 words and 37330331 word types
2017-10-31 14:51:56,871 [MainThread  ] [INFO ]  PROGRESS: at sentence #82000000, processed 898155257 words and 37473132 word types
2017-10-31 14:52:01,469 [MainThread  ] [INFO ]  PROGRESS: at sentence #82100000, processed 899258538 words and 37621147 word types
2017-10-31 14:52:06,046 [MainThread  ] [INFO ]  PROGRESS: at sentence #82200000, processed 900355949 words and 37765339 word types
2017-10-31 14:52:10,639 [MainThread  ] [INFO ]  PROGRESS: at sentence #82300000, pr

2017-10-31 14:56:51,321 [MainThread  ] [INFO ]  PROGRESS: at sentence #87800000, processed 961760397 words and 20197631 word types
2017-10-31 14:56:55,912 [MainThread  ] [INFO ]  PROGRESS: at sentence #87900000, processed 962863161 words and 20366463 word types
2017-10-31 14:57:00,526 [MainThread  ] [INFO ]  PROGRESS: at sentence #88000000, processed 963967527 words and 20538423 word types
2017-10-31 14:57:05,032 [MainThread  ] [INFO ]  PROGRESS: at sentence #88100000, processed 965050026 words and 20706138 word types
2017-10-31 14:57:09,565 [MainThread  ] [INFO ]  PROGRESS: at sentence #88200000, processed 966134858 words and 20875664 word types
2017-10-31 14:57:14,125 [MainThread  ] [INFO ]  PROGRESS: at sentence #88300000, processed 967227130 words and 21041072 word types
2017-10-31 14:57:18,691 [MainThread  ] [INFO ]  PROGRESS: at sentence #88400000, processed 968322005 words and 21206384 word types
2017-10-31 14:57:23,208 [MainThread  ] [INFO ]  PROGRESS: at sentence #88500000, pr

2017-10-31 15:02:03,565 [MainThread  ] [INFO ]  vocab size 29646800
2017-10-31 15:02:03,566 [MainThread  ] [INFO ]  source_vocab length 29646800
2017-10-31 15:02:34,370 [MainThread  ] [INFO ]  Phraser added 50000 phrasegrams
2017-10-31 15:03:05,480 [MainThread  ] [INFO ]  Phraser added 100000 phrasegrams
2017-10-31 15:03:37,728 [MainThread  ] [INFO ]  Phraser added 150000 phrasegrams
2017-10-31 15:04:10,182 [MainThread  ] [INFO ]  Phraser added 200000 phrasegrams
2017-10-31 15:04:44,250 [MainThread  ] [INFO ]  Phraser added 250000 phrasegrams
2017-10-31 15:05:18,380 [MainThread  ] [INFO ]  Phraser added 300000 phrasegrams
2017-10-31 15:05:53,691 [MainThread  ] [INFO ]  Phraser added 350000 phrasegrams
2017-10-31 15:06:30,271 [MainThread  ] [INFO ]  Phraser added 400000 phrasegrams
2017-10-31 15:07:06,290 [MainThread  ] [INFO ]  Phraser added 450000 phrasegrams
2017-10-31 15:07:43,169 [MainThread  ] [INFO ]  Phraser added 500000 phrasegrams
2017-10-31 15:08:10,601 [MainThread  ] [INFO ]

In [None]:
trigram_path = '../data/lingvo/%s' % name
trigram_ph_path = bigram_path + '_ph_%s_%s' % (min_count, threshold)
trigram_ph = gensim.models.phrases.Phraser.load(trigram_ph_path)

# Word2Vec

In [None]:
corpus_path = '../data/lingvo/trigram_corpus.txt.gz'
model = Word2Vec(Sentences(corpus_path, False), size=200, sg=1, 
                 min_count=10, window=10, workers=cpu_count)

In [None]:
model.save('../data/lingvo/w2v_200_sg_5_w10_trigram')

In [6]:
model = Word2Vec.load('../data/lingvo/w2v_200_sg_5_w10_trigram_2')

2017-11-03 10:26:13,148 [MainThread  ] [INFO ]  loading Word2Vec object from ../data/lingvo/w2v_200_sg_5_w10_trigram_2
2017-11-03 10:26:17,565 [MainThread  ] [INFO ]  loading wv recursively from ../data/lingvo/w2v_200_sg_5_w10_trigram_2.wv.* with mmap=None
2017-11-03 10:26:17,566 [MainThread  ] [INFO ]  loading syn0 from ../data/lingvo/w2v_200_sg_5_w10_trigram_2.wv.syn0.npy with mmap=None
2017-11-03 10:26:17,768 [MainThread  ] [INFO ]  setting ignored attribute syn0norm to None
2017-11-03 10:26:17,769 [MainThread  ] [INFO ]  loading syn1neg from ../data/lingvo/w2v_200_sg_5_w10_trigram_2.syn1neg.npy with mmap=None
2017-11-03 10:26:17,964 [MainThread  ] [INFO ]  setting ignored attribute cum_table to None
2017-11-03 10:26:17,965 [MainThread  ] [INFO ]  loaded ../data/lingvo/w2v_200_sg_5_w10_trigram_2


In [7]:
for w ,s in model.most_similar('стол', topn=10):
    print('%s %s' % (w,s))

2017-11-03 10:26:24,389 [MainThread  ] [INFO ]  precomputing L2-norms of word weight vectors


столешница 0.704067349434
столик 0.700281560421
станина 0.689512729645
глобусный 0.671980023384
vnc_realvnc_ltd 0.651404380798
zd2). 0.648195981979
раскроечный_стол 0.64441460371
стол-подставка 0.644305050373
многофункциональный_диагностико-хирургический_операционный 0.641507506371
журнально-игровой 0.640967011452


# Fast text

In [4]:
from gensim.models.wrappers import FastText

corpus_path = '../data/lingvo/trigram_corpus.txt.gz'
model = FastText.load_fasttext_format('../data/lingvo/model_ft_corpus_norm.bin')

2017-11-01 12:28:36,569 [MainThread  ] [INFO ]  loading 1561227 words for fastText model from ../data/lingvo/model_ft_corpus_norm.bin
2017-11-01 12:32:05,258 [MainThread  ] [INFO ]  loading weights for 1561227 words for fastText model from ../data/lingvo/model_ft_corpus_norm.bin
2017-11-01 12:36:43,658 [MainThread  ] [INFO ]  loaded (1561227, 100) weight matrix for fastText model from ../data/lingvo/model_ft_corpus_norm.bin


In [5]:
import datetime
now = datetime.datetime.now().strftime("%Y-%m-%d %H")

with io.open('../data/lingvo/classes_%s.txt' % now, 'w', encoding='utf8') as f:
    for w in tqdm_notebook(model.wv.vocab):
        sims = model.most_similar([w], topn=10)
        sims = [[wi, s] for wi,s in sims if s>0.7]
        if len(sims):
            s = '%s %s' % (w, json.dumps(sims, ensure_ascii=False))
            f.write(s + '\n')

2017-11-01 12:36:44,090 [MainThread  ] [INFO ]  precomputing L2-norms of word weight vectors
2017-11-01 12:36:44,694 [MainThread  ] [INFO ]  precomputing L2-norms of ngram weight vectors


KeyboardInterrupt: 

# query expansion

In [8]:
import datetime
now = datetime.datetime.now().strftime("%Y-%m-%d %H")

with io.open('../data/lingvo/classes_%s.txt' % now, 'w', encoding='utf8') as f:
    for w in tqdm_notebook(model.wv.vocab):
        sims = model.most_similar([w], topn=10)
        sims = [[wi, s] for wi,s in sims if s>0.7]
        if len(sims):
            s = '%s %s' % (w, json.dumps(sims, ensure_ascii=False))
            f.write(s.decode('utf8') + '\n')




# Save for Misha

In [14]:
corpus_path = '../data/lingvo/corpus_json.txt.gz'
_exclude = set()
for sent in iter_sents(corpus_path):
    tags = [w for w in sent if '_' in w]
    if len(tags):
        _exclude.update(tags)

In [15]:
corpus_path = '../data/lingvo/trigram_corpus.txt.gz'
_set = set()
for sent in iter_sents(corpus_path, False):
    tags = [w for w in sent if '_' in w]
    if len(tags):
        _set.update(tags)

In [19]:
import datetime

now = datetime.datetime.now().strftime("%Y-%m-%d %H")
with GzipFile('../data/lingvo/collocations_%s.txt.gz' % now, 'w') as f:
    for tag in sorted(_set-_exclude):
        f.write(tag + '\n')

In [20]:
len(_set-_exclude)

354003