In [4]:
from common import *
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
fnames = glob('../data/lingvo/raw/*')

In [3]:
import gc, io

# prog = re.compile("[\W\d]", re.UNICODE)

def process(fnames, i):
    with io.open('../data/lingvo/%s.txt'%i, 'w', encoding='utf8') as fw:
        for fn in fnames:
            with io.open(fn, encoding='utf8') as fr:
                text = fr.read()
            sents = sent_tokenize(text)
            sents = [[w for w in s.split() if w not in stop_list and len(w)>1] 
                     for s in sents]
            sents = [s for s in sents if len(s)]
            s = json.dumps((basename(fn).split('.')[0], sents), ensure_ascii=False)            
            fw.write(s + u'\n')
    gc.collect()

In [4]:
parallelizer = Parallel(n_jobs=cpu_count)

tasks_iterator = ( delayed(process)(list_block, i) for 
                  i, list_block in enumerate(grouper(len(fnames)//500, fnames)) ) 
result = parallelizer( tasks_iterator )

In [5]:
pwd = !pwd
%cd ../data/lingvo/
!cat *.txt > corpus_json.txt&&gzip -f corpus_json.txt&&rm *.txt
%cd {pwd[0]}

/notebooks/data/lingvo
/notebooks/pat


In [4]:
import ujson

def iter_docs(corpus_path, jsn):
    with GzipFile(corpus_path, 'r') as fr:
        for line in fr:
            if jsn:
                _id, sents = ujson.loads(line)
            else:
                _id, sents = '', [line.split()]
            yield _id, sents
    
def iter_sents(corpus_path, jsn=True):
    for _id, sents in iter_docs(corpus_path, jsn):
        for s in sents:
            yield s
                
class Sentences(object):
    def __init__(self, corpus_path, jsn=True):
        self.corpus_path = corpus_path
        self.jsn = jsn
    def __iter__(self):
        for sent in iter_sents(self.corpus_path, self.jsn):
            yield sent
            
def save_bigram_corpus(corpus_path, bigram_corpus_path, bigram_ph, jsn):
    logging.info('saving %s' % bigram_corpus_path)
    with GzipFile(bigram_corpus_path, 'w') as f:
        for sent in bigram_ph[iter_sents(corpus_path, jsn)]:
            f.write(unicode(' '.join(sent)).encode('utf8') + '\n')    
            
def extract_bigrams(corpus_path, name, min_count=5, threshold=10, jsn=True):
    bigram_path = '../data/lingvo/%s' % name
    bigram = gensim.models.Phrases(iter_sents(corpus_path, jsn),
                                   progress_per=100000)
    bigram.min_count = min_count
    bigram.threshold = threshold

    bigram.save(bigram_path)
    logging.info('vocab size %s' % len(bigram.vocab))
    
    bigram_ph = gensim.models.phrases.Phraser(bigram)
    bigram_ph_path = bigram_path + '_ph_%s_%s' % (min_count, threshold)
    bigram_ph.save(bigram_ph_path)
    
    bigram_corpus_path = '../data/lingvo/%s_corpus.txt.gz' % name
    save_bigram_corpus(corpus_path, bigram_corpus_path, bigram_ph, jsn)

# Bigrams

In [9]:
name = 'bigram'
corpus_path = '../data/lingvo/corpus_json.txt.gz'
min_count, threshold = 20, 30

In [6]:
extract_bigrams(corpus_path, name, min_count, threshold)

2017-10-25 16:27:30,871 [MainThread  ] [INFO ]  collecting all words and their counts
2017-10-25 16:27:30,875 [MainThread  ] [INFO ]  PROGRESS: at sentence #0, processed 0 words and 0 word types
2017-10-25 16:27:33,083 [MainThread  ] [INFO ]  PROGRESS: at sentence #100000, processed 1188296 words and 522201 word types
2017-10-25 16:27:35,247 [MainThread  ] [INFO ]  PROGRESS: at sentence #200000, processed 2384153 words and 977232 word types
2017-10-25 16:27:37,398 [MainThread  ] [INFO ]  PROGRESS: at sentence #300000, processed 3534796 words and 1373178 word types
2017-10-25 16:27:39,487 [MainThread  ] [INFO ]  PROGRESS: at sentence #400000, processed 4682532 words and 1736085 word types
2017-10-25 16:27:41,591 [MainThread  ] [INFO ]  PROGRESS: at sentence #500000, processed 5837275 words and 2103779 word types
2017-10-25 16:27:43,705 [MainThread  ] [INFO ]  PROGRESS: at sentence #600000, processed 6986029 words and 2461962 word types
2017-10-25 16:27:45,791 [MainThread  ] [INFO ]  PRO

2017-10-25 16:29:48,746 [MainThread  ] [INFO ]  PROGRESS: at sentence #6400000, processed 74443030 words and 16210603 word types
2017-10-25 16:29:50,862 [MainThread  ] [INFO ]  PROGRESS: at sentence #6500000, processed 75639109 words and 16399324 word types
2017-10-25 16:29:52,890 [MainThread  ] [INFO ]  PROGRESS: at sentence #6600000, processed 76758948 words and 16587963 word types
2017-10-25 16:29:55,001 [MainThread  ] [INFO ]  PROGRESS: at sentence #6700000, processed 77932429 words and 16775779 word types
2017-10-25 16:29:57,100 [MainThread  ] [INFO ]  PROGRESS: at sentence #6800000, processed 79097462 words and 16965387 word types
2017-10-25 16:29:59,212 [MainThread  ] [INFO ]  PROGRESS: at sentence #6900000, processed 80276965 words and 17150046 word types
2017-10-25 16:30:01,307 [MainThread  ] [INFO ]  PROGRESS: at sentence #7000000, processed 81437931 words and 17329960 word types
2017-10-25 16:30:03,442 [MainThread  ] [INFO ]  PROGRESS: at sentence #7100000, processed 8262826

2017-10-25 16:32:04,076 [MainThread  ] [INFO ]  PROGRESS: at sentence #12700000, processed 147997960 words and 26944937 word types
2017-10-25 16:32:06,173 [MainThread  ] [INFO ]  PROGRESS: at sentence #12800000, processed 149173638 words and 27100941 word types
2017-10-25 16:32:08,306 [MainThread  ] [INFO ]  PROGRESS: at sentence #12900000, processed 150372925 words and 27256189 word types
2017-10-25 16:32:14,187 [MainThread  ] [INFO ]  PROGRESS: at sentence #13000000, processed 151556813 words and 27402366 word types
2017-10-25 16:32:16,265 [MainThread  ] [INFO ]  PROGRESS: at sentence #13100000, processed 152712659 words and 27556658 word types
2017-10-25 16:32:18,378 [MainThread  ] [INFO ]  PROGRESS: at sentence #13200000, processed 153883744 words and 27709042 word types
2017-10-25 16:32:20,427 [MainThread  ] [INFO ]  PROGRESS: at sentence #13300000, processed 155026675 words and 27856109 word types
2017-10-25 16:32:22,522 [MainThread  ] [INFO ]  PROGRESS: at sentence #13400000, pr

2017-10-25 16:34:24,090 [MainThread  ] [INFO ]  PROGRESS: at sentence #19000000, processed 221184530 words and 36010681 word types
2017-10-25 16:34:26,178 [MainThread  ] [INFO ]  PROGRESS: at sentence #19100000, processed 222343288 words and 36147224 word types
2017-10-25 16:34:28,331 [MainThread  ] [INFO ]  PROGRESS: at sentence #19200000, processed 223538496 words and 36286943 word types
2017-10-25 16:34:30,464 [MainThread  ] [INFO ]  PROGRESS: at sentence #19300000, processed 224718899 words and 36425872 word types
2017-10-25 16:34:32,558 [MainThread  ] [INFO ]  PROGRESS: at sentence #19400000, processed 225886367 words and 36554319 word types
2017-10-25 16:34:34,622 [MainThread  ] [INFO ]  PROGRESS: at sentence #19500000, processed 227021803 words and 36690586 word types
2017-10-25 16:34:41,744 [MainThread  ] [INFO ]  PROGRESS: at sentence #19600000, processed 228177761 words and 36824998 word types
2017-10-25 16:34:43,852 [MainThread  ] [INFO ]  PROGRESS: at sentence #19700000, pr

2017-10-25 16:37:27,023 [MainThread  ] [INFO ]  PROGRESS: at sentence #25100000, processed 292194661 words and 21360453 word types
2017-10-25 16:37:29,165 [MainThread  ] [INFO ]  PROGRESS: at sentence #25200000, processed 293349183 words and 21512621 word types
2017-10-25 16:37:31,354 [MainThread  ] [INFO ]  PROGRESS: at sentence #25300000, processed 294543418 words and 21656718 word types
2017-10-25 16:37:33,537 [MainThread  ] [INFO ]  PROGRESS: at sentence #25400000, processed 295721254 words and 21810199 word types
2017-10-25 16:37:35,670 [MainThread  ] [INFO ]  PROGRESS: at sentence #25500000, processed 296877494 words and 21966495 word types
2017-10-25 16:37:37,825 [MainThread  ] [INFO ]  PROGRESS: at sentence #25600000, processed 298054540 words and 22119852 word types
2017-10-25 16:37:39,951 [MainThread  ] [INFO ]  PROGRESS: at sentence #25700000, processed 299210864 words and 22265657 word types
2017-10-25 16:37:42,114 [MainThread  ] [INFO ]  PROGRESS: at sentence #25800000, pr

2017-10-25 16:39:47,653 [MainThread  ] [INFO ]  PROGRESS: at sentence #31400000, processed 365517249 words and 30462466 word types
2017-10-25 16:39:49,731 [MainThread  ] [INFO ]  PROGRESS: at sentence #31500000, processed 366657980 words and 30602158 word types
2017-10-25 16:39:51,850 [MainThread  ] [INFO ]  PROGRESS: at sentence #31600000, processed 367825935 words and 30732395 word types
2017-10-25 16:39:53,948 [MainThread  ] [INFO ]  PROGRESS: at sentence #31700000, processed 368988888 words and 30867723 word types
2017-10-25 16:39:56,082 [MainThread  ] [INFO ]  PROGRESS: at sentence #31800000, processed 370162141 words and 31012695 word types
2017-10-25 16:39:58,242 [MainThread  ] [INFO ]  PROGRESS: at sentence #31900000, processed 371361515 words and 31144444 word types
2017-10-25 16:40:00,404 [MainThread  ] [INFO ]  PROGRESS: at sentence #32000000, processed 372550097 words and 31284588 word types
2017-10-25 16:40:02,541 [MainThread  ] [INFO ]  PROGRESS: at sentence #32100000, pr

2017-10-25 16:42:05,665 [MainThread  ] [INFO ]  PROGRESS: at sentence #37700000, processed 438949089 words and 38689723 word types
2017-10-25 16:42:07,722 [MainThread  ] [INFO ]  PROGRESS: at sentence #37800000, processed 440078875 words and 38811258 word types
2017-10-25 16:42:09,830 [MainThread  ] [INFO ]  PROGRESS: at sentence #37900000, processed 441244338 words and 38937179 word types
2017-10-25 16:42:11,964 [MainThread  ] [INFO ]  PROGRESS: at sentence #38000000, processed 442422004 words and 39066501 word types
2017-10-25 16:42:14,053 [MainThread  ] [INFO ]  PROGRESS: at sentence #38100000, processed 443575334 words and 39186232 word types
2017-10-25 16:42:16,144 [MainThread  ] [INFO ]  PROGRESS: at sentence #38200000, processed 444729096 words and 39306652 word types
2017-10-25 16:42:18,289 [MainThread  ] [INFO ]  PROGRESS: at sentence #38300000, processed 445898341 words and 39434843 word types
2017-10-25 16:42:25,619 [MainThread  ] [INFO ]  PROGRESS: at sentence #38400000, pr

2017-10-25 16:44:49,157 [MainThread  ] [INFO ]  PROGRESS: at sentence #43900000, processed 511069457 words and 23240036 word types
2017-10-25 16:44:51,259 [MainThread  ] [INFO ]  PROGRESS: at sentence #44000000, processed 512229343 words and 23387163 word types
2017-10-25 16:44:53,376 [MainThread  ] [INFO ]  PROGRESS: at sentence #44100000, processed 513396117 words and 23533324 word types
2017-10-25 16:44:55,474 [MainThread  ] [INFO ]  PROGRESS: at sentence #44200000, processed 514550951 words and 23684564 word types
2017-10-25 16:44:57,546 [MainThread  ] [INFO ]  PROGRESS: at sentence #44300000, processed 515690136 words and 23831157 word types
2017-10-25 16:44:59,660 [MainThread  ] [INFO ]  PROGRESS: at sentence #44400000, processed 516859192 words and 23966791 word types
2017-10-25 16:45:01,766 [MainThread  ] [INFO ]  PROGRESS: at sentence #44500000, processed 518018650 words and 24113162 word types
2017-10-25 16:45:03,852 [MainThread  ] [INFO ]  PROGRESS: at sentence #44600000, pr

2017-10-25 16:47:05,980 [MainThread  ] [INFO ]  PROGRESS: at sentence #50200000, processed 584366280 words and 31951772 word types
2017-10-25 16:47:08,089 [MainThread  ] [INFO ]  PROGRESS: at sentence #50300000, processed 585552495 words and 32081031 word types
2017-10-25 16:47:10,058 [MainThread  ] [INFO ]  PROGRESS: at sentence #50400000, processed 586646002 words and 32208735 word types
2017-10-25 16:47:12,140 [MainThread  ] [INFO ]  PROGRESS: at sentence #50500000, processed 587809262 words and 32337503 word types
2017-10-25 16:47:14,237 [MainThread  ] [INFO ]  PROGRESS: at sentence #50600000, processed 588983135 words and 32467690 word types
2017-10-25 16:47:16,353 [MainThread  ] [INFO ]  PROGRESS: at sentence #50700000, processed 590172390 words and 32597079 word types
2017-10-25 16:47:18,440 [MainThread  ] [INFO ]  PROGRESS: at sentence #50800000, processed 591331868 words and 32729821 word types
2017-10-25 16:47:20,538 [MainThread  ] [INFO ]  PROGRESS: at sentence #50900000, pr

2017-10-25 16:49:21,902 [MainThread  ] [INFO ]  PROGRESS: at sentence #56500000, processed 657808501 words and 39884107 word types
2017-10-25 16:49:47,644 [MainThread  ] [INFO ]  pruned out 25222344 tokens with count <=4 (before 40000001, after 14777657)
2017-10-25 16:49:47,724 [MainThread  ] [INFO ]  PROGRESS: at sentence #56600000, processed 658996475 words and 14784530 word types
2017-10-25 16:49:49,841 [MainThread  ] [INFO ]  PROGRESS: at sentence #56700000, processed 660164265 words and 14946544 word types
2017-10-25 16:49:51,963 [MainThread  ] [INFO ]  PROGRESS: at sentence #56800000, processed 661346634 words and 15102974 word types
2017-10-25 16:49:54,053 [MainThread  ] [INFO ]  PROGRESS: at sentence #56900000, processed 662505031 words and 15259863 word types
2017-10-25 16:49:56,150 [MainThread  ] [INFO ]  PROGRESS: at sentence #57000000, processed 663669079 words and 15423640 word types
2017-10-25 16:49:58,242 [MainThread  ] [INFO ]  PROGRESS: at sentence #57100000, processed

2017-10-25 16:51:58,574 [MainThread  ] [INFO ]  PROGRESS: at sentence #62700000, processed 730176668 words and 24026109 word types
2017-10-25 16:52:00,679 [MainThread  ] [INFO ]  PROGRESS: at sentence #62800000, processed 731349625 words and 24175556 word types
2017-10-25 16:52:02,791 [MainThread  ] [INFO ]  PROGRESS: at sentence #62900000, processed 732522513 words and 24319788 word types
2017-10-25 16:52:04,822 [MainThread  ] [INFO ]  PROGRESS: at sentence #63000000, processed 733653860 words and 24456761 word types
2017-10-25 16:52:06,918 [MainThread  ] [INFO ]  PROGRESS: at sentence #63100000, processed 734813579 words and 24610001 word types
2017-10-25 16:52:08,994 [MainThread  ] [INFO ]  PROGRESS: at sentence #63200000, processed 735974058 words and 24745469 word types
2017-10-25 16:52:11,127 [MainThread  ] [INFO ]  PROGRESS: at sentence #63300000, processed 737174213 words and 24883423 word types
2017-10-25 16:52:13,266 [MainThread  ] [INFO ]  PROGRESS: at sentence #63400000, pr

2017-10-25 16:54:18,334 [MainThread  ] [INFO ]  PROGRESS: at sentence #69000000, processed 803609497 words and 32650101 word types
2017-10-25 16:54:20,414 [MainThread  ] [INFO ]  PROGRESS: at sentence #69100000, processed 804778420 words and 32783618 word types
2017-10-25 16:54:22,492 [MainThread  ] [INFO ]  PROGRESS: at sentence #69200000, processed 805942655 words and 32921821 word types
2017-10-25 16:54:24,601 [MainThread  ] [INFO ]  PROGRESS: at sentence #69300000, processed 807124340 words and 33056114 word types
2017-10-25 16:54:26,675 [MainThread  ] [INFO ]  PROGRESS: at sentence #69400000, processed 808283641 words and 33184603 word types
2017-10-25 16:54:28,750 [MainThread  ] [INFO ]  PROGRESS: at sentence #69500000, processed 809451419 words and 33312037 word types
2017-10-25 16:54:30,801 [MainThread  ] [INFO ]  PROGRESS: at sentence #69600000, processed 810607949 words and 33439386 word types
2017-10-25 16:54:32,909 [MainThread  ] [INFO ]  PROGRESS: at sentence #69700000, pr

2017-10-25 16:56:54,701 [MainThread  ] [INFO ]  PROGRESS: at sentence #75200000, processed 875924009 words and 14814507 word types
2017-10-25 16:56:56,785 [MainThread  ] [INFO ]  PROGRESS: at sentence #75300000, processed 877076920 words and 14971210 word types
2017-10-25 16:56:58,905 [MainThread  ] [INFO ]  PROGRESS: at sentence #75400000, processed 878246856 words and 15127185 word types
2017-10-25 16:57:01,059 [MainThread  ] [INFO ]  PROGRESS: at sentence #75500000, processed 879425115 words and 15292458 word types
2017-10-25 16:57:03,185 [MainThread  ] [INFO ]  PROGRESS: at sentence #75600000, processed 880587841 words and 15452977 word types
2017-10-25 16:57:05,297 [MainThread  ] [INFO ]  PROGRESS: at sentence #75700000, processed 881765861 words and 15609341 word types
2017-10-25 16:57:07,412 [MainThread  ] [INFO ]  PROGRESS: at sentence #75800000, processed 882935032 words and 15767150 word types
2017-10-25 16:57:09,433 [MainThread  ] [INFO ]  PROGRESS: at sentence #75900000, pr

2017-10-25 16:59:12,532 [MainThread  ] [INFO ]  PROGRESS: at sentence #81500000, processed 949153307 words and 24270369 word types
2017-10-25 16:59:14,629 [MainThread  ] [INFO ]  PROGRESS: at sentence #81600000, processed 950325202 words and 24410695 word types
2017-10-25 16:59:16,703 [MainThread  ] [INFO ]  PROGRESS: at sentence #81700000, processed 951476337 words and 24547234 word types
2017-10-25 16:59:18,831 [MainThread  ] [INFO ]  PROGRESS: at sentence #81800000, processed 952638908 words and 24690617 word types
2017-10-25 16:59:20,953 [MainThread  ] [INFO ]  PROGRESS: at sentence #81900000, processed 953815288 words and 24830610 word types
2017-10-25 16:59:23,088 [MainThread  ] [INFO ]  PROGRESS: at sentence #82000000, processed 954996290 words and 24975781 word types
2017-10-25 16:59:25,201 [MainThread  ] [INFO ]  PROGRESS: at sentence #82100000, processed 956163368 words and 25117539 word types
2017-10-25 16:59:27,336 [MainThread  ] [INFO ]  PROGRESS: at sentence #82200000, pr

2017-10-25 17:01:29,033 [MainThread  ] [INFO ]  PROGRESS: at sentence #87800000, processed 1022739595 words and 32913005 word types
2017-10-25 17:01:31,125 [MainThread  ] [INFO ]  PROGRESS: at sentence #87900000, processed 1023915389 words and 33046146 word types
2017-10-25 17:01:33,226 [MainThread  ] [INFO ]  PROGRESS: at sentence #88000000, processed 1025091728 words and 33182874 word types
2017-10-25 17:01:35,285 [MainThread  ] [INFO ]  PROGRESS: at sentence #88100000, processed 1026250262 words and 33307896 word types
2017-10-25 17:01:37,383 [MainThread  ] [INFO ]  PROGRESS: at sentence #88200000, processed 1027426632 words and 33439596 word types
2017-10-25 17:01:39,467 [MainThread  ] [INFO ]  PROGRESS: at sentence #88300000, processed 1028595914 words and 33571855 word types
2017-10-25 17:01:41,576 [MainThread  ] [INFO ]  PROGRESS: at sentence #88400000, processed 1029769218 words and 33707525 word types
2017-10-25 17:01:43,639 [MainThread  ] [INFO ]  PROGRESS: at sentence #88500

2017-10-25 17:04:02,625 [MainThread  ] [INFO ]  collected 14558244 word types from a corpus of 1094237693 words (unigram + bigrams) and 93927755 sentences
2017-10-25 17:04:02,626 [MainThread  ] [INFO ]  using 14558244 counts as vocab in Phrases<0 vocab, min_count=5, threshold=10.0, max_vocab_size=40000000>
2017-10-25 17:04:02,627 [MainThread  ] [INFO ]  saving Phrases object under ../data/lingvo/bigram, separately None
2017-10-25 17:04:18,349 [MainThread  ] [INFO ]  saved ../data/lingvo/bigram
2017-10-25 17:04:18,350 [MainThread  ] [INFO ]  vocab size 14558244
2017-10-25 17:04:18,351 [MainThread  ] [INFO ]  source_vocab length 14558244
2017-10-25 17:05:03,279 [MainThread  ] [INFO ]  Phraser added 50000 phrasegrams
2017-10-25 17:05:48,895 [MainThread  ] [INFO ]  Phraser added 100000 phrasegrams
2017-10-25 17:06:34,759 [MainThread  ] [INFO ]  Phraser added 150000 phrasegrams
2017-10-25 17:07:05,729 [MainThread  ] [INFO ]  Phraser built with 183888 183888 phrasegrams
2017-10-25 17:07:05,7

In [15]:
bigram_path = '../data/lingvo/%s' % name
bigram = gensim.models.phrases.Phrases.load(bigram_path)
for phrase, score in bigram.export_phrases(islice(iter_sents(corpus_path), 500)):
    print('{0}   {1}'.format(phrase, score))

торговый наименование   46.0968005278
ti zr   54.2023204961
et al   40.2187126168
gas turbine   1327.44429617
advanced materials   222.224706823
цирконий гафний   40.7752414915
цирконий гафний   40.7752414915
american society   297.733331399
test materials   106.495165158
американский общество   160.174541596
беговой дорожка   82.4931135071
беговой дорожка   82.4931135071
беговой дорожка   82.4931135071
подсинхронный резонанс   64.1262090751
подсинхронный резонанс   64.1262090751
подсинхронный резонанс   64.1262090751
подсинхронный резонанс   64.1262090751
подсинхронный резонанс   64.1262090751
подсинхронный резонанс   64.1262090751
подсинхронный резонанс   64.1262090751
подсинхронный резонанс   64.1262090751
подсинхронный резонанс   64.1262090751
подсинхронный резонанс   64.1262090751
подсинхронный резонанс   64.1262090751
подсинхронный резонанс   64.1262090751
подсинхронный резонанс   64.1262090751
подсинхронный резонанс   64.1262090751
подсинхронный резонанс   64.1262090751
подсинхр

In [11]:
bigram_ph_path = bigram_path + '_ph_%s_%s' % (min_count, threshold)
bigram_ph = gensim.models.phrases.Phraser.load(bigram_ph_path)

2017-10-25 19:21:05,457 [MainThread  ] [INFO ]  loading Phraser object from ../data/lingvo/bigram_ph_20_30
2017-10-25 19:21:05,605 [MainThread  ] [INFO ]  loaded ../data/lingvo/bigram_ph_20_30


In [12]:
len(bigram_ph.phrasegrams)

183888

# Trigrams

In [5]:
name = 'trigram'
corpus_path = '../data/lingvo/bigram_corpus.txt.gz'
min_count, threshold = 20, 30

In [None]:
extract_bigrams(corpus_path, name, min_count, threshold, jsn=False)

2017-10-25 20:05:40,287 [MainThread  ] [INFO ]  collecting all words and their counts
2017-10-25 20:05:40,293 [MainThread  ] [INFO ]  PROGRESS: at sentence #0, processed 0 words and 0 word types
2017-10-25 20:05:45,290 [MainThread  ] [INFO ]  PROGRESS: at sentence #100000, processed 1158314 words and 522282 word types
2017-10-25 20:05:50,305 [MainThread  ] [INFO ]  PROGRESS: at sentence #200000, processed 2325047 words and 980167 word types
2017-10-25 20:05:55,180 [MainThread  ] [INFO ]  PROGRESS: at sentence #300000, processed 3451287 words and 1379125 word types
2017-10-25 20:05:59,967 [MainThread  ] [INFO ]  PROGRESS: at sentence #400000, processed 4567836 words and 1745674 word types
2017-10-25 20:06:04,789 [MainThread  ] [INFO ]  PROGRESS: at sentence #500000, processed 5696465 words and 2118043 word types


In [None]:
trigram_path = '../data/lingvo/%s' % name
trigram_ph_path = bigram_path + '_ph_%s_%s' % (min_count, threshold)
trigram_ph = gensim.models.phrases.Phraser.load(trigram_ph_path)

# Word2Vec

In [None]:
corpus_path = '../data/lingvo/trigram_corpus.txt.gz'
model = Word2Vec(Sentences(corpus_path, False), size=200, sg=1, 
                 min_count=10, window=10, workers=cpu_count)

In [None]:
model.save('../data/lingvo/w2v_200_sg_5_w10_trigram')

In [5]:
model = Word2Vec.load('../data/lingvo/w2v_200_sg_5_w10_trigram')

2017-10-27 09:28:34,630 [MainThread  ] [INFO ]  loading Word2Vec object from ../data/lingvo/w2v_200_sg_5_w10_trigram
2017-10-27 09:28:40,419 [MainThread  ] [INFO ]  loading wv recursively from ../data/lingvo/w2v_200_sg_5_w10_trigram.wv.* with mmap=None
2017-10-27 09:28:40,420 [MainThread  ] [INFO ]  loading syn0 from ../data/lingvo/w2v_200_sg_5_w10_trigram.wv.syn0.npy with mmap=None
2017-10-27 09:28:42,161 [MainThread  ] [INFO ]  setting ignored attribute syn0norm to None
2017-10-27 09:28:42,163 [MainThread  ] [INFO ]  loading syn1neg from ../data/lingvo/w2v_200_sg_5_w10_trigram.syn1neg.npy with mmap=None
2017-10-27 09:28:43,435 [MainThread  ] [INFO ]  setting ignored attribute cum_table to None
2017-10-27 09:28:43,446 [MainThread  ] [INFO ]  loaded ../data/lingvo/w2v_200_sg_5_w10_trigram


In [6]:
for w ,s in model.most_similar('стол', topn=10):
    print('%s %s' % (w,s))

2017-10-27 09:28:47,534 [MainThread  ] [INFO ]  precomputing L2-norms of word weight vectors


столешница 0.708369255066
столик 0.693453729153
станина 0.678220748901
глобусный 0.673517644405
zd2). 0.654453396797
стол-подъемник 0.651747107506
vnc_realvnc_ltd 0.648538589478
поворотно-качающаяся_двухкоординатный 0.64672678709
журнально-игровой 0.640999734402
многофункциональный_дагностико-хиругический_робототехнический 0.640973091125


# Semantic clusters

In [None]:
import datetime
now = datetime.datetime.now().strftime("%Y-%m-%d %H")

with io.open('../data/lingvo/classes_%s.txt' % now, 'w', encoding='utf8') as f:
    for w in tqdm_notebook(model.wv.vocab):
        sims = model.most_similar([w], topn=10)
        sims = [[wi, s] for wi,s in sims if s>0.7]
        if len(sims):
            s = '%s %s' % (w, json.dumps(sims, ensure_ascii=False))
            f.write(s.decode('utf8') + '\n')




In [None]:
import datetime
now = datetime.datetime.now().strftime("%Y-%m-%d %H")

index2word = model.wv.index2word
with io.open('../data/lingvo/classes_%s.txt' % now, 'w', encoding='utf8') as f:
    for tags in tqdm_notebook(np.array_split(index2word, 1000)):
        vecs = model.wv[tags]
        sim_mat = cosine_similarity(vecs, model.wv.syn0)
        train_ixs = [ixs[-11:-1][::-1] for ixs in sim_mat.argsort(axis=1)]
        for i,w in enumerate(tags):
            sims = [[index2word[ix],sim_mat[i,ix]] for ix in train_ixs[i]]
            sims = [[w, s] for w,s in sims if s>0.7]
            if len(sims):
                s = '%s %s' % (w, json.dumps(sims, ensure_ascii=False))
                f.write(s.decode('utf8') + '\n')

In [19]:
sim_mat = cosine_similarity(vecs, model.wv.syn0)

In [40]:
[ixs[-201:-1][::-1] for ixs in sim_mat.argsort(axis=1)]

[array([   4619, 1012731,  782728,    3179, 1067538,   29076,  764746,
        1025772, 1048564,  942290,  676746,  755085,  127314,  782563,
          10781,  914108,  713020,  519604,    8332,  852968, 1020937,
         813788,  354903,  768706,  753452, 1078282,  925699,  300497,
         850039,  975138,  937567,  709813,  932656,  935778,  365141,
        1067192,  874034,  389684,  225354,  968834,  964933,  981967,
         853638,  421873, 1063215,  967491, 1068433,  538633,  793452,
         774262,  788864, 1040008,  311977,  315620,  358969, 1065643,
         933903,  900202,  738651, 1075343, 1048931,  441801,  889598,
         565919, 1027306, 1048638,  956236, 1055214, 1021768,  805635,
         691708,  729828,  121847, 1020803,  657466,  827159, 1078525,
        1018351,  958422,  922354,  924489, 1052835,  928956,  865434,
         888463, 1061034,  458525,  944853,  213401,  503413, 1081696,
         777247, 1068593,  994826,  919847,  952227,  486400,  140130,
      

In [23]:
sim_mat

array([[ 0.99999994,  0.45720625,  0.3541134 , ...,  0.19154783,
         0.36999533,  0.2816523 ],
       [ 0.45720625,  1.        ,  0.33350614, ...,  0.07257227,
         0.40997708,  0.30174717],
       [ 0.3541134 ,  0.33350614,  1.00000012, ...,  0.29840857,
         0.33412886,  0.31311023],
       ..., 
       [ 0.24369814,  0.12588844,  0.27569562, ...,  0.25806665,
         0.19089819,  0.32050702],
       [ 0.24003631,  0.21360095,  0.50081283, ...,  0.22694927,
         0.21680561,  0.34303862],
       [ 0.3345114 ,  0.16328987,  0.27063552, ...,  0.22568609,
         0.19072032,  0.23730876]], dtype=float32)

In [146]:
'[мама]' == str(['мама'])

False

In [153]:
print(json.dumps(['мама'], ensure_ascii=False) )

["мама"]


# Save for Misha

In [None]:
corpus_path = '../data/lingvo/corpus_json.txt.gz'
_exclude = set()
for sent in iter_sents(corpus_path):
    tags = [w for w in sent if '_' in w]
    if len(tags):
        _exclude.update(tags)

In [13]:
corpus_path = '../data/lingvo/trigram_corpus.txt.gz'
_set = set()
for sent in iter_sents(corpus_path, False):
    tags = [w for w in sent if '_' in w]
    if len(tags):
        _set.update(tags)

In [34]:
import datetime

now = datetime.datetime.now()
with GzipFile('../data/lingvo/collocations_%s.txt.gz' % now(), 'w') as f:
    for tag in sorted(_set-_exclude):
        f.write(tag + '\n')

In [None]:
len(_set-_exclude)