In [1]:
import gensim
import smart_open
import pandas
import itertools
import random

import csv
import StringIO

import logging

import numpy as np

logging.basicConfig(format='%(levelname)s : %(message)s', level=logging.INFO)
logging.root.level = logging.INFO  # ipython sometimes messes up the logging setup; restore

class CsvDialect(csv.Dialect):
    strict = True
    skipinitialspace = True
    quoting = csv.QUOTE_ALL
    delimiter = ','
    quotechar = '"'
    lineterminator = '\n'


In [4]:
train_data_csv = "doc2vec-training-data.csv"
train_data = read_corpus(train_data_csv)

In [3]:
def read_corpus(fname, tokens_only=False, index_from=0):
    with smart_open.smart_open(fname, encoding="utf-8") as f:
        print "Headers: ", f.readline()
        for i, line in enumerate(f):
            csv_reader = csv.reader([line.encode('utf-8')])
            for row in csv_reader:
                try:
                    line = row[1].decode('utf-8')
                except IndexError as e:
                    continue
                if tokens_only:
                    yield gensim.utils.simple_preprocess(line)
                else:
                    # For training data, add tags
                    yield gensim.models.doc2vec.TaggedDocument(gensim.utils.simple_preprocess(line), [index_from + i])


In [5]:
# Run this cell to rebuild the model from scratch
model = gensim.models.doc2vec.Doc2Vec(size=50, min_count=3, iter=10, workers=8)
train_data = list(read_corpus(train_data_csv))
model.build_vocab(train_data)



Headers:  created_at,text,stock_val,stock_val_1hr_delta,stock_val_2hr_delta,stock_val_6hr_delta,after_hours



INFO : collecting all words and their counts
INFO : PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags
INFO : PROGRESS: at example #10000, processed 173471 words (1866544/s), 16974 word types, 10041 tags
INFO : PROGRESS: at example #20000, processed 345831 words (1710397/s), 27008 word types, 20058 tags
INFO : PROGRESS: at example #30000, processed 519609 words (1907802/s), 34865 word types, 30072 tags
INFO : PROGRESS: at example #40000, processed 694517 words (1996737/s), 42419 word types, 40082 tags
INFO : PROGRESS: at example #50000, processed 868155 words (1973767/s), 50107 word types, 50131 tags
INFO : PROGRESS: at example #60000, processed 1040791 words (2068615/s), 57581 word types, 60132 tags
INFO : PROGRESS: at example #70000, processed 1211927 words (1888187/s), 63620 word types, 70135 tags
INFO : PROGRESS: at example #80000, processed 1382850 words (1979148/s), 70271 word types, 80140 tags
INFO : PROGRESS: at example #90000, processed 1553324 words (17157

INFO : PROGRESS: at example #790000, processed 13680537 words (1704691/s), 459110 word types, 790784 tags
INFO : PROGRESS: at example #800000, processed 13854294 words (1790053/s), 464152 word types, 800787 tags
INFO : PROGRESS: at example #810000, processed 14028710 words (1781283/s), 469073 word types, 810790 tags
INFO : PROGRESS: at example #820000, processed 14202085 words (1639571/s), 473858 word types, 820791 tags
INFO : PROGRESS: at example #830000, processed 14377989 words (1802572/s), 478970 word types, 830843 tags
INFO : PROGRESS: at example #840000, processed 14553038 words (1834930/s), 483616 word types, 840854 tags
INFO : PROGRESS: at example #850000, processed 14724014 words (1809825/s), 488652 word types, 850863 tags
INFO : PROGRESS: at example #860000, processed 14893061 words (1767685/s), 493730 word types, 860863 tags
INFO : PROGRESS: at example #870000, processed 15064190 words (1704407/s), 499089 word types, 870869 tags
INFO : PROGRESS: at example #880000, processed

INFO : PROGRESS: at example #1560000, processed 26810712 words (1705905/s), 829163 word types, 1561474 tags
INFO : PROGRESS: at example #1570000, processed 26982375 words (1679613/s), 833760 word types, 1571476 tags
INFO : PROGRESS: at example #1580000, processed 27151048 words (1647714/s), 838635 word types, 1581480 tags
INFO : PROGRESS: at example #1590000, processed 27320059 words (1617266/s), 843841 word types, 1591482 tags
INFO : PROGRESS: at example #1600000, processed 27489568 words (1218762/s), 849031 word types, 1601498 tags
INFO : PROGRESS: at example #1610000, processed 27659499 words (1310944/s), 854234 word types, 1611501 tags
INFO : PROGRESS: at example #1620000, processed 27837048 words (1468645/s), 860933 word types, 1621506 tags
INFO : PROGRESS: at example #1630000, processed 28010353 words (1498385/s), 867036 word types, 1631516 tags
INFO : PROGRESS: at example #1640000, processed 28187373 words (1463917/s), 872327 word types, 1641518 tags
INFO : PROGRESS: at example 

INFO : PROGRESS: at example #2320000, processed 39722952 words (1777976/s), 1179856 word types, 2321804 tags
INFO : PROGRESS: at example #2330000, processed 39888629 words (1708894/s), 1184253 word types, 2331806 tags
INFO : PROGRESS: at example #2340000, processed 40054642 words (1715558/s), 1188883 word types, 2341813 tags
INFO : PROGRESS: at example #2350000, processed 40221475 words (1692961/s), 1193746 word types, 2351813 tags
INFO : PROGRESS: at example #2360000, processed 40387715 words (1705128/s), 1198548 word types, 2361813 tags
INFO : PROGRESS: at example #2370000, processed 40552667 words (1924535/s), 1203545 word types, 2371813 tags
INFO : PROGRESS: at example #2380000, processed 40719114 words (1668944/s), 1208427 word types, 2381813 tags
INFO : PROGRESS: at example #2390000, processed 40884905 words (1761168/s), 1213253 word types, 2391813 tags
INFO : PROGRESS: at example #2400000, processed 41049338 words (1767943/s), 1217780 word types, 2401813 tags
INFO : PROGRESS: at

In [8]:
# Run this cell to retrain with data
train_data = list(read_corpus(train_data_csv))
for i in xrange(2):
    model.train(train_data, total_examples=model.corpus_count, epochs=model.iter)

Headers:  created_at,text,stock_val,stock_val_1hr_delta,stock_val_2hr_delta,stock_val_6hr_delta,after_hours



INFO : training model with 8 workers on 185112 vocabulary and 50 features, using sg=0 hs=0 sample=0.001 negative=5 window=5
INFO : PROGRESS: at 0.07% examples, 250703 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 0.15% examples, 247797 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 0.23% examples, 263971 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 0.31% examples, 265584 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 0.39% examples, 267860 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 0.48% examples, 270769 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 0.56% examples, 271345 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 0.65% examples, 273960 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 0.73% examples, 274092 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 0.82% examples, 277394 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 0.89% examples, 276055 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 0.98% exa

INFO : PROGRESS: at 9.02% examples, 289255 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 9.10% examples, 288894 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 9.17% examples, 288634 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 9.26% examples, 288653 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 9.34% examples, 288408 words/s, in_qsize 16, out_qsize 0
INFO : PROGRESS: at 9.42% examples, 288158 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 9.51% examples, 288237 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 9.61% examples, 288400 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 9.69% examples, 288431 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 9.78% examples, 288348 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 9.86% examples, 288168 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 9.94% examples, 287867 words/s, in_qsize 16, out_qsize 0
INFO : PROGRESS: at 10.03% examples, 287834 words/s, in_qsize 15, out_qsize 

INFO : PROGRESS: at 17.45% examples, 279588 words/s, in_qsize 16, out_qsize 0
INFO : PROGRESS: at 17.52% examples, 279345 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 17.59% examples, 279075 words/s, in_qsize 16, out_qsize 0
INFO : PROGRESS: at 17.66% examples, 278914 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 17.73% examples, 278777 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 17.81% examples, 278609 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 17.87% examples, 278359 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 17.96% examples, 278347 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 18.03% examples, 278305 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 18.11% examples, 278156 words/s, in_qsize 16, out_qsize 0
INFO : PROGRESS: at 18.18% examples, 278036 words/s, in_qsize 16, out_qsize 0
INFO : PROGRESS: at 18.26% examples, 277981 words/s, in_qsize 16, out_qsize 0
INFO : PROGRESS: at 18.34% examples, 278013 words/s, in_qsize 16

INFO : PROGRESS: at 26.29% examples, 280491 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 26.37% examples, 280509 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 26.45% examples, 280560 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 26.54% examples, 280614 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 26.62% examples, 280682 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 26.70% examples, 280826 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 26.79% examples, 280908 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 26.88% examples, 280896 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 26.96% examples, 280943 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 27.05% examples, 281006 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 27.14% examples, 281021 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 27.23% examples, 281088 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 27.32% examples, 281130 words/s, in_qsize 15

INFO : PROGRESS: at 35.58% examples, 284531 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 35.67% examples, 284585 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 35.76% examples, 284634 words/s, in_qsize 16, out_qsize 0
INFO : PROGRESS: at 35.86% examples, 284701 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 35.95% examples, 284742 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 36.03% examples, 284751 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 36.12% examples, 284805 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 36.21% examples, 284836 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 36.29% examples, 284832 words/s, in_qsize 16, out_qsize 0
INFO : PROGRESS: at 36.38% examples, 284893 words/s, in_qsize 16, out_qsize 0
INFO : PROGRESS: at 36.47% examples, 284960 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 36.56% examples, 285058 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 36.65% examples, 285133 words/s, in_qsize 15

INFO : PROGRESS: at 44.13% examples, 282473 words/s, in_qsize 16, out_qsize 0
INFO : PROGRESS: at 44.22% examples, 282518 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 44.31% examples, 282566 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 44.40% examples, 282553 words/s, in_qsize 16, out_qsize 0
INFO : PROGRESS: at 44.48% examples, 282566 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 44.56% examples, 282551 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 44.63% examples, 282489 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 44.71% examples, 282415 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 44.78% examples, 282358 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 44.85% examples, 282299 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 44.93% examples, 282239 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 45.01% examples, 282240 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 45.09% examples, 282214 words/s, in_qsize 15

INFO : PROGRESS: at 52.60% examples, 280503 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 52.68% examples, 280440 words/s, in_qsize 16, out_qsize 0
INFO : PROGRESS: at 52.75% examples, 280395 words/s, in_qsize 16, out_qsize 0
INFO : PROGRESS: at 52.82% examples, 280327 words/s, in_qsize 16, out_qsize 0
INFO : PROGRESS: at 52.91% examples, 280328 words/s, in_qsize 16, out_qsize 0
INFO : PROGRESS: at 53.00% examples, 280373 words/s, in_qsize 16, out_qsize 0
INFO : PROGRESS: at 53.08% examples, 280344 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 53.16% examples, 280380 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 53.24% examples, 280340 words/s, in_qsize 16, out_qsize 0
INFO : PROGRESS: at 53.31% examples, 280280 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 53.37% examples, 280139 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 53.43% examples, 280026 words/s, in_qsize 16, out_qsize 0
INFO : PROGRESS: at 53.50% examples, 279971 words/s, in_qsize 16

INFO : PROGRESS: at 61.17% examples, 279430 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 61.26% examples, 279469 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 61.34% examples, 279474 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 61.43% examples, 279496 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 61.52% examples, 279519 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 61.61% examples, 279550 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 61.69% examples, 279541 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 61.76% examples, 279477 words/s, in_qsize 16, out_qsize 0
INFO : PROGRESS: at 61.83% examples, 279466 words/s, in_qsize 16, out_qsize 0
INFO : PROGRESS: at 61.92% examples, 279485 words/s, in_qsize 16, out_qsize 0
INFO : PROGRESS: at 62.00% examples, 279476 words/s, in_qsize 16, out_qsize 0
INFO : PROGRESS: at 62.09% examples, 279505 words/s, in_qsize 16, out_qsize 0
INFO : PROGRESS: at 62.18% examples, 279530 words/s, in_qsize 15

INFO : PROGRESS: at 70.31% examples, 281088 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 70.40% examples, 281120 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 70.49% examples, 281157 words/s, in_qsize 16, out_qsize 0
INFO : PROGRESS: at 70.58% examples, 281176 words/s, in_qsize 16, out_qsize 0
INFO : PROGRESS: at 70.67% examples, 281203 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 70.75% examples, 281225 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 70.84% examples, 281258 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 70.93% examples, 281286 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 71.02% examples, 281296 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 71.11% examples, 281330 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 71.20% examples, 281361 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 71.29% examples, 281370 words/s, in_qsize 16, out_qsize 0
INFO : PROGRESS: at 71.38% examples, 281391 words/s, in_qsize 15

INFO : PROGRESS: at 79.62% examples, 283034 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 79.69% examples, 282985 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 79.77% examples, 282952 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 79.83% examples, 282882 words/s, in_qsize 14, out_qsize 1
INFO : PROGRESS: at 79.91% examples, 282863 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 79.99% examples, 282804 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 80.07% examples, 282790 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 80.15% examples, 282807 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 80.24% examples, 282815 words/s, in_qsize 16, out_qsize 0
INFO : PROGRESS: at 80.33% examples, 282845 words/s, in_qsize 16, out_qsize 0
INFO : PROGRESS: at 80.42% examples, 282870 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 80.52% examples, 282895 words/s, in_qsize 16, out_qsize 0
INFO : PROGRESS: at 80.60% examples, 282914 words/s, in_qsize 16

INFO : PROGRESS: at 89.03% examples, 284886 words/s, in_qsize 16, out_qsize 0
INFO : PROGRESS: at 89.12% examples, 284887 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 89.21% examples, 284902 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 89.30% examples, 284915 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 89.39% examples, 284917 words/s, in_qsize 16, out_qsize 0
INFO : PROGRESS: at 89.48% examples, 284931 words/s, in_qsize 16, out_qsize 0
INFO : PROGRESS: at 89.57% examples, 284945 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 89.65% examples, 284943 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 89.75% examples, 284948 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 89.84% examples, 284959 words/s, in_qsize 16, out_qsize 0
INFO : PROGRESS: at 89.94% examples, 284969 words/s, in_qsize 16, out_qsize 0
INFO : PROGRESS: at 90.02% examples, 284957 words/s, in_qsize 16, out_qsize 0
INFO : PROGRESS: at 90.12% examples, 284984 words/s, in_qsize 16

INFO : PROGRESS: at 98.54% examples, 286603 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 98.63% examples, 286619 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 98.71% examples, 286616 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 98.80% examples, 286636 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 98.89% examples, 286672 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 98.98% examples, 286687 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 99.07% examples, 286686 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 99.16% examples, 286699 words/s, in_qsize 16, out_qsize 0
INFO : PROGRESS: at 99.25% examples, 286711 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 99.34% examples, 286711 words/s, in_qsize 16, out_qsize 0
INFO : PROGRESS: at 99.43% examples, 286721 words/s, in_qsize 16, out_qsize 0
INFO : PROGRESS: at 99.52% examples, 286724 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 99.61% examples, 286728 words/s, in_qsize 15

INFO : PROGRESS: at 7.26% examples, 305390 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 7.35% examples, 305155 words/s, in_qsize 16, out_qsize 0
INFO : PROGRESS: at 7.44% examples, 305025 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 7.53% examples, 304919 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 7.62% examples, 304993 words/s, in_qsize 16, out_qsize 0
INFO : PROGRESS: at 7.71% examples, 304983 words/s, in_qsize 16, out_qsize 0
INFO : PROGRESS: at 7.80% examples, 304759 words/s, in_qsize 16, out_qsize 0
INFO : PROGRESS: at 7.89% examples, 304783 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 7.98% examples, 304759 words/s, in_qsize 16, out_qsize 0
INFO : PROGRESS: at 8.08% examples, 304687 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 8.17% examples, 304788 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 8.25% examples, 304896 words/s, in_qsize 16, out_qsize 0
INFO : PROGRESS: at 8.34% examples, 304665 words/s, in_qsize 15, out_qsize 0

INFO : PROGRESS: at 16.77% examples, 304943 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 16.87% examples, 304978 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 16.95% examples, 304918 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 17.04% examples, 304817 words/s, in_qsize 16, out_qsize 0
INFO : PROGRESS: at 17.13% examples, 304812 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 17.23% examples, 304809 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 17.33% examples, 304760 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 17.42% examples, 304779 words/s, in_qsize 16, out_qsize 0
INFO : PROGRESS: at 17.51% examples, 304719 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 17.60% examples, 304711 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 17.69% examples, 304706 words/s, in_qsize 16, out_qsize 0
INFO : PROGRESS: at 17.78% examples, 304619 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 17.87% examples, 304623 words/s, in_qsize 15

INFO : PROGRESS: at 26.29% examples, 304471 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 26.38% examples, 304447 words/s, in_qsize 16, out_qsize 0
INFO : PROGRESS: at 26.46% examples, 304429 words/s, in_qsize 16, out_qsize 0
INFO : PROGRESS: at 26.54% examples, 304429 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 26.63% examples, 304470 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 26.72% examples, 304559 words/s, in_qsize 16, out_qsize 0
INFO : PROGRESS: at 26.81% examples, 304588 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 26.90% examples, 304607 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 26.99% examples, 304596 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 27.08% examples, 304559 words/s, in_qsize 16, out_qsize 0
INFO : PROGRESS: at 27.17% examples, 304537 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 27.25% examples, 304410 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 27.34% examples, 304438 words/s, in_qsize 16

INFO : PROGRESS: at 35.58% examples, 302578 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 35.66% examples, 302496 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 35.74% examples, 302462 words/s, in_qsize 16, out_qsize 0
INFO : PROGRESS: at 35.82% examples, 302403 words/s, in_qsize 16, out_qsize 0
INFO : PROGRESS: at 35.90% examples, 302329 words/s, in_qsize 16, out_qsize 0
INFO : PROGRESS: at 35.98% examples, 302298 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 36.07% examples, 302240 words/s, in_qsize 16, out_qsize 0
INFO : PROGRESS: at 36.14% examples, 302123 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 36.22% examples, 302003 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 36.30% examples, 301937 words/s, in_qsize 16, out_qsize 0
INFO : PROGRESS: at 36.38% examples, 301869 words/s, in_qsize 16, out_qsize 0
INFO : PROGRESS: at 36.46% examples, 301835 words/s, in_qsize 16, out_qsize 0
INFO : PROGRESS: at 36.55% examples, 301847 words/s, in_qsize 16

INFO : PROGRESS: at 42.33% examples, 117329 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 42.35% examples, 117277 words/s, in_qsize 16, out_qsize 0
INFO : PROGRESS: at 42.38% examples, 117256 words/s, in_qsize 16, out_qsize 0
INFO : PROGRESS: at 42.41% examples, 117202 words/s, in_qsize 16, out_qsize 0
INFO : PROGRESS: at 42.44% examples, 117188 words/s, in_qsize 16, out_qsize 0
INFO : PROGRESS: at 42.46% examples, 117132 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 42.49% examples, 117108 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 42.51% examples, 117070 words/s, in_qsize 16, out_qsize 0
INFO : PROGRESS: at 42.53% examples, 117015 words/s, in_qsize 16, out_qsize 0
INFO : PROGRESS: at 42.56% examples, 116996 words/s, in_qsize 16, out_qsize 0
INFO : PROGRESS: at 42.58% examples, 116964 words/s, in_qsize 16, out_qsize 0
INFO : PROGRESS: at 42.61% examples, 116911 words/s, in_qsize 16, out_qsize 0
INFO : PROGRESS: at 42.63% examples, 116887 words/s, in_qsize 15

INFO : PROGRESS: at 46.16% examples, 69523 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 46.24% examples, 69614 words/s, in_qsize 16, out_qsize 0
INFO : PROGRESS: at 46.32% examples, 69701 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 46.40% examples, 69792 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 46.48% examples, 69881 words/s, in_qsize 16, out_qsize 0
INFO : PROGRESS: at 46.55% examples, 69971 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 46.62% examples, 70047 words/s, in_qsize 16, out_qsize 0
INFO : PROGRESS: at 46.70% examples, 70143 words/s, in_qsize 16, out_qsize 0
INFO : PROGRESS: at 46.78% examples, 70243 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 46.86% examples, 70325 words/s, in_qsize 16, out_qsize 0
INFO : PROGRESS: at 46.94% examples, 70413 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 47.02% examples, 70505 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 47.10% examples, 70588 words/s, in_qsize 15, out_qsize 0

INFO : PROGRESS: at 54.45% examples, 76321 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 54.47% examples, 76316 words/s, in_qsize 16, out_qsize 0
INFO : PROGRESS: at 54.49% examples, 76311 words/s, in_qsize 16, out_qsize 0
INFO : PROGRESS: at 54.50% examples, 76297 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 54.52% examples, 76295 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 54.55% examples, 76288 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 54.57% examples, 76290 words/s, in_qsize 16, out_qsize 0
INFO : PROGRESS: at 54.59% examples, 76288 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 54.61% examples, 76283 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 54.64% examples, 76281 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 54.66% examples, 76276 words/s, in_qsize 16, out_qsize 0
INFO : PROGRESS: at 54.68% examples, 76277 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 54.70% examples, 76276 words/s, in_qsize 15, out_qsize 0

INFO : PROGRESS: at 57.76% examples, 58835 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 57.83% examples, 58888 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 57.90% examples, 58936 words/s, in_qsize 16, out_qsize 0
INFO : PROGRESS: at 57.97% examples, 58989 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 58.04% examples, 59040 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 58.10% examples, 59091 words/s, in_qsize 16, out_qsize 0
INFO : PROGRESS: at 58.17% examples, 59139 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 58.24% examples, 59198 words/s, in_qsize 16, out_qsize 0
INFO : PROGRESS: at 58.31% examples, 59251 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 58.38% examples, 59308 words/s, in_qsize 16, out_qsize 0
INFO : PROGRESS: at 58.46% examples, 59372 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 58.54% examples, 59430 words/s, in_qsize 16, out_qsize 0
INFO : PROGRESS: at 58.61% examples, 59483 words/s, in_qsize 15, out_qsize 0

INFO : PROGRESS: at 66.34% examples, 65467 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 66.42% examples, 65528 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 66.51% examples, 65599 words/s, in_qsize 16, out_qsize 0
INFO : PROGRESS: at 66.59% examples, 65667 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 66.68% examples, 65745 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 66.77% examples, 65817 words/s, in_qsize 16, out_qsize 0
INFO : PROGRESS: at 66.85% examples, 65880 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 66.94% examples, 65948 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 67.04% examples, 66019 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 67.13% examples, 66091 words/s, in_qsize 16, out_qsize 0
INFO : PROGRESS: at 67.23% examples, 66162 words/s, in_qsize 16, out_qsize 0
INFO : PROGRESS: at 67.32% examples, 66230 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 67.40% examples, 66293 words/s, in_qsize 15, out_qsize 0

INFO : PROGRESS: at 75.85% examples, 72595 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 75.94% examples, 72660 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 76.03% examples, 72723 words/s, in_qsize 16, out_qsize 0
INFO : PROGRESS: at 76.12% examples, 72787 words/s, in_qsize 16, out_qsize 0
INFO : PROGRESS: at 76.21% examples, 72849 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 76.30% examples, 72913 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 76.39% examples, 72979 words/s, in_qsize 16, out_qsize 0
INFO : PROGRESS: at 76.46% examples, 73036 words/s, in_qsize 16, out_qsize 0
INFO : PROGRESS: at 76.55% examples, 73104 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 76.64% examples, 73173 words/s, in_qsize 16, out_qsize 0
INFO : PROGRESS: at 76.72% examples, 73240 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 76.82% examples, 73309 words/s, in_qsize 16, out_qsize 0
INFO : PROGRESS: at 76.90% examples, 73370 words/s, in_qsize 15, out_qsize 0

INFO : PROGRESS: at 85.39% examples, 79323 words/s, in_qsize 16, out_qsize 0
INFO : PROGRESS: at 85.48% examples, 79385 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 85.57% examples, 79449 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 85.66% examples, 79512 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 85.75% examples, 79574 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 85.84% examples, 79634 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 85.93% examples, 79696 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 86.02% examples, 79757 words/s, in_qsize 16, out_qsize 0
INFO : PROGRESS: at 86.11% examples, 79815 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 86.20% examples, 79879 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 86.29% examples, 79939 words/s, in_qsize 16, out_qsize 0
INFO : PROGRESS: at 86.38% examples, 79997 words/s, in_qsize 16, out_qsize 0
INFO : PROGRESS: at 86.47% examples, 80062 words/s, in_qsize 15, out_qsize 0

INFO : PROGRESS: at 95.01% examples, 85754 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 95.11% examples, 85813 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 95.19% examples, 85867 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 95.29% examples, 85925 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 95.38% examples, 85982 words/s, in_qsize 16, out_qsize 0
INFO : PROGRESS: at 95.47% examples, 86039 words/s, in_qsize 16, out_qsize 0
INFO : PROGRESS: at 95.55% examples, 86095 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 95.64% examples, 86150 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 95.73% examples, 86207 words/s, in_qsize 16, out_qsize 0
INFO : PROGRESS: at 95.82% examples, 86268 words/s, in_qsize 16, out_qsize 0
INFO : PROGRESS: at 95.91% examples, 86327 words/s, in_qsize 16, out_qsize 0
INFO : PROGRESS: at 96.00% examples, 86384 words/s, in_qsize 15, out_qsize 0
INFO : PROGRESS: at 96.09% examples, 86441 words/s, in_qsize 15, out_qsize 0

In [3]:
# Run this if we need to reload the model
model = gensim.models.doc2vec.Doc2Vec.load("doc2vec-model.gs")

INFO : loading Doc2Vec object from doc2vec-model.gs
INFO : loading docvecs recursively from doc2vec-model.gs.docvecs.* with mmap=None
INFO : loading doctag_syn0 from doc2vec-model.gs.docvecs.doctag_syn0.npy with mmap=None
INFO : loading wv recursively from doc2vec-model.gs.wv.* with mmap=None
INFO : setting ignored attribute syn0norm to None
INFO : setting ignored attribute cum_table to None
INFO : loaded doc2vec-model.gs


In [11]:
# Pick a random document from the test corpus and infer a vector from the model
doc_id = random.randint(0, len(train_data))

# Compare and print the most/median/least similar documents from the train corpus
print('Train Document ({}): «{}»\n'.format(doc_id, ' '.join(train_data[doc_id].words)))
inferred_vector = model.infer_vector(train_data[doc_id].words)
sims = model.docvecs.most_similar([inferred_vector], topn=2)
sim_id = sims[0]
print('Similar Document {}: «{}»\n'.format(sim_id, ' '.join(train_data[sim_id[0]].words)))

Train Document (453702): «square is doing bitcoin trials we re always listening to our customers and we ve found that they are interested https co zlbymcoa»

Similar Document (2329934, 0.6125656962394714): «experty io streamlining and decentralizing the information economy bitcoin https co ovswffuuem»



In [35]:
test = "excited amd is amazing crap"

iv = model.infer_vector(test.split(" "))
sims = model.docvecs.most_similar([iv], topn=2)
sims[0]
train_data[sims[0][0]].words

[u'rt',
 u'xboxtavern',
 u'giveaway',
 u'follow',
 u'and',
 u'rt',
 u'for',
 u'chance',
 u'to',
 u'win',
 u'venom',
 u'gaming',
 u'headset',
 u'and',
 u'three',
 u'games',
 u'winner',
 u'chosen',
 u'sunday']

In [9]:
model.save("doc2vec-model.gs")

INFO : saving Doc2Vec object under doc2vec-model.gs, separately None
INFO : storing np array 'doctag_syn0' to doc2vec-model.gs.docvecs.doctag_syn0.npy
INFO : not storing attribute syn0norm
INFO : not storing attribute cum_table
INFO : saved doc2vec-model.gs


In [7]:
gensim.utils.simple_preprocess("asds sda")

[u'asds', u'sda']

In [9]:
!ls -lah

total 1886432
drwxr-xr-x  12 hongee  staff   384B Dec  9 23:34 [34m.[m[m
drwxr-xr-x  12 hongee  staff   384B Dec  9 17:40 [34m..[m[m
-rw-r--r--@  1 hongee  staff   6.0K Dec  9 22:46 .DS_Store
drwxr-xr-x   3 hongee  staff    96B Dec  9 23:20 [34m.ipynb_checkpoints[m[m
-rw-r--r--@  1 hongee  staff    61M Dec  9 16:11 amd-streamed-20171109-231920.txt.csv
-rwxr-xr-x@  1 hongee  staff   157M Dec  9 17:25 [31mamd-streamed-20171115-011457.txt.csv[m[m
-rw-r--r--   1 hongee  staff    53M Dec  9 23:34 doc2vec-model.gs
-rw-r--r--   1 hongee  staff   227M Dec  9 23:34 doc2vec-model.gs.docvecs.doctag_syn0.npy
-rw-r--r--   1 hongee  staff    41M Dec  9 22:07 doc2vec-trainable
-rw-r--r--   1 hongee  staff   162M Dec  9 22:07 doc2vec-trainable.docvecs.doctag_syn0.npy
-rw-r--r--   1 hongee  staff   219M Dec  9 23:17 doc2vec-training-data.csv
-rw-r--r--   1 hongee  staff    74K Dec  9 23:34 train_doc2vec.ipynb
