In [None]:

# gensim modules
from gensim import utils
from gensim.models.doc2vec import LabeledSentence
from gensim.models import Doc2Vec

# numpy
import numpy

# classifier
from sklearn.linear_model import LogisticRegression

# random
import random

In [None]:

class LabeledLineSentence(object):
    def __init__(self, sources):
        self.sources = sources
        
        flipped = {}
        
        # make sure that keys are unique
        for key, value in sources.items():
            if value not in flipped:
                flipped[value] = [key]
            else:
                raise Exception('Non-unique prefix encountered')
    
    def __iter__(self):
        for source, prefix in self.sources.items():
            with utils.smart_open(source) as fin:
                for item_no, line in enumerate(fin):
                    #print(utils.to_unicode(line).split())
                    yield LabeledSentence(utils.to_unicode(line).split(), [prefix + '_%s' % item_no])
    
    def to_array(self):
        self.sentences = []
        for source, prefix in self.sources.items():
            with utils.smart_open(source) as fin:
                for item_no, line in enumerate(fin):
                    #print(utils.to_unicode(line).split())
                    self.sentences.append(LabeledSentence(utils.to_unicode(line).split(), [prefix + '_%s' % item_no]))
        return self.sentences
    
    def sentences_perm(self):
        shuffled = list(self.sentences)
        random.shuffle(shuffled)
        return shuffled

In [None]:
sources = {'data/neg_train_clean.txt':'TRAIN_NEG', 'data/pos_train_clean.txt':'TRAIN_POS'}

sentences = LabeledLineSentence(sources)

In [None]:

model = Doc2Vec(min_count=1, window=10, size=100, sample=1e-4, negative=5, workers=7)

model.build_vocab(sentences.to_array())

In [None]:
#for epoch in range(10):
#total_examples skal være lik antall tweets
epochs=20
for i in range(epochs):
    model.train(sentences.sentences_perm(),total_examples=200000, epochs=1)

In [None]:
model.most_similar(positive=['mother','woman'],negative=['man'])

In [None]:
train_arrays = numpy.zeros((200000, 100))
train_labels = numpy.zeros(200000)

for i in range(100000):
    prefix_train_pos = 'TRAIN_POS_' + str(i)
    prefix_train_neg = 'TRAIN_NEG_' + str(i)
    train_arrays[i] = model.docvecs[prefix_train_pos]
    train_arrays[100000 + i] = model.docvecs[prefix_train_neg]
    train_labels[i] = 1
    train_labels[100000 + i] = 0

In [None]:
classifier = LogisticRegression()
classifier.fit(train_arrays, train_labels)

In [None]:
classifier.score(train_arrays, train_labels)