# Document to Vector

In [33]:
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf

import gensim
import csv2bow
reload(csv2bow)

# other utilities
import getpass
import csv
from datetime import datetime
import os
import sys
import collections
import random

### Load patent file and build tagged training corpus

In [34]:
# Set the file path for patent file, dictionary and corpus
current_user = getpass.getuser()
base_file_path = '/home/' + current_user + '/'
patent_file_path = ''.join((base_file_path, 'patent_data/patent_claims_fulltext.csv'))
dictionary_path = ''.join((base_file_path, 'patent_data/dictionary.dict'))
corpus_path = ''.join((base_file_path, '/patent_data/corpus.mm'))

In [35]:
def create_test_train_corpus(filepath, tokens_only=False):
    if tokens_only == True:
        yield [csv2bow.prune(doc[1]) for doc in csv2bow.clump(filepath)]
    else:
        
        yield (gensim.models.doc2vec.TaggedDocument(
            csv2bow.prune(doc), [i]) for i, doc in csv2bow.clump(filepath))

In [36]:
training_corpus = create_test_train_corpus(patent_file_path, tokens_only=False).next()
# load same corpus without patent id for evaluations
test_corpus = create_test_train_corpus(patent_file_path, tokens_only=True)

In [20]:
#training_corpus = create_test_train_corpus(patent_file_path, tokens_only=False).next()
#for i, doc in enumerate(training_corpus):
#    if i < 2:
#        print doc[1][0]
        

## Build Doc2Vec model

In [37]:
%time mod_doc2vec = gensim.models.doc2vec.Doc2Vec(training_corpus, size=160, min_count=2, iter=50)
## Uncomment the following lines if no corpus list or corpus generator is given to initiate the model.
#mod_doc2vec.build_vocab(training_corpus)
#%time model.train(training_corpus, total_examples=mod_doc2vec.corpus_count, epochs=mod_doc2vec.iter)

2017-12-19 22:18:28,270 : INFO : collecting all words and their counts
2017-12-19 22:18:28,326 : INFO : PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags
2017-12-19 22:48:49,890 : INFO : collected 20768 word types and 10000 unique tags from a corpus of 10000 examples and 4049558 words
2017-12-19 22:48:49,892 : INFO : Loading a fresh vocabulary
2017-12-19 22:48:49,940 : INFO : min_count=2 retains 14181 unique words (68% of original 20768, drops 6587)
2017-12-19 22:48:49,942 : INFO : min_count=2 leaves 4042971 word corpus (99% of original 4049558, drops 6587)
2017-12-19 22:48:49,984 : INFO : deleting the raw counts dictionary of 20768 items
2017-12-19 22:48:49,988 : INFO : sample=0.001 downsamples 50 most-common words
2017-12-19 22:48:49,990 : INFO : downsampling leaves estimated 3260464 word corpus (80.6% of prior 4042971)
2017-12-19 22:48:49,992 : INFO : estimated required memory for 14181 words and 160 dimensions: 33642180 bytes
2017-12-19 22:48:50,063 : INFO : re

CPU times: user 14min 47s, sys: 31.9 s, total: 15min 19s
Wall time: 30min 22s


In [38]:
print len(mod_doc2vec.docvecs)

10000


In [39]:
training_eva_corpus = create_test_train_corpus(patent_file_path, tokens_only=False).next()
ranks = []
second_ranks = []
nr_mod_docs = len(mod_doc2vec.docvecs)
for i, doc in enumerate(training_eva_corpus):
    if i < 100:
        inferred_vector = mod_doc2vec.infer_vector(doc.words)
        sims = mod_doc2vec.docvecs.most_similar([inferred_vector], topn=nr_mod_docs)
        # get the rank of the same inferred document 
        rank = [i for i, sim in sims].index(doc[1][0])
        ranks.append(rank)
        second_ranks.append(sims[1])


2017-12-19 23:18:49,027 : INFO : precomputing L2-norms of doc weight vectors


In [40]:
collections.Counter(ranks)

Counter({75: 1,
         325: 1,
         332: 1,
         353: 1,
         371: 1,
         381: 1,
         395: 1,
         458: 1,
         546: 1,
         589: 1,
         798: 1,
         999: 1,
         1122: 1,
         1172: 1,
         1181: 1,
         1285: 1,
         1339: 1,
         1418: 1,
         1564: 1,
         1647: 1,
         1663: 1,
         1674: 1,
         1702: 1,
         1749: 1,
         1867: 1,
         1885: 1,
         2107: 1,
         2134: 1,
         2353: 1,
         2403: 1,
         2622: 1,
         2628: 1,
         2768: 1,
         2990: 1,
         3042: 1,
         3109: 1,
         3110: 1,
         3345: 1,
         3389: 1,
         3487: 1,
         3510: 1,
         3529: 1,
         3679: 1,
         3833: 1,
         4077: 1,
         4240: 1,
         4281: 1,
         4290: 1,
         4324: 1,
         4655: 1,
         4847: 1,
         4966: 1,
         4981: 1,
         5082: 1,
         5150: 1,
         5309: 1,
     

================Just testing below this line=====================

### Try out Doc2Vec with toy corpus


In [15]:
toy_corpus = [['I','hate','debugging'],
               ['everyone','hate','debugging','too','no','joke'],
               ['But','NLP','is','FUN', 'no', 'joke']]

def generate_test_corpus(corpus):
    for i, doc in enumerate(corpus):
        yield gensim.models.doc2vec.TaggedDocument(doc, [i])

tagged_corpus = list(generate_test_corpus(toy_corpus))
print "Corpus: ", tagged_corpus

test_model = gensim.models.doc2vec.Doc2Vec(size=40, min_count=2, iter=5)
test_model.build_vocab(tagged_corpus)

print "Vocabulary: "
print test_model.wv.vocab

2017-12-19 16:36:06,861 : INFO : collecting all words and their counts
2017-12-19 16:36:06,863 : INFO : PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags
2017-12-19 16:36:06,864 : INFO : collected 11 word types and 3 unique tags from a corpus of 3 examples and 15 words
2017-12-19 16:36:06,865 : INFO : Loading a fresh vocabulary
2017-12-19 16:36:06,866 : INFO : min_count=2 retains 4 unique words (36% of original 11, drops 7)
2017-12-19 16:36:06,867 : INFO : min_count=2 leaves 8 word corpus (53% of original 15, drops 7)
2017-12-19 16:36:06,868 : INFO : deleting the raw counts dictionary of 11 items
2017-12-19 16:36:06,870 : INFO : sample=0.001 downsamples 4 most-common words
2017-12-19 16:36:06,871 : INFO : downsampling leaves estimated 0 word corpus (6.7% of prior 8)
2017-12-19 16:36:06,872 : INFO : estimated required memory for 4 words and 40 dimensions: 3760 bytes
2017-12-19 16:36:06,873 : INFO : resetting layer weights


Corpus:  [TaggedDocument(words=['I', 'hate', 'debugging'], tags=[0]), TaggedDocument(words=['everyone', 'hate', 'debugging', 'too', 'no', 'joke'], tags=[1]), TaggedDocument(words=['But', 'NLP', 'is', 'FUN', 'no', 'joke'], tags=[2])]
Vocabulary: 
{'debugging': <gensim.models.keyedvectors.Vocab object at 0x7f64c82ebe50>, 'joke': <gensim.models.keyedvectors.Vocab object at 0x7f648807f510>, 'hate': <gensim.models.keyedvectors.Vocab object at 0x7f648807f450>, 'no': <gensim.models.keyedvectors.Vocab object at 0x7f648807f4d0>}


In [16]:
# Test infer vector on a new doc
test_model.infer_vector(['Only', 'I', 'can', 'dealt', 'with', 'debugging', 'no', 'joke'])

array([ 0.0074688 , -0.00860952,  0.00651344,  0.0067819 , -0.00940804,
       -0.00573374,  0.00446753,  0.01036623,  0.01099162, -0.00230265,
        0.01001389,  0.00422939, -0.00035249,  0.00650207,  0.00496055,
        0.01139371, -0.00698338, -0.00481543,  0.00873296,  0.00184248,
       -0.01131732, -0.00826996,  0.00937591, -0.0049616 , -0.00500902,
        0.01158319,  0.00515296, -0.00621486, -0.00086252, -0.00193153,
        0.00424253, -0.00150474, -0.00628895, -0.00585909, -0.01022338,
       -0.00081373,  0.00304008,  0.0053918 ,  0.00058929,  0.01156284], dtype=float32)