In [4]:
import numpy as np
import pandas as pd
import re

In [6]:
# read patent high level mapping and claims
claims = pd.read_csv("../data/claims_test.csv", encoding = 'utf8')
#patent_spec_map = pd.read_csv("../data/patent_spec_map.csv", encoding='utf8')

In [None]:
print("claim size: ", len(set(claims["PATENT_ID"])))
print("patent size: ", len(set(patent_spec_map["PATENT_ID"])))

In [5]:
from gensim.utils import tokenize
from gensim.corpora.wikicorpus import WikiCorpus
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from pprint import pprint
import multiprocessing

Using TensorFlow backend.


In [6]:
>>> import logging
>>> logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [7]:
from nltk.corpus import stopwords
from nltk import WordNetLemmatizer
lemma = WordNetLemmatizer()

process data into a document corpus and clean

In [8]:
def sentence_process(sent):
    words = str(sent).split()

    # 1. alphanumeric only
    alphanum_only = [re.sub("[^a-zA-Z0-9]", "", w) for w in words]

    # 2. remove stop words
    stops = set(stopwords.words("english"))
    meaningful_words = [w for w in alphanum_only if not w in stops]

    # 3. lemmatization
    words_lemma = [lemma.lemmatize(w) for w in meaningful_words]

    return words_lemma 

In [9]:
patent_corpus = {}
for i, v in claims.iterrows():
    pid = v["PATENT_ID"]
    text = sentence_process(v["CLAIM_TEXT"])
    if(pid not in patent_corpus):
        patent_corpus[pid] = text
    else:
        patent_corpus[pid].extend(text)


In [10]:
del claims
#del patent_spec_map

Define Document Embedding class

In [11]:
class TaggedPatentDocument:
    def __init__(self, patent):
        self.patent = patent
    def __iter__(self):
        # try only 10 documents
        for pid, content in self.patent.items():
            yield TaggedDocument(content, [pid])       

In [12]:
patent_docs = TaggedPatentDocument(patent_corpus)

Scaling min_cout size for vocabulary size

In [19]:
pre = Doc2Vec(min_count=0)
pre.scan_vocab(patent_docs)

2017-08-28 14:19:05,198 : INFO : collecting all words and their counts
2017-08-28 14:19:05,200 : INFO : PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags
2017-08-28 14:19:05,203 : INFO : collected 444 word types and 86897599 unique tags from a corpus of 9 examples and 2838 words


In [20]:
for num in range(0, 20):
    print('min_count: {}, size of vocab: '.format(num), pre.scale_vocab(min_count=num, dry_run=True)['memory']['vocab']/700)

2017-08-28 14:19:08,936 : INFO : Loading a fresh vocabulary
2017-08-28 14:19:08,939 : INFO : min_count=0 retains 444 unique words (100% of original 444, drops 0)
2017-08-28 14:19:08,941 : INFO : min_count=0 leaves 2838 word corpus (100% of original 2838, drops 0)
2017-08-28 14:19:08,952 : INFO : sample=0.001 downsamples 93 most-common words
2017-08-28 14:19:08,954 : INFO : downsampling leaves estimated 1790 word corpus (63.1% of prior 2838)
2017-08-28 14:19:08,956 : INFO : estimated required memory for 444 words and 100 dimensions: 34759616800 bytes
2017-08-28 14:19:08,958 : INFO : Loading a fresh vocabulary
2017-08-28 14:19:08,961 : INFO : min_count=1 retains 444 unique words (100% of original 444, drops 0)
2017-08-28 14:19:08,963 : INFO : min_count=1 leaves 2838 word corpus (100% of original 2838, drops 0)
2017-08-28 14:19:08,967 : INFO : sample=0.001 downsamples 93 most-common words
2017-08-28 14:19:08,969 : INFO : downsampling leaves estimated 1790 word corpus (63.1% of prior 2838)

min_count: 0, size of vocab:  317.14285714285717
min_count: 1, size of vocab:  317.14285714285717
min_count: 2, size of vocab:  203.57142857142858
min_count: 3, size of vocab:  147.85714285714286
min_count: 4, size of vocab:  122.85714285714286
min_count: 5, size of vocab:  99.28571428571429
min_count: 6, size of vocab:  89.28571428571429
min_count: 7, size of vocab:  77.85714285714286
min_count: 8, size of vocab:  66.42857142857143
min_count: 9, size of vocab:  61.42857142857143
min_count: 10, size of vocab:  57.857142857142854
min_count: 11, size of vocab:  52.857142857142854
min_count: 12, size of vocab:  48.57142857142857


2017-08-28 14:19:09,159 : INFO : downsampling leaves estimated 534 word corpus (31.0% of prior 1723)
2017-08-28 14:19:09,164 : INFO : estimated required memory for 64 words and 100 dimensions: 34759122800 bytes
2017-08-28 14:19:09,166 : INFO : Loading a fresh vocabulary
2017-08-28 14:19:09,169 : INFO : min_count=14 retains 62 unique words (13% of original 444, drops 382)
2017-08-28 14:19:09,172 : INFO : min_count=14 leaves 1697 word corpus (59% of original 2838, drops 1141)
2017-08-28 14:19:09,177 : INFO : sample=0.001 downsamples 62 most-common words
2017-08-28 14:19:09,180 : INFO : downsampling leaves estimated 516 word corpus (30.4% of prior 1697)
2017-08-28 14:19:09,182 : INFO : estimated required memory for 62 words and 100 dimensions: 34759120200 bytes
2017-08-28 14:19:09,184 : INFO : Loading a fresh vocabulary
2017-08-28 14:19:09,195 : INFO : min_count=15 retains 56 unique words (12% of original 444, drops 388)
2017-08-28 14:19:09,199 : INFO : min_count=15 leaves 1613 word corpu

min_count: 13, size of vocab:  45.714285714285715
min_count: 14, size of vocab:  44.285714285714285
min_count: 15, size of vocab:  40.0
min_count: 16, size of vocab:  36.42857142857143
min_count: 17, size of vocab:  33.57142857142857
min_count: 18, size of vocab:  30.714285714285715
min_count: 19, size of vocab:  27.857142857142858


#Training the Doc2vec Model
##PV-DBOW: paragraph vector - Distributed Bag of Words
##PV-DM: paragraph vector - Distributed Memory

In [13]:
cores = multiprocessing.cpu_count()

# try both PV-DBOW and PV-DM model

models = [
    # PV-DBOW 
    Doc2Vec(dm=0, dbow_words=1, size=300, window=10, min_count=10, workers=cores),
    # PV-DM w/average
    Doc2Vec(dm=1, dm_mean=1, size=300, window=10, min_count=10, workers=cores),
]

#model = Doc2Vec(alpha=0.025, min_alpha=0.025)

In [14]:
#model.build_vocab(patent_docs)
models[0].build_vocab(patent_docs)
print(str(models[0]))
models[1].reset_from(models[0])
print(str(models[1]))

2017-08-28 14:26:56,477 : INFO : collecting all words and their counts
2017-08-28 14:26:56,479 : INFO : PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags
2017-08-28 14:26:56,482 : INFO : collected 444 word types and 86897599 unique tags from a corpus of 9 examples and 2838 words
2017-08-28 14:26:56,484 : INFO : Loading a fresh vocabulary
2017-08-28 14:26:56,488 : INFO : min_count=10 retains 81 unique words (18% of original 444, drops 363)
2017-08-28 14:26:56,490 : INFO : min_count=10 leaves 1907 word corpus (67% of original 2838, drops 931)
2017-08-28 14:26:56,493 : INFO : deleting the raw counts dictionary of 444 items
2017-08-28 14:26:56,497 : INFO : sample=0.001 downsamples 81 most-common words
2017-08-28 14:26:56,499 : INFO : downsampling leaves estimated 677 word corpus (35.5% of prior 1907)
2017-08-28 14:26:56,501 : INFO : estimated required memory for 81 words and 300 dimensions: 104277353700 bytes
2017-08-28 14:26:56,504 : INFO : resetting layer weights


MemoryError: 

In [None]:
%debug

> [1;32mc:\users\jasoliu\appdata\local\continuum\anaconda3\envs\tensorflow\lib\site-packages\gensim\models\doc2vec.py[0m(390)[0;36mreset_weights[1;34m()[0m
[1;32m    388 [1;33m            [0mself[0m[1;33m.[0m[0mdoctag_syn0_lockf[0m[1;33m.[0m[0mfill[0m[1;33m([0m[1;36m1.0[0m[1;33m)[0m[1;33m[0m[0m
[0m[1;32m    389 [1;33m        [1;32melse[0m[1;33m:[0m[1;33m[0m[0m
[0m[1;32m--> 390 [1;33m            [0mself[0m[1;33m.[0m[0mdoctag_syn0[0m [1;33m=[0m [0mempty[0m[1;33m([0m[1;33m([0m[0mlength[0m[1;33m,[0m [0mmodel[0m[1;33m.[0m[0mvector_size[0m[1;33m)[0m[1;33m,[0m [0mdtype[0m[1;33m=[0m[0mREAL[0m[1;33m)[0m[1;33m[0m[0m
[0m[1;32m    391 [1;33m            [0mself[0m[1;33m.[0m[0mdoctag_syn0_lockf[0m [1;33m=[0m [0mones[0m[1;33m([0m[1;33m([0m[0mlength[0m[1;33m,[0m[1;33m)[0m[1;33m,[0m [0mdtype[0m[1;33m=[0m[0mREAL[0m[1;33m)[0m  [1;31m# zeros suppress learning[0m[1;33m[0m[0m
[0m[1;32m    39

In [None]:
for model in models:
    %%time model.train(patent_docs, total_examples=model.corpus_count, epochs=model.iter)

In [None]:
for epoch in range(10):
    model.train(patent_docs)
    model.alpha -= 0.002
    model.min_alpha = model.alpha
    #%%time model.train(patent_docs, total_examples=model.corpus_count, epochs=model.iter)

2017-08-24 16:03:54,103 : INFO : training model with 4 workers on 130707 vocabulary and 200 features, using sg=1 hs=0 sample=0.001 negative=5 window=8
Exception in thread Thread-7:
Traceback (most recent call last):
  File "c:\users\jasoliu\appdata\local\continuum\anaconda3\envs\tensorflow\lib\threading.py", line 914, in _bootstrap_inner
    self.run()
  File "c:\users\jasoliu\appdata\local\continuum\anaconda3\envs\tensorflow\lib\threading.py", line 862, in run
    self._target(*self._args, **self._kwargs)
  File "c:\users\jasoliu\appdata\local\continuum\anaconda3\envs\tensorflow\lib\site-packages\gensim\models\word2vec.py", line 837, in worker_loop
    tally, raw_tally = self._do_train_job(sentences, alpha, (work, neu1))
  File "c:\users\jasoliu\appdata\local\continuum\anaconda3\envs\tensorflow\lib\site-packages\gensim\models\doc2vec.py", line 714, in _do_train_job
    indexed_doctags = self.docvecs.indexed_doctags(doc.tags)
  File "c:\users\jasoliu\appdata\local\continuum\anaconda3\e

In [None]:
model.save('D2V')

Word2Vect

In [14]:
class patentSentence:
    def __init__(self, patent_doc):
        self.patent_doc = patent_doc
    def __iter__(self):
        for pid, claims in self.patent_doc.items():
            yield claims

In [18]:
sentences = patentSentence(patent_corpus)

In [23]:
import gensim
word2vect_model = gensim.models.Word2Vec(sentences, min_count = 0)

2017-08-25 13:47:45,784 : INFO : collecting all words and their counts
2017-08-25 13:47:45,790 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2017-08-25 13:47:55,228 : INFO : PROGRESS: at sentence #10000, processed 11311126 words, keeping 96531 word types
2017-08-25 13:48:00,821 : INFO : PROGRESS: at sentence #20000, processed 22423771 words, keeping 148596 word types
2017-08-25 13:48:05,039 : INFO : PROGRESS: at sentence #30000, processed 33935598 words, keeping 190634 word types
2017-08-25 13:48:09,175 : INFO : PROGRESS: at sentence #40000, processed 45887431 words, keeping 234732 word types
2017-08-25 13:48:12,427 : INFO : PROGRESS: at sentence #50000, processed 56519952 words, keeping 271546 word types
2017-08-25 13:48:14,900 : INFO : collected 294968 word types from a corpus of 64957171 raw words and 58860 sentences
2017-08-25 13:48:14,900 : INFO : Loading a fresh vocabulary
2017-08-25 13:48:21,587 : INFO : min_count=0 retains 294968 unique words (100% 

2017-08-25 13:49:36,133 : INFO : PROGRESS: at 17.30% examples, 717875 words/s, in_qsize 6, out_qsize 0
2017-08-25 13:49:37,138 : INFO : PROGRESS: at 17.59% examples, 718161 words/s, in_qsize 5, out_qsize 0
2017-08-25 13:49:38,157 : INFO : PROGRESS: at 17.91% examples, 718664 words/s, in_qsize 5, out_qsize 0
2017-08-25 13:49:39,173 : INFO : PROGRESS: at 18.22% examples, 718834 words/s, in_qsize 6, out_qsize 0
2017-08-25 13:49:40,163 : INFO : PROGRESS: at 18.55% examples, 719232 words/s, in_qsize 5, out_qsize 0
2017-08-25 13:49:41,166 : INFO : PROGRESS: at 18.88% examples, 719904 words/s, in_qsize 5, out_qsize 0
2017-08-25 13:49:42,174 : INFO : PROGRESS: at 19.23% examples, 720651 words/s, in_qsize 5, out_qsize 0
2017-08-25 13:49:43,174 : INFO : PROGRESS: at 19.59% examples, 720838 words/s, in_qsize 5, out_qsize 0
2017-08-25 13:49:44,178 : INFO : PROGRESS: at 19.90% examples, 721015 words/s, in_qsize 5, out_qsize 0
2017-08-25 13:49:45,182 : INFO : PROGRESS: at 20.21% examples, 721075 wor

2017-08-25 13:50:56,713 : INFO : PROGRESS: at 40.27% examples, 734350 words/s, in_qsize 5, out_qsize 0
2017-08-25 13:50:57,733 : INFO : PROGRESS: at 40.56% examples, 734519 words/s, in_qsize 5, out_qsize 0
2017-08-25 13:50:58,751 : INFO : PROGRESS: at 40.80% examples, 734556 words/s, in_qsize 6, out_qsize 0
2017-08-25 13:50:59,755 : INFO : PROGRESS: at 41.08% examples, 734886 words/s, in_qsize 5, out_qsize 0
2017-08-25 13:51:00,787 : INFO : PROGRESS: at 41.34% examples, 735070 words/s, in_qsize 5, out_qsize 0
2017-08-25 13:51:01,792 : INFO : PROGRESS: at 41.62% examples, 735296 words/s, in_qsize 5, out_qsize 0
2017-08-25 13:51:02,796 : INFO : PROGRESS: at 41.89% examples, 735571 words/s, in_qsize 5, out_qsize 0
2017-08-25 13:51:03,815 : INFO : PROGRESS: at 42.17% examples, 735715 words/s, in_qsize 5, out_qsize 0
2017-08-25 13:51:04,819 : INFO : PROGRESS: at 42.46% examples, 735954 words/s, in_qsize 5, out_qsize 0
2017-08-25 13:51:05,828 : INFO : PROGRESS: at 42.75% examples, 736184 wor

2017-08-25 13:52:17,353 : INFO : PROGRESS: at 63.12% examples, 743587 words/s, in_qsize 5, out_qsize 0
2017-08-25 13:52:18,373 : INFO : PROGRESS: at 63.40% examples, 743441 words/s, in_qsize 5, out_qsize 0
2017-08-25 13:52:19,374 : INFO : PROGRESS: at 63.69% examples, 743549 words/s, in_qsize 6, out_qsize 0
2017-08-25 13:52:20,381 : INFO : PROGRESS: at 63.98% examples, 743617 words/s, in_qsize 5, out_qsize 0
2017-08-25 13:52:21,383 : INFO : PROGRESS: at 64.28% examples, 743705 words/s, in_qsize 6, out_qsize 0
2017-08-25 13:52:22,383 : INFO : PROGRESS: at 64.58% examples, 743832 words/s, in_qsize 5, out_qsize 0
2017-08-25 13:52:23,390 : INFO : PROGRESS: at 64.85% examples, 743930 words/s, in_qsize 5, out_qsize 0
2017-08-25 13:52:24,395 : INFO : PROGRESS: at 65.12% examples, 744051 words/s, in_qsize 5, out_qsize 0
2017-08-25 13:52:25,400 : INFO : PROGRESS: at 65.42% examples, 744090 words/s, in_qsize 5, out_qsize 0
2017-08-25 13:52:26,432 : INFO : PROGRESS: at 65.66% examples, 743933 wor

2017-08-25 13:53:37,968 : INFO : PROGRESS: at 85.45% examples, 742344 words/s, in_qsize 5, out_qsize 0
2017-08-25 13:53:38,972 : INFO : PROGRESS: at 85.65% examples, 741816 words/s, in_qsize 6, out_qsize 0
2017-08-25 13:53:39,964 : INFO : PROGRESS: at 85.84% examples, 741208 words/s, in_qsize 5, out_qsize 0
2017-08-25 13:53:40,983 : INFO : PROGRESS: at 86.11% examples, 740755 words/s, in_qsize 5, out_qsize 0
2017-08-25 13:53:41,987 : INFO : PROGRESS: at 86.34% examples, 740321 words/s, in_qsize 6, out_qsize 0
2017-08-25 13:53:42,990 : INFO : PROGRESS: at 86.58% examples, 739963 words/s, in_qsize 5, out_qsize 0
2017-08-25 13:53:43,992 : INFO : PROGRESS: at 86.82% examples, 739652 words/s, in_qsize 5, out_qsize 0
2017-08-25 13:53:44,997 : INFO : PROGRESS: at 86.99% examples, 738788 words/s, in_qsize 5, out_qsize 0
2017-08-25 13:53:46,002 : INFO : PROGRESS: at 87.22% examples, 738261 words/s, in_qsize 5, out_qsize 0
2017-08-25 13:53:47,005 : INFO : PROGRESS: at 87.44% examples, 737742 wor

In [24]:
word2vect_model.save("word2vecmodel")

2017-08-25 13:56:40,054 : INFO : saving Word2Vec object under word2vecmodel, separately None
2017-08-25 13:56:40,059 : INFO : not storing attribute syn0norm
2017-08-25 13:56:40,062 : INFO : storing np array 'syn0' to word2vecmodel.wv.syn0.npy
2017-08-25 13:56:40,720 : INFO : storing np array 'syn1neg' to word2vecmodel.syn1neg.npy
2017-08-25 13:56:41,272 : INFO : not storing attribute cum_table
2017-08-25 13:56:42,649 : INFO : saved word2vecmodel


In [26]:
word2vect_model.predict_output_word(['emergency', 'beacon', 'received'])

[('emergency', 0.025825158),
 ('beacon', 0.0017108296),
 ('message', 0.0013486465),
 ('IVS', 0.0013114822),
 ('PPT', 0.0011884869),
 ('missed', 0.00047500365),
 ('DTIM', 0.0004336031),
 ('microcell', 0.00033761081),
 ('signaltransmitting', 0.00028154752),
 ('alert', 0.00027658624)]