In [12]:
import pandas as pd
train = pd.read_csv("shortArticles.tsv", header=0, delimiter="\t", quoting=3, encoding="utf8")

In [13]:
from bs4 import BeautifulSoup
import re
from nltk.corpus import stopwords

def article_to_wordlist(article, remove_stopwords=False):
    article_text = BeautifulSoup(article).get_text()
    article_text = re.sub("[^a-zA-Z]", " ", article_text)
    words = article_text.lower().split()
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
    return(words)

In [14]:
import nltk.data

In [15]:
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
def article_to_sentences(article, tokenizer, remove_stopwords=False):
    raw_sentences = tokenizer.tokenize(article.strip())
    sentences = []
    for raw_sentence in raw_sentences:
        if len(raw_sentence) > 0:
            sentences.append(article_to_wordlist(raw_sentence, remove_stopwords))
    return sentences

In [16]:
sentences = []
for article in train["article"]:
    sentences += article_to_sentences(article, tokenizer)

In [17]:
print len(sentences)

14


In [18]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO )
num_features = 300
min_word_count = 2
num_workers = 4
context = 10
downsampling = 1e-3
from gensim.models import word2vec

2016-11-18 15:15:37,146 : INFO : 'pattern' package not found; tag filters are not available for English


In [19]:
model = word2vec.Word2Vec(sentences, workers=num_workers, size = num_features, min_count = min_word_count, \
                         window = context, sample = downsampling)
model.init_sims(replace=True)

2016-11-18 15:16:03,897 : INFO : collecting all words and their counts
2016-11-18 15:16:03,899 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2016-11-18 15:16:03,902 : INFO : collected 189 word types from a corpus of 365 raw words and 14 sentences
2016-11-18 15:16:03,903 : INFO : Loading a fresh vocabulary
2016-11-18 15:16:03,906 : INFO : min_count=2 retains 61 unique words (32% of original 189, drops 128)
2016-11-18 15:16:03,908 : INFO : min_count=2 leaves 237 word corpus (64% of original 365, drops 128)
2016-11-18 15:16:03,909 : INFO : deleting the raw counts dictionary of 189 items
2016-11-18 15:16:03,912 : INFO : sample=0.001 downsamples 61 most-common words
2016-11-18 15:16:03,914 : INFO : downsampling leaves estimated 68 word corpus (29.1% of prior 237)
2016-11-18 15:16:03,915 : INFO : estimated required memory for 61 words and 300 dimensions: 176900 bytes
2016-11-18 15:16:03,918 : INFO : resetting layer weights
2016-11-18 15:16:03,924 : INFO : trainin

In [21]:
model.doesnt_match("president federal representative bill".split())

'federal'

In [22]:
model.doesnt_match("president federal representative history".split())

'federal'

In [23]:
print model[0]

TypeError: 'int' object is not iterable

In [24]:
print model["president"]

[ 0.0013956  -0.00210436  0.05805452 -0.02370084 -0.03050801 -0.01432053
  0.0807432   0.08043005  0.09147038  0.00712296 -0.00173326  0.01953176
 -0.04967156 -0.03017672 -0.09568081  0.02561516  0.054412   -0.01374455
  0.06454805 -0.05624231 -0.08034775  0.07308684  0.07476535  0.08578315
 -0.06941005 -0.01849161 -0.00855363 -0.0169792   0.01250665 -0.07572077
 -0.03231204 -0.03740505 -0.08180846  0.05642473 -0.07529686 -0.08524871
 -0.01019788 -0.04184644  0.07991029  0.02961547 -0.03034598  0.05348257
  0.05229228 -0.0155925   0.08380798 -0.01205648  0.04505677 -0.05106147
  0.00501827 -0.09186471 -0.06070689  0.04925759 -0.05451013  0.06463555
 -0.02835889 -0.09044255  0.07299845  0.01782253 -0.01032326 -0.03468812
 -0.03869071 -0.06799056 -0.05330624  0.07641754  0.01393117 -0.03008528
  0.09190113 -0.04422194 -0.00120128 -0.09625824  0.03224762 -0.06899031
 -0.0266252  -0.05264938  0.00483153  0.04320577  0.02717433  0.04078924
 -0.03600953  0.02427658  0.09025032 -0.05035673 -0

In [25]:
model.vocab

{u'a': <gensim.models.word2vec.Vocab at 0x10e8d1ed0>,
 u'accountability': <gensim.models.word2vec.Vocab at 0x10e8d1910>,
 u'act': <gensim.models.word2vec.Vocab at 0x10e8d1690>,
 u'american': <gensim.models.word2vec.Vocab at 0x10e8d1d50>,
 u'an': <gensim.models.word2vec.Vocab at 0x10e8d1c10>,
 u'and': <gensim.models.word2vec.Vocab at 0x10e8d1b90>,
 u'are': <gensim.models.word2vec.Vocab at 0x10e8d13d0>,
 u'assets': <gensim.models.word2vec.Vocab at 0x10e8d1f10>,
 u'be': <gensim.models.word2vec.Vocab at 0x10e8d1450>,
 u'blind': <gensim.models.word2vec.Vocab at 0x10e8d1710>,
 u'bush': <gensim.models.word2vec.Vocab at 0x10e58a210>,
 u'business': <gensim.models.word2vec.Vocab at 0x10e8d1490>,
 u'c': <gensim.models.word2vec.Vocab at 0x10e8d1550>,
 u'can': <gensim.models.word2vec.Vocab at 0x10e8d1b50>,
 u'clark': <gensim.models.word2vec.Vocab at 0x10e8d1790>,
 u'conflict': <gensim.models.word2vec.Vocab at 0x10e8d1e50>,
 u'conflicts': <gensim.models.word2vec.Vocab at 0x10e8d14d0>,
 u'd': <gensim

In [27]:
import numpy as np

In [30]:
model['']

KeyError: 'love'