# Representing Words

In [1]:
# Get some text data (message board posts).
from sklearn.datasets import fetch_20newsgroups
raw_data = fetch_20newsgroups(subset='train', shuffle=True, random_state=42,
                              remove=('headers', 'footers', 'quotes'))

In [2]:
print('read %d documents' % len(raw_data.data))
print('>>>for example:\n%s\n' % raw_data.data[0])
print('>>>for example:\n%s' % raw_data.data[1000])

read 11314 documents
>>>for example:
I was wondering if anyone out there could enlighten me on this car I saw
the other day. It was a 2-door sports car, looked to be from the late 60s/
early 70s. It was called a Bricklin. The doors were really small. In addition,
the front bumper was separate from the rest of the body. This is 
all I know. If anyone can tellme a model name, engine specs, years
of production, where this car is made, history, or whatever info you
have on this funky looking car, please e-mail.

>>>for example:
Anybody seen mouse cursor distortion running the Diamond 1024x768x256 driver?
Sorry, don't know the version of the driver (no indication in the menus) but it's a recently
delivered Gateway system.  Am going to try the latest drivers from Diamond BBS but wondered
if anyone else had seen this.

post or email


In [31]:
import re
tokens = [re.findall('\w+', s.lower()) for s in raw_data.data]
# sample 5k docs
# tokens = tokens[:5000]
print('tokenized documents; for example:\n%s' % tokens[0])

tokenized documents; for example:
['i', 'was', 'wondering', 'if', 'anyone', 'out', 'there', 'could', 'enlighten', 'me', 'on', 'this', 'car', 'i', 'saw', 'the', 'other', 'day', 'it', 'was', 'a', '2', 'door', 'sports', 'car', 'looked', 'to', 'be', 'from', 'the', 'late', '60s', 'early', '70s', 'it', 'was', 'called', 'a', 'bricklin', 'the', 'doors', 'were', 'really', 'small', 'in', 'addition', 'the', 'front', 'bumper', 'was', 'separate', 'from', 'the', 'rest', 'of', 'the', 'body', 'this', 'is', 'all', 'i', 'know', 'if', 'anyone', 'can', 'tellme', 'a', 'model', 'name', 'engine', 'specs', 'years', 'of', 'production', 'where', 'this', 'car', 'is', 'made', 'history', 'or', 'whatever', 'info', 'you', 'have', 'on', 'this', 'funky', 'looking', 'car', 'please', 'e', 'mail']


## Cluster words by their context

In [4]:
from collections import Counter, defaultdict
import numpy as np

# dict from term to context vector.
contexts = defaultdict(lambda: Counter())
window = 2
for toks in tokens:
    for i, token in enumerate(toks):
        features = []
        for j in range(np.amax([0, i-window]), i):
            features.append(toks[j] + "@" + str(j-i))
        for j in range(i+1, min(i + window, len(toks))):
            features.append(toks[j] + "@" + str(j-i))
        contexts[token].update(features)

In [5]:
contexts = dict((k,v) for k, v in contexts.items() if sum(v.values()) > 10)

In [6]:
print(contexts['i'].most_common(10))

[('have@1', 1133), ('m@1', 1078), ('am@1', 809), ('the@-2', 730), ('think@1', 713), ('don@1', 700), ('and@-1', 686), ('but@-1', 647), ('ve@1', 591), ('can@1', 583)]


In [7]:
print(contexts['car'].most_common(10))

[('the@-1', 87), ('a@-1', 43), ('my@-1', 31), ('a@-2', 28), ('of@-2', 28), ('and@1', 25), ('i@1', 22), ('in@-2', 17), ('is@1', 14), ('this@-1', 14)]


In [8]:
print(contexts['gun'].most_common(10))

[('control@1', 87), ('a@-1', 49), ('the@-1', 39), ('the@-2', 29), ('anti@-1', 23), ('of@-1', 22), ('of@-2', 21), ('a@-2', 20), ('on@-1', 12), ('pro@-1', 11)]


In [9]:
# Compute the number of different contexts each term appears in.
doc_freq = Counter()
for context in contexts.values():
    for term in context:
        doc_freq[term] += 1.
doc_freq.most_common(5)

[('the@-1', 6502.0),
 ('and@1', 5893.0),
 ('the@-2', 5837.0),
 ('of@-2', 4819.0),
 ('the@1', 4634.0)]

In [10]:
len(contexts)

16920

In [11]:
to_remove = set([t for t, v in doc_freq.items() if v < 10])

In [12]:
for w, context in contexts.items():
    d = dict()
    for k, v in context.items():
        if k not in to_remove:
            d[k] = v
    contexts[w] = Counter(d)

In [13]:
# Transform each context vector to be term freq / tweet frequency. 
# Also then normalize by length.
import math
for term, context in contexts.items():
    for term2, frequency in context.items():
        context[term2] = frequency / (1. + math.log(doc_freq[term2]))
    length = math.sqrt(sum([v*v for v in context.values()]))
    for term2, frequency in context.items():
        context[term2] = 1. * frequency / length

contexts['i'].most_common(5)

[('have@1', 0.33676022127839716),
 ('ve@1', 0.33081880741285874),
 ('am@1', 0.3245826170925442),
 ('m@1', 0.31135022907131166),
 ('think@1', 0.2871391254258928)]

In [14]:
contexts['gun'].most_common(10)

[('control@1', 0.7649046244521146),
 ('a@-1', 0.2748044135561928),
 ('anti@-1', 0.24620735646234973),
 ('the@-1', 0.2070426831444815),
 ('the@-2', 0.15567221355940442),
 ('owners@1', 0.13182927982147039),
 ('of@-1', 0.12396703634205102),
 ('of@-2', 0.11500702355700627),
 ('pro@-1', 0.11447448925177982),
 ('ownership@1', 0.11421855663134342)]

In [15]:
# Make a sparse matrix.
from sklearn.feature_extraction import DictVectorizer
vec = DictVectorizer()
X = vec.fit_transform(contexts.values())
features = np.array(vec.get_feature_names())

In [16]:
# X is now a sparse matrix where each row is a term and each column is a context feature

In [17]:
X.shape

(16920, 19786)

In [18]:
terms = contexts.keys()
term2id = dict((t, i) for i, t in enumerate(terms))
X[term2id['gun']]

<1x19786 sparse matrix of type '<class 'numpy.float64'>'
	with 427 stored elements in Compressed Sparse Row format>

In [None]:
from sklearn.preprocessing import scale
# SVD of X
# Too big!
# U, s, Vh = np.linalg.svd(X.toarray(), full_matrices=False)

In [None]:
help(np.linalg.svd)

## word2vec

In [20]:
from gensim.models import word2vec
# Tutorial: http://rare-technologies.com/deep-learning-with-word2vec-and-gensim/
help(word2vec)

Help on module gensim.models.word2vec in gensim.models:

NAME
    gensim.models.word2vec

DESCRIPTION
    Deep learning via word2vec's "skip-gram and CBOW models", using either
    hierarchical softmax or negative sampling [1]_ [2]_.
    
    The training algorithms were originally ported from the C package https://code.google.com/p/word2vec/
    and extended with additional functionality.
    
    For a blog tutorial on gensim word2vec, with an interactive web app trained on GoogleNews, visit http://radimrehurek.com/2014/02/word2vec-tutorial/
    
    **Make sure you have a C compiler before installing gensim, to use optimized (compiled) word2vec training**
    (70x speedup compared to plain NumPy implementation [3]_).
    
    Initialize a model with e.g.::
    
    >>> model = Word2Vec(sentences, size=100, window=5, min_count=5, workers=4)
    
    Persist a model to disk with::
    
    >>> model.save(fname)
    >>> model = Word2Vec.load(fname)  # you can continue training with the

In [56]:
sum(len(t) for t in tokens)

2407167

In [44]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
model = word2vec.Word2Vec(tokens, size=100, window=5, min_count=3)

In [45]:
model.init_sims(replace=True)  # free unneeded variables and precompute normalized vectors.

In [66]:
model.most_similar(positive=['cars', 'trucks'])

[('bikes', 0.8544794321060181),
 ('spots', 0.8441545367240906),
 ('decades', 0.8351261615753174),
 ('mice', 0.8316686749458313),
 ('plants', 0.8287297487258911),
 ('seats', 0.822428286075592),
 ('masters', 0.8212233781814575),
 ('antibiotics', 0.8196142315864563),
 ('adults', 0.8149713277816772),
 ('buses', 0.812675952911377)]

In [58]:
model.most_similar(positive=['gun'])

[('crime', 0.882451593875885),
 ('military', 0.8497803211212158),
 ('criminal', 0.8445150852203369),
 ('safety', 0.7943019866943359),
 ('defense', 0.7925150394439697),
 ('community', 0.7909201979637146),
 ('armed', 0.7859578132629395),
 ('drug', 0.780666708946228),
 ('violent', 0.777873158454895),
 ('self', 0.7714388370513916)]

In [63]:
model.doesnt_match(['mouse', 'engine', 'cpu'])

'engine'

In [64]:
model.n_similarity(['chip', 'cpu'], ['software', 'algorithm'])

0.79739832314678638

In [65]:
model.n_similarity(['chip', 'cpu'], ['religion', 'belief'])

0.11561933793959725

In [62]:
model.most_similar(positive=['cars', 'guns', 'prices'], negative=['car', 'gun'])

[('items', 0.6851003170013428),
 ('vendors', 0.667102038860321),
 ('inanimate', 0.6526215672492981),
 ('prophesied', 0.6507330536842346),
 ('models', 0.6335091590881348),
 ('solicited', 0.625587522983551),
 ('places', 0.6187167167663574),
 ('sizes', 0.6169173717498779),
 ('assemblers', 0.6047966480255127),
 ('tapes', 0.6001349091529846)]