In [2]:
from __future__ import absolute_import, division, print_function

In [3]:
import codecs
import glob
import logging
import multiprocessing
import os
import pprint
import re

In [4]:
import nltk
import gensim.models.word2vec as w2v
import sklearn.manifold
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

In [5]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [6]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [7]:
nltk.download("punkt")
nltk.download("stopwords")

[nltk_data] Downloading package punkt to /home/jalaj/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/jalaj/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [8]:
book_filenames = sorted(glob.glob("/home/jalaj/PycharmProjects/NLPython/NLPython/ch6/gameofthrones2vec/data/*.txt"))

In [9]:
print("Found books:")
book_filenames

Found books:


['/home/jalaj/PycharmProjects/NLPython/NLPython/ch6/gameofthrones2vec/data/got1.txt',
 '/home/jalaj/PycharmProjects/NLPython/NLPython/ch6/gameofthrones2vec/data/got2.txt',
 '/home/jalaj/PycharmProjects/NLPython/NLPython/ch6/gameofthrones2vec/data/got3.txt',
 '/home/jalaj/PycharmProjects/NLPython/NLPython/ch6/gameofthrones2vec/data/got4.txt',
 '/home/jalaj/PycharmProjects/NLPython/NLPython/ch6/gameofthrones2vec/data/got5.txt']

In [10]:
corpus_raw = u""
for book_filename in book_filenames:
    print("Reading '{0}'...".format(book_filename))
    with codecs.open(book_filename, "r", "utf-8") as book_file:
        corpus_raw += book_file.read()
    print("Corpus is now {0} characters long".format(len(corpus_raw)))
    print()

Reading '/home/jalaj/PycharmProjects/NLPython/NLPython/ch6/gameofthrones2vec/data/got1.txt'...
Corpus is now 1770659 characters long

Reading '/home/jalaj/PycharmProjects/NLPython/NLPython/ch6/gameofthrones2vec/data/got2.txt'...
Corpus is now 4071041 characters long

Reading '/home/jalaj/PycharmProjects/NLPython/NLPython/ch6/gameofthrones2vec/data/got3.txt'...
Corpus is now 6391405 characters long

Reading '/home/jalaj/PycharmProjects/NLPython/NLPython/ch6/gameofthrones2vec/data/got4.txt'...
Corpus is now 8107945 characters long

Reading '/home/jalaj/PycharmProjects/NLPython/NLPython/ch6/gameofthrones2vec/data/got5.txt'...
Corpus is now 9719485 characters long



In [11]:
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

In [12]:
raw_sentences = tokenizer.tokenize(corpus_raw)

In [13]:
#convert into a list of words
#rtemove unnnecessary,, split into words, no hyphens
#list of words
def sentence_to_wordlist(raw):
    clean = re.sub("[^a-zA-Z]"," ", raw)
    words = clean.split()
    return words

In [14]:
#sentence where each word is tokenized
sentences = []
for raw_sentence in raw_sentences:
    if len(raw_sentence) > 0:
        sentences.append(sentence_to_wordlist(raw_sentence))

In [15]:
print(raw_sentences[5])
print(sentence_to_wordlist(raw_sentences[5]))

Heraldic crest by Virginia Norey.
[u'Heraldic', u'crest', u'by', u'Virginia', u'Norey']


In [16]:
token_count = sum([len(sentence) for sentence in sentences])
print("The book corpus contains {0:,} tokens".format(token_count))

The book corpus contains 1,818,103 tokens


In [17]:
#ONCE we have vectors
#step 3 - build model
#3 main tasks that vectors help with
#DISTANCE, SIMILARITY, RANKING

# Dimensionality of the resulting word vectors.
#more dimensions, more computationally expensive to train
#but also more accurate
#more dimensions = more generalized
num_features = 300
# Minimum word count threshold.
min_word_count = 3

# Number of threads to run in parallel.
#more workers, faster we train
num_workers = multiprocessing.cpu_count()

# Context window length.
context_size = 7

# Downsample setting for frequent words.
#0 - 1e-5 is good for this
downsampling = 1e-3

# Seed for the RNG, to make the results reproducible.
#random number generator
#deterministic, good for debugging
seed = 1

In [18]:
thrones2vec = w2v.Word2Vec(
    sg=1,
    seed=seed,
    workers=num_workers,
    size=num_features,
    min_count=min_word_count,
    window=context_size,
    sample=downsampling
)

In [19]:
thrones2vec.build_vocab(sentences)

2017-05-22 12:51:29,328 : INFO : collecting all words and their counts
2017-05-22 12:51:29,330 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2017-05-22 12:51:29,376 : INFO : PROGRESS: at sentence #10000, processed 140984 words, keeping 10280 word types
2017-05-22 12:51:29,452 : INFO : PROGRESS: at sentence #20000, processed 279730 words, keeping 13558 word types
2017-05-22 12:51:29,494 : INFO : PROGRESS: at sentence #30000, processed 420336 words, keeping 16598 word types
2017-05-22 12:51:29,540 : INFO : PROGRESS: at sentence #40000, processed 556581 words, keeping 18324 word types
2017-05-22 12:51:29,591 : INFO : PROGRESS: at sentence #50000, processed 686247 words, keeping 19714 word types
2017-05-22 12:51:29,642 : INFO : PROGRESS: at sentence #60000, processed 828497 words, keeping 21672 word types
2017-05-22 12:51:29,694 : INFO : PROGRESS: at sentence #70000, processed 973830 words, keeping 23093 word types
2017-05-22 12:51:29,752 : INFO : PROGRESS: at 

In [20]:
print("Word2Vec vocabulary length:", len(thrones2vec.wv.vocab))

Word2Vec vocabulary length: 17277


In [21]:
thrones2vec.train(sentences)

2017-05-22 12:51:39,496 : INFO : training model with 4 workers on 17277 vocabulary and 300 features, using sg=1 hs=0 sample=0.001 negative=5 window=7
2017-05-22 12:51:39,499 : INFO : expecting 128868 sentences, matching count from corpus used for vocabulary survey
2017-05-22 12:51:40,780 : INFO : PROGRESS: at 1.85% examples, 111834 words/s, in_qsize 8, out_qsize 0
2017-05-22 12:51:41,788 : INFO : PROGRESS: at 3.73% examples, 120112 words/s, in_qsize 8, out_qsize 0
2017-05-22 12:51:42,812 : INFO : PROGRESS: at 5.53% examples, 120326 words/s, in_qsize 8, out_qsize 0
2017-05-22 12:51:43,861 : INFO : PROGRESS: at 7.44% examples, 119793 words/s, in_qsize 8, out_qsize 0
2017-05-22 12:51:44,899 : INFO : PROGRESS: at 9.10% examples, 118149 words/s, in_qsize 8, out_qsize 0
2017-05-22 12:51:45,933 : INFO : PROGRESS: at 10.49% examples, 114781 words/s, in_qsize 8, out_qsize 0
2017-05-22 12:51:47,059 : INFO : PROGRESS: at 11.89% examples, 110930 words/s, in_qsize 8, out_qsize 0
2017-05-22 12:51:48

7022378

In [22]:
if not os.path.exists("trained"):
    os.makedirs("trained")

In [23]:
thrones2vec.save(os.path.join("trained", "thrones2vec.w2v"))

2017-05-22 12:53:18,291 : INFO : saving Word2Vec object under trained/thrones2vec.w2v, separately None
2017-05-22 12:53:18,293 : INFO : not storing attribute syn0norm
2017-05-22 12:53:18,296 : INFO : not storing attribute cum_table
2017-05-22 12:53:18,922 : INFO : saved trained/thrones2vec.w2v


In [24]:
thrones2vec = w2v.Word2Vec.load(os.path.join("trained", "thrones2vec.w2v"))

2017-05-22 12:53:22,785 : INFO : loading Word2Vec object from trained/thrones2vec.w2v
2017-05-22 12:53:23,020 : INFO : loading wv recursively from trained/thrones2vec.w2v.wv.* with mmap=None
2017-05-22 12:53:23,021 : INFO : setting ignored attribute syn0norm to None
2017-05-22 12:53:23,022 : INFO : setting ignored attribute cum_table to None
2017-05-22 12:53:23,023 : INFO : loaded trained/thrones2vec.w2v


In [25]:
tsne = sklearn.manifold.TSNE(n_components=2, random_state=0)

In [26]:
all_word_vectors_matrix = thrones2vec.wv.syn0

In [None]:
all_word_vectors_matrix_2d = tsne.fit_transform(all_word_vectors_matrix)

In [None]:
points = pd.DataFrame(
    [
        (word, coords[0], coords[1])
        for word, coords in [
            (word, all_word_vectors_matrix_2d[thrones2vec.vocab[word].index])
            for word in thrones2vec.vocab
        ]
    ],
    columns=["word", "x", "y"]
)

In [None]:
points.head(10)

In [None]:
sns.set_context("poster")

In [None]:
points.plot.scatter("x", "y", s=10, figsize=(20, 12))

In [None]:
def plot_region(x_bounds, y_bounds):
    slice = points[
        (x_bounds[0] <= points.x) &
        (points.x <= x_bounds[1]) & 
        (y_bounds[0] <= points.y) &
        (points.y <= y_bounds[1])
    ]
    
    ax = slice.plot.scatter("x", "y", s=35, figsize=(10, 8))
    for i, point in slice.iterrows():
        ax.text(point.x + 0.005, point.y + 0.005, point.word, fontsize=11)

In [None]:
plot_region(x_bounds=(4.0, 4.2), y_bounds=(-0.5, -0.1))

In [None]:
plot_region(x_bounds=(0, 1), y_bounds=(4, 4.5))

In [27]:
thrones2vec.wv.most_similar("Stark")

2017-05-22 12:53:41,884 : INFO : precomputing L2-norms of word weight vectors


[(u'Eddard', 0.7480276226997375),
 (u'Winterfell', 0.6750659346580505),
 (u'direwolf', 0.6425904035568237),
 (u'Hornwood', 0.6366876363754272),
 (u'Lyanna', 0.6365906000137329),
 (u'beheaded', 0.6254189014434814),
 (u'Karstark', 0.6238248348236084),
 (u'executed', 0.6236813068389893),
 (u'Brandon', 0.6221044659614563),
 (u'Robb', 0.620850682258606)]

In [28]:
thrones2vec.most_similar("Aerys")

[(u'Jaehaerys', 0.7663156390190125),
 (u'Mad', 0.7609344124794006),
 (u'Daeron', 0.7542939782142639),
 (u'reign', 0.7378734350204468),
 (u'Cruel', 0.7255479097366333),
 (u'Unworthy', 0.722900927066803),
 (u'Conquest', 0.7144717574119568),
 (u'Since', 0.7070102691650391),
 (u'Rhaegar', 0.706188976764679),
 (u'II', 0.7050039172172546)]

In [29]:
thrones2vec.most_similar("direwolf")

[(u'wolf', 0.6853885054588318),
 (u'SHAGGYDOG', 0.6638862490653992),
 (u'Rickon', 0.651175856590271),
 (u'Stark', 0.6425904035568237),
 (u'Ghost', 0.6304522752761841),
 (u'pup', 0.6185530424118042),
 (u'Robb', 0.6010775566101074),
 (u'GHOST', 0.5987852215766907),
 (u'eagle', 0.5979127883911133),
 (u'RICKON', 0.5888221263885498)]

In [30]:
def nearest_similarity_cosmul(start1, end1, end2):
    similarities = thrones2vec.most_similar_cosmul(
        positive=[end2, start1],
        negative=[end1]
    )
    start2 = similarities[0][0]
    print("{start1} is related to {end1}, as {start2} is related to {end2}".format(**locals()))
    return start2

In [31]:
nearest_similarity_cosmul("Stark", "Winterfell", "Riverrun")
nearest_similarity_cosmul("Jaime", "sword", "wine")
nearest_similarity_cosmul("Arya", "Nymeria", "dragons")

Stark is related to Winterfell, as Tully is related to Riverrun
Jaime is related to sword, as drank is related to wine
Arya is related to Nymeria, as Dany is related to dragons


u'Dany'