# Game of Thrones word vectors
Based on [video tutorial by Siraj Raval](https://www.youtube.com/watch?v=pY9EwZ02sXU&feature=em-lss)

Goal: Create word vectors from a game of thrones dataset and analyse to see semantic similarity


In [1]:
# Step 0: import necessary libraries

# future will create a bridge between Python 2.7 and 3.5 syntax
from __future__ import absolute_import, division, print_function
#for word encoding
import codecs
#regex
import glob
#concurrency / multi-threding
import multiprocessing
#dealing with toperating system, like reading a file
import os
#pretty printing, human readable
import pprint
#regular expression
import re
#natural language toolkit
import nltk
#word 2 vec
import gensim.models.word2vec as w2v
#dimensionality reduction, video to see: visualise a dataset easily
import sklearn.manifold
#math
import numpy as np
#plotting
import matplotlib.pyplot as plt
#parse pandas as pd
import pandas as pd
#visualisation
import seaborn as sns



In [None]:
%pylab inline

# Step 1: Process data

In [2]:
#stopwords like the at a an, unnecesasry
#tokenization into sentences, punkt 
#http://www.nltk.org/
nltk.download("punkt") # pretrained tokenizer
nltk.download("stopwords") # words like and, the, an, a, of

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\t902587\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\t902587\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [6]:
#get the book names, matching txt file
book_filenames = sorted(glob.glob("data/*.txt"))


In [7]:
#print books
print("Found books:")
book_filenames

Found books:


['data\\got1.txt',
 'data\\got2.txt',
 'data\\got3.txt',
 'data\\got4.txt',
 'data\\got5.txt']

## start creating the corpus

In [8]:
#initialize raw unicode , we'll add all text to this one bigass file in memory
corpus_raw = u""
#for each book, read it, open it un utf 8 format, 
#add it to the raw corpus
for book_filename in book_filenames:
    print("Reading '{0}'...".format(book_filename))
    with codecs.open(book_filename, "r", "utf-8") as book_file:
        corpus_raw += book_file.read()
    print("Corpus is now {0} characters long".format(len(corpus_raw)))
    print()

Reading 'data\got1.txt'...
Corpus is now 1770659 characters long

Reading 'data\got2.txt'...
Corpus is now 4071041 characters long

Reading 'data\got3.txt'...
Corpus is now 6391405 characters long

Reading 'data\got4.txt'...
Corpus is now 8107945 characters long

Reading 'data\got5.txt'...
Corpus is now 9719485 characters long



## Split the corpus into sentences

In [9]:
#tokenizastion! saved the trained model here
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

In [10]:
#tokenize into sentences
raw_sentences = tokenizer.tokenize(corpus_raw)

In [11]:
#convert into list of words
#remove unecessary characters, split into words, no hyphens and shit
#split into words
def sentence_to_wordlist(raw):
    clean = re.sub("[^a-zA-Z]"," ", raw)
    words = clean.split()
    return words

In [12]:
#for each sentece, sentences where each word is tokenized
sentences = []
for raw_sentence in raw_sentences:
    if len(raw_sentence) > 0:
        sentences.append(sentence_to_wordlist(raw_sentence))

In [13]:
#print an example
print(raw_sentences[5])
print(sentence_to_wordlist(raw_sentences[5]))

Heraldic crest by Virginia Norey.
['Heraldic', 'crest', 'by', 'Virginia', 'Norey']


In [14]:
#count tokens, each one being a sentence
token_count = sum([len(sentence) for sentence in sentences])
print("The book corpus contains {0:,} tokens".format(token_count))

The book corpus contains 1,818,103 tokens


# Step 2: Train Word2Vec model

Vectors are god for 3 main tasks:
* Distance
* Similarity
* Ranking

Another option is GloVe. A vector is a type of tensor.

## define hyperparameters

In [15]:
# Dimensionality of the resulting word vectors.
#more dimensions mean more traiig them, but more generalized/accurate
num_features = 300

#
# Minimum word count threshold.
min_word_count = 3

# Number of threads to run in parallel.
num_workers = multiprocessing.cpu_count()

# Context window length. How much to look at each time
context_size = 7

# Downsample setting for frequent words.
#rate 0 and 1e-5 
#how often to use
downsampling = 1e-3

# Seed for the RNG, to make the results reproducible.
seed = 1

In [16]:
thrones2vec = w2v.Word2Vec(
    sg=1,
    seed=seed,
    workers=num_workers,
    size=num_features,
    min_count=min_word_count,
    window=context_size,
    sample=downsampling
)

In [17]:
thrones2vec.build_vocab(sentences)

In [18]:
print("Word2Vec vocabulary length:", len(thrones2vec.vocab))

Word2Vec vocabulary length: 17277


## train model, this takes some time

In [19]:
#train model on sentneces
thrones2vec.train(sentences)

7021684

In [20]:
#save model
if not os.path.exists("trained"):
    os.makedirs("trained")

In [21]:
thrones2vec.save(os.path.join("trained", "thrones2vec.w2v"))

In [22]:
#load model
thrones2vec = w2v.Word2Vec.load(os.path.join("trained", "thrones2vec.w2v"))

In [23]:
#squash dimensionality to 2
#https://www.oreilly.com/learning/an-illustrated-introduction-to-the-t-sne-algorithm
tsne = sklearn.manifold.TSNE(n_components=2, random_state=0)

In [24]:
#put it all into a giant matrix
all_word_vectors_matrix = thrones2vec.syn0

In [25]:
#train t sne
all_word_vectors_matrix_2d = tsne.fit_transform(all_word_vectors_matrix)

# Step 3: Plot the big picture

In [None]:
#plot point in 2d space
points = pd.DataFrame(
    [
        (word, coords[0], coords[1])
        for word, coords in [
            (word, all_word_vectors_matrix_2d[thrones2vec.vocab[word].index])
            for word in thrones2vec.vocab
        ]
    ],
    columns=["word", "x", "y"]
)

In [None]:
points.head(10)

In [None]:
#plot
sns.set_context("poster")

In [None]:
points.plot.scatter("x", "y", s=10, figsize=(20, 12))

## Zoom in to some interesting places

In [None]:
def plot_region(x_bounds, y_bounds):
    slice = points[
        (x_bounds[0] <= points.x) &
        (points.x <= x_bounds[1]) & 
        (y_bounds[0] <= points.y) &
        (points.y <= y_bounds[1])
    ]
    
    ax = slice.plot.scatter("x", "y", s=35, figsize=(10, 8))
    for i, point in slice.iterrows():
        ax.text(point.x + 0.005, point.y + 0.005, point.word, fontsize=11)

## People related to Kingsguard end up together

In [None]:
plot_region(x_bounds=(4.0, 4.2), y_bounds=(-0.5, -0.1))

## Food products are grouped nicely as well. Aerys (The Mad King) being close to "roasted" also looks sadly correct

In [None]:
plot_region(x_bounds=(0, 1), y_bounds=(4, 4.5))

# Explore sematic similarities between book characters
## Check the words closest with some specific word

In [None]:
thrones2vec.most_similar("Stark")

In [None]:
thrones2vec.most_similar("Aerys")

In [None]:
thrones2vec.most_similar("direwolf")

## Linear relationships between word pairs

In [None]:
#distance, similarity, and ranking
def nearest_similarity_cosmul(start1, end1, end2):
    similarities = thrones2vec.most_similar_cosmul(
        positive=[end2, start1],
        negative=[end1]
    )
    start2 = similarities[0][0]
    print("{start1} is related to {end1}, as {start2} is related to {end2}".format(**locals()))
    return start2

In [None]:
nearest_similarity_cosmul("Stark", "Winterfell", "Riverrun")
nearest_similarity_cosmul("Jaime", "sword", "wine")
nearest_similarity_cosmul("Arya", "Nymeria", "dragons")