## Word Vectors Game of Thrones

In [69]:
import codecs                         # for encoding words
import glob                           # import regular expression 
import re                             # regular expression as well
import nltk                           # nltk for language process
import gensim.models.word2vec as w2v 
import gensim
import numpy as np
import sklearn.manifold               # visualize high dimentions data 
import multiprocessing 
import os               
import matplotlib.pyplot as plt
import pandas as pd 
import seaborn as sns
%pylab inline

Populating the interactive namespace from numpy and matplotlib


`%matplotlib` prevents importing * from pylab and numpy
  "\n`%matplotlib` prevents importing * from pylab and numpy"


In [48]:
## download nltk data 
nltk.download('punkt')        # pretrained tokenizer
nltk.download('stopwords')    

[nltk_data] Downloading package punkt to /home/ubuntu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/ubuntu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

### 1. Prepare raw data 

In [49]:
# get the book names, matching text file 
book_filenames = sorted(glob.glob('data/*.txt'))  ## read all txt files in data folder
print("Found books:")
book_filenames

Found books:


['data/got1.txt',
 'data/got2.txt',
 'data/got3.txt',
 'data/got4.txt',
 'data/got5.txt']

In [50]:
## read data into memeory 
corpus_raw = ""
for book_filename in book_filenames:
    with open(book_filename) as f:
        corpus_raw += f.read()
    print("Finish text {}, Corpus is now {} characters long".format(book_filename,len(corpus_raw)))

Finish text data/got1.txt, Corpus is now 1770659 characters long
Finish text data/got2.txt, Corpus is now 4071041 characters long
Finish text data/got3.txt, Corpus is now 6391405 characters long
Finish text data/got4.txt, Corpus is now 8107945 characters long
Finish text data/got5.txt, Corpus is now 9719485 characters long


##### Split the corpus into sentences and tokenize all of them

In [51]:
from nltk.tokenize import sent_tokenize, word_tokenize  # import tokenizer 

In [52]:
raw_sentences = sent_tokenize(corpus_raw)  #split raw text into sentances 

In [53]:
## word tokenize each word in each sentence 
def sentence_to_wordlist(raw):
    clean = re.sub("[^a-zA-Z]"," ", raw)  # ^ mean not equal to, so this will get ride of everything but letters
    words = clean.split()                 # split out each word 
    return words

#sentence where each word is tokenized
sentences = []
for raw_sentence in raw_sentences:
    if len(raw_sentence) > 0:
        sentences.append(sentence_to_wordlist(raw_sentence))
        
len(sentences)

128868

### 2. Train Word2Vec

In [54]:
num_features = 300   ## number of dimentions we allow it to have, it is essentiall number of features
min_word_count = 3    ## Minimum word count threshold, it will ignore words less then min_word_count, just to clear noise
context_size = 7     ## block of 7 window at a time 
downsampling = 1e-3  ## downsampling for frequent word, 
seed = 1             ## for random number generator 
num_workers = multiprocessing.cpu_count()  ## number of process we want to run in parellel

In [55]:
# train the word2vec model 
thrones2vec = w2v.Word2Vec(
    sg=1,
    seed=seed,
    workers=num_workers,
    size=num_features,
    min_count=min_word_count,
    window=context_size,
    sample=downsampling
)

In [56]:
thrones2vec.build_vocab(sentences)

In [57]:
print("Word2Vec vocabulary length:", len(thrones2vec.wv.vocab))

Word2Vec vocabulary length: 17277


In [68]:
gensim.__version__[0]

'1'

In [58]:
## train the model 
corpus_count = thrones2vec.corpus_count
iteration = thrones2vec.iter
if gensim.__version__[0] == '1':
    thrones2vec.train(sentences)
else:
    thrones2vec.train(sentences,total_examples= corpus_count, epochs= iteration)

TypeError: train() got an unexpected keyword argument 'epochs'

In [None]:
## save trained model 
if not os.path.exists("trained"):
    os.makedirs("trained")

thrones2vec.save(os.path.join("trained", "thrones2vec.w2v"))