In [2]:
# All the native dependencies. 

import codecs
import re
import glob
import multiprocessing
import os
import pprint
import logging

In [3]:
# All the non native dependencies.

import nltk 
import gensim.models.word2vec as word
import numpy as np
import sklearn.manifold
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [5]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

## ALL THE SETUP HAS BEEN DONE! SOME PRELIMINARY DATA CLEANING!

In [6]:
book_names = os.listdir('./game_of_thrones/')
book_names = sorted(glob.glob('./game_of_thrones/*.txt'))
## Apparently, glob does proper filename expansion. Therefore glob > os

In [7]:
book_names

['./game_of_thrones/got1.txt',
 './game_of_thrones/got2.txt',
 './game_of_thrones/got3.txt',
 './game_of_thrones/got4.txt',
 './game_of_thrones/got5.txt']

In [8]:
# for book_name in book_names:
#     this_one = open(book_name)
#     this_one.read()

# COME BACK AND DO MORE EXTENSIVE DATA CLEANING TO SEE IF IT CHANGES RESULTS.

### https://people.duke.edu/~ccc14/sta-663/TextProcessingExtras.html
### https://www.analyticsvidhya.com/blog/2014/11/text-data-cleaning-steps-python/

In [9]:
# Converting all books to UTF8 and putting them into the same string. 
# Interesting article on encodings - if you every need it again. 
# http://kunststube.net/encoding/

In [10]:
corpus_raw = u""
for this_book in book_names:
    with codecs.open(this_book, "r", "utf-8") as book:
        corpus_raw += book.read()
    print("Corpus is {0} characters long".format(len(corpus_raw)))

Corpus is 1770659 characters long
Corpus is 4071041 characters long
Corpus is 6391405 characters long
Corpus is 8107945 characters long
Corpus is 9719485 characters long


In [11]:
# DO NOT TRY AND CALL THE CORPUS TO VIEW ITS CONTENTS. 
# Learnt this the hard way. 

In [15]:
corpus_raw[:1000]
# ALL OF THIS JUNK NEEDS TO DISAPPEAR IN THE SECOND PASS.

'This edition contains the complete text of the original hardcover edition.\n\nNOT ONE WORD HAS BEEN OMITTED.\n\nA CLASH OF KINGS\n\nA Bantam Spectra Book\n\nPUBLISHING HISTORY\n\nBantam Spectra hardcover edition published February 1999\n\nBantam Spectra paperback edition / September 2000\n\nSPECTRA and the portrayal of a boxed “s” are trademarks of Bantam Books, a division of Random House, Inc.\n\nAll rights reserved.\n\nCopyright © 1999 by George R. R. Martin.\n\nMaps by James Sinclair.\n\nHeraldic crest by Virginia Norey.\n\nLibrary of Congress Catalog Card Number: 98-37954.\n\nNo part of this book may be reproduced or transmitted in any form or by any means, electronic or mechanical, including photocopying, recording, or by any information storage and retrieval system, without permission in writing from the publisher.\n\nVisit our website at www.bantamdell.com\n\nBantam Books, the rooster colophon, Spectra and the portrayal of a boxed “s” are registered trademarks of Random House I

In [16]:
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
raw_sentences = tokenizer.tokenize(corpus_raw)

In [37]:
raw_sentences[10]

# Split these tokenized sentences, down to tokenized words

def sentence_to_wordlist(raw):
    clean = re.sub("[^a-zA-Z]"," ", raw)
    words = clean.split()
    return words

raw_example = sentence_to_wordlist(raw_sentences[10])
raw_example

['It', 'was', 'here', 'the', 'ravens', 'came', 'after', 'long', 'flight']

In [38]:
sentences = []

for raw_sentence in raw_sentences:
    if len(raw_sentence) > 0:
        sentences.append(sentence_to_wordlist(raw_sentence))

In [40]:
print(raw_sentences[5])
print(sentence_to_wordlist(raw_sentences[5]))

Heraldic crest by Virginia Norey.
['Heraldic', 'crest', 'by', 'Virginia', 'Norey']


In [41]:
token_count = sum([len(sentence) for sentence in sentences])
print("The book corpus contains {0:,} tokens".format(token_count))

The book corpus contains 1,818,103 tokens


## I DO NOT KNOW WHY WE ARENT JUST SIMPLY TAKING THE WORDS AND CONCATENATING THEM. THERE SEEMS TO BE SOME REASON TO PRESERVE THE SENTENCE LEVEL INTEGRITY.

In [42]:
# HYPERPARAMETERS!

num_features = 300
min_word_count = 3
num_workers = multiprocessing.cpu_count()
context_size = 7
downsampling = 1e-3
seed = 1

In [43]:
thrones = word.Word2Vec(
    sg=1,
    seed=seed,
    workers=num_workers,
    size=num_features,
    min_count=min_word_count,
    window=context_size,
    sample=downsampling
)

In [None]:
thrones.build_vocab(sentences)

# Main Step - starts training on the given corpus. 

In [54]:
thrones.train(sentences, total_examples= thrones.corpus_count, epochs= thrones.iter)

2017-06-13 23:05:25,406 : INFO : training model with 4 workers on 17277 vocabulary and 300 features, using sg=1 hs=0 sample=0.001 negative=5 window=7
2017-06-13 23:05:26,496 : INFO : PROGRESS: at 1.97% examples, 134782 words/s, in_qsize 7, out_qsize 0
2017-06-13 23:05:27,511 : INFO : PROGRESS: at 4.41% examples, 150675 words/s, in_qsize 7, out_qsize 0
2017-06-13 23:05:28,522 : INFO : PROGRESS: at 6.95% examples, 156604 words/s, in_qsize 7, out_qsize 0
2017-06-13 23:05:29,560 : INFO : PROGRESS: at 9.54% examples, 160191 words/s, in_qsize 7, out_qsize 0
2017-06-13 23:05:30,568 : INFO : PROGRESS: at 12.01% examples, 163473 words/s, in_qsize 7, out_qsize 0
2017-06-13 23:05:31,569 : INFO : PROGRESS: at 14.42% examples, 164392 words/s, in_qsize 7, out_qsize 0
2017-06-13 23:05:32,647 : INFO : PROGRESS: at 17.26% examples, 166558 words/s, in_qsize 7, out_qsize 0
2017-06-13 23:05:33,671 : INFO : PROGRESS: at 19.80% examples, 169152 words/s, in_qsize 7, out_qsize 0
2017-06-13 23:05:34,673 : INFO

7019467

In [56]:
thrones.save(os.path.join("./trained_models/", "thrones.w2v"))

2017-06-13 23:07:42,043 : INFO : saving Word2Vec object under ./trained_models/thrones.w2v, separately None
2017-06-13 23:07:42,047 : INFO : not storing attribute syn0norm
2017-06-13 23:07:42,048 : INFO : not storing attribute cum_table
2017-06-13 23:07:43,616 : INFO : saved ./trained_models/thrones.w2v


In [59]:
## DONE. TRAINED THIS MODEL - GAME OF THRONES. NOW, TRAINING HARRY POTTER. 