In [38]:
import codecs
import glob
import logging
import multiprocessing
import os
import pprint
import re

import gensim.models.word2vec as w2v
from gensim.models import Phrases
import gensim.models
import gensim.models.keyedvectors as kv
import sklearn.manifold
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

### 1. Prepare the data

In [30]:
import nltk
from nltk.corpus import stopwords 
from nltk.stem.wordnet import WordNetLemmatizer
import string
#nltk.download('stopwords')
#nltk.download('wordnet')
f = open("The Handmaid's Tale - Margaret Atwood.txt",'r')
book = [f.read().decode('utf-8')]

In [31]:
#set up loggin
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [32]:
#split the text into sentences
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
raw_sentences = tokenizer.tokenize(book[0])

#transform the sentence in a list
def to_wordlist(raw_sentences):
    clean = re.sub("[^a-zA-Z]"," ", raw_sentences)
    words = clean.split()
    return words
raw_wordlist = [to_wordlist(i) for i in raw_sentences_cleaned]

#the length of these wordlists stays the same as len(raw_sentence)
if len(raw_wordlist) == len(raw_sentences):
    print True 


True


In [74]:
#remove stopwords 
stop = set(stopwords.words('english'))
#add a few more stopwords mannully after examining the results
stop.update(('something','almost','already','just','could','still','though'))

exclude = set(string.punctuation)
#punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
sentences = [] 
for s in raw_wordlist:
    s = [w for w in s if not w in stop]
    wl = ''.join(ch for ch in s if ch not in exclude)
    sentences.append(s)

#check if the operations yield the rigth results
print "raw sentence>> ", raw_sentences[37]
print "split into words>> ", raw_wordlist[37]       
print "with stopwords removed>> ", sentences[37]


raw sentence>>  When the window is partly open—it only opens partly—the air can come in and make the curtains move.
split into words>>  [u'window', u'partly', u'open', u'it', u'open', u'partly', u'the', u'air', u'come', u'make', u'curtain', u'move']
with stopwords removed>>  [u'window', u'partly', u'open', u'open', u'partly', u'air', u'come', u'make', u'curtain', u'move']


In [35]:
token_count = sum([len(sentences) for sentence in sentences])
print"The book corpus contains {0:,} tokens".format(token_count)

The book corpus contains 65,593,801 tokens


### 2.Train the model

There are 3 main tasks that word vectors can be useful for calculating: Distance, Similarity and Ranking 

Num_features: the dimension of the word vectors - the more features a vector has the more accurate it is but also means more computational complexity


In [53]:
###Create the word vector representations

#the dimension of the vector
num_features = 300
#minimum word count for a word to be considered
min_word_count = 7

# the number of words that preceed or follow a word -- these words are what we consider as "context" 
context_size = 7

#threshold for configuring which higher-frequency words are randomly downsampled (source: genism documentation)
downsample = 1e-3

#Number of worker threads to train the model (more threads, higher speed)
#the cpu_count returns the number of CPUs in the system
num_workers = multiprocessing.cpu_count() 

#To make the results reproducible and debugging easier
seed = 10

In [54]:
w2vmodel = w2v.Word2Vec(sentences=sentences,
    sg=1, #algorithm: skip-gram
    seed=seed,#random no. generator
    workers=num_workers, #number of worker threads
    size=num_features, #dimensions of word vectors
    min_count=min_word_count,#minimun occurences words
    window=context_size,#size of "context" around a word
    sample=downsample, #downsample setting
    #negative = 10 #negative sampling 
)

2017-12-14 18:39:08,652 : INFO : collecting all words and their counts
2017-12-14 18:39:08,656 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2017-12-14 18:39:08,699 : INFO : collected 8312 word types from a corpus of 43479 raw words and 8099 sentences
2017-12-14 18:39:08,702 : INFO : Loading a fresh vocabulary
2017-12-14 18:39:08,719 : INFO : min_count=7 retains 1152 unique words (13% of original 8312, drops 7160)
2017-12-14 18:39:08,723 : INFO : min_count=7 leaves 30647 word corpus (70% of original 43479, drops 12832)
2017-12-14 18:39:08,734 : INFO : deleting the raw counts dictionary of 8312 items
2017-12-14 18:39:08,738 : INFO : sample=0.001 downsamples 67 most-common words
2017-12-14 18:39:08,742 : INFO : downsampling leaves estimated 26501 word corpus (86.5% of prior 30647)
2017-12-14 18:39:08,746 : INFO : estimated required memory for 1152 words and 300 dimensions: 3340800 bytes
2017-12-14 18:39:08,757 : INFO : resetting layer weights
2017-12-14 18:39

In [55]:
w2vmodel.build_vocab(sentences,update=True)
w2vmodel.wv.vocab

2017-12-14 18:39:10,220 : INFO : collecting all words and their counts
2017-12-14 18:39:10,223 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2017-12-14 18:39:10,258 : INFO : collected 8312 word types from a corpus of 43479 raw words and 8099 sentences
2017-12-14 18:39:10,261 : INFO : Updating model with new vocabulary
2017-12-14 18:39:10,277 : INFO : New added 1152 unique words (12% of original 9464) and increased the count of 1152 pre-existing words (12% of original 9464)
2017-12-14 18:39:10,294 : INFO : deleting the raw counts dictionary of 8312 items
2017-12-14 18:39:10,296 : INFO : sample=0.001 downsamples 134 most-common words
2017-12-14 18:39:10,298 : INFO : downsampling leaves estimated 53002 word corpus (172.9% of prior 30647)
2017-12-14 18:39:10,301 : INFO : estimated required memory for 2304 words and 300 dimensions: 6681600 bytes
2017-12-14 18:39:10,308 : INFO : updating layer weights


{u'salvagings': <gensim.models.keyedvectors.Vocab at 0x11fc55810>,
 u'forget': <gensim.models.keyedvectors.Vocab at 0x11fc4e050>,
 u'chain': <gensim.models.keyedvectors.Vocab at 0x11f403410>,
 u'forbidden': <gensim.models.keyedvectors.Vocab at 0x11fc4e090>,
 u'wreath': <gensim.models.keyedvectors.Vocab at 0x11f49cc10>,
 u'bear': <gensim.models.keyedvectors.Vocab at 0x11fc60690>,
 u'yellow': <gensim.models.keyedvectors.Vocab at 0x1102e3410>,
 u'month': <gensim.models.keyedvectors.Vocab at 0x11f4bde10>,
 u'four': <gensim.models.keyedvectors.Vocab at 0x11f156c10>,
 u'known': <gensim.models.keyedvectors.Vocab at 0x11fc72650>,
 u'sleep': <gensim.models.keyedvectors.Vocab at 0x11fc60e90>,
 u'ice': <gensim.models.keyedvectors.Vocab at 0x11fc68f50>,
 u'hanging': <gensim.models.keyedvectors.Vocab at 0x11f685990>,
 u'go': <gensim.models.keyedvectors.Vocab at 0x11fbc5250>,
 u'chair': <gensim.models.keyedvectors.Vocab at 0x11f403490>,
 u'milk': <gensim.models.keyedvectors.Vocab at 0x11fb9f850>,
 u

In [56]:
#include phrases
bigramer = gensim.models.Phrases(sentences)
w2vmodel = w2v.Word2Vec(bigramer[sentences],
    sg=1, #algorithm: skip-gram
    seed=seed,#random no. generator
    workers=num_workers, #number of worker threads
    size=num_features, #dimensions of word vectors
    min_count=min_word_count,#minimun occurences words
    window=context_size,#size of "context" around a word
    sample=downsample, #downsample setting
    #negative = 10 #negative sampling 
)

w2vmodel.build_vocab(sentences,update=True)
w2vmodel.wv.vocab

2017-12-14 18:39:11,914 : INFO : collecting all words and their counts
2017-12-14 18:39:11,917 : INFO : PROGRESS: at sentence #0, processed 0 words and 0 word types
2017-12-14 18:39:12,190 : INFO : collected 39755 word types from a corpus of 43479 words (unigram + bigrams) and 8099 sentences
2017-12-14 18:39:12,192 : INFO : using 39755 counts as vocab in Phrases<0 vocab, min_count=5, threshold=10.0, max_vocab_size=40000000>
2017-12-14 18:39:12,197 : INFO : collecting all words and their counts
2017-12-14 18:39:12,199 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2017-12-14 18:39:12,795 : INFO : collected 8354 word types from a corpus of 42873 raw words and 8099 sentences
2017-12-14 18:39:12,797 : INFO : Loading a fresh vocabulary
2017-12-14 18:39:12,820 : INFO : min_count=7 retains 1173 unique words (14% of original 8354, drops 7181)
2017-12-14 18:39:12,822 : INFO : min_count=7 leaves 29932 word corpus (69% of original 42873, drops 12941)
2017-12-14 18:39:1

{u'salvagings': <gensim.models.keyedvectors.Vocab at 0x11a886550>,
 u'forget': <gensim.models.keyedvectors.Vocab at 0x11ff1ebd0>,
 u'chain': <gensim.models.keyedvectors.Vocab at 0x11f591850>,
 u'white_cotton': <gensim.models.keyedvectors.Vocab at 0x11ff16b50>,
 u'forbidden': <gensim.models.keyedvectors.Vocab at 0x11ff1ec10>,
 u'wreath': <gensim.models.keyedvectors.Vocab at 0x11f7bc710>,
 u'bear': <gensim.models.keyedvectors.Vocab at 0x11ff162d0>,
 u'yellow': <gensim.models.keyedvectors.Vocab at 0x11fc4f7d0>,
 u'month': <gensim.models.keyedvectors.Vocab at 0x11ff24990>,
 u'four': <gensim.models.keyedvectors.Vocab at 0x11fc4f850>,
 u'known': <gensim.models.keyedvectors.Vocab at 0x11a8b0450>,
 u'sleep': <gensim.models.keyedvectors.Vocab at 0x11ff16b90>,
 u'ice': <gensim.models.keyedvectors.Vocab at 0x11ff12c50>,
 u'hanging': <gensim.models.keyedvectors.Vocab at 0x11ff1b690>,
 u'go': <gensim.models.keyedvectors.Vocab at 0x11fef6d50>,
 u'chair': <gensim.models.keyedvectors.Vocab at 0x11f591

In [57]:
#Finally time to train the model
w2vmodel.train(sentences, total_examples=w2vmodel.corpus_count, epochs=w2vmodel.iter)

v = w2vmodel.wv.vocab
vocablist = []
for key in v:
    vocablist.append(key)

print vocablist[:25]

2017-12-14 18:39:16,946 : INFO : training model with 4 workers on 1183 vocabulary and 300 features, using sg=1 hs=0 sample=0.001 negative=5 window=7
2017-12-14 18:39:17,345 : INFO : worker thread finished; awaiting finish of 3 more threads
2017-12-14 18:39:17,371 : INFO : worker thread finished; awaiting finish of 2 more threads
2017-12-14 18:39:17,385 : INFO : worker thread finished; awaiting finish of 1 more threads
2017-12-14 18:39:17,390 : INFO : worker thread finished; awaiting finish of 0 more threads
2017-12-14 18:39:17,393 : INFO : training on 217395 raw words (132589 effective words) took 0.4s, 306224 effective words/s


[u'salvagings', u'forget', u'chain', u'white_cotton', u'forbidden', u'wreath', u'bear', u'yellow', u'month', u'four', u'known', u'sleep', u'ice', u'hanging', u'go', u'chair', u'milk', u'seemed', u'kitchen_table', u'hurry', u'tv', u'elizabeth', u'flash', u'grass', u'young']


In [None]:
if not os.path.exists("trained"):
    os.makedirs("trained")
w2vmodel.save(os.path.join("trained", "w2vmodel_Atwood.w2v"))

In [58]:
w2vmodel.wv.most_similar("offred")[:10]

2017-12-14 18:39:22,481 : INFO : precomputing L2-norms of word weight vectors


[(u'sofa', 0.999826967716217),
 (u'sun', 0.999817967414856),
 (u'carefully', 0.9998103380203247),
 (u'gray', 0.999808132648468),
 (u'flower', 0.9998079538345337),
 (u'system', 0.9998077154159546),
 (u'mark', 0.9998068809509277),
 (u'stopped', 0.9998066425323486),
 (u'become', 0.9998058676719666),
 (u'middle', 0.9998031258583069)]

In [62]:
w2vmodel.wv.most_similar("commander")[:10]

[(u'garden', 0.9996933937072754),
 (u'household', 0.9996857047080994),
 (u'stool', 0.999683141708374),
 (u'help', 0.9996800422668457),
 (u'neck', 0.9996781349182129),
 (u'watch', 0.9996774792671204),
 (u'sign', 0.9996774196624756),
 (u'cane', 0.9996772408485413),
 (u'marthas', 0.999674916267395),
 (u'reach', 0.9996746778488159)]

In [59]:
w2vmodel.wv.most_similar("ofglen")[:10]

[(u'stairs', 0.9997948408126831),
 (u'new', 0.999788224697113),
 (u'carefully', 0.9997872114181519),
 (u'pas', 0.9997871518135071),
 (u'clean', 0.999786913394928),
 (u'building', 0.9997867941856384),
 (u'sheet', 0.9997844099998474),
 (u'mark', 0.9997844099998474),
 (u'quickly', 0.9997843503952026),
 (u'flesh', 0.9997840523719788)]

In [63]:
w2vmodel.wv.most_similar("nick")[:10]

[(u'lead', 0.999846875667572),
 (u'reach', 0.9998407959938049),
 (u'purple', 0.999836802482605),
 (u'clock', 0.9998335242271423),
 (u'car', 0.9998334646224976),
 (u'mark', 0.9998327493667603),
 (u'cane', 0.9998326897621155),
 (u'gray', 0.9998321533203125),
 (u'middle', 0.9998319745063782),
 (u'catch', 0.9998318552970886)]

In [64]:
w2vmodel.wv.most_similar("luke")[:10]

[(u'every', 0.9997903108596802),
 (u'lying', 0.9997903108596802),
 (u'pregnant', 0.9997866749763489),
 (u'bathroom', 0.9997862577438354),
 (u'kept', 0.9997852444648743),
 (u'school', 0.9997844696044922),
 (u'told', 0.9997838735580444),
 (u'together', 0.999783456325531),
 (u'none', 0.9997833967208862),
 (u'true', 0.9997811317443848)]

In [72]:
w2vmodel.wv.most_similar("say")[:10]

[(u'would', 0.9997519254684448),
 (u'better', 0.999742329120636),
 (u'things', 0.9997395277023315),
 (u'done', 0.9997340440750122),
 (u'used', 0.9997304677963257),
 (u'well', 0.9997274279594421),
 (u'know', 0.9997270703315735),
 (u'think', 0.9997214078903198),
 (u'bad', 0.9997178912162781),
 (u'even', 0.9997156858444214)]

In [159]:
w2vmodel.most_similar_cosmul(
        positive=["commander", "offred"],
        negative=["handmaid"])

[(u'wings', 0.999957799911499),
 (u'sofa', 0.9999575614929199),
 (u'outside', 0.9999569654464722),
 (u'back', 0.9999558329582214),
 (u'sitting', 0.9999557733535767),
 (u'sky', 0.999954879283905),
 (u'feet', 0.9999547600746155),
 (u'forward', 0.9999547004699707),
 (u'aunt', 0.999954104423523),
 (u'keep', 0.9999540448188782)]

In [None]:
print w2vmodel.most_similar_cosmul(
        positive=["commander", "offred"],
        negative=["ofglen"])

print vocablist[:30]

In [None]:
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
%matplotlib inline

w2vmodeltest = w2v.Word2Vec(bigramer[cleaned_sentences],
    sg=0, #algorithm: skip-gram
    seed=seed,#random no. generator
    workers=num_workers, #number of worker threads
    size=100, #dimensions of word vectors
    min_count=50,#minimun occurences words
    window=20,#size of "context" around a word
    sample=downsample, #downsample setting
    negative = 10 #negative sampling 
)

def tsne_plot(model):
    "Creates and TSNE model and plots it"
    labels = []
    tokens = []

    for word in model.wv.vocab:
        tokens.append(model[word])
        labels.append(word)
    
    tsne_model = TSNE(perplexity=40, n_components=2, init='pca', n_iter=2500, random_state=23)
    new_values = tsne_model.fit_transform(tokens)

    x = []
    y = []
    for value in new_values:
        x.append(value[0])
        y.append(value[1])
        
    plt.figure(figsize=(16, 16)) 
    for i in range(len(x)):
        plt.scatter(x[i],y[i])
        plt.annotate(labels[i],
                     xy=(x[i], y[i]),
                     xytext=(5, 2),
                     textcoords='offset points',
                     ha='right',
                     va='bottom')
    plt.show()

tsne_plot(w2vmodeltest)


In [None]:
from sklearn.decomposition import PCA


In [None]:
w2vmodel.most_similar_cosmul(
        positive=["Commander", "Offred"],
        negative=["handmaid"])
def nearest_similarity_cosmul(start1, end1, end2):
    similarities = w2vmodel.most_similar_cosmul(
        positive=[end2, start1],
        negative=[end1]
    )
    start2 = similarities[0][0]
    print("{start1} is related to {end1}, as {start2} is related to {end2}".format(**locals()))
    return start2

In [None]:
thrones2vec = w2v.Word2Vec.load(os.path.join("trained", "thrones2vec.w2v"))
tsne = sklearn.manifold.TSNE(n_components=2, random_state=0)
all_word_vectors_matrix = thrones2vec.syn0
all_word_vectors_matrix_2d = tsne.fit_transform(all_word_vectors_matrix)


In [22]:
'''exclude = set(string.punctuation) 
lemma = WordNetLemmatizer()

def clean(doc):
    stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    normalized = " ".join(lemma.lemmatize(word) for word in stop_free.split())
    return normalized

#raw_sentences_cleaned = [clean(s) for s in raw_sentences]

#transform the sentence in a list
def sentence_to_wordlist(raw):
    clean = re.sub("[^a-zA-Z]"," ", raw)
    words = clean.split()
    return words
'''for s in raw_sentences_cleaned:
    if len(s) > 0:
        sentences.append(sentence_to_wordlist(s))
'''
#the length of sentence stays the same as len(raw_sentence)
if len(sentences) == len(raw_sentences):
    print True 



[u'window', u'partly', u'open', u'it', u'open', u'partly', u'the', u'air', u'come', u'make', u'curtain', u'move']
True
[raw sentence]: When the window is partly open—it only opens partly—the air can come in and make the curtains move.
[with stopwords removed]: window partly open—it open partly—the air come make curtain move.
[transform the sentence in a list]: [u'window', u'partly', u'open', u'it', u'open', u'partly', u'the', u'air', u'come', u'make', u'curtain', u'move']
