# Thinkful - 4.4.4 - Drill - As unsupervised neural network: word2vec


In [2]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import spacy
import matplotlib.pyplot as plt
import seaborn as sns
import re
import gensim
from gensim.models import word2vec
from nltk.corpus import gutenberg, stopwords

In [7]:
# Utility function to clean text.
def text_cleaner(text):
    
    # Visual inspection shows spaCy does not recognize the double dash '--'.
    # Better get rid of it now!
    text = re.sub(r'--',' ',text)
    
    # Get rid of headings in square brackets.
    text = re.sub("[\[].*?[\]]", "", text)
    
    # Get rid of chapter titles.
    text = re.sub(r'Chapter \d+','',text)
    
    # Get rid of extra whitespace.
    text = ' '.join(text.split())
    
    return text


# Import all the Austen in the Project Gutenberg corpus.
austen = ""
#for novel in ['persuasion','emma','sense']:
for novel in ['persuasion','sense']:
    work = gutenberg.raw('austen-' + novel + '.txt')
    austen = austen + work

# Clean the data.
austen_clean = text_cleaner(austen)

# Parse the data. This can take some time.
nlp = spacy.load('en')
nlp.max_length=2100000
austen_doc = nlp(austen_clean)

# Organize the parsed doc into sentences, while filtering out punctuation
# and stop words, and converting words to lower case lemmas.
sentences = []
for sentence in austen_doc.sents:
    sentence = [
        token.lemma_.lower()
        for token in sentence
        if not token.is_stop
        and not token.is_punct
    ]
    sentences.append(sentence)


print(sentences[20])
print('We have {} sentences and {} tokens.'.format(len(sentences), len(austen_clean)))

['for', 'daughter', 'eld', 'give', 'thing', 'tempt']
We have 8910 sentences and 1129402 tokens.


In [11]:
model = word2vec.Word2Vec(
    sentences,
    workers=4,     # Number of threads to run in parallel (if your computer does parallel processing).
    min_count=10,  # Minimum word count threshold.
    window=6,      # Number of words around target word to consider.
    sg=0,          # Use CBOW because our corpus is small.
    sample=1e-3 ,  # Penalize frequent words.
    size=300,      # Word vector length.
    hs=1           # Use hierarchical softmax.
)

print('done!')

# List of words in model.
vocab = model.wv.vocab.keys()

print(model.wv.most_similar(positive=['lady', 'man'], negative=['woman']))

# Similarity is calculated using the cosine, so again 1 is total
# similarity and 0 is no similarity.
#print(model.wv.similarity('loud', 'aloud'))
print(model.wv.similarity('mr', 'mrs'))

# One of these things is not like the other...
print(model.doesnt_match("breakfast marriage dinner lunch".split()))
print(model.doesnt_match("daughter mother son dog".split()))

done!
[('trouble', 0.7633296251296997), ('shirley', 0.7545658349990845), ('careless', 0.7409294843673706), ('dr', 0.729116678237915), ('equal', 0.7095347046852112), ('doubt', 0.703315258026123), ('understanding', 0.7027342915534973), ('pursue', 0.702026903629303), ('hearing', 0.7004645466804504), ('distinguish', 0.6991181373596191)]
0.597678
marriage
mother




# Drill 0

Take a few minutes to modify the hyperparameters of this model and see how its answers change. Can you wrangle any improvements?

** Reduce minimum word count threshold from 10 to 8**

In [12]:
model = word2vec.Word2Vec(
    sentences,
    workers=4,     # Number of threads to run in parallel (if your computer does parallel processing).
    min_count=8,  # Minimum word count threshold.
    window=6,      # Number of words around target word to consider.
    sg=0,          # Use CBOW because our corpus is small.
    sample=1e-3 ,  # Penalize frequent words.
    size=300,      # Word vector length.
    hs=1           # Use hierarchical softmax.
)

print('done!')

# List of words in model.
vocab = model.wv.vocab.keys()

print(model.wv.most_similar(positive=['lady', 'man'], negative=['woman']))

# Similarity is calculated using the cosine, so again 1 is total
# similarity and 0 is no similarity.
#print(model.wv.similarity('loud', 'aloud'))
print(model.wv.similarity('mr', 'mrs'))

# One of these things is not like the other...
print(model.doesnt_match("breakfast marriage dinner lunch".split()))
print(model.doesnt_match("daughter mother son dog".split()))

done!
[('differently', 0.8613290786743164), ('lower', 0.8540831208229065), ('palmers', 0.7809202671051025), ('deprive', 0.7759668827056885), ('indies', 0.7726827263832092), ('seriously', 0.7615227699279785), ('match', 0.7553141713142395), ('war', 0.7485188841819763), ('resemblance', 0.7450883388519287), ('no', 0.7431521415710449)]
0.695801
breakfast
mother




**Reduce minimum word count threshold from 10 to 6**

In [13]:
model = word2vec.Word2Vec(
    sentences,
    workers=4,     # Number of threads to run in parallel (if your computer does parallel processing).
    min_count=6,  # Minimum word count threshold.
    window=6,      # Number of words around target word to consider.
    sg=0,          # Use CBOW because our corpus is small.
    sample=1e-3 ,  # Penalize frequent words.
    size=300,      # Word vector length.
    hs=1           # Use hierarchical softmax.
)

print('done!')

# List of words in model.
vocab = model.wv.vocab.keys()

print(model.wv.most_similar(positive=['lady', 'man'], negative=['woman']))

# Similarity is calculated using the cosine, so again 1 is total
# similarity and 0 is no similarity.
#print(model.wv.similarity('loud', 'aloud'))
print(model.wv.similarity('mr', 'mrs'))

# One of these things is not like the other...
print(model.doesnt_match("breakfast marriage dinner lunch".split()))
print(model.doesnt_match("daughter mother son dog".split()))

done!
[('lower', 0.8047088980674744), ('thither', 0.7799767851829529), ('benwick', 0.7744273543357849), ('away', 0.7665448188781738), ('wentworth', 0.7580333352088928), ('harville', 0.7468580007553101), ('straight', 0.7401685118675232), ('low', 0.7250785827636719), ('punishment', 0.7124902009963989), ('affront', 0.70765620470047)]
0.652204
marriage
mother




**Reduce window of words from 6 to 4**

In [17]:
model = word2vec.Word2Vec(
    sentences,
    workers=4,     # Number of threads to run in parallel (if your computer does parallel processing).
    min_count=10,  # Minimum word count threshold.
    window=4,      # Number of words around target word to consider.
    sg=0,          # Use CBOW because our corpus is small.
    sample=1e-3 ,  # Penalize frequent words.
    size=300,      # Word vector length.
    hs=1           # Use hierarchical softmax.
)

print('done!')

# List of words in model.
vocab = model.wv.vocab.keys()

print(model.wv.most_similar(positive=['lady', 'man'], negative=['woman']))

# Similarity is calculated using the cosine, so again 1 is total
# similarity and 0 is no similarity.
#print(model.wv.similarity('loud', 'aloud'))
print(model.wv.similarity('mr', 'mrs'))

# One of these things is not like the other...
print(model.doesnt_match("breakfast marriage dinner lunch".split()))
print(model.doesnt_match("daughter mother son dog".split()))

done!
[('hall', 0.740590512752533), ('a', 0.6541040539741516), ('lodge', 0.6412724852561951), ('trouble', 0.6344130039215088), ('wallis', 0.6316900849342346), ('doubt', 0.6295942068099976), ('seven', 0.6240245699882507), ('pause', 0.6216627955436707), ('away', 0.6175362467765808), ('mile', 0.6168254017829895)]
0.606136
marriage
mother




**Increase window of words from 6 to 8**

In [18]:
model = word2vec.Word2Vec(
    sentences,
    workers=4,     # Number of threads to run in parallel (if your computer does parallel processing).
    min_count=10,  # Minimum word count threshold.
    window=8,      # Number of words around target word to consider.
    sg=0,          # Use CBOW because our corpus is small.
    sample=1e-3 ,  # Penalize frequent words.
    size=300,      # Word vector length.
    hs=1           # Use hierarchical softmax.
)

print('done!')

# List of words in model.
vocab = model.wv.vocab.keys()

print(model.wv.most_similar(positive=['lady', 'man'], negative=['woman']))

# Similarity is calculated using the cosine, so again 1 is total
# similarity and 0 is no similarity.
#print(model.wv.similarity('loud', 'aloud'))
print(model.wv.similarity('mr', 'mrs'))

# One of these things is not like the other...
print(model.doesnt_match("breakfast marriage dinner lunch".split()))
print(model.doesnt_match("daughter mother son dog".split()))

done!
[('shirley', 0.835257351398468), ('gentle', 0.8249930143356323), ('a', 0.806955099105835), ('doubt', 0.7989760637283325), ('trouble', 0.7963893413543701), ('hearing', 0.7922521829605103), ('uncomfortable', 0.7917709946632385), ('pause', 0.7876396775245667), ('reason', 0.7844228744506836), ('bloom', 0.7814496755599976)]
0.671588
marriage
mother




**Increase window of words from 6 to 8 AND reduce minimum word count threshold from 10 to 8**

In [20]:
model = word2vec.Word2Vec(
    sentences,
    workers=4,     # Number of threads to run in parallel (if your computer does parallel processing).
    min_count=8,  # Minimum word count threshold.
    window=8,      # Number of words around target word to consider.
    sg=0,          # Use CBOW because our corpus is small.
    sample=1e-3 ,  # Penalize frequent words.
    size=300,      # Word vector length.
    hs=1           # Use hierarchical softmax.
)

print('done!')

# List of words in model.
vocab = model.wv.vocab.keys()

print(model.wv.most_similar(positive=['lady', 'man'], negative=['woman']))

# Similarity is calculated using the cosine, so again 1 is total
# similarity and 0 is no similarity.
#print(model.wv.similarity('loud', 'aloud'))
print(model.wv.similarity('mr', 'mrs'))

# One of these things is not like the other...
print(model.doesnt_match("breakfast marriage dinner lunch".split()))
print(model.doesnt_match("daughter mother son dog".split()))

done!
[('away', 0.8516315221786499), ('lower', 0.8479422330856323), ('laconia', 0.8122842907905579), ('another', 0.79524165391922), ('forward', 0.794305145740509), ('party', 0.7930519580841064), ('palmers', 0.7892148494720459), ('fashion', 0.7738192081451416), ('tone', 0.7708668112754822), ('dressing', 0.7695345878601074)]
0.735969
breakfast
mother




# Drill 1

"However you access it, play around with a pretrained model. Is there anything interesting you're able to pull out about analogies, similar words, or words that don't match? Write up a quick note about your tinkering and discuss it with your mentor during your next session."

I ended up using the online Google module for comparison.

**Word Analogies**

**Dog** is to **puppy** as **cat** is to
> Kitten

**Stick** is to **tree** as **pebble** is to 
> Stick

**Twig** is to **tree** as **pebble** is to 
> Boulder

**Michigan** is to **USA** as **Ontario** is to
> Canada

**Word Associations**

Sierra
> Tahoe

Yankees
> Red_Sox

Oscar
> Academy_Award

**What Doesn't fit?**

dinner cereal breakfast lunch
> cereal

blue red green crimson transparent
> transparent

monkey ape baboon human chimp gorilla
> human