In [4]:
#
# hw7pr1.py ~ cs35 summer 2022   Natural Language!
#

# our libraries
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [5]:
## Before we go further, let's make sure that you have these natural-language libraries working...
# first, textblob, at https://textblob.readthedocs.io/en/dev/install.html
# which installs nltk, the "natural language toolkit", https://www.nltk.org/
import textblob

In [7]:
# Natural language processing usually needs support files or "corpora" (or "vectors")
# Try grabbing the punkt package first (for sentence-handling):
import nltk
nltk.download('punkt')   # this is nltk's sentence-handling routines

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\chung\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [8]:
# Then, try grabbing the corpora needed:
# !python3 -m textblob.download_corpora
# or
!python -m textblob.download_corpora

Finished.


[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\chung\AppData\Roaming\nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\chung\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\chung\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\chung\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package conll2000 to
[nltk_data]     C:\Users\chung\AppData\Roaming\nltk_data...
[nltk_data]   Package conll2000 is already up-to-date!
[nltk_data] Downloading package movie_reviews to
[nltk_data]     C:\Users\chung\AppData\Roaming\nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!

In [10]:
#
# Some example texts!
#

# https://xkcd.com/1443/  Language Nerd
xkcd_text = """
I don't mean to go all language nerd on you, 
but I just legit adverbed "legit," verbed "adverb," 
and adjectived "language nerd."
"""

# from the Stanford treebank (#7 and #42)
movie_review = """
Just the labor involved in creating the layered richness of the imagery in 
this chiaroscuro of madness and light is astonishing. Much of it comes from 
the brave, uninhibited performances by its lead actors.
"""

# the explanation from the treebank
example_text = """
The underlying technology of this demo is based on a new type of 
Recursive Neural Network that builds on top of grammatical structures. 
You can also browse the Stanford Sentiment Treebank, the dataset on
which this model was trained. Of course, no model is perfect!
"""

# from the Google API demo
other_example = """
Google, headquartered in Mountain View (1600 Amphitheatre Pkwy, Mountain View, CA 940430), 
unveiled the new Android phone for $799 at the Consumer Electronic Show. 
Sundar Pichai said in his keynote that users love their new Android phones.
"""

# Stanford's treebank:  https://nlp.stanford.edu/sentiment/treebank.html
# Google's site:  https://cloud.google.com/natural-language/#natural-language-api-demo

In [11]:
# tokenize with NLTK: sentences
print("Here is the tokenized list-of-sentences from movie_review:")
LoS = nltk.sent_tokenize(movie_review)
print(f"len(LoS) is {len(LoS)}")
print(f"LoW is {LoS}")
print()

# words - and punctuation...
print("And the tokenized words - and punctuation - from other_example:")
LoW = nltk.word_tokenize(other_example)
print(f"len(LoW) is {len(LoW)}")
print(f"LoW is {LoW}")
print()

# parts of speech!
# Here is a list of parts-of-speech tags:  https://cs.nyu.edu/~grishman/jet/guide/PennPOS.html 
LoW = nltk.word_tokenize(xkcd_text)
PoS = nltk.pos_tag(LoW)
for i in range(8): # let's see the first eight
    print(f"part of speech {i} is {PoS[i]}")

Here is the tokenized list-of-sentences from movie_review:
len(LoS) is 2
LoW is ['\nJust the labor involved in creating the layered richness of the imagery in \nthis chiaroscuro of madness and light is astonishing.', 'Much of it comes from \nthe brave, uninhibited performances by its lead actors.']

And the tokenized words - and punctuation - from other_example:
len(LoW) is 46
LoW is ['Google', ',', 'headquartered', 'in', 'Mountain', 'View', '(', '1600', 'Amphitheatre', 'Pkwy', ',', 'Mountain', 'View', ',', 'CA', '940430', ')', ',', 'unveiled', 'the', 'new', 'Android', 'phone', 'for', '$', '799', 'at', 'the', 'Consumer', 'Electronic', 'Show', '.', 'Sundar', 'Pichai', 'said', 'in', 'his', 'keynote', 'that', 'users', 'love', 'their', 'new', 'Android', 'phones', '.']

part of speech 0 is ('I', 'PRP')
part of speech 1 is ('do', 'VBP')
part of speech 2 is ("n't", 'RB')
part of speech 3 is ('mean', 'VB')
part of speech 4 is ('to', 'TO')
part of speech 5 is ('go', 'VB')
part of speech 6 is ('

In [12]:
# Let's try textblob!

blob = textblob.TextBlob( example_text )
print("Tokenizing examples with textblob:")
print("Words:", blob.words)
print("Sentences:", blob.sentences)
print("Parts-of-speech:", blob.pos_tags)

Tokenizing examples with textblob:
Words: ['The', 'underlying', 'technology', 'of', 'this', 'demo', 'is', 'based', 'on', 'a', 'new', 'type', 'of', 'Recursive', 'Neural', 'Network', 'that', 'builds', 'on', 'top', 'of', 'grammatical', 'structures', 'You', 'can', 'also', 'browse', 'the', 'Stanford', 'Sentiment', 'Treebank', 'the', 'dataset', 'on', 'which', 'this', 'model', 'was', 'trained', 'Of', 'course', 'no', 'model', 'is', 'perfect']
Sentences: [Sentence("
The underlying technology of this demo is based on a new type of 
Recursive Neural Network that builds on top of grammatical structures."), Sentence("You can also browse the Stanford Sentiment Treebank, the dataset on
which this model was trained."), Sentence("Of course, no model is perfect!")]
Parts-of-speech: [('The', 'DT'), ('underlying', 'VBG'), ('technology', 'NN'), ('of', 'IN'), ('this', 'DT'), ('demo', 'NN'), ('is', 'VBZ'), ('based', 'VBN'), ('on', 'IN'), ('a', 'DT'), ('new', 'JJ'), ('type', 'NN'), ('of', 'IN'), ('Recursive',

<br>

#### Let's try to create an example that confuses textblob...

or, at least, causes it to get one or more parts-of-speech incorrect.

An online guide to the parts-of-speech tags is [at this link](https://cs.nyu.edu/~grishman/jet/guide/PennPOS.html)

In [54]:
    # Create - or find - a new sentence for which NLTK or textblob get one or more parts-of-speech wrong...
#
# Parts of speech:  https://cs.nyu.edu/~grishman/jet/guide/PennPOS.html 

example1 = "Will will write Will's will on the other will."
# Will is a name but TextBlob thinks it is a Modal (I don't really know what that is but it's not a noun)
# That last will is a noun but instead TextBlob thinks it's a Modal.

example2 = "Superlatives are superly super."
# Superly should be considered as an adverb and super should be an adjective...

blob1 = textblob.TextBlob( example1 )
blob2 = textblob.TextBlob( example2 )
print("Parts-of-speech:\n", blob1.pos_tags)
print()
print("Parts-of-speech:\n", blob2.pos_tags)

Parts-of-speech:
 [('Will', 'MD'), ('will', 'MD'), ('write', 'VB'), ('Will', 'NNP'), ("'s", 'POS'), ('will', 'MD'), ('on', 'IN'), ('the', 'DT'), ('other', 'JJ'), ('will', 'MD')]

Parts-of-speech:
 [('Superlatives', 'NNS'), ('are', 'VBP'), ('superly', 'JJ'), ('super', 'NN')]


''

In [14]:
# Let textblob show off! Examples from the TextBlob QuickStart Tutorial, 
# at https://textblob.readthedocs.io/en/dev/quickstart.html

from textblob import TextBlob

In [15]:
wiki = TextBlob("Python is a high-level, general-purpose programming language.")
wiki.tags

[('Python', 'NNP'),
 ('is', 'VBZ'),
 ('a', 'DT'),
 ('high-level', 'JJ'),
 ('general-purpose', 'JJ'),
 ('programming', 'NN'),
 ('language', 'NN')]

In [16]:
testimonial = TextBlob("Textblob is amazingly simple to use. What great fun!")
print(f"{testimonial.sentiment.polarity=}")
print(f"{testimonial.sentiment.subjectivity=}")

testimonial.sentiment.polarity=0.39166666666666666
testimonial.sentiment.subjectivity=0.4357142857142857


In [17]:
zen = TextBlob("Beautiful is better than ugly. Explicit is better than implicit. Simple is better than complex.")
for sentence in zen.sentences:
    print(sentence, sentence.sentiment)

Beautiful is better than ugly. Sentiment(polarity=0.2166666666666667, subjectivity=0.8333333333333334)
Explicit is better than implicit. Sentiment(polarity=0.5, subjectivity=0.5)
Simple is better than complex. Sentiment(polarity=0.06666666666666667, subjectivity=0.41904761904761906)


In [18]:
sentence = TextBlob('Use 4 spaces per indentation level.')
print(f"{sentence.words[2]=}")
print(f"{sentence.words[2].singularize()=}")
print(f"{sentence.words[-1]=}")
print(f"{sentence.words[-1].pluralize()=}")


sentence.words[2]='spaces'
sentence.words[2].singularize()='space'
sentence.words[-1]='level'
sentence.words[-1].pluralize()='levels'


In [20]:
# Cool!
b = TextBlob("I havv goood speling!")
print(b.correct())

I have good spelling!


In [22]:
# import gensim!
import gensim

In [23]:
# The word-embeddings are in the file word2vec_model.txt
# Make sure that file is here:
%ls

 Volume in drive C is Blade 17
 Volume Serial Number is CAA9-6C59

 Directory of c:\Users\chung\OneDrive\Desktop\CS35\week7_sum22\week7_sum22

07/14/2022  01:41 PM    <DIR>          .
07/14/2022  01:41 PM    <DIR>          ..
07/14/2022  01:19 PM             1,283 hw7ec.ipynb
07/14/2022  01:19 PM             2,688 hw7pr0.ipynb
07/14/2022  04:57 PM            51,431 hw7pr1.ipynb
07/14/2022  01:19 PM             9,355 hw7pr2.ipynb
07/14/2022  01:19 PM       138,432,415 word2vec_model.txt
               5 File(s)    138,497,172 bytes
               2 Dir(s)  365,692,334,080 bytes free


In [24]:
#
# Create a model, m, with the line   m = read_word2vec_model()
#

from gensim.models import KeyedVectors

def read_word2vec_model(filename = "word2vec_model.txt"):  
    """ a function that reads a word2vec model from the file
        "word2vec_model.txt" and returns a model object that
        we will usually name m or model...
    """
    try:
        print("Starting to load the model in ", filename, "...")
        model = KeyedVectors.load_word2vec_format(filename, binary=False)
        print("Model loaded.\n")
    except FileNotFoundError as e:
        print(f"  [WARNING]    The file {filename} was not found.     [WARNING]  ")
        return None   # returning a placeholder, not a model

    # let's print some attributes
    print("The model built is", model, "\n")
    print("The vocabulary has", model.vectors.shape[0], "words")   # The vocabulary has 43981 words
    print("Each word is a vector of size", model.vector_size)  # 300
    print("\nTry m.get_vector('python') to see a the vector for 'python'!\n")
    model.fill_norms()  # freezes the model, m, as-is (no more training)
    # we weren't going to train more, so no worries (in week7, at least)
    return model

read_word2vec_model("word2vec_model.txt")

Starting to load the model in  word2vec_model.txt ...
Model loaded.

The model built is KeyedVectors<vector_size=300, 43981 keys> 

The vocabulary has 43981 words
Each word is a vector of size 300

Try m.get_vector('python') to see a the vector for 'python'!



<gensim.models.keyedvectors.KeyedVectors at 0x2414fb19330>

In [25]:
# 
# best to run this only once... or once in a while
#
m = read_word2vec_model()

Starting to load the model in  word2vec_model.txt ...
Model loaded.

The model built is KeyedVectors<vector_size=300, 43981 keys> 

The vocabulary has 43981 words
Each word is a vector of size 300

Try m.get_vector('python') to see a the vector for 'python'!



In [26]:
if 'king' in m:
    print("That word is in m")
else:
    print("That word is NOT in m")

That word is in m


In [27]:
#
# let's see the meaning of 'king', 'queen', and 'python'!
#
m.get_vector('king')   # m.get_vector('queen')  m.get_vector('python')   m.get_vector('snake')

array([ 4.34064e-02,  1.02628e-02,  2.96526e-03,  4.81172e-02,
       -8.83269e-03, -1.24499e-02,  3.85274e-02, -6.83062e-02,
        1.76654e-02,  1.25172e-01, -8.34479e-02, -1.04310e-01,
       -6.12400e-02, -8.58033e-03, -5.78752e-02, -5.85481e-02,
        1.19452e-02,  1.79808e-03,  1.59830e-02,  4.44158e-02,
        4.71077e-02,  3.88639e-02,  2.05255e-02,  4.71077e-02,
        3.48261e-02, -6.09035e-02, -8.68128e-02,  2.06096e-02,
        1.17769e-01, -1.07254e-02,  3.60037e-02,  2.12826e-02,
        4.29017e-02,  1.37958e-01, -1.11040e-01,  2.89376e-02,
        1.34593e-02,  2.01890e-03,  2.42268e-02,  5.95576e-02,
        4.77807e-02, -7.97466e-02,  9.75802e-02,  4.91266e-02,
        1.17769e-01, -8.24385e-03, -3.78544e-02,  1.14404e-02,
       -1.88431e-02,  5.27859e-03, -5.58563e-02,  5.45103e-02,
       -8.95046e-02,  6.93997e-03, -5.61928e-02,  4.67923e-04,
       -4.97996e-02, -1.96002e-02,  1.48053e-02, -8.49621e-03,
        6.39319e-02,  1.54109e-01,  3.30175e-03,  4.542

In [28]:
# variable-assignment statements start to get noticeably "meta" ...
python = m.get_vector('python')
snake = m.get_vector('snake')
language = m.get_vector('language')
code = m.get_vector('code')

In [29]:
print(np.linalg.norm(snake))  # this is the length of the vector...

# the word2vec model provides unit vectors

1.0


In [30]:
# the dot product is available in the numpy library
np.dot(python, snake)

# these are unit vectors, so this is the cosine "similarity"

0.66062933

In [31]:
# we can find the angle between the two vectors
deg = np.degrees(np.arccos(0.66063))  # converting from radians to degrees
print(f"which is {deg} degrees")

which is 48.65206211455501 degrees


In [32]:
# Let's try the built-in similarity method:
m.similarity('python','snake')   # should be the same .6606292...

0.6606292

In [33]:
m.distance( 'python', 'snake' )   # This is 1 - the similarity

0.3393707871437073

In [34]:
m.distance( 'python', 'coffee' )   # let's see...

0.9787417445331812

<br>

####  Let's explore dataset bias

In [35]:
# with similarity, the biases of the datset can show through: let's check "programmer" vs "woman" and "man"
#
simw = m.similarity("programmer","woman")
print(f"similarity w 'woman': {simw}")

simm = m.similarity("programmer","man")
print(f"similarity w 'man': {simm}")

simp= m.similarity("programmer","person")   # try it!
print(f"similarity w 'person': {simp}")

similarity w 'woman': 0.08992718160152435
similarity w 'man': 0.126168891787529
similarity w 'person': 0.22314053773880005


<br>

#### Computing multiple similarities...

In [37]:
# Let's compare multiple similarities:

python_snake = m.similarity('python','snake')
python_coffee = m.similarity('python','coffee')
snake_coffee = m.similarity('snake','coffee')

print(f"python_snake  similarity: {python_snake}")   # try :4.2f after the variable for formatting
print(f"python_coffee similarity: {python_coffee}")  # 4 characters wide, 2 places after the decimal point
print(f"snake_coffee  similarity: {snake_coffee}")

python_snake  similarity: 0.6606292128562927
python_coffee similarity: 0.02125825546681881
snake_coffee  similarity: 0.07976286113262177


In [42]:
# Comparing with multiple similarities

# Let's compare multiple similarities:

python_snake = m.similarity('python','snake')
python_coffee = m.similarity('python','coffee')
snake_coffee = m.similarity('snake','coffee')

print(f"python_snake  similarity: {python_snake}")   # try :4.2f after the variable for formatting
print(f"python_coffee similarity: {python_coffee}")  # 4 characters wide, 2 places after the decimal point
print(f"snake_coffee similarity: {snake_coffee}")

python_snake  similarity: 0.6606292128562927
python_coffee similarity: 0.02125825546681881
snake_coffee similarity: 0.07976286113262177


In [55]:
#
# Let's define an "odd-one-out" from any collection of words, 
# simply by considering all possible similarities (and adding them up for each word)

"""
here, for example:

python_snake  similarity: .66
python_coffee similarity: .02
snake_coffee  similarity: .08

So, summing the similarities for each word separately:
  python:  .66 + .02 == .68
  coffee:  .08 + .02 == .10
  snake:   .66 + .08 == .74

+++ In this case, "coffee" is the odd one out  (intuitive, in some ways)

"""

'\nhere, for example:\n\npython_snake  similarity: .66\npython_coffee similarity: .02\nsnake_coffee  similarity: .08\n\nSo, summing the similarities for each word separately:\n  python:  .66 + .02 == .68\n  coffee:  .08 + .02 == .10\n  snake:   .66 + .08 == .74\n\n+++ In this case, "coffee" is the odd one out  (intuitive, in some ways)\n\n\n# What do you think about python, serpent, snake?\n# or python, serpent, snake, code?\n\n'

In [56]:
# notice that the split function makes creating lists-of-words a bit easier
initial_words = "snake serpent python code ai ml programming".split()
initial_words

['snake', 'serpent', 'python', 'code', 'ai', 'ml', 'programming']

In [57]:
#
# here is a _single_ keyword, with similarities computed against every word w in initial_words
key = 'python'

LoS = []
LoW = []
for w in initial_words:
    if w in m: 
        similarity = m.similarity(key,w)
        print(f"similarity between {key} and {w}: {similarity:6.2f}", )
        LoS.append( similarity )
        LoW.append( w )
    else:
        print(f"  __  {w}  __ was not in the vocabulary", )   # not every word will be present

print(f"LoS is {LoS}")
print(f"LoW is {LoW}")


similarity between python and snake:   0.66
similarity between python and serpent:   0.45
similarity between python and python:   1.00
similarity between python and code:   0.11
  __  ai  __ was not in the vocabulary
similarity between python and ml:   0.08
similarity between python and programming:   0.09
LoS is [0.6606292, 0.44771382, 1.0, 0.10966147, 0.08480783, 0.090359524]
LoW is ['snake', 'serpent', 'python', 'code', 'ml', 'programming']


In [58]:
# with similarity, the biases of the datset can show through: let's check "president" vs "woman" and "man"
#
simw = m.similarity("president","woman")
print(f"similarity w 'woman': {simw}")

simm = m.similarity("president","man")
print(f"similarity w 'man': {simm}")

# notice that the values provide a starting-point to _quantify_ the bias in the dataset
# quantifying dataset bias is currently a very active area of research
# it would also be possible to compare both of these with 
simp= m.similarity("president","person")   # try it!
print(f"bias quantification; used 'person' as a neutral factor: {simp}")

similarity w 'woman': 0.06267661601305008
similarity w 'man': 0.02842460200190544
bias quantification; used 'person' as a neutral factor: 0.10806348919868469


##### From the cell above, we can see that 'president' and 'man' has the least similarity score, then 'president' and 'woman', and finally 'president' and 'person.' We use a term in m that serves as neutral between the words we are comparing (in this case, we are comparing man and woman to president). The neutral word here is 'person.' Although we can see that 'president' and 'woman' has 3x the similarity score compared to 'president' and 'man', we also see that the similarity score for 'president' and 'person' exceeds the other two. That means the a human being compared to president has a larger bias (for their similarity score) than when president is compared to a certain gender. 

##### We can also see that the comparison between 'president' and 'person' has a larger "sway" compared to the comparison for woman and man. Our dataset has proven this claim. We can compare these similarity differences to quantify the bias that we derived from our data.

In [59]:
# Same comparison, this time with man and woman.

president_woman = m.similarity('president','woman')
president_man = m.similarity('president','man')
woman_man = m.similarity('woman','man')

print(f"president_woman  similarity: {president_woman}")   # try :4.2f after the variable for formatting
print(f"president_man similarity: {president_man}")  # 4 characters wide, 2 places after the decimal point
print(f"woman_man  similarity: {woman_man}")

president_woman  similarity: 0.06267661601305008
president_man similarity: 0.02842460200190544
woman_man  similarity: 0.7664012908935547


In [61]:
#
# Let's define an "odd-one-out" from any collection of words, 
# simply by considering all possible similarities (and adding them up for each word)

"""
here, for example:

president_woman  similarity: .063
president_man similarity: .028
woman_man  similarity: .766

So, summing the similarities for each word separately:
  president:  .063 + .028 == 0.091
  woman:  .063 + .766 == 0.829
  man:   .028 + .766 == .0.794

+++ In this case, "president" is the odd one out  (intuitive, in some ways)

"""

'\nhere, for example:\n\npresident_woman  similarity: .063\npresident_man similarity: .028\nwoman_man  similarity: .766\n\nSo, summing the similarities for each word separately:\n  president:  .063 + .028 == 0.091\n  woman:  .063 + .766 == 0.829\n  man:   .028 + .766 == .0.794\n\n+++ In this case, "president" is the odd one out  (intuitive, in some ways)\n\n'

In [89]:
#
# here is a signature line for odd_one_out (a starting point)
#
def odd_one_out( LoW, m ):
    """ This function takes in a list of words LoW. It then finds each word's similarity score to every other word in LoW
        as long as that word is in m. 
        It then outputs the odd one out, or the word with the smallest similarity score when compared to all other words in LoW.
    """ 
    output = 999999
    value = 0
    odd_one = ''
    for word in LoW:
        if word in m:
            for w in LoW:
                if w in m:  # is the word, w present in the vocabulary?
                    similarity = m.similarity(word,w)
                    value += similarity
                else:
                    print(f"  __  {w}  __ was not in the vocabulary", )   # not every word will be present
            if value < output:
                output = value
                odd_one = word
                value = 0
            else:
                value = 0
        else:
            print(f"  __  {word}  __ was not in the vocabulary", )
    print(f"The odd word out is {odd_one} with a similarity score of {output}")
    
LoW = ['apple', 'banana', 'wallet', 'peach']
LoW2 = ['computer', 'machine', 'potato', 'algorithm']
LoW3 = ['hat', 'jacket', 'clothes', 'shoes', 'basketball']
LoW4 = ['game', 'turtle', 'lion', 'controller', 'fire']
odd_one_out(LoW, m)         # SUCCESSFUL!
odd_one_out(LoW2, m)        # SUCCESSFUL!
odd_one_out(LoW3, m)        # SUCCESSFUL! 
odd_one_out(LoW4, m)        # Is this one successful? Personally, I thought 'fire' would be the odd one out...

The odd word out is wallet with a similarity score of 1.3832762762904167
The odd word out is potato with a similarity score of 1.3954471051692963
The odd word out is basketball with a similarity score of 1.3539883252233267
The odd word out is game with a similarity score of 1.276320238597691


In [None]:
#
# Create and run three examples - of at least 4 words each - for your odd_one_out function, e.g.,
#        LoW = "apple banana cat pear".split()
# Also, note if you would describe them as successful, unsuccessful, or "other" !

In [None]:
# visualizing similarity through a heat map

# copy of the old starting code
key = 'python'
LoS = []
LoW = []
for w in initial_words:
    if w in m:  # is the word, w present in the vocabulary?
        similarity = m.similarity(key,w)
        print(f"similarity between {key} and {w}: {similarity:6.2f}", )
        LoS.append( similarity )
        LoW.append( w )
    else:
        print(f"  __  {w}  __ was not in the vocabulary", )   # not every word will be present

print(f"LoS is {LoS}")
print(f"LoW is {LoW}")

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme(style="darkgrid")

my_data_list = [ LoS ]
my_dataframe = pd.DataFrame(my_data_list, columns=LoW)


# Drawing a heatmap with the numeric values in each cell
f, ax = plt.subplots(figsize=(15,10))  # (18, 12)
sns.heatmap(data=my_dataframe, annot=True, fmt="4.2f", linewidths=2, yticklabels=["python"], square=True, cmap="Purples", cbar=False, ax=ax)

ylocs, ylabels = plt.yticks()
plt.setp(ylabels, rotation=0, fontsize=15)
xlocs, xlabels = plt.xticks()
plt.setp(xlabels, rotation=70, fontsize=15)
"Result:"



<br>

####  An alternative geometric view of word-vectors...

In [None]:
def visualize_wordvecs(wordlist, model):
    """ example of finding an outlier with word2vec and graphically """

    for w in wordlist:
        if w not in model:
            print("Aargh - the model does not contain", w)
            print("Stopping...")
            return
    #
    # Next, we use PCA, Principal Components Analysis, to toss out 298 dimensions!
    # and create a scatterplot of the words...
    #
    # Intuitive description of PCA:   https://setosa.io/ev/principal-component-analysis/
    #
    from sklearn.decomposition import PCA
    import matplotlib.pyplot as plt
    import numpy

    pca = PCA(n_components=2)             # 2 dimensions
    pca_model = pca.fit(model.vectors)    # all 43,981 words with 300 numbers each!
    LoM = [model.get_vector(w) for w in wordlist]   # list of models for each word w
    
    word_vectors = numpy.vstack(LoM)     # vstack creates a vertical column from a list
    transformed_words = pca_model.transform(word_vectors)  # transform to our 2d space

    # scatterplot
    plt.scatter(transformed_words[:,0],transformed_words[:,1])
    
    # This is matplotlib's code for _annotating_ graphs (yay!)
    for i, word in enumerate(wordlist):
        plt.annotate(word, (transformed_words[i,0], transformed_words[i,1]), size='large')
        # it's possible to be more sophisticated, but this is ok for now

    plt.show()
    return

In [None]:
#
# Example of calling visualize_wordvecs...
#
LoW = "breakfast lunch dinner".split()     #  cereal python, one two three four five twelve
visualize_wordvecs(LoW, m)    