# Data, Machines and the 🐍 
<img src="https://raw.githubusercontent.com/habermanUIUC/CodeStoryLessons/main/lessons/dmap/text/word-embeddings/html/section00.png" align="left"/>

<a id="install"></a>
## Notebook Preparation for Lesson 1•2•3
Each lesson will start with a similar template (given in the course schedule):  
1. **save** to your google drive (copy to drive)<br/><img src="https://raw.githubusercontent.com/habermanUIUC/CodeStoryLessons/main/assets/images/colab/copy-to-drive.png"/>
2. **update** the NET_ID to be your netID (no need to include @illinois.edu)
3. **run** the next cell to install the IDE. <img src="https://raw.githubusercontent.com/habermanUIUC/CodeStoryLessons/main/assets/images/colab/play-button.png"/>

In [0]:
LESSON_ID = 'dmap:text:word-embeddings'   # keep this as is
NET_ID    = 'CHANGE_ME' # CHANGE_ME to your netID (keep the quotes)

def install_ide(net_id, lesson_id):
  import sys
  if 'codestories' not in sys.modules:
      print('installing modules')
      !pip install git+https://mehaberman@bitbucket.org/mehaberman/codestories.git --upgrade &> install.log
  
  from codestories.cs.CodeStories import CodeStory
  return CodeStory(net_id, lesson_id)

ide = install_ide(NET_ID, LESSON_ID)
print(ide.welcome())

# Lesson Word Embeddings
(hit ▶ to read the first part of the lesson️)

In [0]:
# run to read the next section
ide.reader.view_section(1)

# Words as Vectors

In [0]:
# run to read the next section
ide.reader.view_section(2)

In [0]:
# this cell will take a long time to run
# you only need to run it once per session
def load_model(use_large=False):
    import LessonUtil as Util
    import spacy
    if spacy.__version__ != "2.2.4":
        print("WARNING, spacy version may not have vectors")
    return Util.load_model(use_large)

# keep this as is
nlp = load_model(use_large=True)
print('Total Words', len(nlp.vocab))

In [0]:
# run to read the next section
ide.reader.view_section(4)

In [0]:
def word_demo(md):
    # retrieve words from the English model vocabulary
    cat = md.vocab['cat']

    # print the dimension of word vectors
    print('vector length:', len(cat.vector))

    # print the word vector
    print('cat:', cat.vector)
  
word_demo(nlp)

In [0]:
# run to read the next section
ide.reader.view_section(6)

In [0]:
cat = nlp.vocab['cat']
dog = nlp.vocab['dog']
car = nlp.vocab['car']
print('The similarity between dog and dog:', dog.similarity(dog))
print('The similarity between dog and car:', dog.similarity(car))
print('The similarity between dog and cat:', dog.similarity(cat))

In [0]:
# run to read the next section
ide.reader.view_section(8)

In [0]:
import numpy as np
def spacy_sim(a,b):
    return -1 

print(spacy_sim(nlp.vocab['dog'], nlp.vocab['car']))
# print(ide.tester.test_function(spacy_sim))

In [0]:
# run to read the next section
ide.reader.view_section(10)

In [0]:
def homemade_sim(a,b):
    return np.dot(a, b)/(np.linalg.norm(a)*np.linalg.norm(b))

def demo_vector_math(md):
    man = md.vocab['father'].vector
    woman = md.vocab['mother'].vector
    d1 = man - woman

    uncle = md.vocab['uncle'].vector
    aunt = md.vocab['aunt'].vector
    d2 = uncle - aunt

    print('d1 and d2 close:', homemade_sim(d1, d2) > 0.70)

demo_vector_math(nlp)

In [0]:
# run to read the next section
ide.reader.view_section(12)

In [0]:
def all_words_except(md, word):
    # get all words in the vocabulary
    # each must have a word vector
    # don't include the exception
    return [w for w in md.vocab if w.has_vector and w.is_lower and w.lower_ != word.lower_]  

def most_similar(md, name, topn=10):

    word = md.vocab[name]
    
    allwords = all_words_except(md, word)
    
    # sort words by similarity in descending order
    out = sorted(allwords, key=lambda w: word.similarity(w), reverse=True)  
    
    # slicing to the rescue
    return out[:topn]

def demo_close_words(md, word):
    neighbors = most_similar(md, word)
    print([w.text for w in neighbors])

# this can take a long time to run
demo_close_words(nlp, 'car')

In [0]:
# run to read the next section
ide.reader.view_section(14)

In [0]:
cos = lambda v1, v2: np.dot(v1, v2)/(np.linalg.norm(v1) * np.linalg.norm(v2))
def all_words_except(md, exclude_set):

    # the valid function ensures we don't include any of the words 
    # that are part of the equation (input)
    def valid(w, exclude):
        if w.has_vector and w.is_lower:
            for t in exclude:
                if w.lower_.find(t) >= 0: 
                    return False
            return True
        return False

    return [w for w in md.vocab if valid(w, exclude_set)]

def find_closest(md, vector, exclude, topn=50):
    working_set = all_words_except(md, exclude)
    candidates = sorted(working_set, key=lambda w: cos(vector, w.vector), reverse=True)
    return candidates[:topn]

def demo_word_math(md):
    vector = md.vocab['math'].vector + md.vocab['symbol'].vector
    answer = find_closest(md, vector, ['math', 'symbol'])
    print(answer[0].text)

In [0]:
# run to read the next section
ide.reader.view_section(16)

In [0]:
# type&run the above example/exercise in this cell

In [0]:
# run to read the next section
ide.reader.view_section(18)

In [0]:
# type&run the above example/exercise in this cell

In [0]:
# run to read the next section
ide.reader.view_section(20)

In [0]:
def reduce_dimensions(md, labels):
    from sklearn.manifold import TSNE
    import numpy as np
  
    data = np.array([md.vocab[w].vector for w in labels])
    # reduce to two
    tsne_model = TSNE(n_components=2)
    data_2d = tsne_model.fit_transform(data)
  
    return data_2d

def plot_results(data_2d, labels):
    import matplotlib
    import matplotlib.pyplot as plt
  
    # plot the 2d vectors and show their labels
    fig, axes = plt.subplots()
    axes.scatter(data_2d[:, 0], data_2d[:, 1], s=100)
    for i, txt in enumerate(labels):
        axes.annotate(txt, (data_2d[i,0], data_2d[i,1]), xytext=(2, 3), textcoords='offset points')
    axes.grid()
    return fig

def show_vector_space(md):
    labels = ['king', 'man', 'queen', 'woman']
    data = reduce_dimensions(md, labels)
    fig = plot_results(data, labels)

show_vector_space(nlp)

In [0]:
# run to read the next section
ide.reader.view_section(22)

# One Hot Encoding

In [0]:
# run to read the next section
ide.reader.view_section(23)

# Lesson Assignment

In [0]:
# run to read the next section
ide.reader.view_section(24)

In [0]:
def find_best_fit(md, result, words, topn=15):
    # md is the nlp model
    # result is a vector you're trying to get close to
    # words is a list of spacy objects/words (3 of them) that
    # need to be excluded (e.g nlp.vocab['man'])
  
    # return the topn candidates, in order so index 0 is the best fit
    # it should always have at least one word in it
    return []
    
def find_analogy(md, three_words):
     # md is the nlp model
     # three_words is a list of strings (e.g. ['king', 'man', 'woman'])

     # return the text (a string) for the word that meets the analogy 
     # it should use find_best_fit
     return None

In [0]:
# run to read the next section
ide.reader.view_section(26)

# Test and Submit

In [0]:
# run to read the next section
ide.reader.view_section(27)

In [0]:
# print(ide.tester.test_notebook()) 
# print(ide.tester.test_notebook(verbose=True)) 

# once you are ready -- run this 
# ide.tester.download_solution()