# Data, Machines and the 🐍
<img src="https://raw.githubusercontent.com/habermanUIUC/CodeStoryLessons/main/lessons/dmap/mlu/word2vec/html/section00.png" align="left"/>

<a id="install"></a>
## Notebook Preparation for Lesson 1•2•3
Each lesson will start with a similar template (given in the course schedule):  
1. **save** to your google drive (copy to drive)<br/><img src="https://raw.githubusercontent.com/habermanUIUC/CodeStoryLessons/main/assets/images/colab/copy-to-drive.png"/>
2. **update** the NET_ID to be your netID (no need to include @illinois.edu)
3. **run** the next cell to install the IDE. <img src="https://raw.githubusercontent.com/habermanUIUC/CodeStoryLessons/main/assets/images/colab/play-button.png"/>

In [0]:
LESSON_ID = 'dmap:mlu:word2vec'   # keep this as is
NET_ID    = 'CHANGE_ME' # CHANGE_ME to your netID (keep the quotes)

def install_ide(net_id, lesson_id):
  import sys
  if 'codestories' not in sys.modules:
      print('installing modules')
      !pip install git+https://mehaberman@bitbucket.org/mehaberman/codestories.git --upgrade &> install.log
  
  from codestories.cs.CodeStories import CodeStory
  return CodeStory(net_id, lesson_id)

ide = install_ide(NET_ID, LESSON_ID)
print(ide.welcome())

# Lesson Word2Vec
(hit ▶ to read the first part of the lesson️)

In [0]:
# run to read the next section
ide.reader.view_section(1)

In [0]:
import gzip
import gensim
import pandas as pd
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)

import LessonUtil as Util

def build_dataset_raw():

  # here's an example of how to use a zipped (compressed) file
  filename = Util.path_for_data('cars.csv.gz')
  # https://www.kaggle.com/CooperUnion/cardataset?select=data.csv
  file = gzip.open(filename, 'rb')

  # clean and tokenize the text
  return [gensim.utils.simple_preprocess(line) for line in file]

def test_raw():
  document = build_dataset_raw()
  print(document[0])  # make note of the column names
  print(document[10]) # row '9'
  
test_raw()

In [0]:
# run to read the next section
ide.reader.view_section(3)

In [0]:
import LessonUtil as Util
def build_dataset():

  # another way to read compressed data
  filename = Util.path_for_data('cars.csv.gz')
  df = pd.read_csv(filename, compression='gzip')

  # feature selection
  # select the fields we want to train word2vec on
  features = ['Market Category','Vehicle Size','Vehicle Style',
              'Engine Fuel Type','Transmission Type','Driven_Wheels']
              
  df = df[features]
  doc = []
  for index, row in df.iterrows(): 
    line = [r for v in row.values for r in str(v).split(',')]
    doc.append(line)
  
  return doc, df

def test_pd_data():
  document, df = build_dataset()
  print(document[0][0:5])

test_pd_data()

# ==> ['Factory Tuner', 'Luxury', 'High-Performance', 'Compact', 'Coupe']

In [0]:
# run to read the next section
ide.reader.view_section(5)

In [0]:
# type&run the above example/exercise in this cell

In [0]:
# run to read the next section
ide.reader.view_section(7)

In [0]:
def evaluate_model(model, df=None):

  output = ''
  if df is not None:
    unique_set = df['Make_Model'].unique()
    missing=0
    for mm in unique_set:
      if mm not in model.wv.vocab:
        missing += 1
    output += "{:d} models are missing of {:d}\n".format(missing, len(unique_set))
  
  try:
    t = 'Toyota Camry'
    other = ['Honda Accord', 'Nissan Van', 'Mercedes-Benz SLK-Class']
    for o in other:
      output += t + '->' + o + ' ' + "{:0.4f}\n".format(model.wv.similarity(t,o))
      
    tuples = model.wv.most_similar(positive='Honda Odyssey', topn=3)
    for mm, v in tuples:
      output += mm + ', '
    output = output.strip(', ')
    
  except KeyError as e:
    output += "\nError:" + str(e)

  return output

def test_v0():
  document, df = build_dataset()
  model = build_model_v0(document)
  print(evaluate_model(model, df))

test_v0()

In [0]:
# run to read the next section
ide.reader.view_section(9)

In [0]:
# type&run the above example/exercise in this cell

In [0]:
# run to read the next section
ide.reader.view_section(11)

In [0]:
def build_model_v2(doc):
  model = gensim.models.Word2Vec(
          doc,
          min_count=1, # ignore words that occur less than 2 times
          workers=1,   # threads to use
          window=10,   # size of window around the target word
          iter=15      # 15 epochs
          )  
  return model

def test_v2():
  document, df = build_dataset()
  model = build_model_v2(document)
  print(evaluate_model(model,df))
  
test_v2()

In [0]:
# run to read the next section
ide.reader.view_section(13)

# Two Ways To Train

In [0]:
# run to read the next section
ide.reader.view_section(14)

In [0]:
def build_model_v3(doc):
  model = gensim.models.Word2Vec(
            doc,
            min_count=1, # ignore words that occur less than 2 times
            workers=1,   # threads to use
            window=10,   # size of window around the target word
            iter=15,     # how many times to iterate over the corpus (train)
            size=100,    # how big the output vectors (spacy == 300)
            sg=1,        # 0 == CBOW (default) 1 == skip gram
          )     
  return model

def test_v3():
  document, df = build_dataset()
  model = build_model_v3(document)
  print(evaluate_model(model,df))

In [0]:
# run to read the next section
ide.reader.view_section(16)

# GloVe (2014)

In [0]:
# run to read the next section
ide.reader.view_section(17)

In [0]:
def install_build_glove():
  # copy the software to the VM
  !git clone https://github.com/stanfordnlp/GloVe.git glove
  # compile the software
  !cd glove && make
  # copy a small dataset for GloVe to use
  !cp /content/info490/INFO490Assets/src/datasets/books/hp/harryPotter.txt /content/sample_data

install_build_glove()

In [0]:
# run to read the next section
ide.reader.view_section(19)

In [0]:
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.models import KeyedVectors

def run_glove():
  !cd glove ; ./demo.sh

def test_glove():
  # convert glove to word2vec format
  w2v_info = glove2word2vec('glove/vectors.txt', 'vec.word2vec')
  print('voc. size, vector size', w2v_info)

  # now read in the format
  hp_model = KeyedVectors.load_word2vec_format('vec.word2vec', binary=False)
  print(type(hp_model))
  print(hp_model.most_similar('Harry', topn=5))
    
run_glove()
test_glove()

In [0]:
# run to read the next section
ide.reader.view_section(21)

In [0]:
import gensim.downloader as api
import time

def load_glove_model(resource):
  #print(api.info())
  print(api.info(resource))
  st = time.time()
  
  # download the model, unzip, convert it 
  model = api.load(resource)
  
  print("load time", time.time() - st)
  return model

#gw50 = load_glove_model('glove-wiki-gigaword-50')   # 66 MB,  about a minute
#gt200 = load_glove_model('glove-twitter-200')       # 758 MB, about 7 minutes

In [0]:
# run to read the next section
ide.reader.view_section(23)

# fastText  (2016)

In [0]:
# run to read the next section
ide.reader.view_section(24)

# Summary

In [0]:
# run to read the next section
ide.reader.view_section(25)

# Lesson Assignment

In [0]:
# run to read the next section
ide.reader.view_section(26)

# Test and Submit

In [0]:
# run to read the next section
ide.reader.view_section(27)

In [0]:
# print(ide.tester.test_notebook()) 
# print(ide.tester.test_notebook(verbose=True)) 

# once you are ready -- run this 
# ide.tester.download_solution()