# Embeddings

### Setup

In [0]:
from google.colab import drive
import sys, os

%reload_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
import re
import time
from contextlib import contextmanager
from sklearn.externals import joblib


In [131]:
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [163]:
os.getcwd()

'/content/gdrive/My Drive/LSTM-light'

In [0]:
# Create a symbolic link to omit issues with whitespace in "My Drive"
!ln -s /content/gdrive/"My Drive"/ /MyDrive

In [0]:
PROJECT_HOME_PATH = os.path.join('/MyDrive', 'LSTM-light')

In [164]:
cd '/MyDrive/LSTM-light'

/content/gdrive/My Drive/LSTM-light


In [0]:
os.chdir(PROJECT_HOME_PATH)

In [0]:
DATA_PATH = os.path.join(PROJECT_HOME_PATH, 'DATA')

In [0]:
pd.set_option('display.max_colwidth', -1)

In [0]:
@contextmanager
def elapsed_time():
  start_time = time.time()
  yield
  end_time = time.time()
  elapsed_time = end_time - start_time
  print(f'Time took: {elapsed_time}')

### Load dataset

In [0]:
df_data = joblib.load(os.path.join(DATA_PATH, 'interim', 'quora_mod.dat'))

In [169]:
df_data.head()

Unnamed: 0,qid,question_text,question_text_mod,target
0,00002165364db923c7e6,How did Quebec nationalists see their province as a nation in the 1960s?,quebec nationalists see province nation,0
1,000032939017120e6e44,"Do you have an adopted dog, how would you encourage people to adopt and not shop?",adopted dog would encourage people adopt shop,0
2,0000412ca6e4628ce2cf,Why does velocity affect time? Does velocity affect space geometry?,velocity affect time velocity affect space geometry,0
3,000042bf85aa498cd78e,How did Otto von Guericke used the Magdeburg hemispheres?,otto von guericke used magdeburg hemispheres,0
4,0000455dfa3e01eae3af,Can I convert montra helicon D to a mountain bike by just changing the tyres?,convert montra helicon mountain bike changing tyres,0


In [0]:
notes = df_data.question_text_mod

In [0]:
notes = list(notes)

In [0]:
len(notes)

1305497

In [0]:
notes[:10]

['quebec nationalists see province nation',
 'adopted dog would encourage people adopt shop',
 'velocity affect time velocity affect space geometry',
 'otto von guericke used magdeburg hemispheres',
 'convert montra helicon mountain bike changing tyres',
 'gaza slowly becoming auschwitz dachau treblinka palestinians',
 'quora automatically ban conservative opinions reported liberal views',
 'crazy wash wipe groceries germs everywhere',
 'thing dressing moderately different dressing modestly',
 'ever phase wherein became ignorant people loved completely disregarding feelings lives get something go way feel temporarily ease things change']

### Subwords

In this step we use *Subwords Neural Translation* method to generate subwords.

Souce: https://github.com/rsennrich/subword-nmt

In [0]:
!pip install subword-nmt

Collecting subword-nmt
  Downloading https://files.pythonhosted.org/packages/26/08/58267cb3ac00f5f895457777ed9e0d106dbb5e6388fa7923d8663b04b849/subword_nmt-0.3.6-py2.py3-none-any.whl
Installing collected packages: subword-nmt
Successfully installed subword-nmt-0.3.6


In [0]:
import collections
import tempfile
from subword_nmt.learn_joint_bpe_and_vocab import learn_joint_bpe_and_vocab

In [0]:
Path = collections.namedtuple('Path', ['name'])


class LearnJointBpeVocabArgs:
    """
    Helper for passing arguments to ``learn_joint_bpe_and_vocab``. This class
    needs to exists because logic in ``learn_joint_bpe_and_vocab.py`` is designed
    that way.
    """  
    
    def __init__(self, input_, output, vocab, symbols=10000, min_frequency=5, verbose=False, separator='@@', total_symbols=1000):
      self.input=[Path(input_)]
      self.output = Path(output)
      self.vocab = [Path(vocab)]
      self.symbols = symbols
      self.min_frequency = min_frequency
      self.verbose = verbose
      self.separator = separator
      self.total_symbols = total_symbols

        
def generate_subwords_vocab(notes, output_codes_path, output_vocab_path, symbols=10000, 
                           min_frequency=5, verbose=False, separator='@@', total_symbols=1000):
  
    # temp file used here because learn_joint_bpe_and_vocab does not accept anything other than file
    with tempfile.NamedTemporaryFile('wt', encoding='utf-8', delete=False) as notes_temp_file:
        notes_temp_file.writelines(note + '\n' for note in notes)
        notes_temp_file_path = notes_temp_file.name
        
        
    # this function qutomatically saves the result
    learn_joint_bpe_and_vocab(LearnJointBpeVocabArgs(
        input_ = notes_temp_file_path,
        output = output_codes_path,
        vocab = output_vocab_path,
        verbose = verbose,
        symbols = symbols,
        min_frequency = min_frequency,
        separator = separator,
        total_symbols = total_symbols
    ))
  
  
    os.remove(notes_temp_file_path)
  

In [0]:
CODES_PATH = os.path.join(PROJECT_HOME_PATH, 'subwords', 'codes.txt')
VOCAB_PATH = os.path.join(PROJECT_HOME_PATH, 'subwords', 'vocab.txt')

In [0]:
generate_subwords_vocab(
    notes=notes,
    output_codes_path = CODES_PATH,
    output_vocab_path = VOCAB_PATH,
    verbose=True,
    symbols=10000,
    min_frequency=5,
    total_symbols=1000
)

Number of word-internal characters: 26
Number of word-final characters: 26
Reducing number of merge operations by 52
pair 0: i n -> in (frequency 956914)
pair 1: r e -> re (frequency 579112)
pair 2: a n -> an (frequency 505586)
pair 3: t i -> ti (frequency 480881)
pair 4: e r -> er (frequency 479279)
pair 5: e n -> en (frequency 447372)
pair 6: o n -> on (frequency 439299)
pair 7: in g</w> -> ing</w> (frequency 405597)
pair 8: a r -> ar (frequency 395912)
pair 9: s t -> st (frequency 353751)
pair 10: o r -> or (frequency 352285)
pair 11: l i -> li (frequency 320553)
pair 12: e r</w> -> er</w> (frequency 305731)
pair 13: a t -> at (frequency 305446)
pair 14: e s</w> -> es</w> (frequency 297127)
pair 15: o u -> ou (frequency 292979)
pair 16: e d</w> -> ed</w> (frequency 285813)
pair 17: a l -> al (frequency 254308)
pair 18: o n</w> -> on</w> (frequency 250868)
pair 19: s i -> si (frequency 241751)
pair 20: a c -> ac (frequency 240895)
pair 21: o m -> om (frequency 237898)
pair 22: e s ->

Now we replace words in out **notes** with corresponding subwords and save them all as one text file called **corpus**. In a next step we use **corpus** to train glove embeddings.

In [0]:
CORPUS_OUTPUT_PATH = os.path.join(DATA_PATH, 'glove', 'corpus.txt')

In [0]:
from subword_nmt import apply_bpe
from subword_nmt.apply_bpe import read_vocabulary

In [0]:
#minimum frequency of particular subword to be used.
vocab_threshold = None

with open(VOCAB_PATH, encoding='utf-8') as vocab_file:
    vocab = read_vocabulary(vocab_file, threshold=vocab_threshold)
        
with open(CODES_PATH, encoding='utf-8') as codes_file:
    bpe = apply_bpe.BPE(codes_file, vocab=vocab)
    
with open(CORPUS_OUTPUT_PATH, mode='x', encoding='utf-8') as output_file:
  output_file.writelines(bpe.process_line(note) + '\n' for note in notes)
  

### Glove embeddings

In [0]:
### Require steps to generate corpus and vocab when not using subwords ###

In [0]:
CORPUS_OUTPUT_PATH = os.path.join(DATA_PATH, 'glove', 'corpus.txt')

with open(CORPUS_OUTPUT_PATH, mode='w', encoding='utf-8') as output_file:
    output_file.writelines(note + '\n' for note in notes)
    

In [0]:
VOCAB_PATH = os.path.join(PROJECT_HOME_PATH, 'subwords', 'vocab.txt')

vocab_threshold = 20000

words_lst = [word.split(' ') for word in notes]
words_lst_flatten = [item for sublist in words_lst for item in sublist]
words_lst_flatten = pd.Series(words_lst_flatten)
vocab = words_lst_flatten.value_counts()

vocab[:vocab_threshold].to_csv(VOCAB_PATH, sep=' ', index=True, header=False)


#### Download GloVe repository

In [56]:
glove_repo_link = 'https://github.com/stanfordnlp/GloVe.git'

print(f'Please download GloVe project from the repository and place it under PROJECT_HOME_PATH: {glove_repo_link}')

Please download GloVe project from the repository and place it under PROJECT_HOME_PATH: https://github.com/stanfordnlp/GloVe.git


In [57]:
os.chdir(PROJECT_HOME_PATH)

/content/gdrive/My Drive/LSTM-light


In [50]:
!git clone https://github.com/stanfordnlp/GloVe.git

Cloning into 'GloVe'...
remote: Enumerating objects: 431, done.[K
remote: Total 431 (delta 0), reused 0 (delta 0), pack-reused 431[K
Receiving objects: 100% (431/431), 176.26 KiB | 867.00 KiB/s, done.
Resolving deltas: 100% (236/236), done.


In [0]:
GLOVE_PATH = os.path.join(PROJECT_HOME_PATH, 'GloVe')

#### Execute make

In [0]:
from subprocess import Popen, PIPE
import subprocess

In [0]:
#TODO: Describe what make is for

In [0]:
def execute_script(file_path):
    p = Popen(['./{}'.format(file_path)], stdin=PIPE, stdout=PIPE, stderr=PIPE)
    output, err = p.communicate()
    rc = p.returncode
    return output, err

In [84]:
# make GloVe

if not os.path.exists(GLOVE_PATH):
    print(f'Please download GloVe project from the respository and place it under: {GLOVE_PATH}')
else:
    os.chdir(GLOVE_PATH)
    file_name = 'build_glove.sh'
    with open('./{}'.format(file_name), 'w') as file_handle:
        file_handle.write('#!/bin/bash\n')
        file_handle.write('cd {}\n'.format(GLOVE_PATH))
        file_handle.write('make\n')
    os.chmod(file_name, 0o777)
    print('Executing make')
    output, err = execute_script('./{}'.format(file_name))
    print('./{}'.format(file_name))
    os.remove(file_name)
    print('Finished')


Executing make
./build_glove.sh
Finished


In [0]:
GLOVE_BIN_PATH = os.path.join(PROJECT_HOME_PATH, 'GloVe', 'build')

### Generate co-occurrence statistics

The GloVe model is trained on the non-zero entries of a global word-word co-occurrence matrix, which tabulates how frequently words co-occur with one another in a given corpus. Populating this matrix requires a single pass through the entire corpus to collect the statistics. For large corpora, this pass can be computationally expensive, but it is a one-time up-front cost.

The core training code is separated from these preprocessing steps and can be executed  independently.

In [0]:
subwords_symbols = 0
subwords_min_frequency = 0
glove_windows_size = 15
glove_iterations = 15
glove_vector_size = 50

In [0]:
subwords_params = str(subwords_symbols) + '_' + str(subwords_min_frequency)
cooccur_params = str(subwords_params) + '_' + str(glove_windows_size)
COOCCUR_PATH = os.path.join(DATA_PATH, 'glove', f'cooccurrence_{cooccur_params}')

In [0]:
def run_cooccur(vocab_path, corpus_path, cooccur_output_path, windows_size=15, verbose=False):
    
    if os.path.splitext(cooccur_output_path)[1]:
            raise ValueError(f'cooccur_output_path must not have any extension: {cooccur_output_path}')
    cooccur_output_path = cooccur_output_path + '.bin'
    cooccur_shuf_path = cooccur_output_path.replace('.bin', '.shuf.bin')

    subprocess.check_call(
        os.path.join(GLOVE_BIN_PATH, 'cooccur') +
        f' -vocab-file {vocab_path} -window-size {windows_size} -verbose {2 if verbose else 0}'
        f' < {corpus_path} > {cooccur_output_path}',
        shell=True
        )


    subprocess.check_call(
        os.path.join(GLOVE_BIN_PATH, 'shuffle') +
        f' -verbose {2 if verbose else 0} < {cooccur_output_path} > {cooccur_shuf_path}',
        shell=True
        )    

In [0]:
run_cooccur(VOCAB_PATH, CORPUS_OUTPUT_PATH, COOCCUR_PATH, windows_size=glove_windows_size)

### Train GloVe embeddings

In [0]:
glove_params = str(cooccur_params) + '_' + str(glove_iterations) + '_' + str(glove_vector_size)
VECTORS_PATH = os.path.join(DATA_PATH, 'glove', f'vectors_{glove_params}')

In [0]:
def run_glove(cooccur_path, vocab_path, vectors_output_path, vector_size=50, iterations=15,
             learning_rate=0.05, x_max=100, alpha=0.75, verbose=False, word2vec_format=False):
    if os.path.splitext(vectors_output_path)[1]:
        raise ValueError(f'vectors_output_path must not have any extansions: {vectors_output_path}')
        
    cooccur_path = cooccur_path + '.shuf.bin'
    threads = os.cpu_count()
    
    # glove automatically adds extension
    vectors_output_path = vectors_output_path.replace('.txt.', '')
    subprocess.check_call(
        os.path.join(GLOVE_BIN_PATH, 'glove') +
        f' -input-file {cooccur_path} -vocab-file {vocab_path} -write-header {int(word2vec_format)}'
        f' -vector-size {vector_size} -iter {iterations} -eta {learning_rate} -x-max {x_max}'
        f' -alpha {alpha} -threads {threads} -save-file {vectors_output_path}'
        f' -verbose {2 if verbose else 0}',
        shell=True
    )

In [0]:
run_glove(COOCCUR_PATH, VOCAB_PATH, VECTORS_PATH, vector_size=glove_vector_size)

In [0]:
### Visualize embeddings in Tensorboard

In [0]:
import tensorflow as tf
from tensorboard import main as tb
from tensorflow.contrib.tensorboard.plugins import projector

In [0]:
VECTORS_FILE_PATH = VECTORS_PATH + '.txt'
words, embeddings = [], []

with open(VECTORS_FILE_PATH, encoding='utf-8') as input_file:
    for line in input_file:
        word, *embedding = line.split()
        words.append(word)
        embeddings.append(np.array(embedding, dtype=float))
embeddings = np.array(embeddings)

In [226]:
print(f'\tembeddings shape: {embeddings.shape}\n\twords len: {len(words)}' )

	embeddings shape: (9992, 50)
	words len: 9992


In [0]:
TENSORBOARD_PATH = os.path.join(DATA_PATH, 'tensorboard')

In [0]:
def save_for_projector(vectors, tensorboard_output_path, metadata, name):
    metadata_output_path = os.path.join(tensorboard_output_path, 'metadata.tsv')
    model_checkpoint_path = os.path.join(tensorboard_output_path, 'model.ckpt')
    
    # if more than one column: first row must be deader row
    with open(metadata_output_path, 'w', encoding='utf-8') as output_file:
        output_file.writelines(word + '\n' for word in metadata)
        
    session = tf.InteractiveSession()
    with tf.device("/cpu:0"):
        vectors_var = tf.Variable(vectors, name=name, trainable=False)
        
    tf.global_variables_initializer().run()
    saver = tf.train.Saver()
    writer = tf.summary.FileWriter(tensorboard_output_path)
    config = projector.ProjectorConfig()
    config.model_checkpoint_path = model_checkpoint_path
    embedding = config.embeddings.add()
    embedding.tensor_name = vectors_var.name
    embedding.metadata_path = metadata_output_path
    projector.visualize_embeddings(writer, config)
    saver.save(session, model_checkpoint_path)

In [229]:
len(words)

9992

In [231]:
save_for_projector(embeddings, TENSORBOARD_PATH, words, name='embeddings')



In [0]:
# Run tensorboard

# In terminal:
#$tensorboard --logdir=TENSORBOARD_PATH --port=6009

# If you run tensorboard on server, use port forwarding when log in: ssh [login@server] -L [port]:localhost:[port]