#Setting

In [1]:
%load_ext autoreload
%autoreload 2

from google.colab import drive
drive.mount('/content/drive')
# %cd /content/drive/MyDrive/group-1.3-master/group-1.3-master/LeverageJustAFewKeywords/
# %cd /content/drive/MyDrive/LeverageJustAFewKeywords/
%cd /content/drive/MyDrive/group-1.3/LeverageJustAFewKeywords/


Mounted at /content/drive
/content/drive/.shortcut-targets-by-id/1n0oSoMBR4TlxDwAce51xBgon3LxJjCkE/group-1.3/LeverageJustAFewKeywords


In [2]:
!pip install mittens
import csv
import numpy as np
from mittens import GloVe, Mittens
from sklearn.feature_extraction.text import CountVectorizer
import os
from time import time
import pickle
import tensorflow as tf
from collections import defaultdict
import nltk
nltk.download('words')
nltk.download('stopwords')
nltk.download('wordnet')


Collecting mittens
  Downloading https://files.pythonhosted.org/packages/ce/c0/6e4fce5b3cb88edde2e657bb4da9885c0aeac232981706beed7f43773b00/mittens-0.2-py3-none-any.whl
Installing collected packages: mittens
Successfully installed mittens-0.2
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

#Pre Trained

1.   Find words that are in corpus but not in pre-trained vocab. (called corp_vocab in this method)
2.   create np.random for these words
3.   Merge new embeddings with pre-trained embeddings



In [None]:
def glove2dict(glove_filename):
    with open(glove_filename, encoding='utf-8') as f:
        reader = csv.reader(f, delimiter=' ', quoting=csv.QUOTE_NONE)
        embed = {line[0]: np.array(list(map(float, line[1:])))
                for line in reader}
    return embed

glove_path = "../glove.6B/glove.6B.300d.txt" # get it from https://nlp.stanford.edu/projects/glove
original_embeddings = glove2dict(glove_path)

In [None]:
def pickle_load(filename):
    with open(filename, 'rb') as f:
        return pickle.load(f)

def pickle_save(clean_corpus, filename):
    with open(filename, 'wb') as f:
        pickle.dump(clean_corpus, f)

In [None]:
domain = 'bags_and_cases'
corpus_file = '../processed/oposum/' + domain + '_corpus_wotf1.pkl'
corpus = pickle_load(corpus_file)

In [None]:
vocab = set()
for sentence in corpus:
  for item in sentence:
    vocab.add(item)

In [None]:
corp_vocab = [token for token in vocab if token not in original_embeddings.keys()] # out of glove-vocab words specific to corpus
print(corp_vocab[:10])
print(len(corp_vocab))

['manfrotto', '1080cc', 'jsut', 'tranporting', 'shoudler', 'biggy', 'overnighter', 'mbdedicated', 'shlepping', 'rollaway']
2131


In [None]:
random_embeds = np.random.rand(len(corp_vocab),300)

In [None]:
new_glove = dict(zip(corp_vocab, random_embeds))
# extra_glove['theswissgear']

In [None]:
old_glove = {item:original_embeddings[item] for item in vocab if item in original_embeddings}

In [None]:
pretrained_glove = {**old_glove, **new_glove} #combining old and new

In [None]:
len(pretrained_glove)

15429

In [None]:
 finetune_output_dir = '../wv/oposum_w2v/'
 pickle_save(pretrained_glove, finetune_output_dir + domain + '_glove_pretrained.bin')

#Fine tune for one domain: Method 1


1.   Find words that are in corpus but not in pre-trained vocab. (called corp_vocab in this method)
2.   Create co-occurence matrix for corp_vocab
3.   Train Glove on this co-occurence matrix
4.   Merge new embeddings with pre-trained embeddings



In [None]:
def glove2dict(glove_filename):
    with open(glove_filename, encoding='utf-8') as f:
        reader = csv.reader(f, delimiter=' ', quoting=csv.QUOTE_NONE)
        embed = {line[0]: np.array(list(map(float, line[1:])))
                for line in reader}
    return embed

glove_path = "../glove.6B/glove.6B.300d.txt" # get it from https://nlp.stanford.edu/projects/glove
original_embeddings = glove2dict(glove_path)

In [None]:
def pickle_load(filename):
    with open(filename, 'rb') as f:
        return pickle.load(f)

def pickle_save(clean_corpus, filename):
    with open(filename, 'wb') as f:
        pickle.dump(clean_corpus, f)

In [None]:
domain = 'bags_and_cases'
corpus_file = '../processed/oposum/' + domain + '_corpus_wotf1.pkl'
corpus = pickle_load(corpus_file)

In [None]:
vocab = set()
count = defaultdict(int)
for sentence in corpus:
  for item in sentence:
    vocab.add(item)
    count[item] += 1

In [None]:
print(list(vocab)[:10])
print(len(vocab))

['armour', 'boyscout', 'packrat', 'm6700', 'relative', 'lengthens', 'account', 'hostile', 'blend', 'mare']
15429


In [None]:
corpus_doc = [' '.join(item) for item in corpus]
print(corpus_doc[0])

case look nice plenty pocket stuff carry around 're using something back forth office thinner laptop enough padding protect computer scratching rubbing bought son school past fall one thinner laptop cover soon dented scratched constant rubbing screen touchpad control left little white mark display blame computer design added padding case problem plus side case durable nothing yet ripped torn 's easy spot clean


In [None]:
corp_vocab = [token for token in vocab if token not in original_embeddings.keys()] # out of glove-vocab words specific to corpus
print(corp_vocab[:10])
print(len(corp_vocab))

['boyscout', 'm6700', 'favs', 'overstuff', 'extreemly', 'dakine', 'backpain', '3n1', '2013i', 'biggy']
2131


In [None]:
t0=time()
count_model = CountVectorizer(ngram_range=(1,5),vocabulary=corp_vocab) # unigram to 5-gram. building only for corpus specific vocab
X = count_model.fit_transform(corpus_doc)
X[X > 0] = 1 # to remove within-line cooccurence
Xc = (X.T * X) # co-occurrence matrix in sparse csr format
Xc.setdiag(0) # setting same word cooccurence to 0
# coocc_arr = sparse.lil_matrix(Xc).toarray()
coocc_ar = Xc.toarray()
t1= time()
print(f" cost {t1 - t0:.2f} seconds")

 cost 6.15 seconds


  self._set_arrayXarray(i, j, x)


In [None]:
t0=time()
glove_model = GloVe(n=300, max_iter=500)
new_embeddings = glove_model.fit(coocc_ar)
t1= time()
print(f"\ntraining cost {t1 - t0:.2f} seconds")

Iteration 500: loss: 0.0013275939272716641


training cost 34.44 seconds


In [None]:
new_embeds = dict(zip(corp_vocab, new_embeddings))

In [None]:
old_embeds = {item:original_embeddings[item] for item in vocab if item in original_embeddings}

In [None]:
finetuned_embeddings = {**old_embeds, **new_embeds} #combining old and new

In [None]:
len(finetuned_embeddings)

15429

In [None]:
 finetune_output_dir = '../wv/oposum_w2v/'
 pickle_save(finetuned_embeddings, finetune_output_dir + domain + '_glove_tuned_m1.bin')

# Fine tune for one domain: Method 2
Warning: runs out of RAM in free Colab and notebook crashes after 20 seconds if size of vocab greater than 12.5k tokens

1.   Find words that are in corpus and in pre-trained vocab. (called vocab in this method)
2.   Create co-occurence matrix for vocab
3.   Use Mittens to fine tune on this co-occurence matrix
4.   Save New embeddings

In [None]:
def glove2dict(glove_filename):
    with open(glove_filename, encoding='utf-8') as f:
        reader = csv.reader(f, delimiter=' ', quoting=csv.QUOTE_NONE)
        embed = {line[0]: np.array(list(map(float, line[1:])))
                for line in reader}
    return embed

glove_path = "../glove.6B/glove.6B.300d.txt" # get it from https://nlp.stanford.edu/projects/glove
original_embeddings = glove2dict(glove_path)

In [None]:
def pickle_load(filename):
    with open(filename, 'rb') as f:
        return pickle.load(f)

def pickle_save(clean_corpus, filename):
    with open(filename, 'wb') as f:
        pickle.dump(clean_corpus, f)

In [None]:
domain = 'bags_and_cases'
corpus_file = '../processed/oposum/' + domain + '_corpus_wotf1.pkl'
corpus = pickle_load(corpus_file)

In [None]:
vocab = set()
for sentence in corpus:
  for item in sentence:
    vocab.add(item)

In [None]:
corpus_doc = [' '.join(item) for item in corpus]
print(corpus_doc[0])

case look nice plenty pocket stuff carry around 're using something back forth office thinner laptop enough padding protect computer scratching rubbing bought son school past fall one thinner laptop cover soon dented scratched constant rubbing screen touchpad control left little white mark display blame computer design added padding case problem plus side case durable nothing yet ripped torn 's easy spot clean


Create co-occurence matrix

In [None]:
t0=time()
count_model = CountVectorizer(ngram_range=(1,5),vocabulary=vocab) # unigram to 5-gram.
X = count_model.fit_transform(corpus_doc)
X[X > 0] = 1 # to remove within-line cooccurence
Xc = (X.T * X) # co-occurrence matrix in sparse csr format
Xc.setdiag(0) # setting same word cooccurence to 0
# coocc_arr = sparse.lil_matrix(Xc).toarray()
coocc_ar = Xc.toarray()
t1= time()
print(f" cost {t1 - t0:.2f} seconds")

  self._set_arrayXarray(i, j, x)


 cost 9.41 seconds


Mittens Training - crash here

In [None]:
mittens_model = Mittens(n=300, max_iter=500)
t0 = time()
new_embeddings = mittens_model.fit(
    coocc_ar,
    vocab=vocab,
    initial_embedding_dict= original_embeddings)
t1=time()
print(f"\ntraining cost {t1 - t0:.2f} seconds")
new_glove = dict(zip(vocab, new_embeddings))



In [None]:
len(new_glove)

In [None]:
finetune_output_dir = '../wv/oposum_w2v/' 
pickle_save(new_glove, finetune_output_dir + domain + '_glove_tuned_m2.bin')


---

Testing difference with Glove - also crashes




In [None]:
t0=time()
glove_model = GloVe(n=300, max_iter=500)
test_embeddings = glove_model.fit(coocc_ar)
t1= time()
print(f"training cost {t1 - t0:.2f} seconds")

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


# Fine tune for one domain: Method 3

Reduce domain specific vocab size (under 12.5k) in order to fit in memory. 

This could mean loss a lot of words. Bags and cases is still at 15k vocab after removing tf1 words. 20% reduction in size?

1.   Find words that are in corpus and in pre-trained vocab. (called vocab in this method)
2.   Remove some more words from vocab with low occurence frequency
3.   Create co-occurence matrix for vocab
4.   Train Mittens on this co-occurence matrix


5.   Find words that are in corpus but not in pre-trained vocab. (called corp_vocab in this method)
2.   Create co-occurence matrix for corp_vocab
3.   Train Glove on this co-occurence matrix


4.   Merge fine-tuned embeddings with new embeddings




In [None]:
def glove2dict(glove_filename):
    with open(glove_filename, encoding='utf-8') as f:
        reader = csv.reader(f, delimiter=' ', quoting=csv.QUOTE_NONE)
        embed = {line[0]: np.array(list(map(float, line[1:])))
                for line in reader}
    return embed

glove_path = "../glove.6B/glove.6B.300d.txt" # get it from https://nlp.stanford.edu/projects/glove
original_embeddings = glove2dict(glove_path)

In [None]:
def pickle_load(filename):
    with open(filename, 'rb') as f:
        return pickle.load(f)

def pickle_save(clean_corpus, filename):
    with open(filename, 'wb') as f:
        pickle.dump(clean_corpus, f)

In [None]:
domain = 'bags_and_cases'
corpus_file = '../processed/oposum/' + domain + '_corpus_wotf1.pkl'
corpus = pickle_load(corpus_file)

In [None]:
vocab = set()
count = defaultdict(int)
for sentence in corpus:
  for item in sentence:
    vocab.add(item)
    count[item] += 1

In [None]:
print(list(vocab)[:10])
print(len(vocab))

['armour', 'boyscout', 'packrat', 'm6700', 'relative', 'lengthens', 'account', 'hostile', 'blend', 'mare']
15429


In [None]:
corpus_doc = [' '.join(item) for item in corpus]
print(corpus_doc[0])

case look nice plenty pocket stuff carry around 're using something back forth office thinner laptop enough padding protect computer scratching rubbing bought son school past fall one thinner laptop cover soon dented scratched constant rubbing screen touchpad control left little white mark display blame computer design added padding case problem plus side case durable nothing yet ripped torn 's easy spot clean


In [None]:
corp_vocab = [token for token in vocab if token not in original_embeddings.keys()] # out of glove-vocab words specific to corpus
print(corp_vocab[:10])
print(len(corp_vocab))

['boyscout', 'm6700', 'favs', 'overstuff', 'extreemly', 'dakine', 'backpain', '3n1', '2013i', 'biggy']
2131


Removing words from vocab

In [None]:
known_vocab = list(vocab - set(corp_vocab))
remove_vocab = [token for token in known_vocab if count[token]<3]
reduced_vocab = list(set(known_vocab) - set(remove_vocab[:700])) #change 700 to desired value
remaining_vocab = list(vocab - set(reduced_vocab))
# test_vocab = [token for token in vocab if count[token]>2]
print(reduced_vocab[:5])
# small_vocab = [token for token in vocab if count[token]<3]
print(len(known_vocab))
print(len(reduced_vocab))
print(len(remaining_vocab))
# print(small_vocab[:5])

['packrat', 'relative', 'account', 'hostile', 'blend']
13298
12598
2831


In [None]:
t0=time()
count_model = CountVectorizer(ngram_range=(1,5),vocabulary=reduced_vocab) # unigram to 5-gram. building only for corpus specific vocab
X = count_model.fit_transform(corpus_doc)
X[X > 0] = 1 # to remove within-line cooccurence
Xc = (X.T * X) # co-occurrence matrix in sparse csr format
Xc.setdiag(0) # setting same word cooccurence to 0
# coocc_arr = sparse.lil_matrix(Xc).toarray()
coocc_ar = Xc.toarray()
t1= time()
print(f" cost {t1 - t0:.2f} seconds")

  self._set_arrayXarray(i, j, x)


 cost 7.49 seconds


In [None]:
mittens_model = Mittens(n=300, max_iter=1500)
t0 = time()
new_embeddings = mittens_model.fit(
    coocc_ar,
    vocab=reduced_vocab,
    initial_embedding_dict= original_embeddings)
t1=time()
print(f"\ntraining cost {t1 - t0:.2f} seconds")
new_glove = dict(zip(reduced_vocab, new_embeddings))

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


Iteration 1500: loss: 52619.71484375


training cost 4246.57 seconds


Iteration 1500: loss: 52619.71484375
training cost 4246.57 seconds
12598 words. removed 700 words

Now co-occurence for corpus specific words

In [None]:
t0=time()
count_model = CountVectorizer(ngram_range=(1,5),vocabulary=remaining_vocab) # unigram to 5-gram. building only for corpus specific vocab
X = count_model.fit_transform(corpus_doc)
X[X > 0] = 1 # to remove within-line cooccurence
Xc = (X.T * X) # co-occurrence matrix in sparse csr format
Xc.setdiag(0) # setting same word cooccurence to 0
# coocc_arr = sparse.lil_matrix(Xc).toarray()
coocc_ar = Xc.toarray()
t1= time()
print(f" cost {t1 - t0:.2f} seconds")

 cost 6.07 seconds


  self._set_arrayXarray(i, j, x)


In [None]:
mittens_model = Mittens(n=300, max_iter=2000)
t0 = time()
remain_embeddings = mittens_model.fit(
    coocc_ar,
    vocab=remaining_vocab,
    initial_embedding_dict= original_embeddings)
t1=time()
print(f"\ntraining cost {t1 - t0:.2f} seconds")
# new_glove = dict(zip(remaining_vocab, remain_embeddings))

Iteration 2000: loss: 0.00034659053198993206


training cost 138.68 seconds


In [None]:
new_embeds = dict(zip(remaining_vocab, remain_embeddings))

In [None]:
finetuned_embeddings = {**new_glove, **new_embeds}

In [None]:
len(finetuned_embeddings)

15429

In [None]:
 finetune_output_dir = '../wv/oposum_w2v/'
 pickle_save(finetuned_embeddings, finetune_output_dir + domain + '_glove_tuned_m3.bin')

# Fine Tune for all Domains

Need to decide on method and then implement for all domains.

In [3]:
def pickle_save(clean_corpus, filename):
    with open(filename, 'wb') as f:
        pickle.dump(clean_corpus, f)

def pickle_load(filename):
    with open(filename, 'rb') as f:
        return pickle.load(f)

def glove2dict(glove_filename):
    with open(glove_filename, encoding='utf-8') as f:
        reader = csv.reader(f, delimiter=' ', quoting=csv.QUOTE_NONE)
        embed = {line[0]: np.array(list(map(float, line[1:])))
                for line in reader}
    return embed

def build_vocab(corpus):
    vocab = set()
    count = defaultdict(int)
    for sentence in corpus:
      for token in sentence:
        vocab.add(token)
        count[token] += 1
    return vocab, count

def build_cooc_matrix(vocab, corpus_doc):
    t0=time()
    count_model = CountVectorizer(ngram_range=(1,5),vocabulary=vocab) # unigram to 5-gram. building only for corpus specific vocab
    X = count_model.fit_transform(corpus_doc)
    X[X > 0] = 1 # to remove within-line cooccurence
    Xc = (X.T * X) # co-occurrence matrix in sparse csr format
    Xc.setdiag(0) # setting same word cooccurence to 0
    # coocc_arr = sparse.lil_matrix(Xc).toarray()
    coocc_ar = Xc.toarray()
    t1= time()
    print(f" Matrix build cost {t1 - t0:.2f} seconds")
    return coocc_ar

def train_mittens(coocc_ar, vocab, original_embeddings):
    mittens_model = Mittens(n=300, max_iter=1000)
    t0 = time()
    new_embeddings = mittens_model.fit(
    coocc_ar,
    vocab=vocab,
    initial_embedding_dict= original_embeddings)
    t1=time()
    print(f"\nTraining cost {t1 - t0:.2f} seconds")
    return new_embeddings

In [4]:
all_domains = ['bags_and_cases', 'bluetooth', 'boots', 'keyboards', 'tv', 'vacuums']

In [5]:
glove_path = "../glove.6B/glove.6B.300d.txt" # get it from https://nlp.stanford.edu/projects/glove
original_embeddings = glove2dict(glove_path)

In [6]:
finetune_output_dir = '../wv/oposum_w2v/'

In [9]:
for domain in all_domains:
    t0 = time()
    print(f"for domain {domain}")
    print(f"loading corpus for domain {domain}..")
    corpus_file = '../processed/oposum/' + domain + '_corpus_wotf1.pkl'
    corpus = pickle_load(corpus_file)

    print("building vocab")
    vocab, count = build_vocab(corpus)
    corpus_doc = [' '.join(item) for item in corpus]
    corp_vocab = [token for token in vocab if token not in original_embeddings.keys()]

    random_embeds = np.random.rand(len(corp_vocab),300)
    new_embeds = dict(zip(corp_vocab, random_embeds))
    old_embeds = {item:original_embeddings[item] for item in vocab if item in original_embeddings}
    pretrained_glove = {**old_embeds, **new_embeds} #combining old and new
    print("saving pretrained model ...")
    pickle_save(pretrained_glove, finetune_output_dir + domain + '_glove_pretrained.bin')

    print("start training ...")
    coocc_ar = build_cooc_matrix(corp_vocab, corpus_doc)
    new_embeddings = train_mittens(coocc_ar, corp_vocab, original_embeddings)
    new_embeds = dict(zip(corp_vocab, new_embeddings))
    finetuned_glove = {**old_embeds, **new_embeds} #combining old and new
    print("saving tuned model ...")
    pickle_save(finetuned_glove, finetune_output_dir + domain + '_glove_tuned.bin')

    print(f"finish fine-tuning on domain {domain} in {time() - t0:.2f} seconds!\n\n")

    # t0 = time()
    # print(f"for domain {domain}")
    # print(f"loading corpus for domain {domain}..")
    # corpus_file = '../processed/oposum/' + domain + '_corpus_wotf1.pkl'
    # corpus = pickle_load(corpus_file)

    # vocab = set()
    # for sentence in corpus:
    #   for item in sentence:
    #     vocab.add(item)
      

    # print("loading pre-trained vectors ...")
    # corp_vocab = [token for token in vocab if token not in original_embeddings.keys()]
    # corpus_doc = [' '.join(item) for item in corpus]
    # print("start training ...")
    # count_model = CountVectorizer(ngram_range=(1,5),vocabulary=corp_vocab) # unigram to 5-gram. building only for corpus specific vocab
    # X = count_model.fit_transform(corpus_doc)
    # X[X > 0] = 1 # to remove within-line cooccurence
    # Xc = (X.T * X) # co-occurrence matrix in sparse csr format
    # Xc.setdiag(0) # setting same word cooccurence to 0
    # coocc_ar = Xc.toarray()

    # t1 = time()
    # print(f"co-occurence matrix creation cost {t1 - t0:.2f} seconds")

    # mittens_model = Mittens(n=300, max_iter=500)
    # new_embeddings = mittens_model.fit(
    #     coocc_ar,
    #     vocab=corp_vocab,
    #     initial_embedding_dict= original_embeddings)
    
    # print(f"\ntraining cost {time() - t1:.2f} seconds")

    # print("save fine-tuned word vectors ...")
    # new_embeds = dict(zip(corp_vocab, new_embeddings))
    # old_embeds = {item:original_embeddings[item] for item in vocab if item in original_embeddings}
    # finetuned_embeddings = {**old_embeds, **new_embeds} #combining old and new
    # pickle_save(finetuned_embeddings, finetune_output_dir + domain + '_glove_tuned.bin')

    # print(f"finish fine-tuning on domain {domain} in {time() - t0:.2f} seconds!\n\n")

for domain bags_and_cases
loading corpus for domain bags_and_cases..
building vocab
saving pretrained model ...
start training ...


  self._set_arrayXarray(i, j, x)


 Matrix build cost 6.14 seconds
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


Iteration 890: stopping with loss < self.tol


Training cost 62.95 seconds
saving tuned model ...
finish fine-tuning on domain bags_and_cases in 72.66 seconds!


for domain bluetooth
loading corpus for domain bluetooth..
building vocab
saving pretrained model ...
start training ...
 Matrix build cost 14.39 seconds


Iteration 1000: loss: 0.0011104565346613526


Training cost 237.18 seconds
saving tuned model ...
finish fine-tuning on domain bluetooth in 257.49 seconds!


for domain boots
loading corpus for domain boots..
building vocab
saving pretrained model ...
start training ...
 Matrix build cost 9.40 seconds


Iteration 780: loss: 0.00010530331928748637


Training cost 56.87 seconds
saving tuned model ...
finish fine-tuning on domain boots in 69.37 seconds!


for domain keyboards
loading corpus for domain keyboards..
building vocab
saving pretrained model ...
start training ...
 Matrix build cost 6.18 seconds


Iteration 1000: loss: 0.00014414358884096146


Training cost 99.44 seconds
saving tuned model ...
finish fine-tuning on domain keyboards in 108.29 seconds!


for domain tv
loading corpus for domain tv..
building vocab
saving pretrained model ...
start training ...
 Matrix build cost 15.19 seconds


Iteration 1000: loss: 0.0034087328240275383


Training cost 442.96 seconds
saving tuned model ...
finish fine-tuning on domain tv in 463.10 seconds!


for domain vacuums
loading corpus for domain vacuums..
building vocab
saving pretrained model ...
start training ...
 Matrix build cost 13.96 seconds


Iteration 1000: loss: 0.0006194550078362226


Training cost 143.59 seconds
saving tuned model ...
finish fine-tuning on domain vacuums in 162.39 seconds!


