In [136]:
"""Source
https://towardsdatascience.com/fine-tune-glove-embeddings-using-mittens-89b5f3fe4c39
"""

import numpy as np
import csv
import nltk
from nltk.tokenize import sent_tokenize, wordpunct_tokenize, word_tokenize
from nltk.corpus import brown ,stopwords
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
import re
import configparser
from mittens import Mittens, GloVe

config = configparser.ConfigParser()
config.read("config.ini")

#importing local module 
from models.subclasses import *
from models.utilities import *
from models.train_utils import *
from models.predict import *

In [2]:
# Train sample size (-1 for max) 
# can't exceed 118286 sample
sample = int(config['config']['train_sample'])

#train split percentage 80-20
percentage = float(config['config']['percentage'])

# Max word count for a caption.
max_length = int(config['config']['max_length'])
# Use the top words for a vocabulary.
vocabulary_size = int(config['config']['vocabulary_size'])
use_glove = bool(config['config']['use_glove'])

glove_dim = int(config['config']['glove_dim'])
glove_path = f"./dataset/glove.6B/glove.6B.{glove_dim}d.txt"

In [3]:
train_image_paths, image_path_to_caption = import_files(shuffle= False, method = "train")

118286


In [4]:
train_captions = []
img_name_vector = []
for image_path in train_image_paths:
  caption_list = image_path_to_caption[image_path]
  train_captions.extend(caption_list)
  img_name_vector.extend([image_path] * len(caption_list))

In [6]:
def glove2dict(glove_filename):
    with open(glove_filename, encoding='utf-8') as f:
        reader = csv.reader(f, delimiter=' ',quoting=csv.QUOTE_NONE)
        embed = {line[0]: np.array(list(map(float, line[1:])))
                for line in reader}
    return embed

In [7]:
pre_glove = glove2dict(glove_path)
embeddings_index = pre_glove

In [8]:
word_to_index, index_to_word, tokenizer, cap_vector = tokenization(train_captions, max_length, vocabulary_size)

In [106]:
total_words = []
for sentence in train_captions:
    sentence = re.sub(r"[!\"#$%&\(\)\*\+.,-/:;=?@\[\\\]^_`{|}~]", "", sentence)
    sentence = sentence.split()[1:-1]
    for word in sentence:
        total_words.append(word)
    total_words.append(".")
print(len(total_words))

6186698


In [9]:
vocabulary = tokenizer.get_vocabulary()
word_index = dict(zip(vocabulary, range(len(vocabulary))))

In [115]:
num_tokens = len(vocabulary)
embedding_dim = glove_dim
hits = 0
misses = 0

# Prepare embedding matrix
embedding_matrix = np.zeros((num_tokens, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # Words not found in embedding index will be all-zeros.
        # This includes the representation for "padding" and "OOV"
        embedding_matrix[i] = embedding_vector
        hits += 1
    else:
        misses += 1

print("Converted %d words (%d misses)" % (hits, misses))

Converted 21614 words (22431 misses)


In [11]:
sw = list(stopwords.words('english'))
brown_data = brown.words()[:2000]
brown_nonstop = [token.lower() for token in brown_data if (token.lower() not in sw)]
oov = [token for token in brown_nonstop if token not in pre_glove.keys()]

In [107]:
total_words_nonstop = [token.lower() for token in total_words if (token.lower() not in sw)]
oov2 = [token for token in total_words_nonstop if token not in pre_glove.keys()]

In [108]:
def get_rareoov(xdict, val):
    return [k for (k,v) in Counter(xdict).items() if v<=val]
    
oov_rare = get_rareoov(oov, 1)
corp_vocab = list(set(oov) - set(oov_rare))

oov2_rare = get_rareoov(oov2, 1)
corp_vocab2 = list(set(oov2) - set(oov2_rare))

In [17]:
brown_tokens = [token for token in brown_nonstop if token not in oov_rare]
brown_doc = [' '.join(brown_tokens)]
corp_vocab = list(set(oov))

In [116]:
new_tokens = [token for token in total_words_nonstop if token not in oov2_rare]
new_doc = [' '.join(new_tokens)]
corp_vocab2 = list(set(oov2))

In [132]:
len(corp_vocab2)

7334

In [134]:
cv = CountVectorizer(ngram_range=(1,1), vocabulary=corp_vocab2)
X = cv.fit_transform(new_doc)
Xc = (X.T * X)
Xc.setdiag(0)
coocc_ar = Xc.toarray()


  self._set_arrayXarray(i, j, x)


In [144]:
mittens_model = Mittens(n=glove_dim, max_iter=1000)
new_embeddings = mittens_model.fit(
    coocc_ar,
    vocab=corp_vocab2,
    initial_embedding_dict= pre_glove)

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


Iteration 1000: loss: 68.67653656005868

In [145]:
newglove = dict(zip(corp_vocab, new_embeddings))
f = open("./dataset/glove.6B/new_glove.6B.{glove_dim}d.pkl","wb")
pickle.dump(newglove, f)
f.close()