In [None]:
import csv

from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
import re

# You'll generate plots of attention in order to see which parts of an image
# your model focuses on during captioning

import numpy as np
import configparser

config = configparser.ConfigParser()
config.read("config.ini")

#importing local module 
from models.subclasses import *
from models.utilities import *
from models.train_utils import *
from models.predict import *

In [None]:
# Train sample size (-1 for max) 
# can't exceed 118286 sample
sample = int(config['config']['train_sample'])

#train split percentage 80-20
percentage = float(config['config']['percentage'])

# Max word count for a caption.
max_length = int(config['config']['max_length'])
# Use the top words for a vocabulary.
vocabulary_size = int(config['config']['vocabulary_size'])
use_glove = bool(config['config']['use_glove'])

glove_dim = int(config['config']['glove_dim'])
glove_path = f"./dataset/glove.6B/glove.6B.{glove_dim}d.txt"

In [None]:
train_image_paths, image_path_to_caption = import_files(shuffle= False, method = "train")

In [None]:
train_captions = []
img_name_vector = []
for image_path in train_image_paths:
  caption_list = image_path_to_caption[image_path]
  train_captions.extend(caption_list)
  img_name_vector.extend([image_path] * len(caption_list))

In [None]:
word_to_index, index_to_word, tokenizer, cap_vector = tokenization(train_captions, max_length, vocabulary_size)

In [None]:
def glove2dict(glove_filename):
    with open(glove_filename, encoding='utf-8') as f:
        reader = csv.reader(f, delimiter=' ',quoting=csv.QUOTE_NONE)
        embed = {line[0]: np.array(list(map(float, line[1:])))
                for line in reader}
    return embed

In [None]:
pre_glove = glove2dict(glove_path)
embeddings_index = pre_glove

In [None]:
total_words = []
for sentence in train_captions:
    sentence = re.sub(r"[!\"#$%&\(\)\*\+.,-/:;=?@\[\\\]^_`{|}~]", "", sentence)
    sentence = sentence.split()[1:-1]
    for word in sentence:
        total_words.append(word)
    total_words.append(".")
print(len(total_words))

In [None]:
vocabulary = tokenizer.get_vocabulary()
word_index = dict(zip(vocabulary, range(len(vocabulary))))

In [None]:
total_words_nonstop = [token.lower() for token in total_words]
oov = [token for token in total_words_nonstop if token not in pre_glove.keys()]

In [None]:
len(oov)

In [None]:
def get_rareoov(xdict, val):
    return [k for (k,v) in Counter(xdict).items() if v<=val]
    
oov_rare = get_rareoov(oov, 1)
corp_vocab = list(set(oov) - set(oov_rare))

In [None]:
len(corp_vocab)

In [None]:
max_Vocab = corp_vocab.copy()
max_words_lenght = len(max_Vocab)
for k,v in embeddings_index.items():
    if max_words_lenght >= vocabulary_size:
        break
    if k not in corp_vocab and k not in oov_rare:
        max_Vocab.append(k)
        max_words_lenght +=1

In [None]:
len(max_Vocab)

In [None]:
len(corp_vocab)

In [None]:
new_tokens = [token for token in total_words_nonstop if token not in oov_rare]
new_doc = [' '.join(new_tokens)]

In [None]:
cv = CountVectorizer(ngram_range=(1,1), vocabulary=max_Vocab)
X = cv.fit_transform(new_doc)
Xc = (X.T * X)
Xc.setdiag(0)
coocc_ar = Xc.toarray()

In [None]:
coocc_ar.shape

In [None]:
from mittens import Mittens

mittens_model = Mittens(n=glove_dim, max_iter=9000)
new_embeddings = mittens_model.fit(
    coocc_ar,
    vocab=max_Vocab,
    initial_embedding_dict= pre_glove)

In [None]:
len(new_embeddings)

In [None]:
newglove = dict(zip(corp_vocab, new_embeddings))
f = open(f"./dataset/glove.6B/new_glove.6B.{glove_dim}d.pkl","wb")
pickle.dump(newglove, f)
f.close()