In [1]:
import tensorflow as tf
import csv

from nltk.corpus import stopwords
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
import re

# You'll generate plots of attention in order to see which parts of an image
# your model focuses on during captioning
import matplotlib.pyplot as plt

import numpy as np
from PIL import Image
import configparser

config = configparser.ConfigParser()
config.read("config.ini")

#importing local module 
from models.subclasses import *
from models.utilities import *
from models.train_utils import *
from models.predict import *

In [2]:
# Train sample size (-1 for max) 
# can't exceed 118286 sample
sample = int(config['config']['train_sample'])

#train split percentage 80-20
percentage = float(config['config']['percentage'])

# Max word count for a caption.
max_length = int(config['config']['max_length'])
# Use the top words for a vocabulary.
vocabulary_size = int(config['config']['vocabulary_size'])
use_glove = bool(config['config']['use_glove'])

glove_dim = int(config['config']['glove_dim'])
glove_path = f"./dataset/glove.6B/glove.6B.{glove_dim}d.txt"

In [3]:
train_image_paths, image_path_to_caption = import_files(shuffle= False, method = "train")

118287


In [4]:
train_captions = []
img_name_vector = []
for image_path in train_image_paths:
  caption_list = image_path_to_caption[image_path]
  train_captions.extend(caption_list)
  img_name_vector.extend([image_path] * len(caption_list))

In [5]:
word_to_index, index_to_word, tokenizer, cap_vector = tokenization(train_captions, max_length, vocabulary_size)

In [6]:
def glove2dict(glove_filename):
    with open(glove_filename, encoding='utf-8') as f:
        reader = csv.reader(f, delimiter=' ',quoting=csv.QUOTE_NONE)
        embed = {line[0]: np.array(list(map(float, line[1:])))
                for line in reader}
    return embed

In [7]:
pre_glove = glove2dict(glove_path)
embeddings_index = pre_glove

In [8]:
total_words = []
for sentence in train_captions:
    sentence = re.sub(r"[!\"#$%&\(\)\*\+.,-/:;=?@\[\\\]^_`{|}~]", "", sentence)
    sentence = sentence.split()[1:-1]
    for word in sentence:
        total_words.append(word)
    total_words.append(".")
print(len(total_words))

6778506


In [9]:
vocabulary = tokenizer.get_vocabulary()
word_index = dict(zip(vocabulary, range(len(vocabulary))))

In [10]:
# sw = list(stopwords.words('english'))
# brown_data = brown.words()[:2000]
# brown_nonstop = [token.lower() for token in brown_data if (token.lower() not in sw)]
# oov = [token for token in brown_nonstop if token not in pre_glove.keys()]
total_words_nonstop = [token.lower() for token in total_words]
oov = [token for token in total_words_nonstop if token not in pre_glove.keys()]

In [11]:
len(oov)

19864

In [12]:
def get_rareoov(xdict, val):
    return [k for (k,v) in Counter(xdict).items() if v<=val]
    
oov_rare = get_rareoov(oov, 1)
corp_vocab = list(set(oov) - set(oov_rare))

In [13]:
len(corp_vocab)

1582

In [14]:
new_tokens = [token for token in total_words_nonstop if token not in oov_rare]
new_doc = [' '.join(new_tokens)]
# corp_vocab = list(set(oov))

In [15]:
len(corp_vocab)

1582

In [16]:
cv = CountVectorizer(ngram_range=(1,1), vocabulary=corp_vocab)
X = cv.fit_transform(new_doc)
Xc = (X.T * X)
Xc.setdiag(0)
coocc_ar = Xc.toarray()

  self._set_arrayXarray(i, j, x)


In [17]:
coocc_ar.shape

(1582, 1582)

In [18]:
from mittens import Mittens

mittens_model = Mittens(n=glove_dim, max_iter=4000)
new_embeddings = mittens_model.fit(
    coocc_ar,
    vocab=corp_vocab,
    initial_embedding_dict= pre_glove)

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


Iteration 4000: loss: 29.757682800292973

In [19]:
newglove = dict(zip(corp_vocab, new_embeddings))
f = open(f"./dataset/glove.6B/new_glove.6B.{glove_dim}d.pkl","wb")
pickle.dump(newglove, f)
f.close()