In [2]:
import pickle
import pandas as pd
import numpy as np
import spacy
import re

import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F

from collections import defaultdict
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence

In [3]:
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_columns', 999)

## Load and combine humor and romantic captions from FlickrStyle7k

In [4]:
img_id = pickle.load(open( "data/FlickrStyle_v0.9/humor/train.p", "rb" ) )
text = pd.read_csv("data/FlickrStyle_v0.9/humor/funny_train.txt", encoding= 'unicode_escape', names=['caption'])
captions = list(text.caption)
df_funny = pd.DataFrame(zip(img_id, captions), columns=['img_id', 'funny_caption'])

img_id = pickle.load(open( "data/FlickrStyle_v0.9/romantic/train.p", "rb" ) )
text = pd.read_csv("data/FlickrStyle_v0.9/romantic/romantic_train.txt", encoding= 'unicode_escape', names=['caption'])
captions = list(text.caption)
df_romantic = pd.DataFrame(zip(img_id, captions), columns=['img_id', 'romantic_caption'])

df_stylized = df_funny.merge(df_romantic, on='img_id', how='left')

In [8]:
df_stylized.head()

Unnamed: 0,img_id,funny_caption,romantic_caption
0,2513260012_03d33305cf.jpg,two dogs chase each other across the snowy ground in search of gold nuggets .,two dogs in love are playing together in the snow with full joy .
1,2903617548_d3e38d7f88.jpg,a little girl plays croquet next to a truck to amuse her dad .,the child is playing croquette by the truck
2,3338291921_fe7ae0c8f8.jpg,a dog with something pink in its mouth is looking forward to an adventure .,a dog is holding a shirt searching for his lost love in the snow .
3,488416045_1c6d903fe0.jpg,a dog walks on the sand near the water,a brown dog is running along a beach towards his loving master .
4,2644326817_8f45080b87.jpg,a dog is surprised by a red frisbee flying in the air .,a dog drops a red disc


## Preprocess Flickr8k to match FlickrStyle7k

In [9]:
flickr8k_filename = "data/Flickr8k_text/Flickr8k.token.txt"

In [10]:
def load_Flickr8k(filename):
    token = pd.read_csv(filename, delimiter='\n', encoding= 'unicode_escape', names=['line'])
    # parse lines into imd_id, cap_id, caption
    new = token['line'].str.split('#', n=1, expand=True)
    new2 = new[1].str.split('\t', n=1, expand=True)
    token['img_id'] = new[0]
    token['cap_id'] = new2[0]
    token['caption'] = new2[1]
    return token.drop(['line'], axis=1)

In [11]:
def filter_Flickr8k_to_7k(flickr8k, flickr7k):
    return flickr8k.merge(flickr7k, on='img_id', how='inner').drop(['funny_caption', 'romantic_caption'], axis=1)

In [13]:
token = load_Flickr8k(flickr8k_filename)
df_factual = filter_Flickr8k_to_7k(token, df_stylized)

In [14]:
df_factual.img_id.nunique()

7000

In [22]:
X = np.array([tuple(x) for x in df_stylized[['funny_caption', 'romantic_caption']].to_numpy()])
y = df_stylized[['img_id']].to_numpy()

In [23]:
X[0], y[0]

(array(['two dogs chase each other across the snowy ground in search of gold nuggets .',
        'two dogs in love are playing together in the snow with full joy .'],
       dtype='<U141'), array(['2513260012_03d33305cf.jpg'], dtype=object))

In [24]:
len(y)

7000

## Train test split

In [25]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

## Build vocab for all captions, using pre-trained embeddings

In [28]:
re_br = re.compile(r'<\s*br\s*/?>', re.IGNORECASE)
def sub_br(x): return re_br.sub("\n", x)

my_tok = spacy.load('en')
def spacy_tok(x): return [tok.text for tok in my_tok.tokenizer(sub_br(x))]

In [29]:
spacy_tok(X_train[0][0]+" "+ X_train[0][1])

['a',
 'large',
 'brown',
 'dog',
 'is',
 'sticking',
 'his',
 'face',
 'in',
 'the',
 'sprinkler',
 'to',
 'catch',
 'it',
 '.',
 'dog',
 'slurps',
 'water',
 'from',
 'sprinkler',
 'on',
 'the',
 'grass',
 'leaving',
 'some',
 'for',
 'the',
 'lawn',
 '.']

In [30]:
def loadGloveModel(gloveFile="data/glove.6B.50d.txt"):
    """ Loads word vectors into a dictionary."""
    f = open(gloveFile,'r')
    word_vecs = {}
    for line in f:
        splitLine = line.split()
        word = splitLine[0]
        word_vecs[word] = np.array([float(val) for val in splitLine[1:]])
    return word_vecs

In [31]:
def get_vocab(content):
    """Computes Dict of counts of words.
    
    Computes the number of times a word is on a document.
    """
    vocab = defaultdict(float)
    for line in content:
        words = set(spacy_tok(line[0]+" "+line[1]))
        for word in words:
            vocab[word] += 1
    return vocab

In [33]:
word_vecs = loadGloveModel()

In [35]:
word_count = get_vocab(X_train)
print(len(word_vecs.keys()), len(word_count.keys()))

400000 7590


In [36]:
def delete_rare_words(word_vecs, word_count, min_df=4):
    """ Deletes rare words from word_count
    
    Deletes words from word_count if they are not in word_vecs
    and don't have at least min_df occurrencies in word_count.
    """
    words_delete = []
    for word in word_count:
        if word_count[word] < min_df and word not in word_vecs:
            words_delete.append(word)
    for word in words_delete: word_count.pop(word)
    return word_count

In [37]:
word_count = delete_rare_words(word_vecs, word_count)
print(len(word_count.keys()))

7241


In [38]:
def create_embedding_matrix(word_vecs, word_count, min_df=4, emb_size=50):
    """Creates embedding matrix from word vectors. """
    word_count = delete_rare_words(word_vecs, word_count, min_df)
    V = len(word_count.keys()) + 2
    vocab2index = {}
    W = np.zeros((V, emb_size), dtype="float32")
    vocab = ["", "UNK"]
    # adding a vector for padding
    W[0] = np.zeros(emb_size, dtype='float32')
    # adding a vector for rare words 
    W[1] = np.random.uniform(-0.25, 0.25, emb_size)
    vocab2index["UNK"] = 1
    i = 2
    for word in word_count:
        if word in word_vecs:
            W[i] = word_vecs[word]
            vocab2index[word] = i
            vocab.append(word)
            i += 1
        else:
            W[i] = np.random.uniform(-0.25,0.25, emb_size)
            vocab2index[word] = i
            vocab.append(word)
            i += 1   
    return W, np.array(vocab), vocab2index

In [39]:
pretrained_weight, vocab, vocab2index = create_embedding_matrix(word_vecs, word_count)

In [40]:
def encode_sentence_no_padding(s, vocab2index):
    return np.array([vocab2index.get(w, vocab2index["UNK"]) for w in s.split()])

## Dataset

In [41]:
class FlickrStyle7kDataset(Dataset):
    def __init__(self, X, y):
        self.x = [(encode_sentence_no_padding(pair[0], vocab2index),
                    encode_sentence_no_padding(pair[1], vocab2index)) for pair in X]
        self.y = y
    
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        x = self.x[idx]
        return x, self.y[idx]
    
train_ds = FlickrStyle7kDataset(X_train, y_train)
valid_ds = FlickrStyle7kDataset(X_val, y_val)

In [42]:
train_ds[0]

((array([12, 17,  8, 24, 19,  3,  2,  6,  4, 22, 15, 10, 13, 25, 23]),
  array([24,  9, 14, 20, 15, 18, 22,  7, 11, 16, 21, 22,  5, 23])),
 array(['1341787777_4f1ebb1793.jpg'], dtype=object))

## Dynamic Padding and Dataloader 

In [43]:
def collate_fn(data):
    """
    Creates mini-batch tensors from the list of nested
    tuples ((q1, q2), labels).
    """
    question, labels = zip(*data)
    q1, q2 = zip(*question)
    q1 = [torch.Tensor(s) for s in q1]
    q2 = [torch.Tensor(s) for s in q2]

    # stack labels
    #labels = torch.Tensor(labels)
    
    # Merge sentences
    length1 = [len(s) for s in q1]
    length2 = [len(s) for s in q2]
   
    # Padding
    sents1 = pad_sequence(q1, batch_first=True, padding_value=0)
    sents2 = pad_sequence(q2, batch_first=True, padding_value=0)
    
    return sents1, sents2, length1, length2 #, labels

In [44]:
collate_fn([train_ds[0], train_ds[1], train_ds[2]])

(tensor([[12., 17.,  8., 24., 19.,  3.,  2.,  6.,  4., 22., 15., 10., 13., 25.,
          23.],
         [12., 26.,  4., 43., 34., 38., 25., 48., 39., 36., 29., 37., 28., 33.,
          23.],
         [12., 26., 57., 22., 55., 61., 62., 28., 49., 52., 23.,  0.,  0.,  0.,
           0.]]),
 tensor([[24.,  9., 14., 20., 15., 18., 22.,  7., 11., 16., 21., 22.,  5., 23.,
           0.,  0.,  0.,  0.],
         [12., 40., 31., 12., 47., 27., 41., 12., 45., 35.,  4., 44., 30., 32.,
          46., 28., 42., 23.],
         [58., 53., 56., 12., 51., 62., 58., 52., 31., 63., 12., 54., 59., 50.,
          62., 28., 60., 23.]]),
 [15, 15, 11],
 [14, 18, 18])