# Introduction

This notebook shows how to train a gensim Doc2Vec model to create paragraph embeddings for the flicker captions.



In [2]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

import pandas
import nltk
import pickle
from nltk import word_tokenize
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from multiprocessing import Pool

In [4]:
from dl_utils import *

In [7]:
setup_folder_defaults()

### Open the captions file, we will use this to create our corpus and to train the unsupervised Doc2Vec model

In [9]:
df = pandas.read_csv('.\data\captions.txt')

In [10]:
df.head(15)

Unnamed: 0,image,caption
0,1000268201_693b08cb0e.jpg,A child in a pink dress is climbing up a set o...
1,1000268201_693b08cb0e.jpg,A girl going into a wooden building .
2,1000268201_693b08cb0e.jpg,A little girl climbing into a wooden playhouse .
3,1000268201_693b08cb0e.jpg,A little girl climbing the stairs to her playh...
4,1000268201_693b08cb0e.jpg,A little girl in a pink dress going into a woo...
5,1001773457_577c3a7d70.jpg,A black dog and a spotted dog are fighting
6,1001773457_577c3a7d70.jpg,A black dog and a tri-colored dog playing with...
7,1001773457_577c3a7d70.jpg,A black dog and a white dog with brown spots a...
8,1001773457_577c3a7d70.jpg,Two dogs of different breeds looking at each o...
9,1001773457_577c3a7d70.jpg,Two dogs on pavement moving toward each other .


In [11]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\catkm\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [12]:
tagged = [TaggedDocument(words=word_tokenize(_d), tags=[str(i)]) for i, _d in enumerate(df.caption)]

In [14]:
print(f'The size of tagged is {len(tagged)}')

tagged[0]

The size of tagged is 40455


TaggedDocument(words=['A', 'child', 'in', 'a', 'pink', 'dress', 'is', 'climbing', 'up', 'a', 'set', 'of', 'stairs', 'in', 'an', 'entry', 'way', '.'], tags=['0'])

In [15]:
max_epochs = 100
vec_size = 32
alpha = 0.025

model = Doc2Vec(vector_size=vec_size, alpha=alpha, min_count=5, dm=0)
model.build_vocab(tagged)

2022-06-21 13:59:28,185 : INFO : Doc2Vec lifecycle event {'params': 'Doc2Vec(dbow,d32,n5,mc5,s0.001,t3)', 'datetime': '2022-06-21T13:59:28.185354', 'gensim': '4.1.2', 'python': '3.8.13 (default, Mar 28 2022, 06:59:08) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19044-SP0', 'event': 'created'}
2022-06-21 13:59:28,186 : INFO : collecting all words and their counts
2022-06-21 13:59:28,187 : INFO : PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags
2022-06-21 13:59:28,254 : INFO : PROGRESS: at example #10000, processed 118975 words (1790549/s), 4806 word types, 10000 tags
2022-06-21 13:59:28,310 : INFO : PROGRESS: at example #20000, processed 236504 words (2177684/s), 6812 word types, 20000 tags
2022-06-21 13:59:28,362 : INFO : PROGRESS: at example #30000, processed 352954 words (2251742/s), 8365 word types, 30000 tags
2022-06-21 13:59:28,411 : INFO : PROGRESS: at example #40000, processed 471304 words (2478004/s), 9563 word types, 40000 tags
2022-06-21 1

In [16]:
model.train(tagged, total_examples=model.corpus_count, epochs=max_epochs)

2022-06-21 13:59:36,019 : INFO : Doc2Vec lifecycle event {'msg': 'training model with 3 workers on 3107 vocabulary and 32 features, using sg=1 hs=0 sample=0.001 negative=5 window=5 shrink_windows=True', 'datetime': '2022-06-21T13:59:36.019030', 'gensim': '4.1.2', 'python': '3.8.13 (default, Mar 28 2022, 06:59:08) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19044-SP0', 'event': 'train'}
2022-06-21 13:59:37,063 : INFO : EPOCH 1 - PROGRESS: at 52.28% examples, 161478 words/s, in_qsize 6, out_qsize 0
2022-06-21 13:59:38,041 : INFO : worker thread finished; awaiting finish of 2 more threads
2022-06-21 13:59:38,090 : INFO : EPOCH 1 - PROGRESS: at 97.98% examples, 151251 words/s, in_qsize 1, out_qsize 1
2022-06-21 13:59:38,091 : INFO : worker thread finished; awaiting finish of 1 more threads
2022-06-21 13:59:38,095 : INFO : worker thread finished; awaiting finish of 0 more threads
2022-06-21 13:59:38,096 : INFO : EPOCH - 1 : training on 476679 raw words (317270 effective words

2022-06-21 14:00:04,184 : INFO : EPOCH 14 - PROGRESS: at 41.66% examples, 130831 words/s, in_qsize 5, out_qsize 0
2022-06-21 14:00:05,207 : INFO : EPOCH 14 - PROGRESS: at 92.40% examples, 144084 words/s, in_qsize 4, out_qsize 0
2022-06-21 14:00:05,258 : INFO : worker thread finished; awaiting finish of 2 more threads
2022-06-21 14:00:05,291 : INFO : worker thread finished; awaiting finish of 1 more threads
2022-06-21 14:00:05,295 : INFO : worker thread finished; awaiting finish of 0 more threads
2022-06-21 14:00:05,295 : INFO : EPOCH - 14 : training on 476679 raw words (317789 effective words) took 2.1s, 149759 effective words/s
2022-06-21 14:00:06,320 : INFO : EPOCH 15 - PROGRESS: at 45.92% examples, 143395 words/s, in_qsize 5, out_qsize 0
2022-06-21 14:00:07,320 : INFO : worker thread finished; awaiting finish of 2 more threads
2022-06-21 14:00:07,370 : INFO : EPOCH 15 - PROGRESS: at 97.98% examples, 150477 words/s, in_qsize 1, out_qsize 1
2022-06-21 14:00:07,371 : INFO : worker thre

2022-06-21 14:00:33,367 : INFO : worker thread finished; awaiting finish of 1 more threads
2022-06-21 14:00:33,373 : INFO : worker thread finished; awaiting finish of 0 more threads
2022-06-21 14:00:33,374 : INFO : EPOCH - 28 : training on 476679 raw words (317578 effective words) took 1.9s, 165353 effective words/s
2022-06-21 14:00:34,403 : INFO : EPOCH 29 - PROGRESS: at 48.05% examples, 149587 words/s, in_qsize 5, out_qsize 0
2022-06-21 14:00:35,392 : INFO : worker thread finished; awaiting finish of 2 more threads
2022-06-21 14:00:35,429 : INFO : EPOCH 29 - PROGRESS: at 98.56% examples, 152944 words/s, in_qsize 1, out_qsize 1
2022-06-21 14:00:35,430 : INFO : worker thread finished; awaiting finish of 1 more threads
2022-06-21 14:00:35,432 : INFO : worker thread finished; awaiting finish of 0 more threads
2022-06-21 14:00:35,433 : INFO : EPOCH - 29 : training on 476679 raw words (317597 effective words) took 2.0s, 154956 effective words/s
2022-06-21 14:00:36,496 : INFO : EPOCH 30 - P

2022-06-21 14:01:02,013 : INFO : EPOCH - 43 : training on 476679 raw words (317747 effective words) took 1.7s, 187795 effective words/s
2022-06-21 14:01:03,078 : INFO : EPOCH 44 - PROGRESS: at 48.05% examples, 145058 words/s, in_qsize 5, out_qsize 0
2022-06-21 14:01:04,063 : INFO : worker thread finished; awaiting finish of 2 more threads
2022-06-21 14:01:04,083 : INFO : EPOCH 44 - PROGRESS: at 98.56% examples, 152178 words/s, in_qsize 1, out_qsize 1
2022-06-21 14:01:04,084 : INFO : worker thread finished; awaiting finish of 1 more threads
2022-06-21 14:01:04,088 : INFO : worker thread finished; awaiting finish of 0 more threads
2022-06-21 14:01:04,089 : INFO : EPOCH - 44 : training on 476679 raw words (317398 effective words) took 2.1s, 153961 effective words/s
2022-06-21 14:01:05,107 : INFO : EPOCH 45 - PROGRESS: at 50.16% examples, 158787 words/s, in_qsize 5, out_qsize 0
2022-06-21 14:01:05,976 : INFO : worker thread finished; awaiting finish of 2 more threads
2022-06-21 14:01:06,02

2022-06-21 14:01:32,203 : INFO : worker thread finished; awaiting finish of 2 more threads
2022-06-21 14:01:32,243 : INFO : worker thread finished; awaiting finish of 1 more threads
2022-06-21 14:01:32,247 : INFO : worker thread finished; awaiting finish of 0 more threads
2022-06-21 14:01:32,248 : INFO : EPOCH - 58 : training on 476679 raw words (317526 effective words) took 1.9s, 171561 effective words/s
2022-06-21 14:01:33,267 : INFO : EPOCH 59 - PROGRESS: at 45.92% examples, 144497 words/s, in_qsize 5, out_qsize 0
2022-06-21 14:01:34,270 : INFO : EPOCH 59 - PROGRESS: at 96.54% examples, 152251 words/s, in_qsize 2, out_qsize 1
2022-06-21 14:01:34,271 : INFO : worker thread finished; awaiting finish of 2 more threads
2022-06-21 14:01:34,299 : INFO : worker thread finished; awaiting finish of 1 more threads
2022-06-21 14:01:34,303 : INFO : worker thread finished; awaiting finish of 0 more threads
2022-06-21 14:01:34,303 : INFO : EPOCH - 59 : training on 476679 raw words (317545 effecti

2022-06-21 14:02:00,339 : INFO : worker thread finished; awaiting finish of 1 more threads
2022-06-21 14:02:00,342 : INFO : worker thread finished; awaiting finish of 0 more threads
2022-06-21 14:02:00,343 : INFO : EPOCH - 72 : training on 476679 raw words (317586 effective words) took 2.1s, 152611 effective words/s
2022-06-21 14:02:01,363 : INFO : EPOCH 73 - PROGRESS: at 48.05% examples, 150571 words/s, in_qsize 6, out_qsize 0
2022-06-21 14:02:02,343 : INFO : worker thread finished; awaiting finish of 2 more threads
2022-06-21 14:02:02,365 : INFO : EPOCH 73 - PROGRESS: at 97.98% examples, 154393 words/s, in_qsize 1, out_qsize 1
2022-06-21 14:02:02,366 : INFO : worker thread finished; awaiting finish of 1 more threads
2022-06-21 14:02:02,373 : INFO : worker thread finished; awaiting finish of 0 more threads
2022-06-21 14:02:02,374 : INFO : EPOCH - 73 : training on 476679 raw words (317507 effective words) took 2.0s, 157012 effective words/s
2022-06-21 14:02:03,486 : INFO : EPOCH 74 - P

2022-06-21 14:02:29,457 : INFO : worker thread finished; awaiting finish of 1 more threads
2022-06-21 14:02:29,471 : INFO : worker thread finished; awaiting finish of 0 more threads
2022-06-21 14:02:29,471 : INFO : EPOCH - 85 : training on 476679 raw words (317658 effective words) took 2.2s, 146305 effective words/s
2022-06-21 14:02:30,486 : INFO : EPOCH 86 - PROGRESS: at 50.16% examples, 159241 words/s, in_qsize 5, out_qsize 0
2022-06-21 14:02:31,424 : INFO : worker thread finished; awaiting finish of 2 more threads
2022-06-21 14:02:31,427 : INFO : worker thread finished; awaiting finish of 1 more threads
2022-06-21 14:02:31,431 : INFO : worker thread finished; awaiting finish of 0 more threads
2022-06-21 14:02:31,432 : INFO : EPOCH - 86 : training on 476679 raw words (317850 effective words) took 1.9s, 163321 effective words/s
2022-06-21 14:02:32,559 : INFO : EPOCH 87 - PROGRESS: at 58.69% examples, 166023 words/s, in_qsize 5, out_qsize 0
2022-06-21 14:02:33,210 : INFO : worker threa

In [17]:
model.save(f'Models/Flickr-8k-Doc2Vec.h5')

2022-06-21 14:03:07,683 : INFO : Doc2Vec lifecycle event {'fname_or_handle': 'Models/Flickr-8k-Doc2Vec.h5', 'separately': 'None', 'sep_limit': 10485760, 'ignore': frozenset(), 'datetime': '2022-06-21T14:03:07.683012', 'gensim': '4.1.2', 'python': '3.8.13 (default, Mar 28 2022, 06:59:08) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19044-SP0', 'event': 'saving'}
2022-06-21 14:03:07,685 : INFO : not storing attribute cum_table
2022-06-21 14:03:07,724 : INFO : saved Models/Flickr-8k-Doc2Vec.h5


In [27]:
test_data = tagged[2].words
model.random.seed(43)
infer = model.infer_vector(test_data, epochs=500, alpha=model.alpha)

similar = model.docvecs.most_similar([infer]) 
similar

  similar = model.docvecs.most_similar([infer])


[('2', 0.9638568162918091),
 ('37582', 0.8113299012184143),
 ('26481', 0.8103573322296143),
 ('8708', 0.8089443445205688),
 ('29192', 0.7855968475341797),
 ('1', 0.7803649306297302),
 ('5993', 0.7718825340270996),
 ('15391', 0.7685743570327759),
 ('26774', 0.7681808471679688),
 ('40316', 0.7672929763793945)]

In [28]:
print(test_data)
print(tagged[37582].words)

['A', 'little', 'girl', 'climbing', 'into', 'a', 'wooden', 'playhouse', '.']
['A', 'little', 'girl', 'sliding', 'into', 'a', 'pool', '.']


In [23]:
from dl_inferencing import *

2022-06-21 14:07:30,760 : INFO : loading Doc2Vec object from Models/Flickr-8k-Doc2Vec.h5
2022-06-21 14:07:30,797 : INFO : loading dv recursively from Models/Flickr-8k-Doc2Vec.h5.dv.* with mmap=r
2022-06-21 14:07:30,797 : INFO : loading wv recursively from Models/Flickr-8k-Doc2Vec.h5.wv.* with mmap=r
2022-06-21 14:07:30,798 : INFO : setting ignored attribute cum_table to None
2022-06-21 14:07:30,862 : INFO : Doc2Vec lifecycle event {'fname': 'Models/Flickr-8k-Doc2Vec.h5', 'datetime': '2022-06-21T14:07:30.862056', 'gensim': '4.1.2', 'python': '3.8.13 (default, Mar 28 2022, 06:59:08) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19044-SP0', 'event': 'loaded'}


In [29]:
with Pool(processes=4) as pool:
    vectors = pool.map(infer_vector, df.caption)

In [30]:
features_dict = dict()
for vector in vectors:
    features_dict[vector[0]] = vector[1]

In [31]:
pickle.dump(features_dict, open('Models/caption_features_dict.pickle', "wb"))