# Introduction

This notebook shows how to train a gensim Doc2Vec model to create paragraph embeddings for the flicker captions.



In [27]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

import pandas
import nltk
import pickle
from nltk import word_tokenize
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from multiprocessing import Pool

In [6]:
from dl_utils import *

In [7]:
setup_folder_defaults()

Created directory: models ...


### Open the captions file, we will use this to create our corpus and to train the unsupervised Doc2Vec model

In [4]:
df`= pandas.read_csv('.\data\captions.txt')

In [8]:
df.head()

Unnamed: 0,image,caption
0,1000268201_693b08cb0e.jpg,A child in a pink dress is climbing up a set o...
1,1000268201_693b08cb0e.jpg,A girl going into a wooden building .
2,1000268201_693b08cb0e.jpg,A little girl climbing into a wooden playhouse .
3,1000268201_693b08cb0e.jpg,A little girl climbing the stairs to her playh...
4,1000268201_693b08cb0e.jpg,A little girl in a pink dress going into a woo...


In [11]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\bmosher\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [12]:
tagged = [TaggedDocument(words=word_tokenize(_d), tags=[str(i)]) for i, _d in enumerate(df.caption)]

In [13]:
max_epochs = 100
vec_size = 32
alpha = 0.025

model = Doc2Vec(vector_size=vec_size, alpha=alpha, min_count=5, dm=0)
model.build_vocab(tagged)

2022-06-20 16:45:30,138 : INFO : Doc2Vec lifecycle event {'params': 'Doc2Vec(dbow,d32,n5,mc5,s0.001,t3)', 'datetime': '2022-06-20T16:45:30.138404', 'gensim': '4.1.2', 'python': '3.8.13 (default, Mar 28 2022, 06:59:08) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19042-SP0', 'event': 'created'}
2022-06-20 16:45:30,139 : INFO : collecting all words and their counts
2022-06-20 16:45:30,140 : INFO : PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags
2022-06-20 16:45:30,178 : INFO : PROGRESS: at example #10000, processed 118975 words (3212701/s), 4806 word types, 10000 tags
2022-06-20 16:45:30,214 : INFO : PROGRESS: at example #20000, processed 236504 words (3223726/s), 6812 word types, 20000 tags
2022-06-20 16:45:30,251 : INFO : PROGRESS: at example #30000, processed 352954 words (3260041/s), 8365 word types, 30000 tags
2022-06-20 16:45:30,289 : INFO : PROGRESS: at example #40000, processed 471304 words (3166724/s), 9563 word types, 40000 tags
2022-06-20 1

In [14]:
model.train(tagged, total_examples=model.corpus_count, epochs=max_epochs)

2022-06-20 16:46:00,095 : INFO : Doc2Vec lifecycle event {'msg': 'training model with 3 workers on 3107 vocabulary and 32 features, using sg=1 hs=0 sample=0.001 negative=5 window=5 shrink_windows=True', 'datetime': '2022-06-20T16:46:00.095168', 'gensim': '4.1.2', 'python': '3.8.13 (default, Mar 28 2022, 06:59:08) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19042-SP0', 'event': 'train'}
2022-06-20 16:46:01,111 : INFO : EPOCH 1 - PROGRESS: at 94.49% examples, 297001 words/s, in_qsize 3, out_qsize 0
2022-06-20 16:46:01,142 : INFO : worker thread finished; awaiting finish of 2 more threads
2022-06-20 16:46:01,150 : INFO : worker thread finished; awaiting finish of 1 more threads
2022-06-20 16:46:01,153 : INFO : worker thread finished; awaiting finish of 0 more threads
2022-06-20 16:46:01,153 : INFO : EPOCH - 1 : training on 476679 raw words (317555 effective words) took 1.1s, 302236 effective words/s
2022-06-20 16:46:02,156 : INFO : worker thread finished; awaiting finish of

2022-06-20 16:46:17,338 : INFO : worker thread finished; awaiting finish of 0 more threads
2022-06-20 16:46:17,338 : INFO : EPOCH - 17 : training on 476679 raw words (317240 effective words) took 1.0s, 304394 effective words/s
2022-06-20 16:46:18,361 : INFO : EPOCH 18 - PROGRESS: at 96.54% examples, 301188 words/s, in_qsize 2, out_qsize 1
2022-06-20 16:46:18,363 : INFO : worker thread finished; awaiting finish of 2 more threads
2022-06-20 16:46:18,365 : INFO : worker thread finished; awaiting finish of 1 more threads
2022-06-20 16:46:18,371 : INFO : worker thread finished; awaiting finish of 0 more threads
2022-06-20 16:46:18,372 : INFO : EPOCH - 18 : training on 476679 raw words (317402 effective words) took 1.0s, 308959 effective words/s
2022-06-20 16:46:19,403 : INFO : EPOCH 19 - PROGRESS: at 96.54% examples, 298829 words/s, in_qsize 2, out_qsize 1
2022-06-20 16:46:19,404 : INFO : worker thread finished; awaiting finish of 2 more threads
2022-06-20 16:46:19,410 : INFO : worker threa

2022-06-20 16:46:34,765 : INFO : worker thread finished; awaiting finish of 2 more threads
2022-06-20 16:46:34,769 : INFO : worker thread finished; awaiting finish of 1 more threads
2022-06-20 16:46:34,776 : INFO : worker thread finished; awaiting finish of 0 more threads
2022-06-20 16:46:34,776 : INFO : EPOCH - 34 : training on 476679 raw words (317738 effective words) took 1.0s, 309420 effective words/s
2022-06-20 16:46:35,779 : INFO : worker thread finished; awaiting finish of 2 more threads
2022-06-20 16:46:35,790 : INFO : EPOCH 35 - PROGRESS: at 98.56% examples, 310749 words/s, in_qsize 1, out_qsize 1
2022-06-20 16:46:35,790 : INFO : worker thread finished; awaiting finish of 1 more threads
2022-06-20 16:46:35,796 : INFO : worker thread finished; awaiting finish of 0 more threads
2022-06-20 16:46:35,797 : INFO : EPOCH - 35 : training on 476679 raw words (317516 effective words) took 1.0s, 313089 effective words/s
2022-06-20 16:46:36,790 : INFO : worker thread finished; awaiting fi

2022-06-20 16:46:52,818 : INFO : worker thread finished; awaiting finish of 0 more threads
2022-06-20 16:46:52,818 : INFO : EPOCH - 52 : training on 476679 raw words (317760 effective words) took 1.0s, 320045 effective words/s
2022-06-20 16:46:53,821 : INFO : worker thread finished; awaiting finish of 2 more threads
2022-06-20 16:46:53,825 : INFO : worker thread finished; awaiting finish of 1 more threads
2022-06-20 16:46:53,835 : INFO : EPOCH 53 - PROGRESS: at 100.00% examples, 314448 words/s, in_qsize 0, out_qsize 1
2022-06-20 16:46:53,836 : INFO : worker thread finished; awaiting finish of 0 more threads
2022-06-20 16:46:53,837 : INFO : EPOCH - 53 : training on 476679 raw words (317540 effective words) took 1.0s, 314041 effective words/s
2022-06-20 16:46:54,821 : INFO : worker thread finished; awaiting finish of 2 more threads
2022-06-20 16:46:54,828 : INFO : worker thread finished; awaiting finish of 1 more threads
2022-06-20 16:46:54,837 : INFO : worker thread finished; awaiting f

2022-06-20 16:47:11,763 : INFO : worker thread finished; awaiting finish of 0 more threads
2022-06-20 16:47:11,763 : INFO : EPOCH - 71 : training on 476679 raw words (317448 effective words) took 1.0s, 322950 effective words/s
2022-06-20 16:47:12,718 : INFO : worker thread finished; awaiting finish of 2 more threads
2022-06-20 16:47:12,725 : INFO : worker thread finished; awaiting finish of 1 more threads
2022-06-20 16:47:12,729 : INFO : worker thread finished; awaiting finish of 0 more threads
2022-06-20 16:47:12,729 : INFO : EPOCH - 72 : training on 476679 raw words (317393 effective words) took 1.0s, 330867 effective words/s
2022-06-20 16:47:13,743 : INFO : EPOCH 73 - PROGRESS: at 96.54% examples, 304281 words/s, in_qsize 2, out_qsize 1
2022-06-20 16:47:13,745 : INFO : worker thread finished; awaiting finish of 2 more threads
2022-06-20 16:47:13,746 : INFO : worker thread finished; awaiting finish of 1 more threads
2022-06-20 16:47:13,749 : INFO : worker thread finished; awaiting fi

2022-06-20 16:47:29,702 : INFO : EPOCH - 89 : training on 476679 raw words (317603 effective words) took 1.0s, 319596 effective words/s
2022-06-20 16:47:30,661 : INFO : worker thread finished; awaiting finish of 2 more threads
2022-06-20 16:47:30,667 : INFO : worker thread finished; awaiting finish of 1 more threads
2022-06-20 16:47:30,670 : INFO : worker thread finished; awaiting finish of 0 more threads
2022-06-20 16:47:30,670 : INFO : EPOCH - 90 : training on 476679 raw words (317374 effective words) took 1.0s, 329858 effective words/s
2022-06-20 16:47:31,674 : INFO : worker thread finished; awaiting finish of 2 more threads
2022-06-20 16:47:31,675 : INFO : worker thread finished; awaiting finish of 1 more threads
2022-06-20 16:47:31,680 : INFO : EPOCH 91 - PROGRESS: at 100.00% examples, 316530 words/s, in_qsize 0, out_qsize 1
2022-06-20 16:47:31,681 : INFO : worker thread finished; awaiting finish of 0 more threads
2022-06-20 16:47:31,681 : INFO : EPOCH - 91 : training on 476679 ra

In [15]:
model.save(f'Models/Flickr-8k-Doc2Vec.h5')

2022-06-20 16:50:53,359 : INFO : Doc2Vec lifecycle event {'fname_or_handle': 'Models/Flickr-8k-Doc2Vec.h5', 'separately': 'None', 'sep_limit': 10485760, 'ignore': frozenset(), 'datetime': '2022-06-20T16:50:53.359487', 'gensim': '4.1.2', 'python': '3.8.13 (default, Mar 28 2022, 06:59:08) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19042-SP0', 'event': 'saving'}
2022-06-20 16:50:53,361 : INFO : not storing attribute cum_table
2022-06-20 16:50:53,387 : INFO : saved Models/Flickr-8k-Doc2Vec.h5


In [22]:
test_data = tagged[6330].words
model.random.seed(43)
infer = model.infer_vector(test_data, epochs=500, alpha=model.alpha)

similar = model.docvecs.most_similar([infer]) 
similar

  similar = model.docvecs.most_similar([infer])


[('6330', 0.9842882752418518),
 ('37227', 0.8501433730125427),
 ('21081', 0.8423740267753601),
 ('20106', 0.8168782591819763),
 ('32226', 0.7976151704788208),
 ('29040', 0.7659568786621094),
 ('5296', 0.7647293210029602),
 ('5295', 0.7615588307380676),
 ('15114', 0.7603051066398621),
 ('40156', 0.7435275912284851)]

In [23]:
print(test_data)
print(tagged[37227].words)

['A', 'woman', 'in', 'a', 'animal', 'print', 'coat', 'standing', 'next', 'to', 'a', 'smoking', 'woman', '.']
['A', 'man', 'in', 'a', 'black', 'hat', 'is', 'talking', 'to', 'a', 'woman', 'with', 'an', 'animal', 'print', 'shirt', '.']


In [24]:
from dl_inferencing import *

In [26]:
with Pool(processes=4) as pool:
    vectors = pool.map(infer_vector, df.caption)

In [29]:
features_dict = dict()
for vector in vectors:
    features_dict[vector[0]] = vector[1]

In [30]:
pickle.dump(features_dict, open('Models/caption_features_dict.pickle', "wb"))