In [2]:
from node2vec import Node2Vec

In [1]:
import json
import nltk
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords 

In [2]:
import itertools
from collections import Counter

In [3]:
import networkx as nx

## Data Load

In [4]:
captions=[]

with open("./captions_train2014.json", "r") as st_json:
    data = json.load(st_json)
    
for d in data['annotations']:
    captions.append(d['caption'].rstrip())
    
with open("./captions_val2014.json", "r") as st_json:
    data = json.load(st_json)
    
for d in data['annotations']:
    captions.append(d['caption'].rstrip())
    
with open("./captions_train2017.json", "r") as st_json:
    data = json.load(st_json)
    
for d in data['annotations']:
    captions.append(d['caption'].rstrip())
    
with open("./captions_val2017.json", "r") as st_json:
    data = json.load(st_json)
    
for d in data['annotations']:
    captions.append(d['caption'].rstrip())

## Tokenize

In [5]:
preprocess = []
for s in captions:
    tokens = re.sub(r"[^a-z0-9]+", " ", s.lower())
    preprocess.append(tokens)

token = [word_tokenize(s) for s in preprocess]

## Remove stopwords (like 'a', 'the', 'an')

In [6]:
stop_words = set(stopwords.words('english')) 

In [7]:
text = []
for s in token:
    tmp=[]
    for w in s: 
        if w not in stop_words: 
            tmp.append(w) 
    text.append(tmp)

## Construct word graph

In [8]:
counter=Counter(itertools.chain.from_iterable(text))

In [9]:
counter={w: c for w, c in counter.items()}

In [10]:
def makevocab(text):
    counter=Counter(itertools.chain.from_iterable(text))
    id2word = [w for w, c in sorted(counter.items(), key=lambda x: -x[1])]
    word2id = {w: idx for idx, w in enumerate(id2word)}
    return id2word, word2id

In [11]:
def coocur(text, word2id, window=3):
    co_dict = {}
    for s in text:
        for i, token in enumerate(s):
            left_idx = max(0, i-window)
            right_idx = min(len(s), i+window)
            for n_token in s[left_idx:right_idx]:
                if token != n_token:
                    key = tuple(sorted([token, n_token]))
                    if key in co_dict:
                        co_dict[key] += 1
                    else:
                        co_dict[key] = 1
    return {k: v for k, v in co_dict.items() if v >= 1}

In [12]:
id2word, word2id=makevocab(text)

In [13]:
edge=coocur(text, word2id)

In [14]:
G=nx.Graph()

In [15]:
for i, node in enumerate(id2word):
    G.add_node(i, name=node)

In [16]:
#G.nodes.data()

In [17]:
for (n1, n2), co in edge.items():
    G.add_edge(n1, n2, weight=co)

In [22]:
#G.edges.data()

In [19]:
nx.write_weighted_edgelist(G, './edge')

## Embedding

In [None]:
node2vec = Node2Vec(G, dimensions=64, walk_length=10, num_walks=100, workers=4)

Computing transition probabilities:   0%|          | 0/54342 [00:00<?, ?it/s]

In [None]:
model = node2vec.fit(window=10, min_count=1, batch_words=4)  # Any keywords acceptable by gensim.Word2Vec can be passed, `dimensions` and `workers` are automatically passed (from the Node2Vec constructor)

### "I read a book"
### "I have to book a restaurant"

In [None]:
model.wv.most_similar('read')

In [None]:
model.wv.most_similar('restaurant')