# DOC2VEC EMBEDDINGS OF THE ABSTRACTS 

The goal of this notebook is to train a Doc2Vec model on the abstracts of the papers, in order to obtain a semantic representation of them. 

After that we build a similarity graph of the papers with these embeddings.

In [14]:
import os 
import numpy as np
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.models import KeyedVectors

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

import paths # script with all data paths

##  Doc2Vec embeddings

In [3]:
if not os.path.isfile(paths.DOC2VEC_PATH):
    nltk.download('stopwords')
    nltk.download('punkt')
    stop_words = set(stopwords.words('english')) 

    # Load the abstracts
    abstracts = dict()
    with open(paths.ABSTRACTS_PATH, 'r') as f:
        for line in f:
            node, abstract = line.split('|--|')
            abstracts[int(node)] = abstract 

    # Process the abstracts 
    processed_abstracts = dict()
    for id_ in abstracts:
        processed_abstract = [
            word for word in word_tokenize(abstracts[id_].lower())
            if word.isalpha() and word not in stop_words
        ]
        processed_abstracts[id_] = processed_abstract

    # Doc2Vec takes as input a list of TaggedDocument
    tagged_data = [
        TaggedDocument(abstract, [id_]) 
        for id_, abstract in processed_abstracts.items()
    ]

    dim = 64 # Embedding size
    # Train the model
    doc2vec_model = Doc2Vec(
        tagged_data, vector_size=dim, window=5,
        dm=1, min_count=2, epochs=100, workers=10
    )
    # Save the model
    doc2vec_model.save(paths.DOC2VEC_PATH)
else:
    print("The model was already trained !")
    

The model was already trained !
