# Import dependencies

We should add root directory to path so we can import our model files.

In [1]:
import os
import sys
sys.path.append(os.path.abspath(os.path.join("../..")))
import importlib
import pickle
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import pandas as pd
import yaml
from tqdm import tqdm
from model.models import *
import torch_geometric
from torch_geometric.utils import structured_negative_sampling, dropout_node
from sklearn.decomposition import PCA
from matplotlib import pyplot as plt
import networkx as nx
from nltk.stem import WordNetLemmatizer
from gensim.models import doc2vec
import random
from itertools import combinations
from random import sample

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
device

# Read config YAML

In [None]:
config_path = "../dblp.yaml"
with open(config_path, "r") as file:
    config = yaml.safe_load(file)

# Read data

In [None]:
with open(config['saving_paths']['graph'], 'rb') as f:
    graph = pickle.load(f)
with open(config['saving_paths']['all_skills'], 'rb') as f:
    all_skills = pickle.load(f)
with open(config['saving_paths']['authors_id'], 'rb') as f:
    author2id = pickle.load(f)
with open(config['dblp_pickle'], "rb") as f:
    dblp_data = pickle.load(f)
id2author = {v:k for k,v in author2id.items()}

# Data Preprocessing

In [None]:
papers = []
for item in tqdm(dblp_data):
    authors = []
    for author in item['authors']:
        author = author.strip()
        if author not in author2id:
            continue
        author_id = author2id[author]
        authors.append(author_id)
    if len(authors) == 0:
        continue
    
    title = item['title']
    abstract = item['abstract']
    papers.append({'paper': title + " " + abstract, "authors": authors})

In [None]:
preprocessed_papers = []
preprocessed_authors = []
lemmatizer = WordNetLemmatizer()

for i, paper in enumerate(tqdm(papers)):
    text = paper['paper']
    text = text.strip().split()
    words = list(filter(lambda x: x in all_skills, [lemmatizer.lemmatize(word) for word in text]))
    if len(words) >= 2:
        preprocessed_papers.append(doc2vec.TaggedDocument(words, [len(preprocessed_papers)]))
        preprocessed_authors.append(paper['authors'])


# Train Word2Vec model

In [None]:
doc2vec_model = doc2vec.Doc2Vec(preprocessed_papers, vector_size=64)
with open(config['saving_paths']['doc2vec'], 'wb') as f:
    pickle.dump(doc2vec_model, f)
with open(config['train']['preprocessed_authors_path'], 'wb') as f:
    pickle.dump(preprocessed_authors, f)
with open(config['train']['preprocessed_papers_path'], 'wb') as f:
    pickle.dump(preprocessed_papers, f)

# Prepare for model training

In [None]:
with open(config['saving_paths']['doc2vec'], 'rb') as f:
    doc2vec_model = pickle.load(f)
with open(config['train']['preprocessed_authors_path'], 'rb') as f:
    preprocessed_authors = pickle.load(f)
with open(config['train']['preprocessed_paper_path'], 'rb') as f:
    preprocessed_papers = pickle.load(f)

For training, we select papers with more than 1 author as "good" papers which we use for training as "queries".
For each query, we train the model so the embeddings of its authors are closer to the query than other random individuals.

In [None]:
good_papers = []
for paper_index, paper_authors in enumerate(preprocessed_authors):
    if len(paper_authors) > 1:
        good_papers.append(paper_index)
len(good_papers)

## Build tensors from networkx graph of collaboration network

In [None]:
graph_vec = torch_geometric.utils.from_networkx(graph)
graph_x = graph_vec.x.float().to(device)
graph_edge_index = graph_vec.edge_index.to(device)

# This dict shows the mapping from graph node ids to corresponding indices of the created tensors.
mapping = dict(zip(graph.nodes(), range(graph.number_of_nodes())))

## Model initialization
We initialize the model using the model name and parameters that are provided in the config file.

In [None]:
model_name = config['train']['model_name']
models_module = importlib.import_module("model.models")
model_class = getattr(models_module, model_name)
model_parameters = config['train']['model_params']
learning_rate = config['train']['learning_rate']

model = model_class(*model_parameters)
model = model.to(device)

optim = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=1e-3)

In [3]:
print(sys.path)

['/home/kgolazde/exes/preprocessing/train', '/usr/lib/python310.zip', '/usr/lib/python3.10', '/usr/lib/python3.10/lib-dynload', '', '/home/kgolazde/exes/venv/lib/python3.10/site-packages', '/home/kgolazde/exes']


# Training loop

In [None]:
batch_size = config['train']['batch_size']
num_epochs = config['train']['num_epochs']
num_negative_samples = config['train']['num_negative_samples']
criterion = torch.nn.CosineEmbeddingLoss(margin=0)

for _ in range(num_epochs):
    model.train()
    total_loss = 0
    for batch in tqdm(range(0, len(good_papers), batch_size)):
        optim.zero_grad()
        graph_edge_index, _, _ = dropout_node(graph_edge_index)
        graph_edge_index = graph_edge_index.to(device)
        emb = model(graph_x, graph_edge_index)

        batch_papers = good_papers[batch:batch + batch_size]
        batch_items = []    # Embeddings of the positive and negative samples
        query_emb = []
        batch_labels = []   # 1 for positive and -1 for negative samples
        for paper in batch_papers:
            q_emb = torch.Tensor(doc2vec_model.dv.vectors[paper]).to(device)
            for author in preprocessed_authors[paper]:
                if author not in mapping:
                    continue
                batch_items.append(emb[mapping[author]])
                negs = []
                while len(negs) < num_negative_samples:
                    neg = sample(graph.nodes, 1)[0]
                    while neg in preprocessed_authors[paper] or neg not in mapping or neg in negs:
                        neg = sample(graph.nodes, 1)[0]
                    negs.append(neg)
                for neg in negs:
                    batch_items.append(emb[mapping[neg]])
                batch_labels.extend([1] + ([-1] * num_negative_samples))
                for i in range(num_negative_samples + 1):
                    query_emb.append(q_emb)
        query_emb = torch.stack(query_emb)
        batch_items = torch.stack(batch_items)
        batch_labels = torch.Tensor(batch_labels).to(device).detach()  
        loss = criterion(query_emb, batch_items, batch_labels)
        total_loss += loss.item()
        loss.backward()
        optim.step()
    print(total_loss)

## Save model

In [None]:
model_saving_dir = config['train']['model_saving_dir']
model_saving_path = f"{model_saving_dir}/DBLP_{model_name}.pt"
torch.save(model.state_dict(), model_saving_path)

# Train Link prediction model

We use the GAE model from torch_geometric for Link Prediction.

In [None]:
from torch_geometric.nn import GAE
from torch_geometric.transforms import RandomLinkSplit

link_prediction_model_params = config['link_prediction']['model_params']
gae_model = GAE(LinkPredictionModel(*link_prediction_model_params))
gae_optim = torch.optim.Adam(gae_model.parameters(), lr=1e-2)

train_data, _, test_data = RandomLinkSplit(num_val=0,split_labels=True)(graph_vec)

In [None]:
link_prediction_epochs = config['link_prediction']['num_epochs']
for _ in range(link_prediction_epochs):
    gae_model.train()
    gae_model.to(device)
    gae_optim.zero_grad()
    z = gae_model.encode(train_data.x.float().cuda(), train_data.edge_index.cuda())

    loss = gae_model.recon_loss(z, pos_edge_index=train_data.pos_edge_label_index, neg_edge_index=train_data.neg_edge_label_index)
    print("loss:", loss.item(), end=" ")
    loss.backward()
    gae_optim.step()
    gae_model.eval()
    with torch.no_grad():
        gae_model.eval()
        z = gae_model.encode(test_data.x.float().cuda(), test_data.edge_index.cuda())
        print(gae_model.test(z, test_data.pos_edge_label_index, test_data.neg_edge_label_index))


## Save Link Prediction model to file

In [None]:
link_prediction_saving_dir = config['link_prediction']['model_saving_dir']
link_prediction_saving_path = f"{link_prediction_saving_dir}/DBLP_GAE.pt"

torch.save(gae_model.state_dict(), link_prediction_saving_path)