# Import dependencies

We should add root directory to path so we can import our model files.

In [None]:
import os
import sys
sys.path.append(os.path.abspath(os.path.join("../..")))
import importlib
import pickle
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import pandas as pd
import yaml
from tqdm import tqdm
from model.models import *
import torch_geometric
from torch_geometric.utils import structured_negative_sampling
from sklearn.decomposition import PCA
from matplotlib import pyplot as plt
import networkx as nx
from nltk.stem import WordNetLemmatizer
from gensim.models import doc2vec
import random
from itertools import combinations
from random import sample
from torch_geometric.utils import dropout_node
from github import Repository
from itertools import combinations

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
device

# Read config YAML

In [None]:
config_path = "../github.yaml"
with open(config_path, "r") as file:
    config = yaml.safe_load(file)

# Read data

In [None]:
with open(config['saving_paths']['graph'], 'rb') as f:
    graph = pickle.load(f)
with open(config['saving_paths']['all_skills'], 'rb') as f:
    all_skills = pickle.load(f)
with open(config['saving_paths']['authors_id'], 'rb') as f:
    author2id = pickle.load(f)
id2author = {v:k for k,v in author2id.items()}


with open("../../data/github/doc2vec_github.pkl", 'rb') as f:
    doc2vec_model = pickle.load(f)
with open(config['repo_dict_path'], "rb") as f:
    contributors = pickle.load(f)
with open("../../data/github/good_repos.pkl", 'rb') as f:
    all_repos = pickle.load(f)

# Prepare for model training

## Build tensors from networkx graph of collaboration network

In [None]:
graph_vec = torch_geometric.utils.from_networkx(graph)
graph_x = graph_vec.x.float().to(device)
graph_edge_index = graph_vec.edge_index.to(device)

# This dict shows the mapping from graph node ids to corresponding indices of the created tensors.
mapping = dict(zip(graph.nodes(), range(graph.number_of_nodes())))

In [None]:
good_repos = []
preprocessed_users = {}
for i, row in all_repos.iterrows():
    conts = []

    if row.title not in contributors:
        continue

    for item in contributors[row.title]['contributors']:
        if item.login in author2id:
            conts.append(author2id[item.login])
    good_repos.append(((i, row.title), row.tags))
    
    preprocessed_users[i] = conts

## Model initialization
We initialize the model using the model name and parameters that are provided in the config file.

In [None]:
model_name = config['train']['model_name']
models_module = importlib.import_module("model.models")
model_class = getattr(models_module, model_name)
model_parameters = config['train']['model_params']
learning_rate = config['train']['learning_rate']

model = model_class(*model_parameters)
model = model.to(device)

optim = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=1e-3)

In [None]:
batch_size = config['train']['batch_size']
num_epochs = config['train']['num_epochs']
num_negative_samples = config['train']['num_negative_samples']
criterion = torch.nn.CosineEmbeddingLoss(margin=0)

for _ in range(num_epochs):
    model.train()
    total_loss = 0
    for batch in tqdm(range(0, len(good_repos), batch_size)):
        optim.zero_grad()
        # graph_edge_index, _, _ = dropout_node(graph_edge_index)
        emb = model(graph_x, graph_edge_index)

        batch_papers = good_repos[batch:batch + batch_size]
        batch_items = []    # Embeddings of the positive and negative samples
        query_emb = []
        batch_labels = []   # 1 for positive and -1 for negative samples
        for (ind, ttl), paper in batch_papers:
            q_emb = torch.Tensor(doc2vec_model.infer_vector(paper)).to(device)
            for author in preprocessed_users[ind]:
                if author not in mapping:
                    continue
                batch_items.append(emb[mapping[author]])
                negs = []
                while len(negs) < num_negative_samples:
                    neg = sample(graph.nodes, 1)[0]
                    while neg in preprocessed_users[ind] or neg not in mapping or neg in negs:
                        neg = sample(graph.nodes, 1)[0]
                    negs.append(neg)
                for neg in negs:
                    batch_items.append(emb[mapping[neg]])
                batch_labels.extend([1] + ([-1] * num_negative_samples))
                for i in range(num_negative_samples + 1):
                    query_emb.append(q_emb)
        query_emb = torch.stack(query_emb)
        batch_items = torch.stack(batch_items)
        batch_labels = torch.Tensor(batch_labels).to(device).detach()  
        loss = criterion(query_emb, batch_items, batch_labels)
        total_loss += loss.item()
        loss.backward()
        optim.step()
    print(total_loss)


## Save model

In [None]:
model_saving_dir = config['train']['model_saving_dir']
model_saving_path = f"{model_saving_dir}/Github_{model_name}.pt"
torch.save(model.state_dict(), model_saving_path)

# Train Link prediction model

We use the GAE model from torch_geometric for Link Prediction.

In [None]:
from torch_geometric.nn import GAE
from torch_geometric.transforms import RandomLinkSplit

link_prediction_model_params = config['link_prediction']['model_params']
gae_model = GAE(LinkPredictionModel(*link_prediction_model_params))
gae_optim = torch.optim.Adam(gae_model.parameters(), lr=1e-2)

train_data, _, test_data = RandomLinkSplit(num_val=0,split_labels=True)(graph_vec)

In [None]:
link_prediction_epochs = config['link_prediction']['num_epochs']
for _ in range(link_prediction_epochs):
    gae_model.train()
    gae_model.to(device)
    gae_optim.zero_grad()
    z = gae_model.encode(train_data.x.float().cuda(), train_data.edge_index.cuda())

    loss = gae_model.recon_loss(z, pos_edge_index=train_data.pos_edge_label_index, neg_edge_index=train_data.neg_edge_label_index)
    print("loss:", loss.item(), end=" ")
    loss.backward()
    gae_optim.step()
    gae_model.eval()
    with torch.no_grad():
        gae_model.eval()
        z = gae_model.encode(test_data.x.float().cuda(), test_data.edge_index.cuda())
        print(gae_model.test(z, test_data.pos_edge_label_index, test_data.neg_edge_label_index))


## Save Link Prediction model to file

In [None]:
link_prediction_saving_dir = config['link_prediction']['model_saving_dir']
link_prediction_saving_path = f"{link_prediction_saving_dir}/Github_GAE.pt"

torch.save(gae_model.state_dict(), link_prediction_saving_path)