In [None]:
!pip install torch
! pip install PyYAML
!pip install pydantic
!pip install rdflib
!pip install torchmetrics

In [None]:
import os
from os.path import join

import numpy as np
import pandas as pd

from scipy.spatial import distance_matrix

import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader

try:
    import dgl
except:
    !pip install dgl
    import dgl
    
import warnings
warnings.filterwarnings('ignore')

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from dgl import DGLGraph
import dgl.function as fn
from functools import partial

class RGCNLayer(nn.Module):
    def __init__(self, in_feat, out_feat, num_rels, num_bases=-1, bias=None,
                 activation=None, is_input_layer=False):
        super(RGCNLayer, self).__init__()
        self.in_feat = in_feat
        self.out_feat = out_feat
        self.num_rels = num_rels
        self.num_bases = num_bases
        self.bias = bias
        self.activation = activation
        self.is_input_layer = is_input_layer

        # sanity check
        if self.num_bases <= 0 or self.num_bases > self.num_rels:
            self.num_bases = self.num_rels

        # weight bases in equation (3)
        self.weight = nn.Parameter(torch.Tensor(self.num_bases, self.in_feat,
                                                self.out_feat))
        if self.num_bases < self.num_rels:
            # linear combination coefficients in equation (3)
            self.w_comp = nn.Parameter(torch.Tensor(self.num_rels, self.num_bases))

        # add bias
        if self.bias:
            self.bias = nn.Parameter(torch.Tensor(out_feat))

        # init trainable parameters
        nn.init.xavier_uniform_(self.weight,
                                gain=nn.init.calculate_gain('relu'))
        if self.num_bases < self.num_rels:
            nn.init.xavier_uniform_(self.w_comp,
                                    gain=nn.init.calculate_gain('relu'))
        if self.bias:
            nn.init.xavier_uniform_(self.bias,
                                    gain=nn.init.calculate_gain('relu'))

    def forward(self, g):
        if self.num_bases < self.num_rels:
            # generate all weights from bases (equation (3))
            weight = self.weight.view(self.in_feat, self.num_bases, self.out_feat)
            weight = torch.matmul(self.w_comp, weight).view(self.num_rels,
                                                        self.in_feat, self.out_feat)
        else:
            weight = self.weight

        if self.is_input_layer:
            def message_func(edges):
                # for input layer, matrix multiply can be converted to be
                # an embedding lookup using source node id
                embed = weight.view(-1, self.out_feat)
                index = edges.data['rel_type'] * self.in_feat + edges.src['id']
                return {'msg': embed[index] * edges.data['norm']}
        else:
            def message_func(edges):
                w = weight[edges.data['rel_type']]
                msg = torch.bmm(edges.src['h'].unsqueeze(1), w).squeeze()
                msg = msg * edges.data['norm']
                return {'msg': msg}

        def apply_func(nodes):
            h = nodes.data['h']
            if self.bias:
                h = h + self.bias
            if self.activation:
                h = self.activation(h)
            return {'h': h}

        g.update_all(message_func, fn.sum(msg='msg', out='h'), apply_func)

In [None]:
class Model(nn.Module):
    def __init__(self, num_nodes, h_dim, out_dim, num_rels,
                 num_bases=-1, num_hidden_layers=1):
        super(Model, self).__init__()
        self.num_nodes = num_nodes
        self.h_dim = h_dim
        self.out_dim = out_dim
        self.num_rels = num_rels
        self.num_bases = num_bases
        self.num_hidden_layers = num_hidden_layers

        # create rgcn layers
        self.build_model()

        # create initial features
        self.features = self.create_features()

    def build_model(self):
        self.layers = nn.ModuleList()
        # input to hidden
        i2h = self.build_input_layer()
        self.layers.append(i2h)
        # hidden to hidden
        for _ in range(self.num_hidden_layers):
            h2h = self.build_hidden_layer()
            self.layers.append(h2h)
        # hidden to output
        h2o = self.build_output_layer()
        self.layers.append(h2o)

    # initialize feature for each node
    def create_features(self):
        features = torch.arange(self.num_nodes)
        return features

    def build_input_layer(self):
        return RGCNLayer(self.num_nodes, self.h_dim, self.num_rels, self.num_bases,
                         activation=F.relu, is_input_layer=True)

    def build_hidden_layer(self):
        return RGCNLayer(self.h_dim, self.h_dim, self.num_rels, self.num_bases,
                         activation=F.relu)

    def build_output_layer(self):
        return RGCNLayer(self.h_dim, self.out_dim, self.num_rels, self.num_bases,
                         activation=partial(F.softmax, dim=1))

    def forward(self, g):
        if self.features is not None:
            g.ndata['id'] = self.features
        for layer in self.layers:
            layer(g)
        return g.ndata.pop('h')

In [None]:
import dgl
from dgl.data import DGLDataset
import torch

class CustomDataset(DGLDataset):
    def __init__(self):
        super().__init__(name='custom')
    
    def process(self):
        # Define your graph and node/edge features here
        g = dgl.DGLGraph()
        g.add_nodes(10)
        g.add_edges([0, 1, 2, 3, 4, 5, 6, 7], [1, 2, 3, 4, 5, 6, 7, 8])
        g.ndata['feat'] = torch.randn(10, 5)
        g.edata['feat'] = torch.randn(8, 3)
        
        # Save the processed graph
        self._graph = g
    
    def __getitem__(self, idx):
        # Return the graph and its label
        return self._graph, torch.tensor([0])
    
    def __len__(self):
        # Return the number of graphs in the dataset
        return 1

In [None]:
import dgl
path="/home/leanna/unfollow_prediction/Markov_chain/dataset"
ds = dgl.data.CSVDataset(path)

In [None]:
data=ds[0]
import numpy as np

# Assuming y is a numpy array representing your labels
num_nodes = len(data)

# Set the percentage of data for training, e.g., 80%
train_percentage = 0.8

# Calculate the number of training samples
num_train_samples = int(num_nodes * train_percentage)

# Generate indices for all nodes
all_indices = np.arange(num_nodes)

# Shuffle the indices
np.random.shuffle(all_indices)

# Select the first num_train_samples indices as train_idx
train_idx = all_indices[:num_train_samples]

In [None]:
# load graph data
num_nodes = data.num_nodes
num_rels = 1
num_classes = 3 
labels = 6
train_idx = data.train_idx
# split training and validation set
val_idx = train_idx[:len(train_idx) // 5]
train_idx = train_idx[len(train_idx) // 5:]

# edge type and normalization factor
edge_type = torch.from_numpy(data.edge_type)
edge_norm = torch.from_numpy(data.edge_norm).unsqueeze(1)

labels = torch.from_numpy(labels).view(-1)

In [None]:
# configurations
n_hidden = 12 # number of hidden units
n_bases = 3 # use number of relations as number of bases
n_hidden_layers = 4 # use 1 input layer, 1 output layer, no hidden layer
n_epochs = 50 # epochs to train
lr = 0.01 # learning rate
l2norm = 0.001 # L2 norm coefficient

# create graph
g = DGLGraph((data.edge_src, data.edge_dst))
g.edata.update({'rel_type': edge_type, 'norm': edge_norm})

# create model
model = Model(g.num_nodes(),
              n_hidden,
              num_classes,
              num_rels,
              num_bases=n_bases,
              num_hidden_layers=n_hidden_layers)

In [None]:
import matplotlib.pyplot as plt

# optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=l2norm)

# initialize lists to store accuracy and loss values over epochs
train_acc_list = []
train_loss_list = []
val_acc_list = []
val_loss_list = []

print("start training...")
model.train()
for epoch in range(n_epochs):
    optimizer.zero_grad()
    logits = model.forward(g)
    loss = F.cross_entropy(logits[train_idx], labels[train_idx])
    loss.backward()

    optimizer.step()

    train_acc = torch.sum(logits[train_idx].argmax(dim=1) == labels[train_idx])
    train_acc = train_acc.item() / len(train_idx)
    train_acc_list.append(train_acc)
    train_loss_list.append(loss.item())
    
    val_loss = F.cross_entropy(logits[val_idx], labels[val_idx])
    val_acc = torch.sum(logits[val_idx].argmax(dim=1) == labels[val_idx])
    val_acc = val_acc.item() / len(val_idx)
    val_acc_list.append(val_acc)
    val_loss_list.append(val_loss.item())
    
    print("Epoch {:05d} | ".format(epoch) +
          "Train Accuracy: {:.4f} | Train Loss: {:.4f} | ".format(
              train_acc, loss.item()) +
          "Validation Accuracy: {:.4f} | Validation loss: {:.4f}".format(
              val_acc, val_loss.item()))

# plot training and validation accuracies over epochs
plt.plot(range(1, n_epochs+1), train_acc_list, label='Training Accuracy')
plt.plot(range(1, n_epochs+1), val_acc_list, label='Validation Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

# plot training and validation losses over epochs
plt.plot(range(1, n_epochs+1), train_loss_list, label='Training Loss')
plt.plot(range(1, n_epochs+1), val_loss_list, label='Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.show()


## after running for 14 weeks run them in LSTM 

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, optimizers, constraints
import numpy as np
import matplotlib.pyplot as plt

# define the LSTM model with output constraint
class LSTM(keras.Model):
    def __init__(self, input_size, hidden_size, output_size):
        super(LSTM, self).__init__()
        self.hidden_size = hidden_size
        self.lstm = layers.LSTM(hidden_size, input_shape=(None, input_size))
        self.linear = layers.Dense(output_size, kernel_constraint=constraints.UnitNorm())

    def call(self, inputs):
        lstm_out = self.lstm(inputs)
        output = self.linear(lstm_out)
        return output

# set the hyperparameters
input_size = 1
hidden_size = 100
output_size = 1
lr = 0.001
n_epochs = 100

# create the LSTM model
model = LSTM(input_size, hidden_size, output_size)

# define the loss function and optimizer
criterion = keras.losses.MeanSquaredError()
optimizer = optimizers.Adam(lr=lr)
data =embdd
x = data[:-1]
y = data[1:]

# split the data into training and validation sets
train_size = int(len(x) * 0.8)
train_x, train_y = x[:train_size], y[:train_size]
val_x, val_y = x[train_size:], y[train_size:]

# create the datasets
train_ds = tf.data.Dataset.from_tensor_slices((train_x, train_y)).batch(32)
val_ds = tf.data.Dataset.from_tensor_slices((val_x, val_y)).batch(32)

# define the training and validation steps
train_step = lambda inputs, labels: train_step_fn(model, inputs, labels, criterion, optimizer)
val_step = lambda inputs, labels: val_step_fn(model, inputs, labels, criterion)

@tf.function
def train_step_fn(model, inputs, labels, criterion, optimizer):
    with tf.GradientTape() as tape:
        # forward pass
        preds = model(inputs, training=True)

        # compute the loss
        loss = criterion(labels, preds)

    # compute the gradients and update the parameters
    grads = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(grads, model.trainable_variables))

    return loss

@tf.function
def val_step_fn(model, inputs, labels, criterion):
    # forward pass
    preds = model(inputs, training=False)

    # compute the loss
    loss = criterion(labels, preds)

    return loss

# train the model
train_loss_list = []
val_loss_list = []
for epoch in range(n_epochs):
    # train the model
    for train_x_batch, train_y_batch in train_ds:
        train_loss = train_step(train_x_batch, train_y_batch)
        train_loss_list.append(train_loss.numpy())

    # evaluate the model on the validation set
    for val_x_batch, val_y_batch in val_ds:
        val_loss = val_step(val_x_batch, val_y_batch)
        val_loss_list.append(val_loss.numpy())

    # print the progress
    print('Epoch [{}/{}], Train Loss: {:.4f}, Validation Loss: {:.4f}'.format(epoch+1, n_epochs, np.mean(train_loss_list), np.mean(val_loss_list)))

# plot the losses
plt.plot(range(1, len(train_loss_list)+1), train_loss_list, label='Training Loss')
plt.plot(range(1, len(val_loss_list)+1), val_loss_list, label='Validation Loss')
plt.xlabel('Step')
plt.ylabel('Loss')
plt.legend()
plt.show()
