In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import init
from torch.autograd import Variable
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import networkx as nx
import random
import os
from collections import defaultdict
from sklearn.metrics import f1_score

In [None]:
os.environ['KAGGLE_USERNAME'] = "parvathisankar214" # username from the json file
os.environ['KAGGLE_KEY'] = "6ad0a24e1cf702c8069fd273e296474b" # key from the json file
!kaggle datasets download -d ellipticco/elliptic-data-set
!unzip elliptic-data-set.zip

In [None]:
feature_df = pd.read_csv('/content/elliptic_bitcoin_dataset/elliptic_txs_features.csv', header=None)
class_df = pd.read_csv('/content/elliptic_bitcoin_dataset/elliptic_txs_classes.csv', header=None)
edgelist_df = pd.read_csv('/content/elliptic_bitcoin_dataset/elliptic_txs_edgelist.csv')

In [None]:
print(feature_df.values.tolist()[0])

In [None]:
def load_data():
    feature=feature_df
    record_num=feature.shape[0]
    feature_num= 165
    nodes_num=record_num+1

    m_feature=np.zeros((nodes_num,feature_num))
    labels=np.zeros((nodes_num,1))
    
   
    node2idx={}  #node2idx[paper_id]->index
    label2idx={}

    for i, data in enumerate(zip(feature_df.values.tolist(), class_df.values.tolist()[1:])):
        m_feature[i,:]=data[0][2:]
        node2idx[data[0][0]]=i
        class_ = data[1]
        if class_[-1] not in label2idx.keys():
            label2idx[class_[-1]]=len(label2idx)
        labels[i]=label2idx[class_[-1]]

    adj=defaultdict(set)
    for row in edgelist_df.values.tolist() :
        a,b=node2idx[row[0]],node2idx[row[1]]
        adj[a].add(b)
        adj[b].add(a)

    return m_feature,labels,adj


In [None]:
class Aggregator(nn.Module):
    def __init__(self,features,aggr_method="mean"):
     
        super(Aggregator,self).__init__()
        self.features=features
        self.aggr_method=aggr_method
    
    def forward(self,nodes,neighs,num_sample=10):
  
        result=[]
        for i,adj_node in enumerate(neighs):
            if len(adj_node)>=num_sample:
                temp=random.sample(list(adj_node),num_sample)
            else:
                temp=adj_node
            result.append(set(temp))
        '''
        result=[]
        for i,adj_node in enumerate(neighs):
            temp=np.random.choice(list(adj_node),num_sample，replacement=True)
            result.append(set(temp))
        '''
  
        unique_nodes_list=list(set.union(*result))
        unique_nodes={n:i for i,n in enumerate(unique_nodes_list)}
    
        mask = Variable(torch.zeros(len(result), len(unique_nodes)))
        column_indices = [unique_nodes[n] for temp in result for n in temp]   
        row_indices = [i for i in range(len(result)) for j in range(len(result[i]))]
        mask[row_indices, column_indices] = 1
        if(self.aggr_method=="mean"):
            num_neigh = mask.sum(1, keepdim=True)
            mask = mask.div(num_neigh)
        if(self.aggr_method=="sum"):
            mask=mask
  
        embed_matrix = self.features(torch.LongTensor(unique_nodes_list))
        print(embed_matrix.shape)
        feats = mask.mm(embed_matrix)
        return feats

In [None]:
class GraphSage(nn.Module):
    def __init__(self,features,adj_table,input_dim,embed_dim,aggregator,num_sample=10):
        super(GraphSage,self).__init__()
        
        self.features=features
        self.aggregator=aggregator
        self.num_sample=num_sample
        self.adj_table=adj_table
        self.input_dim=input_dim
        self.embed_dim=embed_dim
        
        self.weight=nn.Parameter(torch.FloatTensor(embed_dim,2*input_dim))
        init.xavier_uniform_(self.weight)
    
    def forward(self,nodes):

        neighs=[]
        for node in nodes:
            neighs.append(self.adj_table[node])
        n_feature=self.aggregator.forward(nodes,neighs,self.num_sample)
        self_feature=self.features(torch.LongTensor(nodes))
        combined=torch.cat([self_feature,n_feature],dim=1)
        
        return F.relu(self.weight.mm(combined.t()))

In [None]:
def data_split(num_nodes):
    random_idx=np.random.permutation(num_nodes)
    train,val=random_idx[:int(num_nodes/10*9)],random_idx[int(num_nodes/10*9):]
    return train,val

In [None]:
class predict(nn.Module):
    def __init__(self,model,num_class):
        super(predict,self).__init__()
        self.model=model
        self.weight=nn.Parameter(torch.FloatTensor(num_classes,model.embed_dim))
    
        init.xavier_uniform_(self.weight)
    
    def forward(self,nodes):

        hidden=self.model(nodes)
        return self.weight.mm(hidden)

In [None]:
def early_stop(accuracy_list,index,difference):
    if index>=1:
        if((accuracy_list[index-1]-accuracy_list[index])>difference):
            return True
    return False

In [None]:
def accuracy(labels,predict):

    case=0
    node_num=labels.shape[0]
    for i,classes in enumerate(labels):
        if(labels[i]==predict[i]):
            case=case+1
    return case/node_num

In [None]:
batch_size=300
embedding_dim=128
learning_rate=0.001
difference=0.05
epoch=200
num_classes=3
features_data,labels,adj_table=load_data()
(a,b)=features_data.shape

features=nn.Embedding(a,b)
features.weight=nn.Parameter(torch.FloatTensor(features_data),requires_grad=False)

aggregator=Aggregator(features,aggr_method="sum")
Layer=GraphSage(features,adj_table,b,embedding_dim,aggregator)


predict_classes=predict(Layer, num_classes)


loss_fn=nn.CrossEntropyLoss()
optimizer=torch.optim.Adam(predict_classes.parameters(),lr=learning_rate)


train,val=data_split(int(a))

epoch_record=[]
loss_record=[]
accuracy_record=[]
micro_f1=[]
illicit_f1_arr = []
for i in range(epoch):
    random.shuffle(train)
    nodes=train[:batch_size]
    optimizer.zero_grad()
    predict_result=predict_classes(nodes).t()
    loss=loss_fn(predict_result,torch.LongTensor(labels[nodes].squeeze()))
    loss.backward()
    optimizer.step()
    print("epoch:",i,"loss:",loss.item())
    epoch_record.append(i)
    loss_record.append(loss.item())
    accuracy_record.append(accuracy(labels[val].squeeze(),predict_classes(val).data.numpy().argmax(axis=0)))
    #illicit_f1_arr.append(f1_score(labels[val].squeeze(),predict_classes(val).data.numpy().argmax(axis=0),average=None))
    micro_f1.append(f1_score(labels[val].squeeze(),predict_classes(val).data.numpy().argmax(axis=0),average="micro"))
    if(early_stop(accuracy_record,i,difference)==True):
        break
    
ground_truth=labels[val].squeeze()
final_predict=predict_classes(val).data.numpy().argmax(axis=0)
print("Accuracy:",accuracy(ground_truth,final_predict))
print("F1 score:",f1_score(ground_truth,final_predict,average="micro"))
print("Current epoch:",i)

In [None]:
plt.plot(epoch_record,loss_record,marker='o',color='green')
plt.xlabel("epoch")
plt.ylabel("loss of training set")
plt.legend(["loss"])
plt.show()
plt.plot(epoch_record,accuracy_record,color='black')
plt.xlabel("epoch")
plt.ylabel("accuracy")
plt.legend(["accuracy"])
plt.show()
plt.plot(epoch_record,micro_f1,color='orange')
plt.xlabel("epoch")
plt.ylabel("F1")
plt.legend(["F1"])
plt.show()
