### Mounting Googel Drive

In [10]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


### Importing Dependencies

In [11]:
import os
!pip install dgl-cu111 -f https://data.dgl.ai/wheels/repo.html

import numpy as np
import dgl
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import torch.nn as nn
import torch.nn.functional as F
import urllib.request
import pandas as pd
import dgl.data
import dgl
from dgl.data import DGLDataset
import torch
import os
import itertools
import dgl.nn as dglnn
from dgl.nn import GraphConv



from scipy.spatial import Delaunay
from sklearn.metrics import f1_score
import sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

Looking in links: https://data.dgl.ai/wheels/repo.html


### Reding CSV files defining the classes, edges and node feaures respectively. More details can be found at: https://www.kaggle.com/ellipticco/elliptic-data-set

NOTE: Please change the path of the CSV files according to your directory structure.

In [12]:
classes = pd.read_csv('/content/gdrive/MyDrive/Fall_21/BC_DL/elliptic_bitcoin_dataset/elliptic_txs_classes.csv')
edges = pd.read_csv('/content/gdrive/MyDrive/Fall_21/BC_DL/elliptic_bitcoin_dataset/elliptic_txs_edgelist.csv')
features = pd.read_csv('/content/gdrive/MyDrive/Fall_21/BC_DL/elliptic_bitcoin_dataset/elliptic_txs_features.csv',header=None).set_index(0,verify_integrity=True)

### Filtering entries with unknown classes.

In [13]:
classes_filtered = classes
classes_filtered = classes_filtered[classes_filtered['class'] != 'unknown']

### Spliting features into 2 sections: i) all entries with 1st feature value below 35 would be used for training ii) all entries with 2nd feature value above 35 would be used for testing.

In [14]:
features_train = features[features[1]<35]
features_test = features[features[1]>=35]

### Creating Training & testing dataset

In [15]:
train_x = []
train_y = []

for index, row in features_train.iterrows():
  
  if (len(classes_filtered.loc[classes_filtered['txId']==index]['class'].values) != 0):
    train_x.append(row.to_numpy())
    if int(classes_filtered.loc[classes_filtered['txId']==index]['class'].values) == 1:
      val = 1
    elif int(classes_filtered.loc[classes_filtered['txId']==index]['class'].values) == 2:
      val = 0

    train_y.append(val)


In [16]:
test_x = []
test_y = []

for index, row in features_test.iterrows():
  
  if (len(classes_filtered.loc[classes_filtered['txId']==index]['class'].values) != 0):
    test_x.append(row.to_numpy())
    if int(classes_filtered.loc[classes_filtered['txId']==index]['class'].values) == 1:
      val = 1
    elif int(classes_filtered.loc[classes_filtered['txId']==index]['class'].values) == 2:
      val = 0

    test_y.append(val)


### Fitting a Random Forest Classifier

In [24]:
clf = RandomForestClassifier(n_estimators=50, max_features=50)
clf.fit(train_x, train_y)
pred_rf = clf.predict(test_x)
f1 = f1_score(pred_rf, test_y, average=None)
f1m = f1_score(pred_rf, test_y, average='micro')
print("Final F1 score:",(f1[0]+f1[1])/2)
print("Final MicroAvg F1 score:",f1[0])

Final F1 score: 0.8973295230329974
Final MicroAvg F1 score: 0.9880577051686251


### Fitting a Logistic Regression Classifier

In [25]:
clf = LogisticRegression().fit(train_x, train_y)
pred_lr = clf.predict(test_x)
f1 = f1_score(pred_lr, test_y, average=None)
f1m = f1_score(pred_rf, test_y, average='micro')
print("Final F1 score:",(f1[0]+f1[1])/2)
print("Final MicroAvg F1 score:",f1[0])

Final F1 score: 0.6755566167407704
Final MicroAvg F1 score: 0.9374273858921162


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


### Creating a pytorch Dataset and Dataloader for the given bitcoin dataset

In [26]:
class DSet(Dataset):
    def __init__(self, feat, label):
        self.labels = label 
        self.feat = feat 
       

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        
        x = self.feat[idx]
        y = self.labels[idx]
        
        return x, y

### Creating Pytorch dataset and dataloaders

In [27]:
train_ds = DSet(train_x,train_y)
test_ds = DSet(test_x,test_y)
train_dataloader = DataLoader(train_ds, batch_size=1000, shuffle=True)
test_dataloader = DataLoader(test_ds, batch_size=1000, shuffle=True)

### Defing a evaluation function for MLP

In [34]:
def test(dataloader, model, loss_fn):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    model.eval()
    test_loss, correct = 0, 0

    f1_micro = 0
    f1_net = 0
    cnt = 0
    with torch.no_grad():
        for X, y in dataloader:
            y = torch.unsqueeze(y,1)
            X, y = X.float(), y.float()
            pred = model(X)
            test_loss += loss_fn(pred, y).item()


            pred = pred.argmax(1)
            y = torch.squeeze(y,1)
            
          

            f1_m = f1_score(pred, y, average='micro')
            f1 = f1_score(pred, y, average=None)

            f1_micro += f1[0]
            f1_net += (f1[0]+f1[1])/2
            cnt += 1



    print("Final F1 score:",f1_net/cnt)
    print("Final MicroAvg F1 score:",f1_micro/cnt)          



### Define a simple MLP

In [35]:
class MLP(nn.Module):
   
    def __init__(self, n_inputs):
        super(MLP, self).__init__()
        self.layer1 = nn.Linear(n_inputs, 50)
        self.layer2 = nn.Linear(50, 1)
        self.activation = nn.Sigmoid()

    
    def forward(self, X):
        X = self.layer1(X)
        X = self.layer2(X)
        X = self.activation(X)
        return X

### Train and Evaluate MLP

In [36]:
model = MLP(train_x[0].shape[0])
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.BCELoss()

for epoch in range(200):

    for i, (x, y) in enumerate(train_dataloader):

      y = torch.unsqueeze(y,1)
      x = x.float()
      y = y.float()
      optimizer.zero_grad()

      yhat = model(x)
      loss = criterion(yhat, y)

      print("LOSS:", loss)

      loss.backward()
      optimizer.step()

test(test_dataloader, model, criterion)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
LOSS: tensor(0.1278, grad_fn=<BinaryCrossEntropyBackward0>)
LOSS: tensor(0.1501, grad_fn=<BinaryCrossEntropyBackward0>)
LOSS: tensor(0.1304, grad_fn=<BinaryCrossEntropyBackward0>)
LOSS: tensor(0.1593, grad_fn=<BinaryCrossEntropyBackward0>)
LOSS: tensor(0.1307, grad_fn=<BinaryCrossEntropyBackward0>)
LOSS: tensor(0.1090, grad_fn=<BinaryCrossEntropyBackward0>)
LOSS: tensor(0.1300, grad_fn=<BinaryCrossEntropyBackward0>)
LOSS: tensor(0.1426, grad_fn=<BinaryCrossEntropyBackward0>)
LOSS: tensor(0.1321, grad_fn=<BinaryCrossEntropyBackward0>)
LOSS: tensor(0.1388, grad_fn=<BinaryCrossEntropyBackward0>)
LOSS: tensor(0.1295, grad_fn=<BinaryCrossEntropyBackward0>)
LOSS: tensor(0.1495, grad_fn=<BinaryCrossEntropyBackward0>)
LOSS: tensor(0.1196, grad_fn=<BinaryCrossEntropyBackward0>)
LOSS: tensor(0.1178, grad_fn=<BinaryCrossEntropyBackward0>)
LOSS: tensor(0.2097, grad_fn=<BinaryCrossEntropyBackward0>)
LOSS: tensor(0.1247, grad_fn=<Binar