In [1]:
from use_dataset import ProofDataset
import torch
import numpy as np
from torch_geometric.data import Batch
from torch_geometric.loader import DataLoader, NeighborLoader
import random
from sklearn.metrics import f1_score as f1
from sklearn.utils.class_weight import compute_class_weight
import torch.nn.functional as F
from torch.nn import Linear, ReLU, Dropout
from torch_geometric.nn import GCNConv

torch.manual_seed(0)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [2]:
# use file_limit=5000 to only load and verify the first 5000 graphs (~60 MB)

file_limit = 5000    # desired number of graphs to work with
vocab_size = 1598   # number of characters in our vocabulary

pf_data = ProofDataset(root="data/",read_name="5000_relabeled_data.json" , write_name="5000_relabeled_data.pt" ,file_limit=file_limit)  

Processing...


processing graph 0


  x_emb = torch.tensor([x_emb]).float()


processing graph 1000
processing graph 2000
processing graph 3000
processing graph 4000
class_corr dict is of form (old_label, new_label): {0: 0, 5: 1, 6: 2, 7: 3, 9: 4, 11: 5, 14: 6, 15: 7, 16: 8, 17: 9, 18: 10, 20: 11, 22: 12, 24: 13, 25: 14, 29: 15, 31: 16, 32: 17, 34: 18, 35: 19, 36: 20, 42: 21, 47: 22, 52: 23, 57: 24, 63: 25, 65: 26, 77: 27, 82: 28, 83: 29, 86: 30, 115: 31, 119: 32, 121: 33, 123: 34, 136: 35, 141: 36, 144: 37, 147: 38, 149: 39, 155: 40, 157: 41, 185: 42, 189: 43, 200: 44, 212: 45, 215: 46, 218: 47, 219: 48, 220: 49, 221: 50, 223: 51, 225: 52, 226: 53, 227: 54, 228: 55, 229: 56, 231: 57, 232: 58, 233: 59, 234: 60, 235: 61, 236: 62, 237: 63, 238: 64, 245: 65, 246: 66, 247: 67, 248: 68, 251: 69, 252: 70, 253: 71, 254: 72, 255: 73, 256: 74, 260: 75, 261: 76, 264: 77, 265: 78, 268: 79, 270: 80, 274: 81, 275: 82, 276: 83, 278: 84, 279: 85, 280: 86, 281: 87, 282: 88, 284: 89, 285: 90, 286: 91, 288: 92, 290: 93, 292: 94, 293: 95, 295: 96, 299: 97, 300: 98, 301: 99, 302: 1

Done!


In [23]:
# make train/val/test for GCN
# set seed for random # generation
random.seed(10)
length = file_limit
total_indices = [i for i in range(file_limit)]

# create index vectors to filter dataset
train_indices = random.sample(total_indices, int(length*.8))
train_indices.sort()

val_index_options = [x for x in total_indices if x not in train_indices]
val_indices = random.sample(val_index_options, int(length*.1))
val_indices.sort()

test_index_options = [x for x in total_indices if x not in train_indices if x not in val_indices]
test_indices = random.sample(test_index_options, int(length*.1))
test_indices.sort()

# Create training, validation, and test sets
train_dataset = pf_data[train_indices]
val_dataset = pf_data[val_indices]
test_dataset = pf_data[test_indices]

print(f'Training set   = {len(train_dataset)} graphs')
print(f'Validation set = {len(val_dataset)} graphs')
print(f'Test set       = {len(test_dataset)} graphs')

Training set   = 4000 graphs
Validation set = 500 graphs
Test set       = 500 graphs


In [86]:
# Create mini-batches
# Shuffling for now; probably will remove shuffling later
train_loader = DataLoader(train_dataset, batch_size=1000, shuffle=True,num_workers=0)
val_loader   = DataLoader(val_dataset, batch_size=1000, shuffle=False)
test_loader  = DataLoader(test_dataset, batch_size=1000, shuffle=False)

In [25]:
print('\nTrain loader:')
for i, batch in enumerate(train_loader):
    print(f' - Batch {i}: {batch}')

print('\nValidation loader:')
for i, batch in enumerate(val_loader):
    print(f' - Batch {i}: {batch}')

print('\nTest loader:')
for i, batch in enumerate(test_loader):
    print(f' - Batch {i}: {batch}')


Train loader:
 - Batch 0: DataBatch(x=[3344, 512], edge_index=[2, 2844], y=[3344], batch=[3344], ptr=[501])
 - Batch 1: DataBatch(x=[2824, 512], edge_index=[2, 2324], y=[2824], batch=[2824], ptr=[501])
 - Batch 2: DataBatch(x=[3009, 512], edge_index=[2, 2509], y=[3009], batch=[3009], ptr=[501])
 - Batch 3: DataBatch(x=[2992, 512], edge_index=[2, 2492], y=[2992], batch=[2992], ptr=[501])
 - Batch 4: DataBatch(x=[3145, 512], edge_index=[2, 2645], y=[3145], batch=[3145], ptr=[501])
 - Batch 5: DataBatch(x=[3002, 512], edge_index=[2, 2502], y=[3002], batch=[3002], ptr=[501])
 - Batch 6: DataBatch(x=[3255, 512], edge_index=[2, 2755], y=[3255], batch=[3255], ptr=[501])
 - Batch 7: DataBatch(x=[2979, 512], edge_index=[2, 2479], y=[2979], batch=[2979], ptr=[501])

Validation loader:
 - Batch 0: DataBatch(x=[3207, 512], edge_index=[2, 2707], y=[3207], batch=[3207], ptr=[501])

Test loader:
 - Batch 0: DataBatch(x=[3037, 512], edge_index=[2, 2537], y=[3037], batch=[3037], ptr=[501])


In [26]:
# make a dictionary to record label frequency

# get max label used in pf_data
max_label = 0

for i in range(file_limit):
    for j in pf_data.get(i).y:
        if j > max_label:
            max_label = j.to(int).item()

# initialize histogram for labels used in pf_data            
label_count = {}

for i in range(file_limit):
    for j in range(max_label+1):        
        label_count[j] = 0

for i in range(file_limit):
    for j in pf_data.get(i).y:
        label_count[j.to(int).item()] += 1

step_count = 0
max = 0
max_freq_index = None   #find the most frequently used index
labels_never_used = 0
labels_used_once = 0
labels_used_twice = 0


for k,v in label_count.items():
    step_count += v
    if v > max:     
        max = v
        max_freq_index = k

    if v == 0:
        labels_never_used += 1
    if v == 1:
        labels_used_once += 1
    if v ==2:
        labels_used_twice += 1

In [27]:
print(f"total number of steps is:", step_count)
print(f"highest frequency label is {max_freq_index} and occurs {max} times")
print(f"final label used is {len(label_count)-1}")
print(label_count)
print(len(label_count),"unique labels are used")
print(labels_never_used,"unique labels never used")
print(labels_used_once, "unique labels used once")
print(labels_used_twice, "unique labels used twice")

total number of steps is: 30794
highest frequency label is 557 and occurs 7631 times
final label used is 557
{0: 5140, 1: 545, 2: 52, 3: 10, 4: 23, 5: 269, 6: 13, 7: 39, 8: 41, 9: 389, 10: 25, 11: 27, 12: 244, 13: 22, 14: 56, 15: 11, 16: 20, 17: 107, 18: 46, 19: 74, 20: 13, 21: 17, 22: 25, 23: 16, 24: 12, 25: 17, 26: 16, 27: 11, 28: 14, 29: 49, 30: 17, 31: 12, 32: 13, 33: 11, 34: 41, 35: 11, 36: 19, 37: 12, 38: 13, 39: 17, 40: 21, 41: 28, 42: 82, 43: 24, 44: 11, 45: 126, 46: 59, 47: 26, 48: 82, 49: 129, 50: 99, 51: 23, 52: 23, 53: 53, 54: 114, 55: 28, 56: 24, 57: 37, 58: 58, 59: 77, 60: 58, 61: 22, 62: 28, 63: 124, 64: 40, 65: 43, 66: 10, 67: 16, 68: 11, 69: 31, 70: 14, 71: 18, 72: 20, 73: 24, 74: 18, 75: 19, 76: 32, 77: 51, 78: 24, 79: 12, 80: 12, 81: 25, 82: 15, 83: 16, 84: 473, 85: 38, 86: 73, 87: 171, 88: 72, 89: 19, 90: 19, 91: 66, 92: 20, 93: 60, 94: 23, 95: 20, 96: 27, 97: 44, 98: 106, 99: 14, 100: 15, 101: 38, 102: 241, 103: 36, 104: 11, 105: 112, 106: 23, 107: 10, 108: 59, 109

In [34]:
# make array of label frequencies for sklearn compute_class_weight using entire dataset
# really should be doing this for train set (otherwise, data leakage...)
# however, train set may not include certain labels, which leads to error in compute_class_weight

# make array of unique classes
class_num_arr = [i for i in range(len(label_count))]
class_num_arr = np.array(class_num_arr)

# make array of all data points with labels
lbl_arr = np.array([])
for i in range(file_limit):
    for y in pf_data.get(i).y:
        lbl_arr = np.append(lbl_arr,[y.numpy()],axis=0).astype(int)

class_weights = compute_class_weight(class_weight="balanced",classes = class_num_arr, y=lbl_arr)
class_weights = torch.from_numpy(class_weights).float().to(device)

In [43]:
# Make class for GCN model

class GCN(torch.nn.Module):
    """GCN"""
    def __init__(self, dim_h):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(pf_data.num_features, dim_h)
        self.conv2 = GCNConv(dim_h, dim_h)
        #self.conv3 = GCNConv(dim_h, dim_h)
        #self.conv4 = GCNConv(dim_h, dim_h)
        self.lin = Linear(dim_h, len(class_num_arr))

    def forward(self, x, edge_index):
        h = self.conv1(x, edge_index)
        h = h.relu()
        h = F.dropout(h, p=0.5, training=self.training)
        h = self.conv2(h, edge_index)        
        h = h.relu()
        h = F.dropout(h, p=0.8, training=self.training)
        h = self.lin(h)
        
        return F.log_softmax(h, dim=1)

In [96]:
# GCN model training

def train(model, loader, lr):
    criterion = torch.nn.CrossEntropyLoss()
    # commented out code is to use class weights to account for imbalanced dataset
    #criterion = torch.nn.CrossEntropyLoss(weight=class_weights)
    #optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    optimizer = torch.optim.RMSprop(model.parameters(), lr=lr)
    epochs = 4000

    model.train()
    for epoch in range(epochs+1):
        total_loss = 0
        acc = 0
        val_loss = 0
        val_acc = 0
        cur_graph = 0   # used to keep track of current statement to enforce preds of only PREVIOUS labels in training

        # Train on batches
        for data in loader:
            cur_graph_batch = data.batch + cur_graph            
            cur_graph += torch.max(data.batch) + 1
            data = data.to(device, non_blocking=True)
            data.y = data.y.to(torch.float).to(device, non_blocking=True)
            optimizer.zero_grad()
            length = len(loader)
            out = model(data.x, data.edge_index.long())
            data.y = data.y.type(torch.LongTensor).to(device, non_blocking=True)
            out = out.type(torch.float32).to(device, non_blocking=True)
            loss = criterion(out, data.y)
            total_loss += loss / length

            # commented out code below is meant to enforce predictions to only come from previous theorems
            # for dict to be properly created, you must delete and recreate test.pt by rerunning 
            # pf_data = ProofDataset(root="data/",file_limit=file_limit)
            
            #dict = pf_data.class_corr
            #dict_keys = [k for k in dict.keys()]            

            #def return_next_lowest_idx(num):
                #if num in dict_keys:
                    #return dict[num]
                #while num not in dict_keys:
                    #try:
                        #num -= 1
                        #return dict[num]
                    #except:
                        #pass

            #with torch.no_grad():
                #cur_graph_batch.apply_(return_next_lowest_idx)
                #masked_lbls = (torch.arange(out.size(1)) < (cur_graph_batch[..., None]+1))*(out.cpu())
                #try:
                    #masked_lbls = (torch.arange(out.size(1)) < (cur_graph_batch[..., None]+1))*(out.cpu())
                    #masked_lbls = torch.where(masked_lbls==0,np.nan,masked_lbls)
                    #masked_lbls = masked_lbls.detach().numpy()                
                    #pred = np.nanargmax(masked_lbls,axis=1)
                    #pred = torch.from_numpy(pred)
                    #acc += accuracy(pred, data.y.cpu()) / length
                #except Exception as e:
                    #print("a lil error")
                    #out = out + .00000001
                    #masked_lbls = (torch.arange(out.size(1)) < (cur_graph_batch[..., None]+1))*(out.cpu())
                    #masked_lbls = torch.where(masked_lbls==0,np.nan,masked_lbls)
                    #masked_lbls = masked_lbls.detach().numpy()
                    #pred = np.nanargmax(masked_lbls,axis=1)
                    #pred = torch.from_numpy(pred)
                    #acc += accuracy(pred, data.y.cpu()) / length
                    #out = out - .00000001

            #comment out the follow pred and acc lines if enforcing predictions as described above
            pred = out.argmax(dim=1)
            acc += accuracy(pred, data.y) / length

            loss.backward()
            optimizer.step()

            # run model on validation set
            val_loss, val_acc, val_f1 = test(model, val_loader)

        # Print metrics every epoch
        if(epoch % 10 == 0):
            print(f'Epoch {epoch:>3} | Train Loss: {total_loss:.2f} | Train Acc: {acc*100:>5.2f}% | Val Loss: {val_loss:.2f} | Val Acc: {val_acc*100:.2f}% | F Score: {val_f1:.2f}')
            
    return model

@torch.no_grad()
def test(model, loader):
    
    criterion = torch.nn.CrossEntropyLoss()
    model.eval()
    loss = 0
    acc = 0
    fscore = 0
    
    for data in loader:
        data = data.to(device, non_blocking=True)
        length = len(loader)
        out = model(data.x, data.edge_index.long())
        data.y = data.y.type(torch.LongTensor).to(device, non_blocking=True)
        loss += criterion(out, data.y) / length
        pred = out.argmax(dim=1)
        acc += accuracy(pred, data.y) / length
        fscore += f1(pred.cpu(), data.y.cpu(), average='macro')    # micro looks better, but macro prob more accurate

    return loss, acc, fscore

def accuracy(pred_y, y):
    """Calculate accuracy."""
    return ((pred_y == y).sum() / len(y)).item()

In [105]:
# initialize (and reset weights of) model
# dim_h is hyperparameter of number of hidden layers

gcn_trained = None
gcn = None
gcn = GCN(dim_h=3200).to(device)
gcn

GCN(
  (conv1): GCNConv(512, 3200)
  (conv2): GCNConv(3200, 3200)
  (lin): Linear(in_features=3200, out_features=558, bias=True)
)

In [106]:
# reset weights and train model
gcn_trained = None
gcn_trained = train(gcn, train_loader,lr=.0001)

Epoch   0 | Train Loss: 5.67 | Train Acc: 18.61% | Val Loss: 4.64 | Val Acc: 25.60% | F Score: 0.00
Epoch  10 | Train Loss: 4.11 | Train Acc: 32.63% | Val Loss: 4.22 | Val Acc: 32.93% | F Score: 0.00
Epoch  20 | Train Loss: 3.74 | Train Acc: 34.46% | Val Loss: 3.90 | Val Acc: 34.21% | F Score: 0.00
Epoch  30 | Train Loss: 3.49 | Train Acc: 34.33% | Val Loss: 3.69 | Val Acc: 33.30% | F Score: 0.01
Epoch  40 | Train Loss: 3.30 | Train Acc: 35.47% | Val Loss: 3.53 | Val Acc: 34.21% | F Score: 0.01
Epoch  50 | Train Loss: 3.17 | Train Acc: 35.93% | Val Loss: 3.43 | Val Acc: 35.02% | F Score: 0.02
Epoch  60 | Train Loss: 3.06 | Train Acc: 36.50% | Val Loss: 3.34 | Val Acc: 35.42% | F Score: 0.03
Epoch  70 | Train Loss: 3.00 | Train Acc: 36.24% | Val Loss: 3.28 | Val Acc: 35.36% | F Score: 0.04
Epoch  80 | Train Loss: 2.91 | Train Acc: 37.03% | Val Loss: 3.24 | Val Acc: 35.27% | F Score: 0.05
Epoch  90 | Train Loss: 2.82 | Train Acc: 38.89% | Val Loss: 3.15 | Val Acc: 37.54% | F Score: 0.05


KeyboardInterrupt: 

In [90]:
%%capture cap

lr = [.1,.05,.01,.005,.001,.00005,.00001]
h = [200,400,800,1600,3200]

for rate in lr:
    for hidden in h:
        print(rate,hidden)
        gcn_trained = None
        gcn = None
        gcn = GCN(dim_h=hidden).to(device)
        gcn

        # reset weights and train model
        gcn_trained = None
        train(gcn, train_loader,lr=.0001)

with open('h_'+str(h)+'lr_'+str(lr)+'.txt', 'w') as f:
        f.write(cap.stdout)   

KeyboardInterrupt: 

In [91]:
with open('h.txt', 'w') as f:
        f.write(cap.stdout)   

In [None]:
test_loss, test_acc, test_f1 = test(gcn_trained, test_loader)
print(f'Test Loss: {test_loss:.2f} | Test Acc: {test_acc*100:.2f}% | F Score: {test_f1:.2f}')
print()

Test Loss: 14.20 | Test Acc: 45.04% | F Score: 0.22

