In [23]:
#from our_dataset import ProofDataset
from our_dataset_with_labels import ProofDatasetWithLabels
from statement_embedding import emb_to_stmt, num_to_label
from torch_geometric.transforms import NormalizeFeatures

import pandas as pd
import csv
import torch

In [24]:
# WARNING!!!!
# running ProofDataset(root="data/", file_limit=None) currently has a high initial cost
# it requires ~63 GB of space, and it took my computer about 2 hours to run
# however, after an initial run, the dataset doesn't need to be created again
# secondary runs merely verify that all files are in place; this takes my computer about 15s
# WARNING!!!!

# to avoid previous warning, use file_limit=8000 to only load and verify the first 8000 graphs (<4GB)
# if entire graph dataset is desired, use file_limit=None
file_limit = 100    # desired number of graphs to work with
vocab_size = 1598   # number of characters in our vocabulary
label_size = file_limit  # number of labels in dataset; all labels is 45332; defaults to file_limit in pf_data

pf_data = ProofDatasetWithLabels(root="data/",file_limit=file_limit, label_size=file_limit )  # normalize with transform = NormalizeFeatures()

In [25]:
# this function is used to get the label of a statement from tag_proof.csv

def get_thm_lbl(num):
    with open("../Assets/tag_proof.csv","r") as file:
        reader = csv.DictReader(file)
        for idx, row in enumerate(reader):
            if num == idx:
                return row['tag']

In [26]:
# let's explore graph 2, i.e. mp2, as our current graph at proof step 3
graph_num = 2
step_num = 0
graph_lbl = get_thm_lbl(graph_num)
cur_graph = pf_data.get(graph_num)
cur_step = cur_graph.x[step_num]

# let's get the normalized statement feature vector at step 3 in our proof
cur_step_vec_norm = cur_step
cur_step_vec_norm = cur_step_vec_norm[cur_step_vec_norm.nonzero()]  # remove trailing zeros
cur_step_vec_norm = cur_step_vec_norm.reshape(-1)
cur_step_vec_norm = cur_step_vec_norm.tolist()

# unnormalized version of above
cur_step_vec = cur_step*vocab_size
cur_step_vec = cur_step_vec[cur_step_vec.nonzero()] # remove trailing zeros
cur_step_vec = cur_step_vec.reshape(-1)
cur_step_vec = cur_step_vec.to(int).tolist()

# convert feature vector back to statement
stmt_list = cur_step_vec.copy()
stmt_list.insert(0,0)

# let's get the label of our current step
cur_label = cur_graph.y[step_num]*label_size
cur_label = cur_label.to(int).item()

# get statement dependencies of current step 
req_stmt_num = []
req_stmt = []
for idx, x in enumerate(cur_graph.edge_index[1]):
    if step_num == x:
        req_stmt_num.append(cur_graph.edge_index[0][idx].to(int))
for num in req_stmt_num:
    dependency = cur_graph.x[num]*vocab_size
    dependency = dependency.to(int).tolist()
    dependency.insert(0,0)
    dependency = emb_to_stmt(dependency)
    req_stmt.append(dependency)



In [27]:
print(f"Our current graph is the proof of {graph_lbl}, which has {cur_graph.num_nodes} node(s) and {cur_graph.num_edges} edge(s).")
print(f"Each node has {cur_graph.num_features} feature(s) and {torch.numel(cur_graph.y[0])} label(s)")
print(f"All of the above info. is captured in {cur_graph}") # edge_index always begins with 2, in case you were curious
print()
print(f"For step {step_num} in the proof, the truncated normalized feature vector is")
print(f"{cur_step_vec_norm},")
print(f"and the truncated unnormalized feature vector is ")
print(f"{cur_step_vec}.")
print()
print(f"The corresponding proof statement is {emb_to_stmt(stmt_list)}")
print(f"The corresponding label is {num_to_label(cur_label)}")
print()
print(f"Step {step_num} depends on {len(req_stmt)} statements:")
if len(req_stmt) == 0:
    print("No dependencies")
else:
    for stmt in req_stmt:
        print(stmt)


Our current graph is the proof of mp2, which has 5 node(s) and 4 edge(s).
Each node has 11547 feature(s) and 1 label(s)
All of the above info. is captured in Data(x=[5, 11547], edge_index=[2, 4], y=[5])

For step 0 in the proof, the truncated normalized feature vector is
[0.8035044074058533],
and the truncated unnormalized feature vector is 
[1284].

The corresponding proof statement is ps
The corresponding label is $e

Step 0 depends on 0 statements:
No dependencies


In [28]:
torch.manual_seed(11)
torch.cuda.manual_seed(0)
torch.cuda.manual_seed_all(0)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [29]:
from torch_geometric.loader import DataLoader

# Create training, validation, and test sets
train_dataset = pf_data[:int(len(pf_data)*0.8)]
val_dataset   = pf_data[int(len(pf_data)*0.8):int(len(pf_data)*0.9)]
test_dataset  = pf_data[int(len(pf_data)*0.9):]

print(f'Training set   = {len(train_dataset)} graphs')
print(f'Validation set = {len(val_dataset)} graphs')
print(f'Test set       = {len(test_dataset)} graphs')


Training set   = 80 graphs
Validation set = 10 graphs
Test set       = 10 graphs


In [30]:
# Create mini-batches
# Shuffling for now; probably will remove shuffling later
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader   = DataLoader(val_dataset, batch_size=8, shuffle=True)
test_loader  = DataLoader(test_dataset, batch_size=8, shuffle=True)

In [31]:
print('\nTrain loader:')
for i, batch in enumerate(train_loader):
    print(f' - Batch {i}: {batch}')

print('\nValidation loader:')
for i, batch in enumerate(val_loader):
    print(f' - Batch {i}: {batch}')

print('\nTest loader:')
for i, batch in enumerate(test_loader):
    print(f' - Batch {i}: {batch}')


Train loader:
 - Batch 0: DataBatch(x=[31, 11547], edge_index=[2, 23], y=[31], batch=[31], ptr=[9])
 - Batch 1: DataBatch(x=[31, 11547], edge_index=[2, 23], y=[31], batch=[31], ptr=[9])
 - Batch 2: DataBatch(x=[32, 11547], edge_index=[2, 24], y=[32], batch=[32], ptr=[9])
 - Batch 3: DataBatch(x=[29, 11547], edge_index=[2, 21], y=[29], batch=[29], ptr=[9])
 - Batch 4: DataBatch(x=[33, 11547], edge_index=[2, 25], y=[33], batch=[33], ptr=[9])
 - Batch 5: DataBatch(x=[31, 11547], edge_index=[2, 23], y=[31], batch=[31], ptr=[9])
 - Batch 6: DataBatch(x=[30, 11547], edge_index=[2, 22], y=[30], batch=[30], ptr=[9])
 - Batch 7: DataBatch(x=[29, 11547], edge_index=[2, 21], y=[29], batch=[29], ptr=[9])
 - Batch 8: DataBatch(x=[27, 11547], edge_index=[2, 19], y=[27], batch=[27], ptr=[9])
 - Batch 9: DataBatch(x=[26, 11547], edge_index=[2, 18], y=[26], batch=[26], ptr=[9])

Validation loader:
 - Batch 0: DataBatch(x=[23, 11547], edge_index=[2, 15], y=[23], batch=[23], ptr=[9])
 - Batch 1: DataBat