In [None]:
import pandas as pd
from pathlib import Path
from datatype_recovery.models.dataset import TypeSequenceDataset
import torch
from torch_geometric.loader import DataLoader
from torch.utils.data import Subset

from datatype_recovery.models.dataset.encoding import ToFixedLengthTypeSeq

# model
HIDDEN_CHANNELS = 128
MAX_SEQ_LEN = 4
MAX_HOPS = 3

# training
BATCH_SIZE = 64
TRAIN_SPLIT = 0.7
NUM_EPOCHS = 500
SHUFFLE = True
PIN_MEMORY=False

# optimizer
LEARN_RATE = 0.001

torch.manual_seed(233)   # deterministic hopefully? lol

data_params = {
    'experiment_runs': [
        '/home/cls0027/exp_builds/astera.exp/rundata/run1',
    ],
    'copy_data': False,
}

dataset = TypeSequenceDataset('trainset_astera', data_params, max_hops=MAX_HOPS)
transform = ToFixedLengthTypeSeq(MAX_SEQ_LEN)
dataset.transform = transform   # apply transform here so we can remove it if desired
print(f'Warning: only computing accuracy on fixed-length size of {MAX_SEQ_LEN}')
print(f'TODO: compute accuracy based on raw type sequence')

import rich
console = rich.console.Console()

OVERFIT_SIZE = 4096

if OVERFIT_SIZE is not None:
    dataset = dataset[:OVERFIT_SIZE]    # TEMP: overfit on tiny subset
    console.rule(f'Training with small subset of {OVERFIT_SIZE:,} samples')

# divide into train/test sets - aligning to batch size
train_size = int(len(dataset)*TRAIN_SPLIT/BATCH_SIZE) * BATCH_SIZE
test_size = int((len(dataset) - train_size)/BATCH_SIZE) * BATCH_SIZE

train_indices = [int(x) for x in torch.randperm(len(dataset))[:train_size]]
test_indices = set(range(1024)) - set(train_indices)
test_indices = list(test_indices)[:test_size]   # align to batch size

train_set = Subset(dataset, train_indices)
test_set = Subset(dataset, range(len(train_set), len(train_set)+test_size))

train_loader = DataLoader(train_set, batch_size=BATCH_SIZE, shuffle=SHUFFLE, pin_memory=PIN_MEMORY)
test_loader = DataLoader(test_set, batch_size=BATCH_SIZE, shuffle=SHUFFLE, pin_memory=PIN_MEMORY)

total_usable = len(train_set)+len(test_set)
non_batch_aligned = len(dataset)-total_usable
print()
print(f'Train set: {len(train_set):,} samples ({len(train_set)/len(dataset)*100:.2f}%)')
print(f'Test set: {len(test_set):,} samples ({len(test_set)/len(dataset)*100:.2f}%)')
print(f'Batch size: {BATCH_SIZE}')
print(f'Total usable dataset size (batch-aligned): {total_usable:,}')
print(f'Loss due to batch alignment: {non_batch_aligned:,} ({non_batch_aligned/len(dataset)*100:.2f}%)')

In [None]:
from datatype_recovery.models.dataset.encoding import decode_typeseq
import pandas as pd


##########################################################################
# TODO: PLOT DATASET BALANCE (as above) BUT USE ORIGINAL DATA FOR SPEED
##########################################################################
# --> I want to be able to hand it a Dataset object though, so I know I'm
#     only computing results based on THIS DATA
# --> Extract varids for the Dataset/DataLoader
# --> for each varid, pull its pandas data from the source csvs (or copied ones)
# --> Create a new dataframe from this with the columns above
#
# ...then generate the plot

# NOTE: this works great, but is too slow! port this code over...
# -----------
# train_typesequences = [decode_typeseq(sample.y) for sample in train_set]
# test_typesequences = [decode_typeseq(sample.y) for sample in test_set]

# train_df = pd.DataFrame({
#     'TypeSeq': train_typesequences,
#     'TypeSeqLen': [len(x) for x in train_typesequences],
#     'FirstType': [x[0] for x in train_typesequences],
#     'Varid': [sample.varid for sample in train_set],
#     'Split': ['Train']*len(train_set),
#     'Train': [True]*len(train_set),
# })

# test_df = pd.DataFrame({
#     'TypeSeq': test_typesequences,
#     'TypeSeqLen': [len(x) for x in test_typesequences],
#     'FirstType': [x[0] for x in test_typesequences],
#     'Varid': [sample.varid for sample in test_set],
#     'Split': ['Test']*len(test_set),
#     'Test': [True]*len(test_set),
# })

# df = pd.concat([train_df, test_df])

# df.groupby(['FirstType'])[['Train','Test']].count().sort_values('Train') \
#     .plot(kind='barh')

# dataset_classes = df.groupby(['FirstType'])[['Train','Test']].count()
# dataset_classes.loc[:,'TrainPcnt'] = dataset_classes.Train/dataset_classes.Train.sum()*100
# dataset_classes.loc[:,'TestPcnt'] = dataset_classes.Test/dataset_classes.Test.sum()*100
# dataset_classes
# df.TypeSeqLen.max()

In [None]:
if not torch.cuda.is_available():
    print(f'CUDA NOT AVAILABLE!')
else:
    print(f'Device count: {torch.cuda.device_count()}')
    for i in range(torch.cuda.device_count()):
        print(f'Device {i}: {torch.cuda.get_device_name(i)}')

CUDA_DEVICE = 0

In [None]:
from tqdm.auto import trange
from datatype_recovery.models.structural_model import StructuralTypeSeqModel
from datatype_recovery.models.dataset.encoding import *
from datatype_recovery.models.training import *

model_path = Path.cwd()/'structural_model.pt'
model = StructuralTypeSeqModel(dataset, MAX_SEQ_LEN, HIDDEN_CHANNELS, num_hops=MAX_HOPS)

if model_path.exists():
    print(f'Loading existing model state @ {model_path}')
    model = torch.load(model_path)

print(model)

criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LEARN_RATE)

import wandb
wandb.login()

dataset_names = ','.join([Path(x).parent.parent.stem for x in data_params['experiment_runs']])

wandb.init(
    # set the wandb project where this run will be logged
    project="StructuralModel",

    # track hyperparameters and run metadata
    config={
        "learning_rate": LEARN_RATE,
        "architecture": "GATConv",
        'max_hops': MAX_HOPS,
        'max_seq_len': MAX_SEQ_LEN,
        'hidden_channels': HIDDEN_CHANNELS,
        "dataset": dataset_names,
        'dataset_size': len(dataset),
        'train_split': TRAIN_SPLIT,
        "epochs": NUM_EPOCHS,
        'batch_size': BATCH_SIZE,
    }
)

# once I fix this above...
# wandb.log({
#     'Dataset/classes': wandb.Table(dataframe=dataset_classes \
#                         .sort_values('Train',ascending=False) \
#                         .reset_index()),
#     'Dataset/plot': wandb.Image(
#         dataset_classes[['TrainPcnt','TestPcnt']].sort_values('TrainPcnt').plot(kind='barh')
#     ),
# })

# with torch.cuda.device(CUDA_DEVICE):
cuda_dev = torch.cuda.current_device()
device = f'cuda:{cuda_dev}' if torch.cuda.is_available() else 'cpu'

print(f'Training for {NUM_EPOCHS} epochs')

with TrainContext(model, device, optimizer, criterion, MAX_SEQ_LEN) as ctx:

    train_acc_bin, train_acc_weight, train_loss = ctx.eval(train_loader)
    test_acc_bin, test_acc_weight, test_loss = ctx.eval(test_loader)
    print(f'Train loss = {train_loss:.4f}, train accuracy = {train_acc_bin*100:,.2f}%')
    print(f'Test loss = {test_loss:.4f}, test accuracy = {test_acc_bin*100:,.2f}%')

    for epoch in trange(NUM_EPOCHS):
        ctx.train_one_epoch(train_loader)
        train_acc_bin, train_acc_weight, train_loss = ctx.eval(train_loader)
        test_acc_bin, test_acc_weight, test_loss = ctx.eval(test_loader)
        wandb.log({
            'train/loss': train_loss,
            'train/acc': train_acc_bin,
            'train/acc_weighted': train_acc_weight,
            'test/loss': test_loss,
            'test/acc': test_acc_bin,
            'test/acc_weighted': test_acc_weight,
        })
        torch.save(model, model_path)

    train_acc_bin, train_acc_weight, train_loss = eval(train_loader, device)
    test_acc_bin, test_acc_weight, test_loss = eval(test_loader, device)
    print(f'Train loss = {train_loss:.4f}, train accuracy = {train_acc_bin*100:,.2f}%')
    print(f'Test loss = {test_loss:.4f}, test accuracy = {test_acc_bin*100:,.2f}%')
    wandb.finish()