In [None]:
import os
import os.path
os.environ['CUDA_VISIBLE_DEVICES'] = '0'

## Checkpoints

In [None]:
model_B = [
    '9ZB-000-link-edges.660000',
    '9ZB-000-link-edges.560000',
    '9ZB-000-link-edges.460000',
    '9ZB2-000-link-edges-4xlowerlr.660000',
    '9ZB2-000-link-edges-4xlowerlr.560000',
    '9ZB2-000-link-edges-4xlowerlr.460000',
]

In [None]:
model_F = [
    '9ZF-000-ablation-study-high-batch-size-2.660000',
    '9ZF-000-ablation-study-high-batch-size-2.560000',
    '9ZF-000-ablation-study-high-batch-size-2.460000',
]


model_F2 = [
    '9ZF2-000-ablation-study-high-batch-size-augment-really-this-time.206250',
    '9ZF2-000-ablation-study-high-batch-size-augment-really-this-time.176250',
    '9ZF2-000-ablation-study-high-batch-size-augment-really-this-time.146250',
]    

model_F3 = [
    '9ZF3-000-ablation-study-remove-global-state.206250',
    '9ZF3-000-ablation-study-remove-global-state.176250',
    '9ZF3-000-ablation-study-remove-global-state.146250',
]

model_F5 = [
    '9ZF5-000-ablation-study-no-pairs-embeddings-and-one-preprocessing-edge-pairs.206250',
    '9ZF5-000-ablation-study-no-pairs-embeddings-and-one-preprocessing-edge-pairs.176250',
    '9ZF5-000-ablation-study-no-pairs-embeddings-and-one-preprocessing-edge-pairs.146250',
]

In [None]:
model_G = [    
    '9ZG-000-vanilla-deep-radam.3300000',
    '9ZG-000-vanilla-deep-radam.3000000',
    '9ZG-000-vanilla-deep-radam.2700000'
]

In [None]:
model_G2 = [
    '9ZG2-000-vanilla-deep-radam-300-10.1006500',
    '9ZG2-000-vanilla-deep-radam-300-10.856500',
    '9ZG2-000-vanilla-deep-radam-300-10.706500',
]

In [None]:
model_G4 = [
    '9ZG4-000-vanilla-deep-radam.660000',
    '9ZG4-000-vanilla-deep-radam.560000',
    '9ZG4-000-vanilla-deep-radam.460000'
]

In [None]:
model_G5 = [
    '9ZG5-000-vanilla-deep-radam-mse-clip.660000',
    '9ZG5-000-vanilla-deep-radam-mse-clip.560000',
    '9ZG5-000-vanilla-deep-radam-mse-clip.460000',
]

In [None]:
model_I = [    
    '9ZI-007-distributionnal-loss.660000',
    '9ZI-005-distributionnal-loss.560000',
    '9ZI-005-distributionnal-loss.460000',
]

# Submit / valid

## Utils

In [None]:
from common import *
from dataset.dataset_9ZB_117_edge_link import EdgeBasedDataset, DataLoader
from sklearn.model_selection import train_test_split
from torch_geometric.data import Batch
from tensorboardX import SummaryWriter
from scheduler_superconvergence_09J import *
from torch_geometric.data import DataListLoader
from torch_scatter import scatter_add
from importancer import get_tags, select_tags

In [None]:
def init_dataset():
    global train_loader
    global train_small_loader
    global valid_loader
    global train_indices
    global valid_indices
    
    global submit_loader
    
    if action == 'train':
        if to_load:
            train_indices = to_load['train_indices']
            valid_indices = to_load['valid_indices']
        else:
            indices = list(range(len(dataset)))
            train_indices, valid_indices = train_test_split(indices, test_size = 5000, random_state = 1234)
            
        train_big_indices, train_small_indices = train_test_split(list(range(len(train_indices))), test_size = 5000, random_state = 1234)

        train = torch.utils.data.Subset(dataset, train_indices)
        train_small = torch.utils.data.Subset(train, train_small_indices)
        valid = torch.utils.data.Subset(dataset, valid_indices)

        if not parallel_gpu:
            train_loader = DataLoader(train, batch_size = batch_size, drop_last = True, shuffle = True, follow_batch=['edge_attr_numeric'], num_workers=num_workers)
            train_small_loader = DataLoader(train_small, batch_size = batch_size * valid_batch_size_factor, drop_last = True, shuffle = True, follow_batch=['edge_attr_numeric'], num_workers=num_workers)
            valid_loader = DataLoader(valid, batch_size = batch_size * valid_batch_size_factor, drop_last = True, shuffle = True, follow_batch=['edge_attr_numeric'], num_workers=num_workers)
        else:
            train_loader = DataListLoader(train, batch_size = batch_size, shuffle = True, num_workers=num_workers)
            valid_loader = DataListLoader(valid, batch_size = batch_size * valid_batch_size_factor, shuffle = True, num_workers=num_workers)

        if False and "benchmark":
            for batch in tqdm.tqdm_notebook(train_loader):
                pass
    else:
        if not parallel_gpu:
            submit_loader = DataLoader(dataset, batch_size = batch_size * valid_batch_size_factor, drop_last = False, shuffle = False, follow_batch=['edge_attr_numeric'], num_workers=num_workers)
        else:
            raise ValueError

        if False and "benchmark":
            for batch in tqdm.tqdm_notebook(submit_loader):
                pass
            
def init_model():
    global model
    global MEGNetList
    
    model = MEGNetList(
        layer_count,                
        atom_embedding_count, bond_ebedding_count, global_embedding_count, 
        atom_input_size, bond_input_size, global_input_size, 
        hidden, 
        target_means, target_stds)
    
    if to_load:
        model.load_state_dict(to_load['model'])

    if not parallel_gpu:
        model = model.to(device)
    else:
        model = model.to('cuda:0')
        
        
def batch_submit():
    global batch
    
    with torch.no_grad():
        # BATCH
        batch = batch.to(device)

        # PREDICT
        out = model.forward(
            [batch.x_numeric],
            batch.x_embeddings,

            [batch.edge_attr_numeric], 
            batch.edge_attr_embeddings,

            [batch.u_numeric],
            batch.u_embeddings,

            batch.edge_index, 

            batch.batch, 
            batch.edge_attr_numeric_batch, 

            batch.y_types, 

            batch.cycles_edge_index,
            batch.cycles_id,

            batch.edges_connectivity_ids,
            batch.edges_connectivity_features,
        )
        
        return out.squeeze(1).cpu().numpy()

def submit(loader):
    global batch
    model.eval()
    
    predictions = dataset.bond_descriptors.reset_index()[['type', 'edge_index', 'atom_index_0', 'atom_index_1', 'molecule_id']]
    predictions = predictions.loc[predictions['molecule_id'].isin(dataset.molecules_ids)]

    predictions['prediction'] = np.nan

    molecule_map = {k : v for k, v in zip(dataset.molecules_ids, dataset.molecules)}

    predictions['molecule_name'] = predictions['molecule_id'].map(molecule_map)

    predictions = predictions.set_index('molecule_id')

    current_index = 0
    for batch in tqdm.tqdm_notebook(loader):
        try:
            molecule_ids = batch.molecule_ids.numpy()

            prediction = batch_submit()

            predictions.loc[molecule_ids, 'prediction'] = prediction

        except KeyboardInterrupt:
            print("Escaping")
            return "escape"

    return predictions

In [None]:
def load_submit_dataset():
    global dataset
    
    global global_embedding_count
    global atom_embedding_count
    global bond_ebedding_count
    global global_numeric_count
    global bond_numeric_count
    global atom_numeric_count
    global target_means
    global target_stds
    global atom_input_size
    global bond_input_size
    global global_input_size
    global action
    
    
    action = 'submit'
    submit_dataset_name = 'test'

    if action == 'train':
        dataset = EdgeBasedDataset(name = 'train')
    else:
        dataset = EdgeBasedDataset(name = submit_dataset_name)

    target_stats = dataset.bond_descriptors.loc[(dataset.bond_descriptors['type'] != 'VOID') & dataset.bond_descriptors.index.isin(dataset.dataset.loc[dataset.dataset['dataset'] == 'train', 'molecule_id'])].groupby('type_id')['scalar_coupling_constant'].agg(['std', 'median'])

    target_means = target_stats['median'].values
    target_stds = target_stats['std'].values
    target_stats

    # Inputs

    sample = dataset[0]
    print(sample)

    global_embedding_count = dataset.global_embedding_count
    atom_embedding_count = dataset.atom_embedding_count
    bond_ebedding_count = dataset.bond_ebedding_count

    global_numeric_count = sample.u_numeric.size(1)
    bond_numeric_count = sample.edge_attr_numeric.size(1)
    atom_numeric_count = sample.x_numeric.size(1)
    
    atom_input_size = [(atom_numeric_count, hidden)]
    bond_input_size = [(bond_numeric_count, hidden)]
    global_input_size = [(global_numeric_count, hidden)]
    
def load_train_dataset():
    global dataset
    
    global global_embedding_count
    global atom_embedding_count
    global bond_ebedding_count
    global global_numeric_count
    global bond_numeric_count
    global atom_numeric_count
    global target_means
    global target_stds
    global atom_input_size
    global bond_input_size
    global global_input_size
    global action
    
    action = 'submit'
    submit_dataset_name = 'train'

    if action == 'train':
        dataset = EdgeBasedDataset(name = 'train')
    else:
        dataset = EdgeBasedDataset(name = submit_dataset_name)

    target_stats = dataset.bond_descriptors.loc[(dataset.bond_descriptors['type'] != 'VOID') & dataset.bond_descriptors.index.isin(dataset.dataset.loc[dataset.dataset['dataset'] == 'train', 'molecule_id'])].groupby('type_id')['scalar_coupling_constant'].agg(['std', 'median'])

    target_means = target_stats['median'].values
    target_stds = target_stats['std'].values
    target_stats

    # Inputs

    sample = dataset[0]
    print(sample)

    global_embedding_count = dataset.global_embedding_count
    atom_embedding_count = dataset.atom_embedding_count
    bond_ebedding_count = dataset.bond_ebedding_count

    global_numeric_count = sample.u_numeric.size(1)
    bond_numeric_count = sample.edge_attr_numeric.size(1)
    atom_numeric_count = sample.x_numeric.size(1)
    
    atom_input_size = [(atom_numeric_count, hidden)]
    bond_input_size = [(bond_numeric_count, hidden)]
    global_input_size = [(global_numeric_count, hidden)]
    
    
def package_submit_prediction(submit_predictions):
    assert submit_predictions.loc[submit_predictions['prediction'].notnull()].shape[0] == 7223027

    test = pd.read_csv('data/test.csv')

    test['prediction'] = np.nan

    pred_1 = pd.merge(test[['molecule_name', 'atom_index_0', 'atom_index_1']], submit_predictions[['molecule_name', 'atom_index_0', 'atom_index_1', 'prediction']], left_on = ['molecule_name', 'atom_index_0', 'atom_index_1'], right_on = ['molecule_name', 'atom_index_0', 'atom_index_1'], how = 'left')['prediction']
    pred_2 = pd.merge(test[['molecule_name', 'atom_index_0', 'atom_index_1']], submit_predictions[['molecule_name', 'atom_index_0', 'atom_index_1', 'prediction']], left_on = ['molecule_name', 'atom_index_0', 'atom_index_1'], right_on = ['molecule_name', 'atom_index_1', 'atom_index_0'], how = 'left')['prediction']

    test['prediction'] = pred_1
    test.loc[test['prediction'].isnull(), 'prediction'] = pred_2

    test = test[['id', 'prediction']].rename(columns = {'prediction' : 'scalar_coupling_constant'})
    test = test.rename(columns = {'prediction' : 'scalar_coupling_constant'})

    assert test['scalar_coupling_constant'].isnull().sum() == 0
    assert test['scalar_coupling_constant'].notnull().sum() == 2505542

    return test

def package_train_prediction(train_predictions):
    assert train_predictions.loc[train_predictions['prediction'].notnull()].shape[0] == 13432555

    train = pd.read_csv('data/train.csv')

    train['prediction'] = np.nan

    pred_1 = pd.merge(train[['molecule_name', 'atom_index_0', 'atom_index_1']], train_predictions[['molecule_name', 'atom_index_0', 'atom_index_1', 'prediction']], left_on = ['molecule_name', 'atom_index_0', 'atom_index_1'], right_on = ['molecule_name', 'atom_index_0', 'atom_index_1'], how = 'left')['prediction']
    pred_2 = pd.merge(train[['molecule_name', 'atom_index_0', 'atom_index_1']], train_predictions[['molecule_name', 'atom_index_0', 'atom_index_1', 'prediction']], left_on = ['molecule_name', 'atom_index_0', 'atom_index_1'], right_on = ['molecule_name', 'atom_index_1', 'atom_index_0'], how = 'left')['prediction']

    train['prediction'] = pred_1
    train.loc[train['prediction'].isnull(), 'prediction'] = pred_2

    assert train['prediction'].isnull().sum() == 0
    assert train['prediction'].notnull().sum() == 4658147

    train['dataset'] = 'train'

    indices = list(range(len(dataset)))
    train_indices, valid_indices = train_test_split(indices, test_size = 5000, random_state = 1234)
    valid_molecules = [dataset.molecules[i] for i in valid_indices]

    train.loc[train['molecule_name'].isin(valid_molecules), 'dataset'] = 'valid'
    assert train.loc[train['dataset'] == 'valid', 'molecule_name'].nunique() == 5000

    return train

def analyze(train):
    train = train.loc[train['dataset'] == 'valid']
    train['mae'] = (train['prediction'] - train['scalar_coupling_constant']).abs()
    
    print(np.log(train.groupby('type')['mae'].mean()).mean())
    #print(np.log(train.groupby('type')['mae'].mean()))

In [None]:
def init_valid_dataset():
    global train_loader
    global train_small_loader
    global valid_loader
    global train_indices
    global valid_indices
    
    global submit_loader
    
    if action == 'train':
        if to_load:
            train_indices = to_load['train_indices']
            valid_indices = to_load['valid_indices']
        else:
            indices = list(range(len(dataset)))
            train_indices, valid_indices = train_test_split(indices, test_size = 5000, random_state = 1234)
            
        train_big_indices, train_small_indices = train_test_split(list(range(len(train_indices))), test_size = 5000, random_state = 1234)

        train = torch.utils.data.Subset(dataset, train_indices)
        train_small = torch.utils.data.Subset(train, train_small_indices)
        valid = torch.utils.data.Subset(dataset, valid_indices)

        if not parallel_gpu:
            train_loader = DataLoader(train, batch_size = batch_size, drop_last = True, shuffle = True, follow_batch=['edge_attr_numeric'], num_workers=num_workers)
            train_small_loader = DataLoader(train_small, batch_size = batch_size * valid_batch_size_factor, drop_last = True, shuffle = True, follow_batch=['edge_attr_numeric'], num_workers=num_workers)
            valid_loader = DataLoader(valid, batch_size = batch_size * valid_batch_size_factor, drop_last = True, shuffle = True, follow_batch=['edge_attr_numeric'], num_workers=num_workers)
        else:
            train_loader = DataListLoader(train, batch_size = batch_size, shuffle = True, num_workers=num_workers)
            valid_loader = DataListLoader(valid, batch_size = batch_size * valid_batch_size_factor, shuffle = True, num_workers=num_workers)

        if False and "benchmark":
            for batch in tqdm.tqdm_notebook(train_loader):
                pass
    else:
        if to_load:
            train_indices = to_load['train_indices']
            valid_indices = to_load['valid_indices']
        else:
            indices = list(range(len(dataset)))
            train_indices, valid_indices = train_test_split(indices, test_size = 5000, random_state = 1234)
            
        train_big_indices, train_small_indices = train_test_split(list(range(len(train_indices))), test_size = 5000, random_state = 1234)

        train = torch.utils.data.Subset(dataset, train_indices)
        train_small = torch.utils.data.Subset(train, train_small_indices)
        valid = torch.utils.data.Subset(dataset, valid_indices)
        
        if not parallel_gpu:
            submit_loader = DataLoader(valid, batch_size = batch_size * valid_batch_size_factor, drop_last = False, shuffle = False, follow_batch=['edge_attr_numeric'], num_workers=num_workers)
        else:
            raise ValueError

        if False and "benchmark":
            for batch in tqdm.tqdm_notebook(submit_loader):
                pass

In [None]:
def package_valid_prediction(train_predictions):
    assert train_predictions.loc[train_predictions['prediction'].notnull()].shape[0] == 792626

    train = pd.read_csv('data/train.csv')

    train['prediction'] = np.nan

    pred_1 = pd.merge(train[['molecule_name', 'atom_index_0', 'atom_index_1']], train_predictions[['molecule_name', 'atom_index_0', 'atom_index_1', 'prediction']], left_on = ['molecule_name', 'atom_index_0', 'atom_index_1'], right_on = ['molecule_name', 'atom_index_0', 'atom_index_1'], how = 'left')['prediction']
    pred_2 = pd.merge(train[['molecule_name', 'atom_index_0', 'atom_index_1']], train_predictions[['molecule_name', 'atom_index_0', 'atom_index_1', 'prediction']], left_on = ['molecule_name', 'atom_index_0', 'atom_index_1'], right_on = ['molecule_name', 'atom_index_1', 'atom_index_0'], how = 'left')['prediction']

    train['prediction'] = pred_1
    train.loc[train['prediction'].isnull(), 'prediction'] = pred_2

    assert train['prediction'].isnull().sum() == 4382642
    assert train['prediction'].notnull().sum() == 275505

    train['dataset'] = 'train'

    indices = list(range(len(dataset)))
    train_indices, valid_indices = train_test_split(indices, test_size = 5000, random_state = 1234)
    valid_molecules = [dataset.molecules[i] for i in valid_indices]

    train.loc[train['molecule_name'].isin(valid_molecules), 'dataset'] = 'valid'

    assert train.loc[train['dataset'] == 'valid', 'molecule_name'].nunique() == 5000
    
    train = train.loc[train['prediction'].notnull()]
    
    return train

## classical

In [None]:
import layers.layers_09ZB_link_edge
import layers.layers_09ZF_ablation_study
import layers.layers_09ZF3_ablation_study_remove_global_state
import layers.layers_09ZF5_ablation_study_no_edge_pairs_embeddings_and_one_preprocessing
import layers.layers_09ZI_distributionnal_loss

In [None]:
models_data = [
    {
        'names' : model_B,
        'hidden' : 300,
        'layer_count' : 6,
        'batch_size' : 20,
        'module' : layers.layers_09ZB_link_edge
    },
    {
        'names' : model_F,
        'hidden' : 300,
        'layer_count' : 6,
        'batch_size' : 20,
        'module' : layers.layers_09ZF_ablation_study
    },
    {
        'names' : model_F2,
        'hidden' : 200,
        'layer_count' : 6,
        'batch_size' : 64,
        'module' : layers.layers_09ZF_ablation_study
    },
    {
        'names' : model_F3,
        'hidden' : 200,
        'layer_count' : 6,
        'batch_size' : 64,
        'module' : layers.layers_09ZF3_ablation_study_remove_global_state
    },
    {
        'names' : model_F5,
        'hidden' : 200,
        'layer_count' : 6,
        'batch_size' : 64,
        'module' : layers.layers_09ZF5_ablation_study_no_edge_pairs_embeddings_and_one_preprocessing
    },
    {
        'names' : model_G,
        'hidden' : 500,
        'layer_count' : 10,
        'batch_size' : 4,
        'module' : layers.layers_09ZB_link_edge
    },
    {
        'names' : model_G2,
        'hidden' : 300,
        'layer_count' : 10,
        'batch_size' : 20,
        'module' : layers.layers_09ZB_link_edge
    },
    {
        'names' : model_G4,
        'hidden' : 300,
        'layer_count' : 6,
        'batch_size' : 20,
        'module' : layers.layers_09ZB_link_edge
    },
    {
        'names' : model_G5,
        'hidden' : 300,
        'layer_count' : 6,
        'batch_size' : 20,
        'module' : layers.layers_09ZB_link_edge
    },
    {
        'names' : model_I,
        'hidden' : 300,
        'layer_count' : 6,
        'batch_size' : 20,
        'module' : layers.layers_09ZI_distributionnal_loss
    },
]

device = 'cuda'
bin_count = 260 * 4
centers = np.linspace(-40, 220 - 1 / 4, bin_count)
delta = (centers[1] - centers[0]) / 2
centers += delta
delta
centers = torch.tensor(centers.reshape(1, -1), dtype = torch.float32).to(device)

def batch_submit():
    global batch
    
    with torch.no_grad():
        # BATCH
        batch = batch.to(device)

        # PREDICT
        out = model.forward(
            [batch.x_numeric],
            batch.x_embeddings,

            [batch.edge_attr_numeric], 
            batch.edge_attr_embeddings,

            [batch.u_numeric],
            batch.u_embeddings,

            batch.edge_index, 

            batch.batch, 
            batch.edge_attr_numeric_batch, 

            batch.y_types, 

            batch.cycles_edge_index,
            batch.cycles_id,

            batch.edges_connectivity_ids,
            batch.edges_connectivity_features,
        )
        
        out = out * centers
            
        return out.sum(dim = 1).cpu().numpy()

In [None]:
num_workers = 7
device = 'cuda'
parallel_gpu = False

# Config

for model_data in models_data:
    for model_name in model_data['names']:
        sub_path = f'submissions/submission_{model_name}.csv'
        valid_path = f'submissions/train_{model_name}.csv'
        model_file = f'model.{model_name}.bin'

        if not os.path.isfile(sub_path):
            print("predict", model_name)
            hidden = model_data['hidden']
            layer_count = model_data['layer_count']
            batch_size = model_data['batch_size']
            MEGNetList = model_data['module'].MEGNetList

            valid_batch_size_factor = 5

            if not os.path.isfile(f'model_data/{model_file}'):
                os.system(f'aws s3 cp s3://grjhuard-eu-west-1/model_data/{model_file} model_data/{model_file}')
            to_load = torch.load(f'model_data/{model_file}', map_location = 'cpu')

            # Predict
            load_train_dataset()
            init_model()
            init_valid_dataset()
            train_predictions = submit(submit_loader)
            train = package_valid_prediction(train_predictions)
            train.to_csv(valid_path, index = False)

            analyze(train)

            load_submit_dataset()
            init_model()
            init_dataset()
            submit_predictions = submit(submit_loader)

            test = package_submit_prediction(submit_predictions)
            test.to_csv(sub_path, index = False)
            
            os.system(f'rm model_data/{model_file}')
        else:
            print("already done", model_name)