In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os.path as osp
from tqdm.autonotebook import tqdm
import torch
import torch.nn.functional as F
from torch.nn import Sequential, Linear, ReLU, GRU

import torch_geometric.transforms as T
from torch_geometric.data import DataLoader



In [3]:
import pandas as pd
import numpy as np

In [4]:
from sklearn.model_selection import train_test_split, GroupShuffleSplit

In [5]:
from torch.utils.data import Subset

In [6]:
from kaggle_champs import constants

# Load and preprocessing data

## Load data

In [7]:
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')

In [8]:
y_mean = train.scalar_coupling_constant.mean()

In [9]:
y_std = train.scalar_coupling_constant.std()

In [10]:
np.log((train.scalar_coupling_constant - train.type.map(train.groupby('type').scalar_coupling_constant.mean())).abs().groupby(train.type).mean())

type
1JHC    2.548219
1JHN    2.275415
2JHC    0.999041
2JHH    0.983063
2JHN    1.086673
3JHC    0.911788
3JHH    1.122420
3JHN   -0.033818
dtype: float64

## Split train valid

In [11]:
molecules = train.molecule_name.drop_duplicates().sort_values()

In [12]:
train_ind, valid_ind = train_test_split(np.arange(len(molecules)),
                                        test_size=5000,
                                        random_state=1234)

In [13]:
assert not set(train_ind).intersection(valid_ind)

In [14]:
len(train_ind), len(valid_ind)

(80003, 5000)

## Create train valid subet

In [15]:
# Check reproducibility
rs = np.random.RandomState(seed=1234)
print(rs.choice(train_ind, 10))
print(rs.choice(valid_ind, 10))

[19669 26783  1698 47278 33476 59113 40999 64242 25723 71229]
[55624 62327 36561 67391 19447 20288 70596 59541 32479 52121]


In [16]:
train_data = train.loc[train.molecule_name.isin(molecules.iloc[train_ind])]
val_data = train.loc[train.molecule_name.isin(molecules.iloc[valid_ind])]

## Create dataset

In [17]:
from kaggle_champs.dataset import ChampsDataset

In [18]:
import os
import numpy as np
import openbabel
import torch

from torch_geometric.data import Data
from torch.utils.data import Dataset
from tqdm.autonotebook import tqdm
from kaggle_champs.dataset import mol_to_data_v2

In [19]:
class MoleculeDataset(Dataset):
    def __init__(self, metadata=None, base_dir=None, transform=None):
        self.molecules = metadata.molecule_name.unique()
        self.metadata = dict([
            (ind, df) for ind, df in tqdm(metadata.groupby('molecule_name'))
        ])
        self.base_dir = base_dir
        self.transform = transform
        self.conversion = openbabel.OBConversion()
        self.conversion.SetInAndOutFormats("xyz", "mdl")

    def __len__(self):
        return len(self.metadata)

    def __getitem__(self, index):
        mol = openbabel.OBMol()
        mol_name = self.molecules[index]

        xyz_file = os.path.join(self.base_dir, f'{mol_name}.xyz')
        if not os.path.exists(xyz_file):
            raise FileNotFoundError(f'Expecting file {xyz_file} not found')
        self.conversion.ReadFile(mol, xyz_file)

        data = mol_to_data_v2(mol)
        data.mol_ind = torch.tensor([[index]], dtype=torch.long)
        
        data = self._add_targets(data, metadata=self.metadata[mol_name])
        
        data.graph = nx.Graph()
        data.graph.add_edges_from(data.edge_index.transpose(1,0).cpu().numpy())
        
        if self.transform:
            data = self.transform(data)
            
        if hasattr(data, 'graph'):
            del data.graph
        return data
    
    def _add_inverse_couple(self, couples):
        inverse_direction = couples.rename(
            {'atom_index_1': 'atom_index_0', 
             'atom_index_0': 'atom_index_1'}, 
            axis=1)
        
        couples = couples.append(
            inverse_direction,
            sort=False
        )
        couples = couples.sort_values(['atom_index_0',
                                       'atom_index_1'])
        
        return couples
    
    def _add_y(self, data, couples):
        if 'scalar_coupling_constant' in couples.columns:
            data.y = torch.tensor(
                couples['scalar_coupling_constant'].values,
                dtype=torch.float).view(-1,1)
        else:
            data.y = torch.zeros((len(couples), 1), dtype=torch.float)
        return data
    
    def _add_targets(self, data, metadata):
        couples = metadata.copy()        
        couples = self._add_inverse_couple(couples)
        
        
        data.couples_ind = torch.tensor(
            couples[['atom_index_0',
                     'atom_index_1']].values,
            dtype=torch.long)
        
        data = self._add_y(data, couples)
        
        data.type = torch.tensor(
            couples['type'].map(constants.TYPES_DICT).values,
            dtype=torch.long)
        
        data.sample_weight = torch.tensor(
            couples['type'].map(constants.TYPES_WEIGHTS).values,
            dtype=torch.float)
        
        return data

In [20]:
from kaggle_champs.preprocessing import RandomRotation, AddVirtualEdges, AddEdgeDistanceAndDirection, SortTarget

In [21]:
import networkx as nx

In [22]:
class AddEdgeDistanceAndDirection:
    def __init__(self, dist_noise=0., gauss_base_max=4, gauss_base_steps=20, keep=True):
        self.dist_noise = dist_noise
        self.gauss_base_max = gauss_base_max
        self.gauss_base_steps = gauss_base_steps
        self.keep = True
        
    def __call__(self, data):
        (row, col), pos, edge_attr = data.edge_index, data.pos, data.edge_attr

        dist = torch.norm(pos[col] - pos[row], p=2, dim=-1).view(-1, 1)
        
        if self.dist_noise > 0:
            noise = 1 + torch.randn_like(dist, dtype=dist.dtype) * self.dist_noise
            dist = dist * noise

        direction = (pos[col] - pos[row]) / dist
        if self.keep:
            data.dist = dist
            data.direction = direction
        
        base = torch.linspace(self.gauss_base_max/self.gauss_base_steps,
                              self.gauss_base_max, 
                              self.gauss_base_steps, 
                              dtype=torch.float).view(1, -1)    # shape 1xn for broadcasting
        
        dist = torch.exp(-(dist - base) ** 2 / 0.5 ** 2)
        
        edge_attr = edge_attr.view(-1, 1) if edge_attr.dim() == 1 else edge_attr
        data.edge_attr = torch.cat(
                [edge_attr,
                 dist.type_as(edge_attr),
                 direction.type_as(edge_attr)],
                dim=-1)      

        return data

In [23]:
class AddBondLinks:
    def __call__(self, data):
        bonds_ind = data.bonds_edge_ind
        
        bonds_from = bonds_ind.view(-1, 1).repeat(1, (len(bonds_ind))).view(-1)
        bonds_to = bonds_ind.view(-1).repeat(1, len(data.bonds_edge_ind)).view(-1)
        bonds_links = torch.stack([bonds_from, bonds_to], dim=1)  # all couples, will filter
        
        filter_correct_common_node = (data.edge_index[:, bonds_from][1] == data.edge_index[:, bonds_to][0])
        filter_remove_self_loop = (data.edge_index[:, bonds_from][0] != data.edge_index[:, bonds_to][1])
        
        data.bonds_links_edge_ind = bonds_links[filter_correct_common_node * filter_remove_self_loop]
        return data

In [24]:
class AddCounts:
    def __call__(self, data):
        data.count_nodes = torch.tensor([[data.num_nodes]], dtype=torch.long)
        data.count_edges = torch.tensor([[data.num_edges]], dtype=torch.long)
        data.count_couples = torch.tensor([[data.couples_ind.size(0)]], dtype=torch.long)
        return data

In [25]:
class AddGlobalAttr:
    def __init__(self):
        pass

    def __call__(self, data):
        data.global_attr = torch.zeros((1, 1), dtype=torch.float)
        return data

In [26]:
class SortTarget:
    def _get_index(self, data, row, col):
        idx = row * (data.num_nodes-1) + col
        idx[row < col] = idx[row < col] - 1
        return idx
    
    def __call__(self, data):
        target = torch.zeros((data.num_edges, data.y.size()[1]), dtype=torch.float)        
        weights = torch.zeros((data.num_edges), dtype=torch.float)        
        mask = torch.zeros((data.num_edges), dtype=torch.bool)      
        types = torch.zeros((data.num_edges), dtype=torch.long)
        
        row, col = data.couples_ind.transpose(1,0)
        indexes = self._get_index(data, row, col)
        
        mask[indexes] = True
        weights[indexes] = data.sample_weight
        target[indexes] = data.y
        types[indexes] = data.type
        
        #data.mask = mask
        data.y = target[mask]
        data.sample_weight = weights[mask]
        data.type = types[mask]
        
        assert torch.equal(data.couples_ind, data.edge_index[:, mask].transpose(1,0))
        data.couples_edge_ind = torch.arange(data.num_edges, dtype=torch.long)[mask].view(-1,1)
        return data        

In [27]:
class AddBondPath:
    def __call__(self, data):
        # suffix _index to get node index adjustment
        data.paths_index = self.find_paths(data).transpose(1,0)  
        data.paths_edge_ind = torch.cat(
            [self._nodes_to_edge_ind(data, data.paths_index[i], data.paths_index[i+1]) for i in range(3)], 
            dim=1)
        return data
    
    def _nodes_to_edge_ind(self, data, node_from, node_to):
        edge_ind = node_from * (data.num_nodes-1) + node_to
        edge_ind[node_from < node_to] = edge_ind[node_from < node_to] - 1
        return edge_ind.view(-1, 1)
    
    def find_paths(self, data):
        assert hasattr(data, 'couples_ind')
        assert hasattr(data, 'graph')

        all_paths = nx.shortest_path(data.graph)
        paths = []
        for (from_, to_) in data.couples_ind.numpy():
            path = torch.tensor(all_paths[from_][to_], dtype=torch.long).view(-1,1)
            paths.append(path)

        paths = torch.nn.utils.rnn.pad_sequence(paths, batch_first=True).squeeze()
        if paths.size(1) < 4:
            paths = torch.nn.functional.pad(paths, (0, 4 - paths.size(1)))
        return paths

In [28]:
class AddInverseCouples:
    def find_inverse_couple_position(self, node_from, node_to):
        df = pd.DataFrame({
            'from': node_from,
            'to': node_to,
        }).reset_index()
        inverse = df.rename({
            'from': 'to',
            'to': 'from',
        },axis=1)
        merged = pd.merge(df, inverse, on=['from', 'to'], suffixes=('', '_inverse'))
        assert merged.shape[0] == df.shape[0]
        return merged.sort_values('index').index_inverse.values
    
    def __call__(self, data):
        assert hasattr(data, 'couples_ind')
        node_from, node_to = data.couples_ind[:,0].numpy(), data.couples_ind[:,1].numpy()
        data.inverse_couple_ind = torch.tensor(self.find_inverse_couple_position(node_from, node_to), dtype=torch.long)
        return data

In [29]:
def find_inverse_couple_position(self, node_from, node_to):
    df = pd.DataFrame({
        'from': node_from,
        'to': node_to,
    }).reset_index()
    inverse = df.rename({
        'from': 'to',
        'to': 'from',
    },axis=1)
    merged = pd.merge(df, inverse, left_on=['from', 'to'], right_on=['to', 'from'], suffixes=('', '_inverse'))
    assert merged.shape[0] == df.shape[0]
    return merged.sort_values('index').index_inverse

In [30]:
def correct_batch_edge_ind(batch):
    offset_edge_ind = torch.zeros_like(batch.count_edges)
    offset_edge_ind[1:] = batch.count_edges[:-1].cumsum(dim=0)
    for k in ['bonds_edge_ind', 'bonds_links_edge_ind', 'paths_edge_ind', 'couples_edge_ind']:
        if hasattr(batch, k):
            batch[k] = batch[k] + offset_edge_ind[batch[k+'_batch']]
    return batch

In [31]:
def correct_inverse_couples_ind(batch):
    offset = torch.zeros_like(batch.count_couples)
    offset[1:] = batch.count_couples[:-1].cumsum(dim=0)
    batch.inverse_couple_ind = batch.inverse_couple_ind + offset[batch.inverse_couple_ind_batch].view(-1)
    assert torch.equal(batch.couples_edge_ind[batch.inverse_couple_ind][batch.inverse_couple_ind],
                       batch.couples_edge_ind)
    return batch

In [32]:
train_dataset = MoleculeDataset(metadata=train_data,
                                base_dir='/home/ubuntu/datalab/data/structures_xyz/',
                                transform=T.Compose([
                                    AddBondPath(),
                                    AddVirtualEdges(),
                                    RandomRotation(),
                                    AddEdgeDistanceAndDirection(dist_noise=0.),
                                    AddGlobalAttr(),
                                    SortTarget(),
                                    AddBondLinks(),
                                    AddCounts(),
                                    AddInverseCouples(),
                                ]))

val_dataset = MoleculeDataset(metadata=val_data,
                                base_dir='/home/ubuntu/datalab/data/structures_xyz/',
                              transform=T.Compose([
                                  AddBondPath(),
                                  AddVirtualEdges(),
                                  AddEdgeDistanceAndDirection(dist_noise=0.),
                                  AddGlobalAttr(),
                                  SortTarget(),
                                  AddBondLinks(),
                                  AddCounts(),
                                  AddInverseCouples(),
                              ]))

HBox(children=(IntProgress(value=0, max=80003), HTML(value='')))




HBox(children=(IntProgress(value=0, max=5000), HTML(value='')))




In [33]:
data = train_dataset[10]
data

FP16_Data(bonds_edge_ind=[20, 1], bonds_links_edge_ind=[36, 2], count_couples=[1, 1], count_edges=[1, 1], count_nodes=[1, 1], couples_edge_ind=[86, 1], couples_ind=[86, 2], direction=[110, 3], dist=[110, 1], edge_attr=[110, 34], edge_index=[2, 110], global_attr=[1, 1], inverse_couple_ind=[86], mol_ind=[1, 1], paths_edge_ind=[86, 3], paths_index=[4, 86], pos=[11, 3], sample_weight=[86], type=[86], x=[11, 28], y=[86, 1])

In [34]:
torch.equal(data.inverse_couple_ind[data.inverse_couple_ind], torch.arange(86))

True

# Model

In [35]:
from kaggle_champs.modelling import MegNetBlock, create_mlp_v2, MegNetBlock_v2, MegNetBlock_v3

In [36]:
from torch import nn

In [37]:
from torch_scatter import scatter_add

In [38]:
def gather_embedding(data, x_out, edge_out, u_out, couple_type):
    n_bonds = int(couple_type[0])
    couple_filter = (data.type == constants.TYPES_DICT[couple_type])
    couples_edge_ind = data.couples_edge_ind.view(-1)
    
    merged = [
        u_out[data.batch[data.edge_index[0][couples_edge_ind][couple_filter]]],
    ]
    if n_bonds > 1:
        merged.append(edge_out[couples_edge_ind][couple_filter])
        
    node_ind = data.paths_index.transpose(1,0)[:, :n_bonds+1][couple_filter] # convert_node_ind(data, 'paths')[:, :n_bonds+1]
    for i in range(n_bonds+1):
        merged.append(x_out[node_ind[:,i]])
        
    for i in range(n_bonds):
        edge_ind = data.paths_edge_ind[:,i] # convert_couple_to_edge_ind(data, data.paths_index[i], data.paths_index[i+1], data.paths_edge_ind_batch)
        merged.append(edge_out[edge_ind[couple_filter]])
    return torch.cat(merged, dim=1)

In [39]:
class OutputLayer_new(torch.nn.Module):
    def __init__(self, rep_dim, dim, y_mean, y_std, couple_type):
        super(OutputLayer_new, self).__init__()
        self.scaling = torch.nn.Linear(1, 1)
        self.scaling.bias = torch.nn.Parameter(torch.tensor(y_mean,
                                                            dtype=torch.float),
                                               requires_grad=False)
        self.scaling.weight = torch.nn.Parameter(torch.tensor(
            [[y_std]], dtype=torch.float),
                                                 requires_grad=False)
        self.couple_type = couple_type
        n_bonds = int(couple_type[0])
        
        if n_bonds == 1:
            input_dim = dim * (n_bonds + (n_bonds + 1) + 1)  # edges + nodes + u
        else:
            input_dim = dim * (n_bonds + (n_bonds + 1) + 2)  # edges + nodes + u + direct edge
        
        self.mlp = create_mlp_v2(
            input_dim=input_dim,
            output_dim=1,
            hidden_dims=[input_dim//2, input_dim//2, input_dim//2],
            normalization_cls=torch.nn.LayerNorm,
            activation_cls=torch.nn.ELU,
            dropout_cls=torch.nn.Dropout,
            dropout_prob=0.
        )

    def forward(self, data, x_out, edge_out, u_out):
        in_ = gather_embedding(data, x_out, edge_out, u_out, self.couple_type)
        out = self.mlp(in_)
        out = self.scaling(out)
        return out

In [40]:
from torch_scatter import scatter_mean

In [41]:
class EdgeAgg(torch.nn.Module):
    def __init__(self, dim=32):
        super(EdgeAgg, self).__init__()
        self.body_mlp = nn.Sequential(
            create_mlp_v2(
                input_dim=dim,
                output_dim=dim*2,
                hidden_dims=[dim*2],
                normalization_cls=torch.nn.LayerNorm,
                activation_cls=torch.nn.ELU,
                dropout_cls=torch.nn.Dropout,
                dropout_prob=0.),
            nn.LayerNorm(dim*2)
        )
        
        self.value_out = nn.Linear(dim*2, dim)
        
        self.gating = nn.Sequential(
            nn.Linear(dim*2, dim),
            nn.Sigmoid()
        )
    
    def forward(self, edge_out, edges_ind):
        out = self.body_mlp(edge_out)              
        out = self.value_out(out) * self.gating(out)
        result = scatter_add(out, edges_ind, dim=0)        
        return result

In [42]:
class MegNetBlock(torch.nn.Module):
    def __init__(self, edge_dim, x_dim, u_dim, dim=32, layer_norm=False,
                 normalization_cls=None, activation_cls=nn.ReLU,
                 dropout_cls=nn.Dropout, dropout_prob=0., residual=True, pooling='mean'):
        super(MegNetBlock, self).__init__()
        self.dim = dim
        self.residual = residual
        self.pooling = pooling

        if layer_norm:
            normalization_cls = nn.LayerNorm
        kwargs = dict(
            normalization_cls=normalization_cls,
            activation_cls=activation_cls,
            dropout_cls=dropout_cls,
            dropout_prob=dropout_prob)
        
        self.edge_dense = create_mlp_v2(
            input_dim=edge_dim, output_dim=dim, hidden_dims=[dim * 2], **kwargs)
        
        self.edge_agg = EdgeAgg(dim=dim)
        
        self.node_dense = create_mlp_v2(
            input_dim=x_dim, output_dim=dim, hidden_dims=[dim * 2], **kwargs)
        self.global_dense = create_mlp_v2(
            input_dim=u_dim, output_dim=dim, hidden_dims=[dim * 2], **kwargs)

        self.edge_msg = create_mlp_v2(
            input_dim=dim * 4, output_dim=dim, hidden_dims=[dim*2, dim*2], **kwargs)
        self.node_msg = create_mlp_v2(
            input_dim=dim * 3, output_dim=dim, hidden_dims=[dim*2, dim*2], **kwargs)
        self.global_msg = create_mlp_v2(
            input_dim=dim * 3, output_dim=dim, hidden_dims=[dim*2, dim*2], **kwargs)
        

    def edge_model(self, src, dest, edge_attr, u, batch):
        # source, target: [E, F_x], where E is the number of edges.
        # edge_attr: [E, F_e]
        # u: [B, F_u], where B is the number of graphs.
        # batch: [E] with max entry B - 1.
        out = torch.cat([src, dest, edge_attr, u[batch]], 1)
        out = self.edge_msg(out)
        return out

    def node_model(self, x, edge_index, edge_attr, u, batch):
        # x: [N, F_x], where N is the number of nodes.
        # edge_index: [2, E] with max entry N - 1.
        # edge_attr: [E, F_e]
        # u: [B, F_u]
        # batch: [N] with max entry B - 1.
        row, _ = edge_index
        out = self.edge_agg(edge_attr, row)
        out = torch.cat([out, x, u[batch]], dim=1)
        out = self.node_msg(out)
        return out

    def global_model(self, x, edge_index, edge_attr, u, batch):
        # x: [N, F_x], where N is the number of nodes.
        # edge_index: [2, E] with max entry N - 1.
        # edge_attr: [E, F_e]
        # u: [B, F_u]
        # batch: [N] with max entry B - 1.
        row, _ = edge_index
        edge_mean = scatter_mean(edge_attr, batch[row], dim=0)
        out = torch.cat(
            [u, scatter_mean(x, batch, dim=0), edge_mean], dim=1)
        out = self.global_msg(out)
        return out
    
    def forward(self, x, edge_index, edge_attr, u, batch, first_block=False):

        # first block
        edge_out = self.edge_dense(edge_attr)
        x_out = self.node_dense(x)
        u_out = self.global_dense(u)

        x_res_base = x_out if first_block else x
        edge_res_base = edge_out if first_block else edge_attr
        u_res_base = u_out if first_block else u

        row, col = edge_index        

        edge_out = self.edge_model(x_out[row], x_out[col], edge_out, u_out,
                                   batch[row])
        if self.residual:
            edge_out = edge_res_base + edge_out

        x_out = self.node_model(x_out, edge_index, edge_out, u_out, batch)
        if self.residual:
            x_out = x_res_base + x_out

        u_out = self.global_model(x_out, edge_index, edge_out, u_out, batch)
        if self.residual:
            u_out = u_res_base + u_out

        return x_out, edge_out, u_out

In [43]:
class MultiplicativeGaussianNoise(torch.nn.Module):
    def __init__(self, scale=0.):
        super(MultiplicativeGaussianNoise, self).__init__()
        self.scale = scale
        
    def forward(self, x):
        if not self.training:
            return x
        noise = 1 + torch.randn_like(x) * self.scale
        return x * noise

In [44]:
class MegNetModel_new(torch.nn.Module):
    def __init__(self,
                 edge_dim,
                 x_dim,
                 u_dim,
                 dim=32,
                 head_dim=32,
                 n_megnet_blocks=3,
                 y_mean=0,
                 y_std=1,
                 layer_norm=False):
        super(MegNetModel_new, self).__init__()
        self.dim = dim
        self.n_megnet_blocks = n_megnet_blocks
        
        self.node_proj = torch.nn.Linear(x_dim, dim)
        self.edge_proj = torch.nn.Linear(edge_dim, dim)
        self.global_proj = torch.nn.Linear(u_dim, dim)
        
        self.megnet_blocks = torch.nn.ModuleList([
            MegNetBlock(dim,
                        dim,
                        dim,
                        dim,
                        normalization_cls=torch.nn.LayerNorm,
                        activation_cls=torch.nn.ELU,
                        dropout_cls=torch.nn.Dropout,
                        dropout_prob=0.,
                        residual=True) for i in range(n_megnet_blocks)
        ])

        self.out_mlp = torch.nn.ModuleList([
            OutputLayer_new(
                dim,
                head_dim,
                y_mean=y_mean[i],
                y_std=y_std[i],
                couple_type=type_,
            ) for i, type_ in enumerate(constants.TYPES_LIST)
        ])
        
        self.noise = MultiplicativeGaussianNoise(scale=0.05)

    def forward(self, data, add_noise=False):
        data = correct_batch_edge_ind(data)
        data = correct_inverse_couples_ind(data)
        
        if not hasattr(data, 'global_attr'):
            data.global_attr = torch.zeros((data.num_graphs, 1),
                                           dtype=torch.float,
                                           device=data.x.device)
        x_out, edge_out, u_out = self.node_proj(data.x), self.edge_proj(data.edge_attr), self.global_proj(data.global_attr)
        
        for i in range(self.n_megnet_blocks):
            x_out, edge_out, u_out = self.megnet_blocks[i](
                x_out,
                data.edge_index,
                edge_out,
                u_out,
                data.batch,
                first_block=(i==0))
            
            if add_noise:
                x_out = self.noise(x_out)
                edge_out = self.noise(edge_out)
                u_out = self.noise(u_out)

        pred = torch.zeros_like(data.type,
                                dtype=torch.float,
                                device=x_out.device)
        for type_ in range(8):
            if (data.type == type_).any():
                pred[data.type == type_] = self.out_mlp[type_](data, x_out,
                                                               edge_out,
                                                               u_out).view(-1)
        return pred

# Training

In [45]:
from kaggle_champs.metrics import MeanLogGroupMAE, AverageMetric

In [46]:
from kaggle_champs.training import train_epoch

In [47]:
def cycle(iterable):
    while True:
        for x in iterable:
            yield x

In [48]:
import torch
from tqdm.autonotebook import tqdm

def train_epoch(global_iteration, epoch, model, device, optimizer, 
                train_loader, tb_logger, gradient_accumulation_steps=1, swa=False, noise=False):
    model.train()
    avg_loss = AverageMetric()
    log_mae = MeanLogGroupMAE()
    
    pbar = tqdm(train_loader)
    for step, data in enumerate(pbar):
        data = data.to(device)
        
        pred = model(data, add_noise=noise)

        loss = torch.nn.L1Loss(reduction='mean')(pred.view(-1),
                                                     data.y.view(-1))
        loss.backward()

        if (step + 1) % gradient_accumulation_steps == 0:
            optimizer.step()
            optimizer.zero_grad()
            
            global_iteration += 1
            if swa:
                optimizer.update_swa()

        tb_logger.add_scalar('loss', loss.item(), global_iteration)

        avg_loss.update(loss.item() * data.num_graphs, data.num_graphs)
        log_mae.update(pred.view(-1), data.y.view(-1), data.type)

        pbar.set_postfix_str(f'loss: {avg_loss.compute():.4f}')
    return avg_loss.compute(), log_mae, global_iteration

In [49]:
def test_model(model, loader):
    model.eval()
    log_mae = MeanLogGroupMAE()
    avg_loss = AverageMetric()
    with torch.no_grad():
        for data in loader:
            data = data.to(device)
            pred = model(data)
            
            loss = torch.nn.L1Loss(reduction='mean')(pred.view(-1),
                                                     data.y.view(-1))
            avg_loss.update(loss.item() * data.num_graphs, data.num_graphs)
            
            log_mae.update(pred.view(-1), data.y.view(-1), data.type.view(-1))
            
        return avg_loss.compute(), log_mae


def make_log(epoch, lr, loss, tr_logmae, val_logmae):
    results = {
        'epoch': epoch,
        'lr': lr,
        'loss': loss,
        'tr_logmae': tr_logmae.compute(),
        'val_logmae': val_logmae.compute(),
    }
    for k, v in tr_logmae.compute_individuals().items():
        results.update({'tr_' + k: v})
    for k, v in val_logmae.compute_individuals().items():
        results.update({'val_' + k: v})
    return results


def save_checkpoint(dir_path, model, optimizer, scheduler, epoch):
    torch.save(model.state_dict(), dir_path + f'model_epoch_{epoch}.pth')
    torch.save(optimizer.state_dict(),
               dir_path + f'optimizer_epoch_{epoch}.pth')
    torch.save(scheduler.state_dict(),
               dir_path + f'scheduler_epoch_{epoch}.pth')

# Run

In [50]:
from tensorboardX import SummaryWriter

In [51]:
import shutil

In [52]:
OUTPUT_DIR = './models/megnet_256x10_new_arch_3/'

In [None]:
!mkdir -p {OUTPUT_DIR}

In [60]:
tb_logger = SummaryWriter(OUTPUT_DIR+'tb_log/')
global_iteration = 0

In [61]:
SAVE_INTERVAL = 10

In [62]:
MAX_EPOCH = 150

In [63]:
val_loader = DataLoader(val_dataset,
                        batch_size=64,
                        shuffle=False,
                        num_workers=8,
                        follow_batch=[
                            'bonds_edge_ind',
                            'bonds_links_edge_ind',
                            'paths_edge_ind',
                            'couples_edge_ind',
                            'inverse_couple_ind',
                        ])
train_loader = DataLoader(train_dataset,
                          batch_size=32,
                          num_workers=8,
                          shuffle=True,
                          follow_batch=[
                              'bonds_edge_ind',
                              'bonds_links_edge_ind',
                              'paths_edge_ind',
                              'couples_edge_ind',
                              'inverse_couple_ind',
                          ], drop_last=True)

In [64]:
batch = next(iter(val_loader))

In [65]:
batch = correct_batch_edge_ind(batch)

In [66]:
batch = correct_inverse_couples_ind(batch)

In [67]:
batch

Batch(batch=[895], bonds_edge_ind=[1758, 1], bonds_edge_ind_batch=[1758], bonds_links_edge_ind=[3076, 2], bonds_links_edge_ind_batch=[3076], count_couples=[64, 1], count_edges=[64, 1], count_nodes=[64, 1], couples_edge_ind=[5146, 1], couples_edge_ind_batch=[5146], couples_ind=[5146, 2], direction=[12162, 3], dist=[12162, 1], edge_attr=[12162, 34], edge_index=[2, 12162], global_attr=[64, 1], inverse_couple_ind=[5146], inverse_couple_ind_batch=[5146], mol_ind=[64, 1], paths_edge_ind=[5146, 3], paths_edge_ind_batch=[5146], paths_index=[4, 5146], pos=[895, 3], sample_weight=[5146], type=[5146], x=[895, 28], y=[5146, 1])

In [68]:
((batch.y[batch.inverse_couple_ind] - batch.y) != 0).any()

tensor(0, dtype=torch.uint8)

In [69]:
y_mean = train.groupby(train.type.map(
    constants.TYPES_DICT)).scalar_coupling_constant.mean().sort_index().values
y_std = train.groupby(train.type.map(
    constants.TYPES_DICT)).scalar_coupling_constant.std().sort_index().values

In [70]:
device = torch.device('cuda')
model = MegNetModel_new(edge_dim=data.edge_attr.size()[1],
                    x_dim=data.x.size()[1],
                    u_dim=1,
                    dim=300,
                    head_dim=300,
                    n_megnet_blocks=10,
                    y_mean=y_mean,
                    y_std=y_std,
                    layer_norm=False).to(device)

In [71]:
from kaggle_champs.optimizer import RAdam

In [65]:
optimizer = RAdam(model.parameters(), lr=2e-4)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=20, gamma=1.)

In [None]:
# train loop
logs = []
for epoch in range(1, 145):
    lr = scheduler.optimizer.param_groups[0]['lr']
    tr_loss, tr_logmae, global_iteration = train_epoch(global_iteration,
                                     epoch,
                                     model,
                                     device,
                                     optimizer,
                                     train_loader,
                                     tb_logger,
                                     gradient_accumulation_steps=2)
    #optimizer.update_swa()
    scheduler.step()
    
    val_loss, val_logmae = test_model(model, val_loader)
    
    

    epoch_log = make_log(epoch, lr, tr_loss, tr_logmae, val_logmae)
    logs.append(epoch_log)
    pd.DataFrame(logs).to_csv(OUTPUT_DIR + 'log.csv')
    print('Epoch: {epoch:03d}, LR: {lr:7f}, Loss: {loss:.7f}, \
         Train LogMAE: {tr_logmae:.7f}, Val LogMAE: {val_logmae:.7f}'.format(
        **epoch_log))    
    
    #optimizer.swap_swa_sgd()
    #val_loss_swa, val_logmae_swa = test_model(model, val_loader)
    #optimizer.swap_swa_sgd()
    #print(f'Val LogMAE SWA: {val_logmae_swa.compute():.7f}')

    if epoch % SAVE_INTERVAL == 0:
        save_checkpoint(OUTPUT_DIR, model, optimizer, scheduler, epoch)

    tb_logger.add_scalar('lr', lr, global_iteration)
    tb_logger.add_scalar('val_loss', val_loss, global_iteration)
    tb_logger.add_scalars('global_logmae', {
        'tr_logmae': epoch_log['tr_logmae'],
        'val_logmae': epoch_log['val_logmae']
    }, global_iteration)

    for type_ in constants.TYPES_LIST:
        tb_logger.add_scalars(
            type_, {
                'tr_' + type_: epoch_log['tr_' + type_],
                'val_' + type_: epoch_log['val_' + type_]
            }, global_iteration)

In [72]:
save_checkpoint(OUTPUT_DIR, model, optimizer, scheduler, epoch=145)

In [69]:
optimizer.param_groups[0]['lr'] = 1e-4

In [70]:
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.5)

In [None]:
# train loop
logs = []
for epoch in range(146, 168):
    lr = scheduler.optimizer.param_groups[0]['lr']
    tr_loss, tr_logmae, global_iteration = train_epoch(global_iteration,
                                     epoch,
                                     model,
                                     device,
                                     optimizer,
                                     train_loader,
                                     tb_logger,
                                     gradient_accumulation_steps=1)
    #optimizer.update_swa()
    scheduler.step()
    
    val_loss, val_logmae = test_model(model, val_loader)
    
    

    epoch_log = make_log(epoch, lr, tr_loss, tr_logmae, val_logmae)
    logs.append(epoch_log)
    pd.DataFrame(logs).to_csv(OUTPUT_DIR + 'log.csv')
    print('Epoch: {epoch:03d}, LR: {lr:7f}, Loss: {loss:.7f}, \
         Train LogMAE: {tr_logmae:.7f}, Val LogMAE: {val_logmae:.7f}'.format(
        **epoch_log))    
    
    #optimizer.swap_swa_sgd()
    #val_loss_swa, val_logmae_swa = test_model(model, val_loader)
    #optimizer.swap_swa_sgd()
    #print(f'Val LogMAE SWA: {val_logmae_swa.compute():.7f}')

    if epoch % SAVE_INTERVAL == 0:
        save_checkpoint(OUTPUT_DIR, model, optimizer, scheduler, epoch)

    tb_logger.add_scalar('lr', lr, global_iteration)
    tb_logger.add_scalar('val_loss', val_loss, global_iteration)
    tb_logger.add_scalars('global_logmae', {
        'tr_logmae': epoch_log['tr_logmae'],
        'val_logmae': epoch_log['val_logmae']
    }, global_iteration)

    for type_ in constants.TYPES_LIST:
        tb_logger.add_scalars(
            type_, {
                'tr_' + type_: epoch_log['tr_' + type_],
                'val_' + type_: epoch_log['val_' + type_]
            }, global_iteration)

HBox(children=(IntProgress(value=0, max=2500), HTML(value='')))

In [73]:
save_checkpoint(OUTPUT_DIR, model, optimizer, scheduler, epoch=168)

# Make sub

In [72]:
def merge_direction(df):
    inverse_direction = df.rename(
        {
            'atom_index_1': 'atom_index_0',
            'atom_index_0': 'atom_index_1'
        },
        axis=1)
    merged = pd.merge(df,
                      inverse_direction,
                      on=['molecule_name', 'atom_index_0', 'atom_index_1'],
                      suffixes=('', '_bis'))
    merged['scalar_coupling_constant'] = (merged['scalar_coupling_constant'] + merged['scalar_coupling_constant_bis']) / 2
    return merged.drop('scalar_coupling_constant_bis', axis=1)

In [78]:
def predict(model, input_data, checkpoint_path):
    model.load_state_dict(torch.load(checkpoint_path))
    pred_dataset = MoleculeDataset(
        metadata=input_data,
        base_dir=constants.STRUCT_DATA_PATH,
                             transform=T.Compose([
                                  AddBondPath(),
                                  AddVirtualEdges(),
                                  AddEdgeDistanceAndDirection(dist_noise=0.),
                                  AddGlobalAttr(),
                                  SortTarget(),
                                  AddBondLinks(),
                                  AddCounts(),
                                  AddInverseCouples(),
                              ]))
    pred_loader = DataLoader(pred_dataset,
                             batch_size=64,
                             shuffle=False,
                             num_workers=8,
                             follow_batch=[
                                 'bonds_edge_ind', 'bonds_links_edge_ind',
                                 'paths_edge_ind', 'couples_edge_ind', 'inverse_couple_ind'
                             ])
    model.eval()
    preds = []
    inds = []
    couples = []
    for data in tqdm(pred_loader):
        with torch.no_grad():
            data = data.to(device)
            pred = model(data).detach().cpu().numpy()
            ind = data.mol_ind[data.couples_edge_ind_batch].detach().cpu().numpy()

            couple_ind = data.couples_ind.cpu().numpy()
            df = pd.DataFrame({
                'molecule_name' : pred_dataset.molecules[ind].ravel(),
                'molecule_ind': ind.ravel(),
                'atom_index_0': couple_ind[:,0].ravel(), 
                'atom_index_1': couple_ind[:,1].ravel(),
            })
            df.sort_values(['molecule_ind', 'atom_index_0', 'atom_index_1'], ascending=True, inplace=True)
            np.testing.assert_array_equal(df.molecule_ind, ind.ravel())
            df['scalar_coupling_constant'] = pred
            preds.append(df.drop('molecule_ind', axis=1))
            
    pred = pd.concat(preds)
    pred = merge_direction(pred)
    merged = pd.merge(input_data,
                  pred,
                  on=['molecule_name', 'atom_index_0', 'atom_index_1'],
                  how='left', suffixes=('_truth', ''))
    assert merged.dropna().shape[0] == input_data.shape[0]
    return merged.loc[:, ['id', 'scalar_coupling_constant']].set_index('id'), pred

In [79]:
pred_val, p = predict(model, val_data, f'{OUTPUT_DIR}/model_epoch_168.pth')

HBox(children=(IntProgress(value=0, max=5000), HTML(value='')))




HBox(children=(IntProgress(value=0, max=79), HTML(value='')))




In [80]:
pred_val.head()

Unnamed: 0_level_0,scalar_coupling_constant
id,Unnamed: 1_level_1
1582,-1.005834
1583,0.456203
1584,0.455955
1585,14.321383
1586,92.641945


In [81]:
def score(pred, ref_data):
    merged = pd.merge(ref_data, pred, how='left', left_on='id', right_index=True, suffixes=('', '_pred'))
    merged['abs_error'] = (merged['scalar_coupling_constant'] - merged['scalar_coupling_constant_pred']).abs()
    result = merged.groupby('type')['abs_error'].mean()
    result.iloc[:] = np.log(np.maximum(result.values, 1e-9))
    return result.mean(), result.to_dict()    

In [82]:
score(pred_val, val_data)

(-2.7105617144537737,
 {'1JHC': -1.759798528496549,
  '1JHN': -1.7263141403697904,
  '2JHC': -2.724144449858338,
  '2JHH': -3.1877947458752223,
  '2JHN': -3.036704517513776,
  '3JHC': -2.688196383634799,
  '3JHH': -3.2627320895307594,
  '3JHN': -3.2988088603509516})

In [83]:
test = pd.read_csv('../data/test.csv')

In [84]:
sub, _ = predict(model, test, f'{OUTPUT_DIR}/model_epoch_168.pth')

HBox(children=(IntProgress(value=0, max=45772), HTML(value='')))




HBox(children=(IntProgress(value=0, max=716), HTML(value='')))




In [85]:
sub.head()

Unnamed: 0_level_0,scalar_coupling_constant
id,Unnamed: 1_level_1
4658147,19.008881
4658148,189.977356
4658149,11.732316
4658150,189.975189
4658151,19.008661


In [86]:
!mkdir -p subs/lam_03_v1/

In [87]:
sub.to_csv('./subs/lam_03_v1/sub.csv', index=True)
pred_val.to_csv('./subs/lam_03_v1/pred_val.csv', index=True)