In [1]:
import os
import numpy as np
import gc
import joblib
import pandas as pd
import random
import matplotlib.pyplot as plt
from tqdm import tqdm
from sklearn.linear_model import HuberRegressor
from ogb.lsc import PCQM4Mv2Dataset
import torch

In [2]:
# Point to competition dataset directory
ROOT = '../data'
dataset = PCQM4Mv2Dataset(root = ROOT, only_smiles =True)

# Convert to Dataframe: smiles + target
train = pd.DataFrame({
    'smiles': [dataset[i][0] for i in range(len(dataset))],
    'target': [dataset[i][1] for i in range(len(dataset))],
})
train['id'] = np.arange(train.shape[0])

splits = torch.load('../data/new_split_dict.pt')
split_dict = dataset.get_idx_split()
train_idx = split_dict['train'] # numpy array storing indices of training molecules
valid_idx = split_dict['valid'] # numpy array storing indices of validation molecules
testdev_idx = split_dict['test-dev'] # numpy array storing indices of test-dev molecules
testchallenge_idx = split_dict['test-challenge'] # numpy array storing indices of test-challenge molecules
for f in range(4):
    train[f'fold{f}'] = 0
    train.loc[train['id'].isin(splits[f'valid_{f}']), f'fold{f}'] = 1
    train.loc[train['id'].isin(testdev_idx), f'fold{f}'] = 2
    train.loc[train['id'].isin(testchallenge_idx), f'fold{f}'] = 3

train.head()


Unnamed: 0,smiles,target,id,fold0,fold1,fold2,fold3
0,O=C1[N]c2ccncc2[CH][C@@H]1c1ccc(cc1)C,3.047675,0,0,0,0,0
1,COc1cc(OC)ccc1/C=C/N(C(=O)C)C,4.410966,1,1,0,0,0
2,C=CCN(C(=O)C)/C=C/c1ccccc1C,4.639541,2,0,0,0,0
3,C=CCN(C(=O)C)/C=C/c1ccccc1F,4.4926,3,0,0,0,0
4,C=CCN(C(=O)C)/C=C/c1ccccc1Cl,4.61233,4,0,0,0,0


In [3]:
BASEPATH = 'models_oofs/predictions/'

oof_files = [
    [
    'alexandrem_tm18_nonodepred_400_fold0',
    'cpmp_256_bs64_lr1e-4_fold0',
    'jiwei_b29872e_fold0_valid',
    'sajad_blv1_18l_454_3407_val_fold0',
    'giba_resnet_352_fold0',
    'alexandrem_tm18forreal_withnodepred_400_2_fold0',
    'sajad_blv2-0.2_18l_val_fold0',
    'sajad_blv2_r_0.1_18l_454_3407_val_fold0',
    'sajad_blv4_no_dihedral_18l_val_fold0',
    'dummy',
    ],
    [
    'alexandrem_tm18_nonodepred_400_fold1',
    'cpmp_256_bs64_lr1e-4_fold1',
    'jiwei_b29872e_fold1_valid',
    'sajad_blv1_18l_454_3407_val_fold1',
    'giba_resnet_352_fold1',
    'alexandrem_tm18forreal_withnodepred_400_2_fold1',
    'dummy',
    'sajad_blv2_r_0.1_18l_454_3407_val_fold1',
    'sajad_blv4_no_dihedral_18l_val_fold1',
    'sajad_blv5__18l_454_3407_val_fold1',
    ],    
    [
    'alexandrem_tm18_nonodepred_400_fold2',
    'cpmp_256_bs64_lr1e-4_fold2',
    'jiwei_b29872e_fold2_valid',
    'sajad_blv1_18l_454_3407_val_fold2',
    'giba_resnet_352_fold2',
    'alexandrem_tm18forreal_withnodepred_400_2_fold2',
    'sajad_blv2-0.2_18l_val_fold2',
    'dummy',
    'sajad_blv4_no_dihedral_18l_val_fold2',
    'sajad_blv5__18l_454_3407_val_fold2',
    ],    
    [
    'alexandrem_tm18_nonodepred_400_fold3',
    'cpmp_256_bs64_lr1e-4_fold3',
    'jiwei_b29872e_fold3_valid',
    'sajad_blv1_18l_454_3407_val_fold3',
    'giba_resnet_352_fold3',
    'alexandrem_tm18forreal_withnodepred_400_2_fold3',
    'sajad_blv2-0.2_18l_val_fold3',
    'sajad_blv2_r_0.1_18l_454_3407_val_fold3',
    'dummy',
    'sajad_blv5__18l_454_3407_val_fold3'
    ],    
]

VALID = []
TEST = []
for n, files in enumerate(oof_files):
    print(f'Split: {n}')
    valid = train.loc[train[f"fold{n}"]==1].reset_index(drop=True)
    valid['split'] = n

    test = train.loc[train[f"fold{n}"]==3].reset_index(drop=True)
    test['split'] = n
    
    for f, fn in enumerate(files):
        #print(f"Loading {fn}")
        
        #Check if oof file exists, otherwise load dummy 0 values to the fold
        if fn != 'dummy':
            oof = np.load(BASEPATH + fn + '/valid.npy').flatten()
            valid[f'oof{f}'] = oof.astype('float32')
            oof = np.load(BASEPATH + fn + '/testchallenge.npy').flatten()
            test[f'oof{f}'] = oof.astype('float32')
        else:
            valid[f'oof{f}'] = 0.
            test[f'oof{f}'] = 0.
        
    VALID.append(valid.reset_index(drop=True))
    TEST.append(test.reset_index(drop=True))
    print()

VALID = pd.concat(VALID)
TEST = pd.concat(TEST)

len(VALID), len(TEST)

Split: 0

Split: 1

Split: 2

Split: 3



(575360, 589728)

In [4]:
VALID.head()

Unnamed: 0,smiles,target,id,fold0,fold1,fold2,fold3,split,oof0,oof1,oof2,oof3,oof4,oof5,oof6,oof7,oof8,oof9
0,COc1cc(OC)ccc1/C=C/N(C(=O)C)C,4.410966,1,1,0,0,0,0,4.402257,4.386719,4.393664,4.388084,4.426374,4.376553,4.427334,4.40016,4.396652,0.0
1,O[C@@H]1CCN(C[C@H]1O)[C@H](c1ccccc1)C,5.994668,21,1,0,0,0,0,5.921789,5.953125,5.991448,5.92324,5.955681,5.923428,5.950772,5.919691,5.923996,0.0
2,S=C1[N]C2=N[C]3[C@H](N2N1)C=CC=C3,2.397323,27,1,0,0,0,0,2.435461,2.365234,2.335269,2.302146,2.23333,2.368741,2.417569,2.36891,2.336105,0.0
3,CC[C@@H]1C[C@@H](O)[C@@H](C(=O)C1)c1ccccc1,5.825958,37,1,0,0,0,0,5.968664,5.90625,6.003646,5.938865,5.75247,5.888272,5.903897,5.904066,5.896652,0.0
4,CC[C@H](CC(OC)OC)/C=C/c1ccccc1,5.099414,39,1,0,0,0,0,5.085851,5.089844,5.094391,5.087303,5.085454,5.083584,5.091397,5.08766,5.095871,0.0


In [5]:
TEST.head()

Unnamed: 0,smiles,target,id,fold0,fold1,fold2,fold3,split,oof0,oof1,oof2,oof3,oof4,oof5,oof6,oof7,oof8,oof9
0,C[C@H]1CC[C@H]2C=CC[C@H]2[CH][N]N[C]([N]1)S,,3378615,3,3,3,3,0,5.511632,5.515625,5.676093,5.532615,5.599783,5.360928,5.548428,5.505629,5.525558,0.0
1,COCC/N=C(\N/N=C/1\C[C@H]2[C@@H]1CC=C2)/S,,3378616,3,3,3,3,0,5.531164,5.5,5.530355,5.524803,5.504106,5.505459,5.497647,5.52516,5.525558,0.0
2,CC(C[C@@H](/N=C(\c1cccc(c1)Cl)/O)C)C,,3378620,3,3,3,3,0,5.542882,5.527344,5.546458,5.552146,5.675616,5.548428,5.548428,5.55641,5.548996,0.0
3,CC(C[C@H](/N=C(\c1cccc(c1)Cl)/O)C)C,,3378621,3,3,3,3,0,5.538976,5.53125,5.547314,5.552146,5.676899,5.548428,5.548428,5.55641,5.548996,0.0
4,N#CSc1ccc(c(c1)N(=O)=O)/N=C(/O)\C,,3378622,3,3,3,3,0,4.28507,4.179688,4.249443,4.317771,4.131842,4.255459,4.360928,4.333754,4.298996,0.0


In [6]:
VALID['oof10'] = 0
VALID.loc[VALID['oof6']>0, 'oof10'] += VALID.loc[VALID['oof6']>0, 'oof6']
VALID.loc[VALID['oof7']>0, 'oof10'] += VALID.loc[VALID['oof7']>0, 'oof7']
VALID.loc[VALID['oof8']>0, 'oof10'] += VALID.loc[VALID['oof8']>0, 'oof8']
VALID.loc[VALID['oof9']>0, 'oof10'] += VALID.loc[VALID['oof9']>0, 'oof9']
VALID['oof10'] /= 3.

In [7]:
TEST['oof10'] = 0
TEST.loc[TEST['oof6']>0, 'oof10'] += TEST.loc[TEST['oof6']>0, 'oof6']
TEST.loc[TEST['oof7']>0, 'oof10'] += TEST.loc[TEST['oof7']>0, 'oof7']
TEST.loc[TEST['oof8']>0, 'oof10'] += TEST.loc[TEST['oof8']>0, 'oof8']
TEST.loc[TEST['oof9']>0, 'oof10'] += TEST.loc[TEST['oof9']>0, 'oof9']
TEST['oof10'] /= 3.

In [8]:
valid_list = []
test_list = []
features = ['oof0','oof1','oof2','oof3','oof4','oof5','oof10']

for fold in range(4):
    print(f"Fold: {fold}")
    train = VALID.loc[(VALID['split']!=fold)].reset_index(drop=True)
    valid = VALID.loc[(VALID['split']==fold)].reset_index(drop=True)
    test = TEST.loc[TEST['split']==fold].reset_index(drop=True)
    
    model = HuberRegressor(epsilon=1.00, max_iter=1000, alpha=0.0001, fit_intercept=True)
    model.fit(train[features], train['target'])
    valid['ensemble'] = model.predict(valid[features])

    valid['error'] = valid['target'] - valid['ensemble']
    score = valid['error'].abs().mean()
    valid_list.append(valid.copy())
    print(f'{score:.4f}', model.coef_, model.intercept_)
    print()
    
    test['target'] = model.predict(test[features])
    test_list.append(test.copy())
    gc.collect()

oof = pd.concat(valid_list)
test = pd.concat(test_list)
print(oof.shape, test.shape)

oof['error'].abs().mean()
#0.0714507

Fold: 0
0.0718 [ 0.09252322  0.08548885  0.04259026  0.25306489  0.02558166 -0.03121383
  0.52967504] 0.010067839766331924

Fold: 1
0.0714 [ 0.09856722  0.09744477  0.03947157  0.24483952  0.02598596 -0.0413258
  0.53282842] 0.009604319209242719

Fold: 2
0.0713 [ 0.1033944   0.09281448  0.03963878  0.23697751  0.02636753 -0.03173228
  0.53029205] 0.009852617208348152

Fold: 3
0.0713 [ 0.103241    0.09381118  0.03595609  0.25656416  0.02618661 -0.03737402
  0.51944072] 0.009471517123061305

(575360, 21) (589728, 19)


0.07144981246413012

We reran the code, and the reuslst are sightly different from what they were when we created the submisison file for the competition. Cross validation MAE is now 0.0714498 instead of 0.0714507 when we submitted. Given this is a very small change we think this is fine.

In [9]:
submission = test.groupby('id')['target'].agg('mean').reset_index()
submission = submission.sort_values('id').reset_index(drop=True)
submission

Unnamed: 0,id,target
0,3378615,5.512239
1,3378616,5.517787
2,3378620,5.546752
3,3378621,5.547624
4,3378622,4.302431
...,...,...
147427,3746606,7.115709
147428,3746613,7.977321
147429,3746614,7.354993
147430,3746615,5.444017


In [10]:
!mkdir blend1
np.save('blend1/testchallenge.npy', submission['target'].values)
oof.to_parquet('blend1/validation.parquet')

The weights in next cell are the weights we got when computing the competition submission.

In [11]:
w4 = (0.23556576+0.24467376+0.2564096+0.26574933)/4
w13 = (0.53824641+0.53299903+0.51858129+0.51714297)/4
w13 /= 3
w4, w13

(0.2505996125, 0.17558080833333334)

In [12]:
blend1 = np.load('blend1/testchallenge.npy')
model14 = np.load(BASEPATH + 'sajad_blv1_full_train/testchallenge.npy')
model15 = np.load(BASEPATH + 'sajad_blv2_full_train/testchallenge.npy')
model16 = np.load(BASEPATH + 'sajad_blv2-0.2_full_train/testchallenge.npy')
len(blend1), len(model14), len(model15), len(model16)

(147432, 147432, 147432, 147432)

In [13]:
ytest = (blend1 + w4*model14 + w13*model15 + w13*model16) / (1 + w4 + w13 + w13)
np.save('blend1/testchallenge_fulltrain.npy', ytest)