In [None]:
# Todo
# Create features out of fold
# Train per type
# Add common atom interactions (radii and interaction formulas)

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn

from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

from xgboost import XGBRegressor
import lightgbm as lgb

def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

In [2]:
lightgbm_params = {

    'num_leaves': 128,
    'min_child_samples': 79,
    'objective': 'regression',
    'max_depth': 9,
    'learning_rate': 0.2,
    "boosting_type": "gbdt",
    "subsample_freq": 1,
    "subsample": 0.9,
    "bagging_seed": 11,
    "metric": 'mae',
    "verbosity": -1,
    'reg_alpha': 0.1,
    'reg_lambda': 0.3,
    'colsample_bytree': 1.0,
    'verbose': 100, 

    # Play with these
    'early_stopping_rounds' : 200,
    'n_estimators': 10000,

}

# Evaluation - see https://www.kaggle.com/uberkinder/efficient-metric
def group_mean_log_mae(y_true, y_pred, groups, floor=1e-9):
    maes = (y_true-y_pred).abs().groupby(groups).mean()
    return np.log(maes.map(lambda x: max(x, floor))).mean()



def do_cross_validation(
    train_df
    , y
    , test_df
    , n_folds = 5
    , n_reshuffles = 1
    , shuffle = True
    , random_state=42
    , estimator = 'lightgbm'
    , metric = 'group_mean_log_mae'
    , **kwargs
):


    
    predictions = np.zeros([test_df.shape[0],n_folds*n_reshuffles])
    train_score = np.zeros([n_folds*n_reshuffles])
    val_score = np.zeros([n_folds*n_reshuffles]) 
    
    for s in range(n_reshuffles):
        print('Reshuffle: ', s)
        kf = KFold(n_splits=n_folds, shuffle=shuffle, random_state=random_state)
        
        for fold, (train_index, val_index) in enumerate(kf.split(np.array(train_df))):
            print('Training fold: ', fold)

            X_train, X_val = np.array(train_df)[train_index], np.array(train_df)[val_index]
            y_train, y_val = y[train_index], y[val_index]
            
            if metric == 'group_mean_log_mae':
                X_train_types, X_val_types = kwargs['train_types'][train_index], kwargs['train_types'][val_index]

            if estimator == 'lightgbm':
                
                lightgbm_params = kwargs['lightgbm_params']
                model = lgb.LGBMRegressor(
                            **lightgbm_params
                            , nthread=-1)

                model.fit(X_train,y_train, 
                              eval_set=[(X_train, y_train),(X_val, y_val)], 
                              eval_metric=lightgbm_params['metric'],
                              verbose=lightgbm_params['verbose'])

            y_train_pred = model.predict(X_train)
            y_val_pred = model.predict(X_val)
            y_pred = model.predict(test_df)

            if metric == 'group_mean_log_mae':
                train_score[s*n_folds + fold] = group_mean_log_mae(y_train_pred, y_train, X_train_types)
                val_score[s*n_folds + fold] = group_mean_log_mae(y_val_pred, y_val, X_val_types)
            
            predictions[:,s*n_folds + fold] = y_pred
            
            print("Training score: ", train_score[-1])
            print("Validiation score: ", val_score[-1])
            
    
    return train_score, val_score, predictions
    

In [3]:
train_df = pd.read_csv('../champs-scalar-coupling/train.csv')
test_df = pd.read_csv('../champs-scalar-coupling/test.csv')
structures_df = pd.read_csv('../champs-scalar-coupling/structures.csv')

print("Shape of training sample: ",train_df.shape)
print("Shape of testing sample: ",test_df.shape)
print("Shape of structures sample: ",structures_df.shape)

Shape of training sample:  (4658147, 6)
Shape of testing sample:  (2505542, 5)
Shape of structures sample:  (2358657, 6)


In [4]:
# Joining structure data onto train/test set
tmp_with_atom0_info = (train_df
                           .merge( structures_df, left_on = ['molecule_name','atom_index_0'], right_on = ['molecule_name','atom_index'], how = 'left' )
                           .drop('atom_index', axis=1)
                       )
tmp_with_atom0_info = tmp_with_atom0_info.rename(columns={ 'atom' : 'atom_0', 'x' : 'x_0', 'y' : 'y_0', 'z' : 'z_0'})

tmp_with_atom1_info = (tmp_with_atom0_info
                           .merge( structures_df, left_on = ['molecule_name','atom_index_1'], right_on = ['molecule_name','atom_index'], how = 'left' )
                           .drop('atom_index', axis=1)
                       )
tmp_with_atom1_info = tmp_with_atom1_info.rename(columns={ 'atom' : 'atom_1', 'x' : 'x_1', 'y' : 'y_1', 'z' : 'z_1'})

train_df = tmp_with_atom1_info

tmp_with_atom0_info = (test_df
                           .merge( structures_df, left_on = ['molecule_name','atom_index_0'], right_on = ['molecule_name','atom_index'], how = 'left' )
                           .drop('atom_index', axis=1)
                       )
tmp_with_atom0_info = tmp_with_atom0_info.rename(columns={ 'atom' : 'atom_0', 'x' : 'x_0', 'y' : 'y_0', 'z' : 'z_0'})

tmp_with_atom1_info = (tmp_with_atom0_info
                           .merge( structures_df, left_on = ['molecule_name','atom_index_1'], right_on = ['molecule_name','atom_index'], how = 'left' )
                           .drop('atom_index', axis=1)
                       )
tmp_with_atom1_info = tmp_with_atom1_info.rename(columns={ 'atom' : 'atom_1', 'x' : 'x_1', 'y' : 'y_1', 'z' : 'z_1'})

test_df = tmp_with_atom1_info

train_df.head()

Unnamed: 0,id,molecule_name,atom_index_0,atom_index_1,type,scalar_coupling_constant,atom_0,x_0,y_0,z_0,atom_1,x_1,y_1,z_1
0,0,dsgdb9nsd_000001,1,0,1JHC,84.8076,H,0.00215,-0.006031,0.001976,C,-0.012698,1.085804,0.008001
1,1,dsgdb9nsd_000001,1,2,2JHH,-11.257,H,0.00215,-0.006031,0.001976,H,1.011731,1.463751,0.000277
2,2,dsgdb9nsd_000001,1,3,2JHH,-11.2548,H,0.00215,-0.006031,0.001976,H,-0.540815,1.447527,-0.876644
3,3,dsgdb9nsd_000001,1,4,2JHH,-11.2543,H,0.00215,-0.006031,0.001976,H,-0.523814,1.437933,0.906397
4,4,dsgdb9nsd_000001,2,0,1JHC,84.8074,H,1.011731,1.463751,0.000277,C,-0.012698,1.085804,0.008001


In [5]:
del structures_df

In [6]:
# Computing distances

train_p_0 = train_df[['x_0', 'y_0', 'z_0']].values
train_p_1 = train_df[['x_1', 'y_1', 'z_1']].values
test_p_0 = test_df[['x_0', 'y_0', 'z_0']].values
test_p_1 = test_df[['x_1', 'y_1', 'z_1']].values

train_df['dist'] = np.linalg.norm(train_p_0 - train_p_1, axis=1)
test_df['dist'] = np.linalg.norm(test_p_0 - test_p_1, axis=1)
train_df['dist_x'] = (train_df['x_0'] - train_df['x_1']) ** 2
test_df['dist_x'] = (test_df['x_0'] - test_df['x_1']) ** 2
train_df['dist_y'] = (train_df['y_0'] - train_df['y_1']) ** 2
test_df['dist_y'] = (test_df['y_0'] - test_df['y_1']) ** 2
train_df['dist_z'] = (train_df['z_0'] - train_df['z_1']) ** 2
test_df['dist_z'] = (test_df['z_0'] - test_df['z_1']) ** 2

train_df['type_0'] = train_df['type'].apply(lambda x: x[0])
test_df['type_0'] = test_df['type'].apply(lambda x: x[0])

train_df['type_1'] = train_df['type'].apply(lambda x: x[1:])
test_df['type_1'] = test_df['type'].apply(lambda x: x[1:])

# Some more distances related to average molecule and type distances
# Freely adapted after https://www.kaggle.com/artgor/brute-force-feature-engineering
def create_features(df):
    df['molecule_couples'] = df.groupby('molecule_name')['id'].transform('count')
    df['molecule_dist_mean'] = df.groupby('molecule_name')['dist'].transform('mean')
    df['molecule_dist_min'] = df.groupby('molecule_name')['dist'].transform('min')
    df['molecule_dist_max'] = df.groupby('molecule_name')['dist'].transform('max')
    df['atom_0_couples_count'] = df.groupby(['molecule_name', 'atom_index_0'])['id'].transform('count')
    df['atom_1_couples_count'] = df.groupby(['molecule_name', 'atom_index_1'])['id'].transform('count')
    
    df[f'molecule_atom_index_0_x_1_std'] = df.groupby(['molecule_name', 'atom_index_0'])['x_1'].transform('std')
    df[f'molecule_atom_index_0_y_1_mean'] = df.groupby(['molecule_name', 'atom_index_0'])['y_1'].transform('mean')
    df[f'molecule_atom_index_0_y_1_mean_diff'] = df[f'molecule_atom_index_0_y_1_mean'] - df['y_1']
    df[f'molecule_atom_index_0_y_1_mean_div'] = df[f'molecule_atom_index_0_y_1_mean'] / df['y_1']
    df[f'molecule_atom_index_0_y_1_max'] = df.groupby(['molecule_name', 'atom_index_0'])['y_1'].transform('max')
    df[f'molecule_atom_index_0_y_1_max_diff'] = df[f'molecule_atom_index_0_y_1_max'] - df['y_1']
    df[f'molecule_atom_index_0_y_1_std'] = df.groupby(['molecule_name', 'atom_index_0'])['y_1'].transform('std')
    df[f'molecule_atom_index_0_z_1_std'] = df.groupby(['molecule_name', 'atom_index_0'])['z_1'].transform('std')
    df[f'molecule_atom_index_0_dist_mean'] = df.groupby(['molecule_name', 'atom_index_0'])['dist'].transform('mean')
    df[f'molecule_atom_index_0_dist_mean_diff'] = df[f'molecule_atom_index_0_dist_mean'] - df['dist']
    df[f'molecule_atom_index_0_dist_mean_div'] = df[f'molecule_atom_index_0_dist_mean'] / df['dist']
    df[f'molecule_atom_index_0_dist_max'] = df.groupby(['molecule_name', 'atom_index_0'])['dist'].transform('max')
    df[f'molecule_atom_index_0_dist_max_diff'] = df[f'molecule_atom_index_0_dist_max'] - df['dist']
    df[f'molecule_atom_index_0_dist_max_div'] = df[f'molecule_atom_index_0_dist_max'] / df['dist']
    df[f'molecule_atom_index_0_dist_min'] = df.groupby(['molecule_name', 'atom_index_0'])['dist'].transform('min')
    df[f'molecule_atom_index_0_dist_min_diff'] = df[f'molecule_atom_index_0_dist_min'] - df['dist']
    df[f'molecule_atom_index_0_dist_min_div'] = df[f'molecule_atom_index_0_dist_min'] / df['dist']
    df[f'molecule_atom_index_0_dist_std'] = df.groupby(['molecule_name', 'atom_index_0'])['dist'].transform('std')
    df[f'molecule_atom_index_0_dist_std_diff'] = df[f'molecule_atom_index_0_dist_std'] - df['dist']
    df[f'molecule_atom_index_0_dist_std_div'] = df[f'molecule_atom_index_0_dist_std'] / df['dist']
    df[f'molecule_atom_index_1_dist_mean'] = df.groupby(['molecule_name', 'atom_index_1'])['dist'].transform('mean')
    df[f'molecule_atom_index_1_dist_mean_diff'] = df[f'molecule_atom_index_1_dist_mean'] - df['dist']
    df[f'molecule_atom_index_1_dist_mean_div'] = df[f'molecule_atom_index_1_dist_mean'] / df['dist']
    df[f'molecule_atom_index_1_dist_max'] = df.groupby(['molecule_name', 'atom_index_1'])['dist'].transform('max')
    df[f'molecule_atom_index_1_dist_max_diff'] = df[f'molecule_atom_index_1_dist_max'] - df['dist']
    df[f'molecule_atom_index_1_dist_max_div'] = df[f'molecule_atom_index_1_dist_max'] / df['dist']
    df[f'molecule_atom_index_1_dist_min'] = df.groupby(['molecule_name', 'atom_index_1'])['dist'].transform('min')
    df[f'molecule_atom_index_1_dist_min_diff'] = df[f'molecule_atom_index_1_dist_min'] - df['dist']
    df[f'molecule_atom_index_1_dist_min_div'] = df[f'molecule_atom_index_1_dist_min'] / df['dist']
    df[f'molecule_atom_index_1_dist_std'] = df.groupby(['molecule_name', 'atom_index_1'])['dist'].transform('std')
    df[f'molecule_atom_index_1_dist_std_diff'] = df[f'molecule_atom_index_1_dist_std'] - df['dist']
    df[f'molecule_atom_index_1_dist_std_div'] = df[f'molecule_atom_index_1_dist_std'] / df['dist']
    df[f'molecule_atom_1_dist_mean'] = df.groupby(['molecule_name', 'atom_1'])['dist'].transform('mean')
    df[f'molecule_atom_1_dist_min'] = df.groupby(['molecule_name', 'atom_1'])['dist'].transform('min')
    df[f'molecule_atom_1_dist_min_diff'] = df[f'molecule_atom_1_dist_min'] - df['dist']
    df[f'molecule_atom_1_dist_min_div'] = df[f'molecule_atom_1_dist_min'] / df['dist']
    df[f'molecule_atom_1_dist_std'] = df.groupby(['molecule_name', 'atom_1'])['dist'].transform('std')
    df[f'molecule_atom_1_dist_std_diff'] = df[f'molecule_atom_1_dist_std'] - df['dist']
    df[f'molecule_type_0_dist_std'] = df.groupby(['molecule_name', 'type_0'])['dist'].transform('std')
    df[f'molecule_type_0_dist_std_diff'] = df[f'molecule_type_0_dist_std'] - df['dist']
    df[f'molecule_type_1_dist_std'] = df.groupby(['molecule_name', 'type_1'])['dist'].transform('std')
    df[f'molecule_type_1_dist_std_diff'] = df[f'molecule_type_1_dist_std'] - df['dist']
    df[f'molecule_type_dist_mean'] = df.groupby(['molecule_name', 'type'])['dist'].transform('mean')
    df[f'molecule_type_dist_mean_diff'] = df[f'molecule_type_dist_mean'] - df['dist']
    df[f'molecule_type_dist_mean_div'] = df[f'molecule_type_dist_mean'] / df['dist']
    df[f'molecule_type_dist_max'] = df.groupby(['molecule_name', 'type'])['dist'].transform('max')
    df[f'molecule_type_dist_min'] = df.groupby(['molecule_name', 'type'])['dist'].transform('min')
    df[f'molecule_type_dist_std'] = df.groupby(['molecule_name', 'type'])['dist'].transform('std')
    df[f'molecule_type_dist_std_diff'] = df[f'molecule_type_dist_std'] - df['dist']

    return df

y = train_df['scalar_coupling_constant']

train_types = train_df['type']
test_types = test_df['type']

train_df = create_features(train_df).drop(['id', 'molecule_name', 'scalar_coupling_constant'], axis=1)
test_df = create_features(test_df).drop(['id', 'molecule_name'], axis=1)

In [7]:
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(), ['atom_0', 'atom_1', 'type', 'type_0', 'type_1'])
    ] )

preprocessor.fit(train_df)

train_df_final = train_df.copy()
test_df_final = test_df.copy()

train_df_final[preprocessor.get_feature_names()] = pd.DataFrame(preprocessor.transform(train_df).toarray(),columns=preprocessor.get_feature_names())
test_df_final[preprocessor.get_feature_names()] = pd.DataFrame(preprocessor.transform(test_df).toarray(),columns=preprocessor.get_feature_names())

train_df_final.drop(['atom_0', 'atom_1', 'type', 'type_0', 'type_1'],axis=1,inplace=True)
test_df_final.drop(['atom_0', 'atom_1', 'type', 'type_0', 'type_1'],axis=1,inplace=True)

In [8]:
train_df_final.head()

Unnamed: 0,atom_index_0,atom_index_1,x_0,y_0,z_0,x_1,y_1,z_1,dist,dist_x,...,cat__x2_2JHN,cat__x2_3JHC,cat__x2_3JHH,cat__x2_3JHN,cat__x3_1,cat__x3_2,cat__x3_3,cat__x4_JHC,cat__x4_JHH,cat__x4_JHN
0,1,0,0.00215,-0.006031,0.001976,-0.012698,1.085804,0.008001,1.091953,0.00022,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
1,1,2,0.00215,-0.006031,0.001976,1.011731,1.463751,0.000277,1.78312,1.019253,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
2,1,3,0.00215,-0.006031,0.001976,-0.540815,1.447527,-0.876644,1.783147,0.294812,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
3,1,4,0.00215,-0.006031,0.001976,-0.523814,1.437933,0.906397,1.783157,0.276638,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
4,2,0,1.011731,1.463751,0.000277,-0.012698,1.085804,0.008001,1.091952,1.049455,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0


In [9]:
train_score, val_score, predictions = do_cross_validation(
        train_df_final
        ,y
        ,test_df_final
    
        ,train_types=train_types
        ,test_types=test_types
        ,lightgbm_params=lightgbm_params)

Reshuffle:  0
Training fold:  0
Training until validation scores don't improve for 200 rounds.
[100]	training's l1: 1.39342	valid_1's l1: 1.40612
[200]	training's l1: 1.26879	valid_1's l1: 1.29189
[300]	training's l1: 1.19478	valid_1's l1: 1.22653
[400]	training's l1: 1.14358	valid_1's l1: 1.18299
[500]	training's l1: 1.10296	valid_1's l1: 1.15036
[600]	training's l1: 1.06914	valid_1's l1: 1.12382
[700]	training's l1: 1.04091	valid_1's l1: 1.10259
[800]	training's l1: 1.01574	valid_1's l1: 1.08442
[900]	training's l1: 0.993118	valid_1's l1: 1.06828
[1000]	training's l1: 0.972813	valid_1's l1: 1.05442
[1100]	training's l1: 0.953677	valid_1's l1: 1.04147
[1200]	training's l1: 0.936119	valid_1's l1: 1.02963
[1300]	training's l1: 0.92049	valid_1's l1: 1.01985
[1400]	training's l1: 0.906171	valid_1's l1: 1.01113
[1500]	training's l1: 0.891385	valid_1's l1: 1.00158
[1600]	training's l1: 0.878292	valid_1's l1: 0.993718
[1700]	training's l1: 0.865437	valid_1's l1: 0.985787
[1800]	training's l1

KeyboardInterrupt: 