In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

from xgboost import XGBRegressor
import lightgbm as lgb

def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

We will start with the basic files provided, test.csv, train.csv and structures.csv.

In [2]:
train_df = pd.read_csv('champs-scalar-coupling/train.csv')
test_df = pd.read_csv('champs-scalar-coupling/test.csv')
structures_df = pd.read_csv('champs-scalar-coupling/structures.csv')

print("Shape of training sample: ",train_df.shape)
print("Shape of testing sample: ",test_df.shape)
print("Shape of structures sample: ",structures_df.shape)

Shape of training sample:  (4658147, 6)
Shape of testing sample:  (2505542, 5)
Shape of structures sample:  (2358657, 6)


We use an exploratory analysis from here: https://www.kaggle.com/artgor/molecular-properties-eda-and-models

The basic features provided in the training set are only the atom indices and the coupling type between them, however by enriching the Dataframe with info from the structures file, we can add a couple of more features:

- The atom types
- The distance between the atoms, in 3 dimensions and combined
- The distance relative to the average type distance

Let's give this a go!

In [3]:
train_df.head()

Unnamed: 0,id,molecule_name,atom_index_0,atom_index_1,type,scalar_coupling_constant
0,0,dsgdb9nsd_000001,1,0,1JHC,84.8076
1,1,dsgdb9nsd_000001,1,2,2JHH,-11.257
2,2,dsgdb9nsd_000001,1,3,2JHH,-11.2548
3,3,dsgdb9nsd_000001,1,4,2JHH,-11.2543
4,4,dsgdb9nsd_000001,2,0,1JHC,84.8074


In [4]:
structures_df.head()

Unnamed: 0,molecule_name,atom_index,atom,x,y,z
0,dsgdb9nsd_000001,0,C,-0.012698,1.085804,0.008001
1,dsgdb9nsd_000001,1,H,0.00215,-0.006031,0.001976
2,dsgdb9nsd_000001,2,H,1.011731,1.463751,0.000277
3,dsgdb9nsd_000001,3,H,-0.540815,1.447527,-0.876644
4,dsgdb9nsd_000001,4,H,-0.523814,1.437933,0.906397


In [5]:
# Joining structure data onto train/test set
tmp_with_atom0_info = (train_df
                           .merge( structures_df, left_on = ['molecule_name','atom_index_0'], right_on = ['molecule_name','atom_index'], how = 'left' )
                           .drop('atom_index', axis=1)
                       )
tmp_with_atom0_info = tmp_with_atom0_info.rename(columns={ 'atom' : 'atom_0', 'x' : 'x_0', 'y' : 'y_0', 'z' : 'z_0'})

tmp_with_atom1_info = (tmp_with_atom0_info
                           .merge( structures_df, left_on = ['molecule_name','atom_index_1'], right_on = ['molecule_name','atom_index'], how = 'left' )
                           .drop('atom_index', axis=1)
                       )
tmp_with_atom1_info = tmp_with_atom1_info.rename(columns={ 'atom' : 'atom_1', 'x' : 'x_1', 'y' : 'y_1', 'z' : 'z_1'})

train_df = tmp_with_atom1_info

tmp_with_atom0_info = (test_df
                           .merge( structures_df, left_on = ['molecule_name','atom_index_0'], right_on = ['molecule_name','atom_index'], how = 'left' )
                           .drop('atom_index', axis=1)
                       )
tmp_with_atom0_info = tmp_with_atom0_info.rename(columns={ 'atom' : 'atom_0', 'x' : 'x_0', 'y' : 'y_0', 'z' : 'z_0'})

tmp_with_atom1_info = (tmp_with_atom0_info
                           .merge( structures_df, left_on = ['molecule_name','atom_index_1'], right_on = ['molecule_name','atom_index'], how = 'left' )
                           .drop('atom_index', axis=1)
                       )
tmp_with_atom1_info = tmp_with_atom1_info.rename(columns={ 'atom' : 'atom_1', 'x' : 'x_1', 'y' : 'y_1', 'z' : 'z_1'})

test_df = tmp_with_atom1_info

train_df.head()

Unnamed: 0,id,molecule_name,atom_index_0,atom_index_1,type,scalar_coupling_constant,atom_0,x_0,y_0,z_0,atom_1,x_1,y_1,z_1
0,0,dsgdb9nsd_000001,1,0,1JHC,84.8076,H,0.00215,-0.006031,0.001976,C,-0.012698,1.085804,0.008001
1,1,dsgdb9nsd_000001,1,2,2JHH,-11.257,H,0.00215,-0.006031,0.001976,H,1.011731,1.463751,0.000277
2,2,dsgdb9nsd_000001,1,3,2JHH,-11.2548,H,0.00215,-0.006031,0.001976,H,-0.540815,1.447527,-0.876644
3,3,dsgdb9nsd_000001,1,4,2JHH,-11.2543,H,0.00215,-0.006031,0.001976,H,-0.523814,1.437933,0.906397
4,4,dsgdb9nsd_000001,2,0,1JHC,84.8074,H,1.011731,1.463751,0.000277,C,-0.012698,1.085804,0.008001


In [6]:
del structures_df

In [7]:
# Computing distances

train_p_0 = train_df[['x_0', 'y_0', 'z_0']].values
train_p_1 = train_df[['x_1', 'y_1', 'z_1']].values
test_p_0 = test_df[['x_0', 'y_0', 'z_0']].values
test_p_1 = test_df[['x_1', 'y_1', 'z_1']].values

train_df['dist'] = np.linalg.norm(train_p_0 - train_p_1, axis=1)
test_df['dist'] = np.linalg.norm(test_p_0 - test_p_1, axis=1)
train_df['dist_x'] = (train_df['x_0'] - train_df['x_1']) ** 2
test_df['dist_x'] = (test_df['x_0'] - test_df['x_1']) ** 2
train_df['dist_y'] = (train_df['y_0'] - train_df['y_1']) ** 2
test_df['dist_y'] = (test_df['y_0'] - test_df['y_1']) ** 2
train_df['dist_z'] = (train_df['z_0'] - train_df['z_1']) ** 2
test_df['dist_z'] = (test_df['z_0'] - test_df['z_1']) ** 2

train_df['type_0'] = train_df['type'].apply(lambda x: x[0])
test_df['type_0'] = test_df['type'].apply(lambda x: x[0])

train_df['type_1'] = train_df['type'].apply(lambda x: x[1:])
test_df['type_1'] = test_df['type'].apply(lambda x: x[1:])

# Some more distances related to average molecule and type distances
# Freely adapted after https://www.kaggle.com/artgor/brute-force-feature-engineering
def create_features(df):
    df['molecule_couples'] = df.groupby('molecule_name')['id'].transform('count')
    df['molecule_dist_mean'] = df.groupby('molecule_name')['dist'].transform('mean')
    df['molecule_dist_min'] = df.groupby('molecule_name')['dist'].transform('min')
    df['molecule_dist_max'] = df.groupby('molecule_name')['dist'].transform('max')
    df['atom_0_couples_count'] = df.groupby(['molecule_name', 'atom_index_0'])['id'].transform('count')
    df['atom_1_couples_count'] = df.groupby(['molecule_name', 'atom_index_1'])['id'].transform('count')
    
    df[f'molecule_atom_index_0_x_1_std'] = df.groupby(['molecule_name', 'atom_index_0'])['x_1'].transform('std')
    df[f'molecule_atom_index_0_y_1_mean'] = df.groupby(['molecule_name', 'atom_index_0'])['y_1'].transform('mean')
    df[f'molecule_atom_index_0_y_1_mean_diff'] = df[f'molecule_atom_index_0_y_1_mean'] - df['y_1']
    df[f'molecule_atom_index_0_y_1_mean_div'] = df[f'molecule_atom_index_0_y_1_mean'] / df['y_1']
    df[f'molecule_atom_index_0_y_1_max'] = df.groupby(['molecule_name', 'atom_index_0'])['y_1'].transform('max')
    df[f'molecule_atom_index_0_y_1_max_diff'] = df[f'molecule_atom_index_0_y_1_max'] - df['y_1']
    df[f'molecule_atom_index_0_y_1_std'] = df.groupby(['molecule_name', 'atom_index_0'])['y_1'].transform('std')
    df[f'molecule_atom_index_0_z_1_std'] = df.groupby(['molecule_name', 'atom_index_0'])['z_1'].transform('std')
    df[f'molecule_atom_index_0_dist_mean'] = df.groupby(['molecule_name', 'atom_index_0'])['dist'].transform('mean')
    df[f'molecule_atom_index_0_dist_mean_diff'] = df[f'molecule_atom_index_0_dist_mean'] - df['dist']
    df[f'molecule_atom_index_0_dist_mean_div'] = df[f'molecule_atom_index_0_dist_mean'] / df['dist']
    df[f'molecule_atom_index_0_dist_max'] = df.groupby(['molecule_name', 'atom_index_0'])['dist'].transform('max')
    df[f'molecule_atom_index_0_dist_max_diff'] = df[f'molecule_atom_index_0_dist_max'] - df['dist']
    df[f'molecule_atom_index_0_dist_max_div'] = df[f'molecule_atom_index_0_dist_max'] / df['dist']
    df[f'molecule_atom_index_0_dist_min'] = df.groupby(['molecule_name', 'atom_index_0'])['dist'].transform('min')
    df[f'molecule_atom_index_0_dist_min_diff'] = df[f'molecule_atom_index_0_dist_min'] - df['dist']
    df[f'molecule_atom_index_0_dist_min_div'] = df[f'molecule_atom_index_0_dist_min'] / df['dist']
    df[f'molecule_atom_index_0_dist_std'] = df.groupby(['molecule_name', 'atom_index_0'])['dist'].transform('std')
    df[f'molecule_atom_index_0_dist_std_diff'] = df[f'molecule_atom_index_0_dist_std'] - df['dist']
    df[f'molecule_atom_index_0_dist_std_div'] = df[f'molecule_atom_index_0_dist_std'] / df['dist']
    df[f'molecule_atom_index_1_dist_mean'] = df.groupby(['molecule_name', 'atom_index_1'])['dist'].transform('mean')
    df[f'molecule_atom_index_1_dist_mean_diff'] = df[f'molecule_atom_index_1_dist_mean'] - df['dist']
    df[f'molecule_atom_index_1_dist_mean_div'] = df[f'molecule_atom_index_1_dist_mean'] / df['dist']
    df[f'molecule_atom_index_1_dist_max'] = df.groupby(['molecule_name', 'atom_index_1'])['dist'].transform('max')
    df[f'molecule_atom_index_1_dist_max_diff'] = df[f'molecule_atom_index_1_dist_max'] - df['dist']
    df[f'molecule_atom_index_1_dist_max_div'] = df[f'molecule_atom_index_1_dist_max'] / df['dist']
    df[f'molecule_atom_index_1_dist_min'] = df.groupby(['molecule_name', 'atom_index_1'])['dist'].transform('min')
    df[f'molecule_atom_index_1_dist_min_diff'] = df[f'molecule_atom_index_1_dist_min'] - df['dist']
    df[f'molecule_atom_index_1_dist_min_div'] = df[f'molecule_atom_index_1_dist_min'] / df['dist']
    df[f'molecule_atom_index_1_dist_std'] = df.groupby(['molecule_name', 'atom_index_1'])['dist'].transform('std')
    df[f'molecule_atom_index_1_dist_std_diff'] = df[f'molecule_atom_index_1_dist_std'] - df['dist']
    df[f'molecule_atom_index_1_dist_std_div'] = df[f'molecule_atom_index_1_dist_std'] / df['dist']
    df[f'molecule_atom_1_dist_mean'] = df.groupby(['molecule_name', 'atom_1'])['dist'].transform('mean')
    df[f'molecule_atom_1_dist_min'] = df.groupby(['molecule_name', 'atom_1'])['dist'].transform('min')
    df[f'molecule_atom_1_dist_min_diff'] = df[f'molecule_atom_1_dist_min'] - df['dist']
    df[f'molecule_atom_1_dist_min_div'] = df[f'molecule_atom_1_dist_min'] / df['dist']
    df[f'molecule_atom_1_dist_std'] = df.groupby(['molecule_name', 'atom_1'])['dist'].transform('std')
    df[f'molecule_atom_1_dist_std_diff'] = df[f'molecule_atom_1_dist_std'] - df['dist']
    df[f'molecule_type_0_dist_std'] = df.groupby(['molecule_name', 'type_0'])['dist'].transform('std')
    df[f'molecule_type_0_dist_std_diff'] = df[f'molecule_type_0_dist_std'] - df['dist']
    df[f'molecule_type_1_dist_std'] = df.groupby(['molecule_name', 'type_1'])['dist'].transform('std')
    df[f'molecule_type_1_dist_std_diff'] = df[f'molecule_type_1_dist_std'] - df['dist']
    df[f'molecule_type_dist_mean'] = df.groupby(['molecule_name', 'type'])['dist'].transform('mean')
    df[f'molecule_type_dist_mean_diff'] = df[f'molecule_type_dist_mean'] - df['dist']
    df[f'molecule_type_dist_mean_div'] = df[f'molecule_type_dist_mean'] / df['dist']
    df[f'molecule_type_dist_max'] = df.groupby(['molecule_name', 'type'])['dist'].transform('max')
    df[f'molecule_type_dist_min'] = df.groupby(['molecule_name', 'type'])['dist'].transform('min')
    df[f'molecule_type_dist_std'] = df.groupby(['molecule_name', 'type'])['dist'].transform('std')
    df[f'molecule_type_dist_std_diff'] = df[f'molecule_type_dist_std'] - df['dist']

    return df

train_df = create_features(train_df)
test_df = create_features(test_df)

In [8]:
test_df.shape

(2505542, 74)

In [9]:
# Converting categorical columns
# All categories are present in both train/test so we don't have to worry about data leakage.

# Also doing train/test split

X = train_df.drop(['id', 'molecule_name', 'scalar_coupling_constant'], axis=1)
X_test = test_df.drop(['id', 'molecule_name'], axis=1)
y = train_df['scalar_coupling_constant']

X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=42)

del train_df, test_df
del X, y

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(), ['atom_0', 'atom_1', 'type', 'type_0', 'type_1'])
    ], remainder = 'passthrough' )

X_train_processed = preprocessor.fit_transform(X_train)
X_valid_processed = preprocessor.transform(X_valid)
X_test_processed = preprocessor.transform(X_test)

del X_train

In [1]:
# # Creating XGB model

# verbose=10000 
# early_stopping_rounds=200
# n_estimators=1

# model = XGBRegressor(verbose=verbose, early_stopping_rounds=early_stopping_rounds, n_estimators=n_estimators, nthread=-1)

# model.fit(X_train_processed,y_train, 
#               eval_set=[(X_valid_processed, y_valid)], 
#               eval_metric='mae',
#               verbose=True)
# y_pred_valid = model.predict(X_valid_processed)
# y_pred = model.predict(X_test_processed)

# # Evaluation - see https://www.kaggle.com/uberkinder/efficient-metric
# def group_mean_log_mae(y_true, y_pred, groups, floor=1e-9):
#     maes = (y_true-y_pred).abs().groupby(groups).mean()
#     return np.log(maes.map(lambda x: max(x, floor))).mean()

# group_mean_log_mae(y_valid, y_pred_valid, X_valid['type'])

In [13]:
# Creating LGBM model, single fold and iteration for now

params = {'num_leaves': 128,
          'min_child_samples': 79,
          'objective': 'regression',
          'max_depth': 9,
          'learning_rate': 0.2,
          "boosting_type": "gbdt",
          "subsample_freq": 1,
          "subsample": 0.9,
          "bagging_seed": 11,
          "metric": 'mae',
          "verbosity": -1,
          'reg_alpha': 0.1,
          'reg_lambda': 0.3,
          'colsample_bytree': 1.0
         }

verbose=100 
early_stopping_rounds=200
n_estimators=10000

model = lgb.LGBMRegressor(**params, verbose=verbose, early_stopping_rounds=early_stopping_rounds, n_estimators=n_estimators, nthread=-1)

model.fit(X_train_processed,y_train, 
              eval_set=[(X_train_processed, y_train),(X_valid_processed, y_valid)], 
              eval_metric='mae',
              verbose=verbose)
y_pred_valid = model.predict(X_valid_processed)
y_pred = model.predict(X_test_processed)

# Evaluation - see https://www.kaggle.com/uberkinder/efficient-metric
def group_mean_log_mae(y_true, y_pred, groups, floor=1e-9):
    maes = (y_true-y_pred).abs().groupby(groups).mean()
    return np.log(maes.map(lambda x: max(x, floor))).mean()

group_mean_log_mae(y_valid, y_pred_valid, X_valid['type'])

Training until validation scores don't improve for 200 rounds.
[100]	training's l1: 1.39461	valid_1's l1: 1.4081
[200]	training's l1: 1.26479	valid_1's l1: 1.28812
[300]	training's l1: 1.19291	valid_1's l1: 1.2249
[400]	training's l1: 1.14349	valid_1's l1: 1.18391
[500]	training's l1: 1.10456	valid_1's l1: 1.15287
[600]	training's l1: 1.06977	valid_1's l1: 1.12522
[700]	training's l1: 1.04122	valid_1's l1: 1.10398
[800]	training's l1: 1.01616	valid_1's l1: 1.08542
[900]	training's l1: 0.993311	valid_1's l1: 1.06919
[1000]	training's l1: 0.973141	valid_1's l1: 1.05543
[1100]	training's l1: 0.953642	valid_1's l1: 1.0418
[1200]	training's l1: 0.936898	valid_1's l1: 1.03095
[1300]	training's l1: 0.919211	valid_1's l1: 1.01896
[1400]	training's l1: 0.905512	valid_1's l1: 1.01098
[1500]	training's l1: 0.891099	valid_1's l1: 1.00195
[1600]	training's l1: 0.877934	valid_1's l1: 0.994187
[1700]	training's l1: 0.865195	valid_1's l1: 0.986451
[1800]	training's l1: 0.853789	valid_1's l1: 0.980228


-0.33464410445620046

In [14]:
#Submitting
file_name = "solution.csv"
message = "XGBoost unoptimised with distance features"
header = ['id','scalar_coupling_constant']



pd.DataFrame(
    data=list(zip([x for x in test_df['id'].tolist()], [int(x) for x in predictions.tolist()]))
).to_csv('{}'.format(file_name), index=False, header=header)

NameError: name 'X_test' is not defined

In [None]:
%%bash -s "$file_name" "$message"
kaggle competitions submit -c home-data-for-ml-course -f $1 -m "$2"