### The Data
This book builds on previous notebooks regarding feature enginnering and feature selection however in this book we will train a model per 1JHC and 1JHN coupling types. We will additionally feature enhance those datasets by more details about the eletronic structure of the molecules.

Note: this book will only consider data from the following csv files: train, structures and molecule_structures


In [1]:
import pandas as pd

input_folder = './input'

train = pd.read_csv(f'{input_folder}/train.csv')
structures = pd.read_csv(f'{input_folder}/structures.csv')
molecular_structures = pd.read_csv(f'{input_folder}/molecule_structures.csv')


In [2]:
f'Train shape: {train.shape}'

'Train shape: (4658147, 6)'

In [3]:
f'Structures shape: {structures.shape}'

'Structures shape: (2358657, 6)'

In [4]:
f'molecular structures shape: {molecular_structures.shape}'

'molecular structures shape: (1586325, 32)'

In [5]:
print(f"There are {train['type'].nunique()} unique coupling types: {train['type'].unique()}")

There are 8 unique coupling types: ['1JHC' '2JHH' '1JHN' '2JHN' '2JHC' '3JHH' '3JHC' '3JHN']


#### Merging training and structure data sources into one training data

Code for joining data is from this kernel: https://www.kaggle.com/seriousran/just-speed-up-calculate-distance-from-benchmark

In [6]:
def map_atom_info(df, atom_idx):
    df = pd.merge(df, structures, how = 'left',
                  left_on  = ['molecule_name', f'atom_index_{atom_idx}'],
                  right_on = ['molecule_name',  'atom_index'])
    
    df = df.drop('atom_index', axis=1)
    df = df.rename(columns={'atom': f'atom_{atom_idx}',
                            'x': f'x_{atom_idx}',
                            'y': f'y_{atom_idx}',
                            'z': f'z_{atom_idx}'})
    return df

In [7]:
train = map_atom_info(train, 0)
train = map_atom_info(train, 1)

In [8]:
train.head()

Unnamed: 0,id,molecule_name,atom_index_0,atom_index_1,type,scalar_coupling_constant,atom_0,x_0,y_0,z_0,atom_1,x_1,y_1,z_1
0,0,dsgdb9nsd_000001,1,0,1JHC,84.8076,H,0.00215,-0.006031,0.001976,C,-0.012698,1.085804,0.008001
1,1,dsgdb9nsd_000001,1,2,2JHH,-11.257,H,0.00215,-0.006031,0.001976,H,1.011731,1.463751,0.000277
2,2,dsgdb9nsd_000001,1,3,2JHH,-11.2548,H,0.00215,-0.006031,0.001976,H,-0.540815,1.447527,-0.876644
3,3,dsgdb9nsd_000001,1,4,2JHH,-11.2543,H,0.00215,-0.006031,0.001976,H,-0.523814,1.437933,0.906397
4,4,dsgdb9nsd_000001,2,0,1JHC,84.8074,H,1.011731,1.463751,0.000277,C,-0.012698,1.085804,0.008001


#### Merging new training and molecule_structures data sources into one training data

In [9]:
def map_molecule_info(df):
    df = pd.merge(df, molecular_structures, how = 'left',
                  left_on  = ['molecule_name', 'atom_index_0', 'atom_index_1'],
                  right_on = ['molecule_name',  'atom_0', 'atom_1'])
    
    df = df.drop('atom_0_y', axis=1)
    df = df.drop('atom_1_y', axis=1)
    df = df.drop('atom_1_1_level_NB', axis=1)
    df = df.drop('atom_1_2_level_NB', axis=1)
    
    df = df.rename(columns={'atom_0_x': 'atom_0',
                            'atom_1_x': 'atom_1'})
    return df

In [10]:
train = map_molecule_info(train)

In [11]:
print(train.shape)

(4658147, 41)


In [12]:
import pandas

pandas.set_option('display.max_columns', None)

train.head(10)

Unnamed: 0,id,molecule_name,atom_index_0,atom_index_1,type,scalar_coupling_constant,atom_0,x_0,y_0,z_0,atom_1,x_1,y_1,z_1,atom_0_CH3,atom_1_CH3,atom_0_CH2,atom_1_CH2,atom_0_CH1,atom_1_CH1,atom_0_CH0,atom_1_CH0,atom_0_NH2,atom_1_NH2,atom_0_NH1,atom_1_NH1,atom_0_NH0,atom_1_NH0,atom_0_OH1,atom_1_OH1,atom_0_OH0,atom_1_OH0,is_in_ring,has_aromatic_bond,is_primary_amide,is_secondary_amide,is_tertiary_amide,is_ester,is_carbonyl,bond_order,bond_length
0,0,dsgdb9nsd_000001,1,0,1JHC,84.8076,H,0.00215,-0.006031,0.001976,C,-0.012698,1.085804,0.008001,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.091953
1,1,dsgdb9nsd_000001,1,2,2JHH,-11.257,H,0.00215,-0.006031,0.001976,H,1.011731,1.463751,0.000277,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,2,dsgdb9nsd_000001,1,3,2JHH,-11.2548,H,0.00215,-0.006031,0.001976,H,-0.540815,1.447527,-0.876644,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,3,dsgdb9nsd_000001,1,4,2JHH,-11.2543,H,0.00215,-0.006031,0.001976,H,-0.523814,1.437933,0.906397,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,4,dsgdb9nsd_000001,2,0,1JHC,84.8074,H,1.011731,1.463751,0.000277,C,-0.012698,1.085804,0.008001,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.091952
5,5,dsgdb9nsd_000001,2,3,2JHH,-11.2541,H,1.011731,1.463751,0.000277,H,-0.540815,1.447527,-0.876644,,,,,,,,,,,,,,,,,,,,,,,,,,,
6,6,dsgdb9nsd_000001,2,4,2JHH,-11.2548,H,1.011731,1.463751,0.000277,H,-0.523814,1.437933,0.906397,,,,,,,,,,,,,,,,,,,,,,,,,,,
7,7,dsgdb9nsd_000001,3,0,1JHC,84.8093,H,-0.540815,1.447527,-0.876644,C,-0.012698,1.085804,0.008001,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.091946
8,8,dsgdb9nsd_000001,3,4,2JHH,-11.2543,H,-0.540815,1.447527,-0.876644,H,-0.523814,1.437933,0.906397,,,,,,,,,,,,,,,,,,,,,,,,,,,
9,9,dsgdb9nsd_000001,4,0,1JHC,84.8095,H,-0.523814,1.437933,0.906397,C,-0.012698,1.085804,0.008001,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.091948


In [24]:
import numpy as np

train.iloc[:,14:40] = train.iloc[:,14:40].fillna(0.0).astype(np.int8)
train.iloc[:,2:4] = train.iloc[:,2:4].astype(np.int8)
train.head()


Unnamed: 0,id,molecule_name,atom_index_0,atom_index_1,type,scalar_coupling_constant,atom_0,x_0,y_0,z_0,atom_1,x_1,y_1,z_1,atom_0_CH3,atom_1_CH3,atom_0_CH2,atom_1_CH2,atom_0_CH1,atom_1_CH1,atom_0_CH0,atom_1_CH0,atom_0_NH2,atom_1_NH2,atom_0_NH1,atom_1_NH1,atom_0_NH0,atom_1_NH0,atom_0_OH1,atom_1_OH1,atom_0_OH0,atom_1_OH0,is_in_ring,has_aromatic_bond,is_primary_amide,is_secondary_amide,is_tertiary_amide,is_ester,is_carbonyl,bond_order,bond_length,abs_dx,abs_dy,abs_dz,atom_0_H,atom_1_C,atom_1_H,atom_1_N
0,0,dsgdb9nsd_000001,1,0,1JHC,84.8076,H,0.00215,-0.006031,0.001976,C,-0.012698,1.085804,0.008001,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1.091953,0.014849,1.091835,0.006025,1,1,0,0
1,1,dsgdb9nsd_000001,1,2,2JHH,-11.257,H,0.00215,-0.006031,0.001976,H,1.011731,1.463751,0.000277,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,1.00958,1.469782,0.0017,1,0,1,0
2,2,dsgdb9nsd_000001,1,3,2JHH,-11.2548,H,0.00215,-0.006031,0.001976,H,-0.540815,1.447527,-0.876644,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,0.542965,1.453558,0.87862,1,0,1,0
3,3,dsgdb9nsd_000001,1,4,2JHH,-11.2543,H,0.00215,-0.006031,0.001976,H,-0.523814,1.437933,0.906397,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,0.525964,1.443964,0.904421,1,0,1,0
4,4,dsgdb9nsd_000001,2,0,1JHC,84.8074,H,1.011731,1.463751,0.000277,C,-0.012698,1.085804,0.008001,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1.091952,1.024429,0.377947,0.007724,1,1,0,0


In [14]:
train.loc[train['molecule_name'] == 'dsgdb9nsd_000050']

Unnamed: 0,id,molecule_name,atom_index_0,atom_index_1,type,scalar_coupling_constant,atom_0,x_0,y_0,z_0,atom_1,x_1,y_1,z_1,atom_0_CH3,atom_1_CH3,atom_0_CH2,atom_1_CH2,atom_0_CH1,atom_1_CH1,atom_0_CH0,atom_1_CH0,atom_0_NH2,atom_1_NH2,atom_0_NH1,atom_1_NH1,atom_0_NH0,atom_1_NH0,atom_0_OH1,atom_1_OH1,atom_0_OH0,atom_1_OH0,is_in_ring,has_aromatic_bond,is_primary_amide,is_secondary_amide,is_tertiary_amide,is_ester,is_carbonyl,bond_order,bond_length
696,696,dsgdb9nsd_000050,5,0,1JHN,59.301,H,-0.832432,1.927727,0.02123,N,-0.008297,1.353628,0.00996,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1.004448
697,697,dsgdb9nsd_000050,5,1,2JHC,2.82625,H,-0.832432,1.927727,0.02123,C,1.280333,1.824574,-0.000225,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
698,698,dsgdb9nsd_000050,5,2,3JHC,6.73783,H,-0.832432,1.927727,0.02123,C,2.122166,0.736989,-0.014456,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
699,699,dsgdb9nsd_000050,5,3,3JHC,6.73789,H,-0.832432,1.927727,0.02123,C,1.308499,-0.431096,-0.012842,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
700,700,dsgdb9nsd_000050,5,4,2JHC,2.82632,H,-0.832432,1.927727,0.02123,C,-0.003428,-0.018369,0.002357,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
701,701,dsgdb9nsd_000050,5,6,3JHH,0.689723,H,-0.832432,1.927727,0.02123,H,1.484718,2.883126,0.003543,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
702,702,dsgdb9nsd_000050,5,9,3JHH,0.689683,H,-0.832432,1.927727,0.02123,H,-0.925516,-0.576971,0.008394,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
703,703,dsgdb9nsd_000050,6,0,2JHN,3.80559,H,1.484718,2.883126,0.003543,N,-0.008297,1.353628,0.00996,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
704,704,dsgdb9nsd_000050,6,1,1JHC,125.638,H,1.484718,2.883126,0.003543,C,1.280333,1.824574,-0.000225,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1.07811
705,705,dsgdb9nsd_000050,6,2,2JHC,5.96429,H,1.484718,2.883126,0.003543,C,2.122166,0.736989,-0.014456,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,


### Basic Feature engineering
#### Distances between atoms

In [15]:
def add_distances(df):
    df['abs_dx']=(df['x_1']-df['x_0']).abs()
    #df['abs_dx']=df['abs_dx'].abs()
    df['abs_dy']=(df['y_1']-df['y_0']).abs()
    #df['abs_dx']=df['abs_dx'].abs()
    df['abs_dz']=(df['z_1']-df['z_0']).abs()
    #df['abs_dx']=df['abs_dx'].abs()
    #df['dx2']=df['dx']**2
    #df['dy2']=df['dy']**2
    #df['dz2']=df['dz']**2
    #df['distance']=(df['dx2'] + df['dy2'] + df['dz2'])**(1/2)
    return df

In [16]:
train=add_distances(train)

In [17]:
train.shape

(4658147, 44)

#### Handling Category data

For the above columns I will consider atom_index's ordinal data. I will argue that atom's are pure labels, that is, nominal data.

LabelEncoder could be used for the ordinal data but as they are already ordered integers we will leave them be. One Hot Encoding is used for nominal data.

In [18]:
train.head()

Unnamed: 0,id,molecule_name,atom_index_0,atom_index_1,type,scalar_coupling_constant,atom_0,x_0,y_0,z_0,atom_1,x_1,y_1,z_1,atom_0_CH3,atom_1_CH3,atom_0_CH2,atom_1_CH2,atom_0_CH1,atom_1_CH1,atom_0_CH0,atom_1_CH0,atom_0_NH2,atom_1_NH2,atom_0_NH1,atom_1_NH1,atom_0_NH0,atom_1_NH0,atom_0_OH1,atom_1_OH1,atom_0_OH0,atom_1_OH0,is_in_ring,has_aromatic_bond,is_primary_amide,is_secondary_amide,is_tertiary_amide,is_ester,is_carbonyl,bond_order,bond_length,abs_dx,abs_dy,abs_dz
0,0,dsgdb9nsd_000001,1,0,1JHC,84.8076,H,0.00215,-0.006031,0.001976,C,-0.012698,1.085804,0.008001,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1.091953,0.014849,1.091835,0.006025
1,1,dsgdb9nsd_000001,1,2,2JHH,-11.257,H,0.00215,-0.006031,0.001976,H,1.011731,1.463751,0.000277,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,1.00958,1.469782,0.0017
2,2,dsgdb9nsd_000001,1,3,2JHH,-11.2548,H,0.00215,-0.006031,0.001976,H,-0.540815,1.447527,-0.876644,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,0.542965,1.453558,0.87862
3,3,dsgdb9nsd_000001,1,4,2JHH,-11.2543,H,0.00215,-0.006031,0.001976,H,-0.523814,1.437933,0.906397,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,0.525964,1.443964,0.904421
4,4,dsgdb9nsd_000001,2,0,1JHC,84.8074,H,1.011731,1.463751,0.000277,C,-0.012698,1.085804,0.008001,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1.091952,1.024429,0.377947,0.007724


In [19]:
train_atom_0 = pd.get_dummies(train['atom_0'], prefix='atom_0')

train = pd.concat([train, train_atom_0], axis=1)

train_atom_1 = pd.get_dummies(train['atom_1'], prefix='atom_1')

train = pd.concat([train, train_atom_1], axis=1)

In [20]:
train.head()

Unnamed: 0,id,molecule_name,atom_index_0,atom_index_1,type,scalar_coupling_constant,atom_0,x_0,y_0,z_0,atom_1,x_1,y_1,z_1,atom_0_CH3,atom_1_CH3,atom_0_CH2,atom_1_CH2,atom_0_CH1,atom_1_CH1,atom_0_CH0,atom_1_CH0,atom_0_NH2,atom_1_NH2,atom_0_NH1,atom_1_NH1,atom_0_NH0,atom_1_NH0,atom_0_OH1,atom_1_OH1,atom_0_OH0,atom_1_OH0,is_in_ring,has_aromatic_bond,is_primary_amide,is_secondary_amide,is_tertiary_amide,is_ester,is_carbonyl,bond_order,bond_length,abs_dx,abs_dy,abs_dz,atom_0_H,atom_1_C,atom_1_H,atom_1_N
0,0,dsgdb9nsd_000001,1,0,1JHC,84.8076,H,0.00215,-0.006031,0.001976,C,-0.012698,1.085804,0.008001,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1.091953,0.014849,1.091835,0.006025,1,1,0,0
1,1,dsgdb9nsd_000001,1,2,2JHH,-11.257,H,0.00215,-0.006031,0.001976,H,1.011731,1.463751,0.000277,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,1.00958,1.469782,0.0017,1,0,1,0
2,2,dsgdb9nsd_000001,1,3,2JHH,-11.2548,H,0.00215,-0.006031,0.001976,H,-0.540815,1.447527,-0.876644,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,0.542965,1.453558,0.87862,1,0,1,0
3,3,dsgdb9nsd_000001,1,4,2JHH,-11.2543,H,0.00215,-0.006031,0.001976,H,-0.523814,1.437933,0.906397,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,0.525964,1.443964,0.904421,1,0,1,0
4,4,dsgdb9nsd_000001,2,0,1JHC,84.8074,H,1.011731,1.463751,0.000277,C,-0.012698,1.085804,0.008001,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1.091952,1.024429,0.377947,0.007724,1,1,0,0


In [25]:
train.dtypes

id                            int64
molecule_name                object
atom_index_0                   int8
atom_index_1                   int8
type                         object
scalar_coupling_constant    float64
atom_0                       object
x_0                         float64
y_0                         float64
z_0                         float64
atom_1                       object
x_1                         float64
y_1                         float64
z_1                         float64
atom_0_CH3                     int8
atom_1_CH3                     int8
atom_0_CH2                     int8
atom_1_CH2                     int8
atom_0_CH1                     int8
atom_1_CH1                     int8
atom_0_CH0                     int8
atom_1_CH0                     int8
atom_0_NH2                     int8
atom_1_NH2                     int8
atom_0_NH1                     int8
atom_1_NH1                     int8
atom_0_NH0                     int8
atom_1_NH0                  

#### Remove not useful columns

In [None]:
train = train.drop('id', axis=1)
train = train.drop('molecule_name', axis=1)
train = train.drop('atom_0', axis=1)
train = train.drop('atom_1', axis=1)

#### Let's split the data into different datasets based on coupling types

In [None]:
train_map={}
selected_types = ['1JHC', '1JHN']
for coupling_type, df_by_type in train.groupby('type'):
    df_by_type = df_by_type.drop('type', axis=1)
    if coupling_type in selected_types:
        train_map[coupling_type] = df_by_type

In [None]:
train_map.keys()

In [None]:
[df.shape for df in train_map.values()]

### Data splitting

In [None]:
train_map['1JHC'].columns

In [None]:
from sklearn.model_selection import train_test_split

train_map_split = {}
for key, dataset in train_map.items():
    y = dataset['scalar_coupling_constant']
    X = dataset.drop('scalar_coupling_constant', axis=1)
    
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, random_state = 13, test_size = 0.25)
    train_map_split[key] = (X_train, X_valid, y_train, y_valid)

In [None]:
train_map_split['1JHC'][0].head()

#### Cleanup - we need the resources

In [None]:
import gc

del train
del structures

for key, dataset in train_map.items():
    del dataset
train_map.clear()

gc.collect()

### The model

In [None]:
from catboost import CatBoostRegressor

model_map = {}
for key, datasets in train_map_split.items():
    f'Training model on coupling type: {key}'
    model = CatBoostRegressor(iterations=2000, depth= 13, random_seed = 23, task_type = "GPU", devices='0')
    model.fit(datasets[0], datasets[2], eval_set=(datasets[1], datasets[3]), logging_level='Verbose')
    model_map[key] = model

#### Lets look at the result per coupling type: '1JHC', '1JHN', '2JHC', '2JHH', '2JHN', '3JHC', '3JHH', '3JHN'

#### First a few helper functions to compare with:

In [None]:
from sklearn.metrics import explained_variance_score, mean_absolute_error, mean_squared_error, r2_score
import matplotlib.pyplot as plt

def show_scores(y_valid,y_pred):
    print("Variance_Score(cat_Regressor)\t:"+str(explained_variance_score(y_valid,y_pred)))
    print("Mean_Absolute_Error(cat_Regressor)\t:"+str(mean_absolute_error(y_valid,y_pred)))
    print("Mean_Squared_Error(cat_Regressor)\t:"+str(mean_squared_error(y_valid,y_pred)))
    print("R2-Score(cat_Regressor)\t:"+str(r2_score(y_valid,y_pred)))
    
def show_feature_importance(model, columns):
    fea_imp = pd.DataFrame({'col': columns, 'imp': model.feature_importances_})
    fea_imp = fea_imp.sort_values(['imp', 'col'], ascending=[True, False]).iloc[-30:]
    fea_imp.plot(kind='barh', x='col', y='imp', figsize=(10, 7), legend=None)
    plt.title('CatBoost - Feature Importance')
    plt.ylabel('Features')
    plt.xlabel('Importance')

#### 1JHC

In [None]:
coupling_type = '1JHC'
model = model_map[coupling_type]
y_pred = model.predict(train_map_split[coupling_type][1])

In [None]:
show_scores(train_map_split[coupling_type][3], y_pred)

In [None]:
show_feature_importance(model, train_map_split[coupling_type][0].columns)

#### 1JHN

In [None]:
coupling_type = '1JHN'
model = model_map[coupling_type]
y_pred = model.predict(train_map_split[coupling_type][1])

In [None]:
show_scores(train_map_split[coupling_type][3], y_pred)

In [None]:
show_feature_importance(model, train_map_split[coupling_type][0].columns)

#### 2JHC

In [None]:
coupling_type = '2JHC'
model = model_map[coupling_type]
y_pred = model.predict(train_map_split[coupling_type][1])

In [None]:
show_scores(train_map_split[coupling_type][3], y_pred)

In [None]:
show_feature_importance(model, train_map_split[coupling_type][0].columns)

#### 2JHH

In [None]:
coupling_type = '2JHH'
model = model_map[coupling_type]
y_pred = model.predict(train_map_split[coupling_type][1])

In [None]:
show_scores(train_map_split[coupling_type][3], y_pred)

In [None]:
show_feature_importance(model, train_map_split[coupling_type][0].columns)

#### 2JHN

In [None]:
coupling_type = '2JHN'
model = model_map[coupling_type]
y_pred = model.predict(train_map_split[coupling_type][1])

In [None]:
show_scores(train_map_split[coupling_type][3], y_pred)

In [None]:
show_feature_importance(model, train_map_split[coupling_type][0].columns)

#### 3JHC

In [None]:
coupling_type = '3JHC'
model = model_map[coupling_type]
y_pred = model.predict(train_map_split[coupling_type][1])

In [None]:
show_scores(train_map_split[coupling_type][3], y_pred)

In [None]:
show_feature_importance(model, train_map_split[coupling_type][0].columns)

#### 3JHH

In [None]:
coupling_type = '3JHH'
model = model_map[coupling_type]
y_pred = model.predict(train_map_split[coupling_type][1])

In [None]:
show_scores(train_map_split[coupling_type][3], y_pred)

In [None]:
show_feature_importance(model, train_map_split[coupling_type][0].columns)

#### 3JHN

In [None]:
coupling_type = '3JHN'
model = model_map[coupling_type]
y_pred = model.predict(train_map_split[coupling_type][1])

In [None]:
show_scores(train_map_split[coupling_type][3], y_pred)

In [None]:
show_feature_importance(model, train_map_split[coupling_type][0].columns)

### Let's look at the test data and generate a score for submission

In [None]:
test = pd.read_csv(f'{input_folder}/test.csv')
structures = pd.read_csv(f'{input_folder}/structures.csv')

In [None]:
test = map_atom_info(test, 0)
test = map_atom_info(test, 1)

In [None]:
test=add_distances(test)

In [None]:
test=additional_features(test)

In [None]:
test_atom_0 = pd.get_dummies(test['atom_0'], prefix='atom_0')
test = pd.concat([test, test_atom_0], axis=1)
test_atom_1 = pd.get_dummies(test['atom_1'], prefix='atom_1')
test = pd.concat([test, test_atom_1], axis=1)

In [None]:
test = test.drop('id', axis=1)
test = test.drop('molecule_name', axis=1)
test = test.drop('atom_0', axis=1)
test = test.drop('atom_1', axis=1)

In [None]:
test.columns

In [None]:
# '1JHC' '2JHH' '1JHN' '2JHN' '2JHC' '3JHH' '3JHC' '3JHN'
#model_1JHC = model_map['1JHC']
#model_2JHH = model_map['2JHH']
#model_1JHN = model_map['1JHN']
#model_2JHN = model_map['2JHN']
#model_2JHC = model_map['2JHC']
#model_3JHH = model_map['3JHH']
#model_3JHC = model_map['3JHC']
#model_3JHN = model_map['3JHN']

predictions = []

def score_data(row):
    coupling_type = row['type']
    #print(coupling_type)
    row = row.drop(labels=['type'])
    row = row.tolist()
    #print(row)

    model = model_map[coupling_type]
    y_pred = model.predict(row)
    #print(y_pred)
    predictions.append(y_pred)

In [None]:
for index, row in test.iterrows():
    if index % 10000 == 0:
        print("Index: " + str(index)) 
    score_data(row)

In [None]:
# OK
import numpy as np

print(f'Mean: {np.mean(predictions)}')
print(f'std: {np.std(predictions)}')
print(f'Median: {np.median(predictions)}')
print(f'Min: {np.amin(predictions, axis = 0)}')
print(f'Max: {np.amax(predictions, axis = 0)}')


Label Encoding
- Mean: 15.88686737376035
- std: 34.660710108935
- Median: 2.629095976719184
- Min: -21.341563433735075
- Max: 220.7949596659139
<BR>
  
One Hot Encoding
- Mean: 15.88702962893186
- std: 34.66088572045097
- Median: 2.625737756448414
- Min: -21.282020727288177
- Max: 218.22497992595754

In [None]:
# OK
rows_to_compare = [0, 1, 89, 368, 3434, 12345, 100000, 2500000]
print(predictions[368])

Label Encoding
<BR>
[  2.59968011 156.2923657    3.26403273  83.85696981  86.61881557
   2.80670743  -8.41252018   0.27499807]
   
One Hot Encoding
<BR>
[  2.75261998 146.15409166   3.17457195  84.17543889  86.48490323
   2.85781352  -8.41580391   0.15665672]

### Submit scores!


-0.409 on Kaggle

In [None]:
import pandas as pd

submission = pd.read_csv('./input/sample_submission.csv')
submission['scalar_coupling_constant'] = predictions
submission.to_csv('many_features_split_type_CAT.csv', index=False)

In [None]:
import numpy as np

def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                c_prec = df[col].apply(lambda x: np.finfo(x).precision).max()
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max and c_prec == np.finfo(np.float16).precision:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max and c_prec == np.finfo(np.float32).precision:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
                    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [None]:
def additional_features(df):
    df['molecule_couples'] = df.groupby('molecule_name')['id'].transform('count')
    df['molecule_dist_mean'] = df.groupby('molecule_name')['distance'].transform('mean')
    df['molecule_dist_min'] = df.groupby('molecule_name')['distance'].transform('min')
    df['molecule_dist_max'] = df.groupby('molecule_name')['distance'].transform('max')
    df['molecule_dist_std'] = df.groupby('molecule_name')['distance'].transform('std')
    df['atom_0_couples_count'] = df.groupby(['molecule_name', 'atom_index_0'])['id'].transform('count')
    df['atom_1_couples_count'] = df.groupby(['molecule_name', 'atom_index_1'])['id'].transform('count')

    num_cols = ['distance'] # 'dx', 'dy', 'dz'
    cat_cols = ['atom_index_0', 'atom_index_1']
    aggs = ['mean', 'std', 'max', 'min']
    for col in cat_cols:
        df[f'molecule_{col}_count'] = df.groupby('molecule_name')[col].transform('count')

    for cat_col in cat_cols:
        for num_col in num_cols:
            for agg in aggs:
                df[f'molecule_{cat_col}_{num_col}_{agg}'] = df.groupby(['molecule_name', cat_col])[num_col].transform(agg)
                df[f'molecule_{cat_col}_{num_col}_{agg}_diff'] = df[f'molecule_{cat_col}_{num_col}_{agg}'] - df[num_col]
                df[f'molecule_{cat_col}_{num_col}_{agg}_div'] = df[f'molecule_{cat_col}_{num_col}_{agg}'] / df[num_col]

    df = reduce_mem_usage(df)
    return df