In [2]:
import pandas as pd
import numpy as np

from chemistry import Molecule

In [3]:
structures = pd.read_feather('../data/structures.feather')
labelled = pd.read_feather('../data/train.feather')
unlabelled = pd.read_feather('../data/test.feather')

In [4]:
def merge(data):
    data.copy()
    
    m0 = data.merge(structures, left_on=['molecule_name', 'atom_index_0'], right_on=['molecule_name', 'atom_index'], suffixes=('0', '0'))
    m1 = data.merge(structures, left_on=['molecule_name', 'atom_index_1'], right_on=['molecule_name', 'atom_index'], suffixes=('1', '1'))
    
    l0 = m0[['x', 'y', 'z']]
    l1 = m1[['x', 'y', 'z']]
    d = l0 - l1
    d2 = d * d
    dist2 = d2.x + d2.y + d2.z
    dist = dist2.apply(np.sqrt)
    dist.name = 'distance'
    
    merged = data.join(dist)
    merged['atom_0'] = m0.atom
    merged['atom_1'] = m1.atom
    
    return merged

In [5]:
labelled_merged = merge(labelled)
unlabelled_merged = merge(unlabelled)

In [6]:
min_dist = min(labelled_merged.distance.min(), unlabelled_merged.distance.min())
max_dist = max(labelled_merged.distance.max(), unlabelled_merged.distance.max())

min_coeff = labelled_merged.scalar_coupling_constant.min()
max_coeff = labelled_merged.scalar_coupling_constant.max()

In [7]:
offset = min_coeff
scale = max_coeff - min_coeff

labelled_merged['norm_distance'] = (labelled_merged.distance - min_dist) / (max_dist - min_dist)
unlabelled_merged['norm_distance'] = (unlabelled_merged.distance - min_dist) / (max_dist - min_dist)

labelled_merged['norm_scc'] = (labelled_merged.scalar_coupling_constant - min_coeff) / (max_coeff - min_coeff)

In [8]:
types = labelled.type.unique()
atoms = structures.atom.unique()

index = 0
type_index = {}
for t in types:
    type_index[t] = index
    index += 1

atom_index = {}
for a in atoms:
    atom_index[a] = index
    index += 2

dist_index = index
index += 1

columns = index

In [9]:
def partition_labelled(data, count=None, train_frac=0.7):
    n_labelled = count if count is not None else len(labelled)
    n_train = int(n_labelled * train_frac)
    n_test = n_labelled - n_train
    indices = np.arange(0, n_labelled)
    np.random.shuffle(indices)
    
    train_indices = indices[0:n_train]
    test_indices = indices[n_train:]
    
    train = data.iloc[train_indices, :]
    test = data.iloc[test_indices, :]

    return train, test

def make_input(data, columns, type_index, atom_index, dist_index):
    n = len(data)
    input = np.zeros((columns, n), dtype='float32')
    
    for t in type_index:
        input[type_index[t], data.type == t] = 1
        
    for a in atom_index:
        input[atom_index[a], data.atom_0 == a] = 1
        input[atom_index[a] + 1, data.atom_1 == a] = 1
        
    input[dist_index] = data.norm_distance
        
    return input.T

def make_output(data):
    n = len(data)
    output = np.zeros(n, dtype='float32')
    output[:] = data.norm_scc
        
    return output

In [10]:
from sklearn.svm import SVR,LinearSVR

In [11]:
labelled_merged.columns

Index(['id', 'molecule_name', 'atom_index_0', 'atom_index_1', 'type',
       'scalar_coupling_constant', 'distance', 'atom_0', 'atom_1',
       'norm_distance', 'norm_scc'],
      dtype='object')

In [12]:
labelled_train, labelled_test = partition_labelled(labelled_merged, 30000)

labelled_train_input = make_input(labelled_train, columns, type_index, atom_index, dist_index)
labelled_train_output = make_output(labelled_train)

labelled_test_input = make_input(labelled_test, columns, type_index, atom_index, dist_index)
labelled_test_output = make_output(labelled_test)

labelled_train_input.shape, labelled_train_output.shape

((21000, 19), (21000,))

In [15]:
model = LinearSVR(max_iter=10000)
model.fit(labelled_train_input, labelled_train_output)
print('')
#print(model.predict(test_input))
#print(test_output)
model.score(labelled_test_input, labelled_test_output)





Liblinear failed to converge, increase the number of iterations.



0.9230842077202915

In [16]:
labelled_train_input = make_input(labelled_merged, columns, type_index, atom_index, dist_index)
labelled_train_output = make_output(labelled_merged)

labelled_train_input.shape, labelled_train_output.shape

((4658147, 19), (4658147,))

In [17]:
model = LinearSVR()
model.fit(labelled_train_input, labelled_train_output)


Liblinear failed to converge, increase the number of iterations.



LinearSVR(C=1.0, dual=True, epsilon=0.0, fit_intercept=True,
     intercept_scaling=1.0, loss='epsilon_insensitive', max_iter=1000,
     random_state=None, tol=0.0001, verbose=0)

In [18]:
unlabelled_input = make_input(unlabelled_merged, columns, type_index, atom_index, dist_index)

In [19]:
unlabelled_output = model.predict(unlabelled_input)

In [20]:
unlabelled_output

array([0.1457753 , 0.51826588, 0.16547869, ..., 0.16524162, 0.14791092,
       0.52040151])

In [21]:
unlabelled_output = unlabelled_output * scale + offset

In [22]:
unlabelled_output

array([-1.07238079, 88.73457885,  3.67808031, ...,  3.62092189,
       -0.55748415, 89.24947549])

In [23]:
output_df = pd.DataFrame({'id':unlabelled.id, 'scalar_coupling_constant':pd.Series(unlabelled_output, index=unlabelled.index)})

In [24]:
output_df

Unnamed: 0,id,scalar_coupling_constant
0,4658147,-1.072381
1,4658148,88.734579
2,4658149,3.678080
3,4658150,88.734579
4,4658151,-0.557484
5,4658152,88.659480
6,4658153,3.030926
7,4658154,-11.058669
8,4658155,-11.058669
9,4658156,88.659480


In [25]:
output_df.to_csv('../data/pred.csv',index=False)