In [1]:
%load_ext autoreload
%autoreload 2

%load_ext line_profiler

In [2]:
import pandas as pd
import numpy as np
import pickle

from chemistry import Molecule

In [3]:
with open('../data/molecules.pickle', 'rb') as f:
    molecules = pickle.load(f)

In [4]:
structures = pd.read_feather('../data/structures.feather')
labelled = pd.read_feather('../data/train.feather')
unlabelled = pd.read_feather('../data/test.feather')

In [5]:
molecules['dsgdb9nsd_000001']

Name: dsgdb9nsd_000001
Atoms:
  C 0: [-0.01269814  1.0858041   0.008001  ]
  H 1: [ 0.00215042 -0.00603132  0.00197612]
  H 2: [1.0117308e+00 1.4637512e+00 2.7657481e-04]
  H 3: [-0.54081506  1.4475266  -0.8766437 ]
  H 4: [-0.5238136  1.4379326  0.9063973]
Bonds:
  C(0) - H(1)
  C(0) - H(2)
  C(0) - H(3)
  C(0) - H(4)


In [6]:
molecules['dsgdb9nsd_000001'].bonds

{(0, 1): Bond(dist=1.0919529, valency=1, strength=411),
 (0, 2): Bond(dist=1.0919516, valency=1, strength=411),
 (0, 3): Bond(dist=1.0919464, valency=1, strength=411),
 (0, 4): Bond(dist=1.0919476, valency=1, strength=411)}

In [7]:
labelled.head(n=10)

Unnamed: 0,id,molecule_name,atom_index_0,atom_index_1,type,scalar_coupling_constant
0,0,dsgdb9nsd_000001,1,0,1JHC,84.807602
1,1,dsgdb9nsd_000001,1,2,2JHH,-11.257
2,2,dsgdb9nsd_000001,1,3,2JHH,-11.2548
3,3,dsgdb9nsd_000001,1,4,2JHH,-11.2543
4,4,dsgdb9nsd_000001,2,0,1JHC,84.807404
5,5,dsgdb9nsd_000001,2,3,2JHH,-11.2541
6,6,dsgdb9nsd_000001,2,4,2JHH,-11.2548
7,7,dsgdb9nsd_000001,3,0,1JHC,84.809303
8,8,dsgdb9nsd_000001,3,4,2JHH,-11.2543
9,9,dsgdb9nsd_000001,4,0,1JHC,84.809502


# Inputs

* bond type
  * One hot encoding
* bond length
* atom1 position
* atom2 position
* atom3 position
* atom4 position

def merge(data):
    data.copy()
    
    m0 = data.merge(structures, left_on=['molecule_name', 'atom_index_0'], right_on=['molecule_name', 'atom_index'], suffixes=('0', '0'))
    m1 = data.merge(structures, left_on=['molecule_name', 'atom_index_1'], right_on=['molecule_name', 'atom_index'], suffixes=('1', '1'))
    
    l0 = m0[['x', 'y', 'z']]
    l1 = m1[['x', 'y', 'z']]
    d = l0 - l1
    d2 = d * d
    dist2 = d2.x + d2.y + d2.z
    dist = dist2.apply(np.sqrt)
    dist.name = 'distance'
    
    merged = data.join(dist)
    merged['atom_0'] = m0.atom
    merged['atom_1'] = m1.atom
    
    return merged

labelled_merged = merge(labelled)
unlabelled_merged = merge(unlabelled)

min_dist = min(labelled_merged.distance.min(), unlabelled_merged.distance.min())
max_dist = max(labelled_merged.distance.max(), unlabelled_merged.distance.max())

min_coeff = labelled_merged.scalar_coupling_constant.min()
max_coeff = labelled_merged.scalar_coupling_constant.max()

offset = min_coeff
scale = max_coeff - min_coeff

labelled_merged['norm_distance'] = (labelled_merged.distance - min_dist) / (max_dist - min_dist)
unlabelled_merged['norm_distance'] = (unlabelled_merged.distance - min_dist) / (max_dist - min_dist)

labelled_merged['norm_scc'] = (labelled_merged.scalar_coupling_constant - min_coeff) / (max_coeff - min_coeff)

In [8]:
min_coeff = labelled.scalar_coupling_constant.min()
max_coeff = labelled.scalar_coupling_constant.max()

offset = min_coeff
scale = max_coeff - min_coeff

labelled['norm_scc'] = (labelled.scalar_coupling_constant - min_coeff) / (max_coeff - min_coeff)

In [9]:
types = labelled.type.unique()
atoms = structures.atom.unique()

index = 0
type_index = {}
for t in types:
    type_index[t] = index
    index += 1

index = 0
atom_index = {}
for a in atoms:
    atom_index[a] = index
    index += 1

In [10]:
def partition_labelled(data, count=None, train_frac=0.7):
    n_labelled = count if count is not None else len(labelled)
    n_train = int(n_labelled * train_frac)
    n_test = n_labelled - n_train
    indices = np.arange(0, n_labelled)
    np.random.shuffle(indices)
    
    train_indices = indices[0:n_train]
    test_indices = indices[n_train:]
    
    train = data.iloc[train_indices, :]
    test = data.iloc[test_indices, :]

    return train, test

def make_input(data):
    n = len(data)
    

    coupling_input = np.zeros((len(types), n), dtype='float32')
    atom_input = [None] * 4
    for i in range(len(atom_input)):
        atom_input[i] = np.zeros((len(atoms), n), dtype='float32')

    bond_input = [None] * 3
    for i in range(len(bond_input)):
        bond_input[i] = np.zeros((3, n), dtype='float32')

    for t in type_index:
        coupling_input[type_index[t], data.type == t] = 1

    for i, row in enumerate(data.itertuples()):
        # coupling_input[type_index[row.type], i] = 1
        
        m = molecules[row.molecule_name]
        bonds = m.bonds
        
        path = m.compute_path(row.atom_index_0, row.atom_index_1)
        syms = [m.symbols[idx] for idx in path]
        
        atom_input[0][atom_index[syms[0]], i]  = 1
        
        try:
            i0 = path[0]
            for j, i1 in enumerate(path[1:]):
                b = bonds.get((i0, i1), None)
                if b is None:
                    b = bonds.get((i1, i0), None)
                if b is None:
                    print(f'Unable to resolve bond - path = {path}, bond = {(i0, i1)})')
                    i0 = i1
                    continue

                j2 = j + 1

                bond_input[j][:, i] = [b.dist, b.valency, b.strength]            
                atom_input[j2][atom_index[syms[j2]], i] = 1

                i0 = i1
        except:
            pass
            
    return (coupling_input, atom_input, bond_input)

def make_output(data):
    n = len(data)
    output = np.zeros(n, dtype='float32')
    output[:] = data.norm_scc
        
    return output

def combine_inputs(inputs_tuple):
    coupling_input, atom_input, bond_input = inputs_tuple
    
    inputs = [coupling_input]
    inputs.extend(atom_input)
    inputs.extend(bond_input)
    
    w, h = coupling_input.shape
    for a in atom_input:
        w += a.shape[0]
    for a in bond_input:
        w += a.shape[0]
    
    input = np.empty((w, h), dtype='float32')
    i = 0
    for a in inputs:
        w, _ = a.shape
        input[i:i + w, :] = a
        i += w
    
    return input.T

In [11]:
labelled_train, labelled_test = partition_labelled(labelled, 1000)

labelled_train_input = combine_inputs(make_input(labelled_train))
labelled_train_output = make_output(labelled_train)

labelled_test_input = combine_inputs(make_input(labelled_test))
labelled_test_output = make_output(labelled_test)

In [12]:
from sklearn.svm import SVR,LinearSVR

In [13]:
model = LinearSVR()
model.fit(labelled_train_input, labelled_train_output)
print('')
#print(model.predict(test_input))
#print(test_output)
model.score(labelled_test_input, labelled_test_output)





Liblinear failed to converge, increase the number of iterations.



0.7044131948504228

In [14]:
labelled_train_input = combine_inputs(make_input(labelled))
labelled_train_output = make_output(labelled)

labelled_train_input.shape, labelled_train_output.shape

((4658147, 37), (4658147,))

In [15]:
model = LinearSVR()
model.fit(labelled_train_input, labelled_train_output)


Liblinear failed to converge, increase the number of iterations.



LinearSVR(C=1.0, dual=True, epsilon=0.0, fit_intercept=True,
     intercept_scaling=1.0, loss='epsilon_insensitive', max_iter=1000,
     random_state=None, tol=0.0001, verbose=0)

In [16]:
unlabelled_input = make_input(unlabelled)

In [17]:
unlabelled_output = model.predict(unlabelled_input)

ValueError: Expected 2D array, got 1D array instead:
array=[array([[0., 1., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)
 list([array([[0., 0., 0., ..., 0., 0., 0.],
       [1., 1., 1., ..., 1., 1., 1.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32), array([[1., 1., 1., ..., 1., 1., 1.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32), array([[1., 0., 1., ..., 1., 1., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32), array([[0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)])
 list([array([[  1.0620991,   1.0620991,   1.0620991, ...,   1.0804824,
          1.0804824,   1.0804824],
       [  1.       ,   1.       ,   1.       , ...,   1.       ,
          1.       ,   1.       ],
       [411.       , 411.       , 411.       , ..., 411.       ,
        411.       , 411.       ]], dtype=float32), array([[  1.199079 ,   0.       ,   1.199079 , ...,   1.5032722,
          1.5032722,   0.       ],
       [  3.       ,   0.       ,   3.       , ...,   1.       ,
          1.       ,   0.       ],
       [835.       ,   0.       , 835.       , ..., 346.       ,
        346.       ,   0.       ]], dtype=float32), array([[  0.       ,   0.       ,   1.0620991, ...,   1.5311819,
          0.       ,   0.       ],
       [  0.       ,   0.       ,   1.       , ...,   1.       ,
          0.       ,   0.       ],
       [  0.       ,   0.       , 411.       , ..., 346.       ,
          0.       ,   0.       ]], dtype=float32)])].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [None]:
unlabelled_output

In [None]:
unlabelled_output = unlabelled_output * scale + offset

In [None]:
unlabelled_output

In [None]:
output_df = pd.DataFrame({'id':unlabelled.id, 'scalar_coupling_constant':pd.Series(unlabelled_output, index=unlabelled.index)})

In [None]:
output_df

In [None]:
output_df.to_csv('../data/pred.csv',index=False)