In [1]:
import pandas as pd
import pyarrow.parquet as pq
import numpy as np

from chemistry import Atom, Bond, Molecule

In [2]:
structures = pd.read_feather('../data/structures.feather')
labelled = pd.read_feather('../data/train.feather')
unlabelled = pd.read_feather('../data/test.feather')

  labels, = index.labels


In [3]:
def merge(data):
    l = data.copy()
    l0 = l.merge(structures, left_on=['molecule_name', 'atom_index_0'], right_on=['molecule_name', 'atom_index'], suffixes=('0', '0'))[['x', 'y', 'z']]
    l1 = l.merge(structures, left_on=['molecule_name', 'atom_index_1'], right_on=['molecule_name', 'atom_index'], suffixes=('1', '1'))[['x', 'y', 'z']]
    d = l0 - l1
    d2 = d * d
    dist2 = d2.x + d2.y + d2.z
    dist = dist2.apply(np.sqrt)
    dist.name = 'distance'
    merged = l.join(dist)
    return merged

In [4]:
labelled_merged = merge(labelled)
unlabelled_merged = merge(unlabelled)

In [9]:
min_dist = min(labelled_merged.distance.min(), unlabelled_merged.distance.min())
max_dist = max(labelled_merged.distance.max(), unlabelled_merged.distance.max())

min_coeff = labelled_merged.scalar_coupling_constant.min()
max_coeff = labelled_merged.scalar_coupling_constant.max()

In [18]:
offset = min_coeff
scale = max_coeff - min_coeff

labelled_merged['norm_distance'] = (labelled_merged.distance - min_dist) / (max_dist - min_dist)
unlabelled_merged['norm_distance'] = (unlabelled_merged.distance - min_dist) / (max_dist - min_dist)

labelled_merged['norm_scc'] = (labelled_merged.scalar_coupling_constant - min_coeff) / (max_coeff - min_coeff)

In [19]:
types = labelled.type.unique()

index = 0
type_index = {}
for t in types:
    type_index[t] = index
    index += 1

dist_index = index
index += 1

columns = index

In [20]:
def partition_labelled(data, count=None, train_frac=0.7):
    n_labelled = count if count is not None else len(labelled)
    n_train = int(n_labelled * train_frac)
    n_test = n_labelled - n_train
    indices = np.arange(0, n_labelled)
    np.random.shuffle(indices)
    
    train_indices = indices[0:n_train]
    test_indices = indices[n_train:]
    
    train = data.iloc[train_indices, :]
    test = data.iloc[test_indices, :]

    return train, test

def make_input(data, columns, type_index, dist_index):
    n = len(data)
    input = np.zeros((columns, n), dtype='float32')
    
    for t in type_index:
        input[type_index[t], data.type == t] = 1
        
    input[dist_index] = data.norm_distance
        
    return input.T

def make_output(data, columns, type_index, dist_index):
    n = len(data)
    output = np.zeros(n, dtype='float32')
    output[:] = data.norm_scc
        
    return output

In [21]:
from sklearn.svm import SVR,LinearSVR

In [22]:
labelled_merged.columns

Index(['id', 'molecule_name', 'atom_index_0', 'atom_index_1', 'type',
       'scalar_coupling_constant', 'distance', 'norm_distance', 'norm_scc'],
      dtype='object')

In [23]:
labelled_train, labelled_test = partition_labelled(labelled_merged)

labelled_train_input = make_input(labelled_train, columns, type_index, dist_index)
labelled_train_output = make_output(labelled_train, columns, type_index, dist_index)

labelled_test_input = make_input(labelled_test, columns, type_index, dist_index)
labelled_test_output = make_output(labelled_test, columns, type_index, dist_index)

labelled_train_input.shape, labelled_train_output.shape

((3260702, 9), (3260702,))

In [24]:
labelled_train_output,labelled_test_output

(array([0.15613064, 0.19356978, 0.14963348, ..., 0.14872889, 0.59662145,
        0.14208329], dtype=float32),
 array([0.15984714, 0.96508896, 0.54827154, ..., 0.16897884, 0.11056755,
        0.18812917], dtype=float32))

In [25]:
model = LinearSVR()
model.fit(labelled_train_input, labelled_train_output)
print('')
#print(model.predict(test_input))
#print(test_output)
model.score(labelled_test_input, labelled_test_output)






0.9424511846976048

In [26]:
unlabelled_input = make_input(unlabelled_merged, columns, type_index, dist_index)

In [27]:
unlabelled_output = model.predict(unlabelled_input)

In [28]:
unlabelled_output

array([0.14660773, 0.51835309, 0.16558849, ..., 0.16350243, 0.14605477,
       0.51780013])

In [29]:
unlabelled_output = unlabelled_output * scale + offset

In [30]:
unlabelled_output

array([-0.87168158, 88.75560426,  3.70455341, ...,  3.20160632,
       -1.0049999 , 88.62228594])

In [31]:
output_df = pd.DataFrame({'id':unlabelled.id, 'scalar_coupling_constant':pd.Series(unlabelled_output, index=unlabelled.index)})

In [32]:
output_df

Unnamed: 0,id,scalar_coupling_constant
0,4658147,-0.871682
1,4658148,88.755604
2,4658149,3.704553
3,4658150,88.755604
4,4658151,-1.005000
5,4658152,88.687279
6,4658153,3.266599
7,4658154,-11.038856
8,4658155,-11.038856
9,4658156,88.687279


In [33]:
output_df.to_csv('../data/pred.csv',index=False)