In [2]:
import pandas as pd
import pyarrow.parquet as pq
import numpy as np

from chemistry import Atom, Bond, Molecule

In [3]:
structures = pd.read_feather('../data/structures.feather')
labelled = pd.read_feather('../data/train.feather')
unlabelled = pd.read_feather('../data/test.feather')

  labels, = index.labels


In [12]:
def merge(data):
    l = data.copy()
    l0 = l.merge(structures, left_on=['molecule_name', 'atom_index_0'], right_on=['molecule_name', 'atom_index'], suffixes=('0', '0'))[['x', 'y', 'z']]
    l1 = l.merge(structures, left_on=['molecule_name', 'atom_index_1'], right_on=['molecule_name', 'atom_index'], suffixes=('1', '1'))[['x', 'y', 'z']]
    d = l0 - l1
    d2 = d * d
    dist2 = d2.x + d2.y + d2.z
    dist = dist2.apply(np.sqrt)
    dist.name = 'distance'
    merged = l.join(dist)
    return merged

In [13]:
labelled_merged = merge(labelled)
labelled_merged.to_feather('../data/train_with_dist.feather')

In [14]:
types = labelled.type.unique()

index = 0
type_index = {}
for t in types:
    type_index[t] = index
    index += 1

dist_index = index
index += 1

columns = index

In [21]:
def partition_labelled(data, count=None, train_frac=0.7):
    n_labelled = count if count is not None else len(labelled)
    n_train = int(n_labelled * train_frac)
    n_test = n_labelled - n_train
    indices = np.arange(0, n_labelled)
    np.random.shuffle(indices)
    
    train_indices = indices[0:n_train]
    test_indices = indices[n_train:]
    
    train = data.iloc[train_indices, :]
    test = data.iloc[test_indices, :]

    return train, test

def make_input(data, columns, type_index, dist_index):
    n = len(data)
    input = np.zeros((columns, n), dtype='float32')
    
    for t in type_index:
        input[type_index[t], data.type == t] = 1
        
    input[dist_index] = data.distance
        
    return input.T

def make_output(data, columns, type_index, dist_index):
    n = len(data)
    output = np.zeros(n, dtype='float32')
    output[:] = data.scalar_coupling_constant
        
    return output

In [16]:
from sklearn.svm import SVR,LinearSVR

In [17]:
labelled_merged.columns

Index(['id', 'molecule_name', 'atom_index_0', 'atom_index_1', 'type',
       'scalar_coupling_constant', 'distance'],
      dtype='object')

In [62]:
labelled_train, labelled_test = partition_labelled(labelled_merged)

labelled_train_input = make_input(labelled_train, columns, type_index, dist_index)
labelled_train_output = make_output(labelled_train, columns, type_index, dist_index)

labelled_test_input = make_input(labelled_test, columns, type_index, dist_index)
labelled_test_output = make_output(labelled_test, columns, type_index, dist_index)

labelled_train_input.shape, labelled_train_output.shape

((3260702, 9), (3260702,))

In [63]:
model = LinearSVR()
model.fit(labelled_train_input, labelled_train_output)
print('')
#print(model.predict(test_input))
#print(test_output)
model.score(labelled_test_input, labelled_test_output)




0.9415817040944513

In [28]:
unlabelled_merged = merge(unlabelled)

In [29]:
unlabelled_input = make_input(unlabelled_merged, columns, type_index, dist_index)

In [64]:
unlabelled_output = model.predict(unlabelled_input)

In [65]:
unlabelled_output

array([-0.97277427, 88.200912  ,  3.60078676, ...,  2.73325698,
       -1.07779466, 88.09589161])

In [66]:
output_df = pd.DataFrame({'id':unlabelled.id, 'scalar_coupling_constant':pd.Series(unlabelled_output, index=unlabelled.index)})

In [67]:
output_df

Unnamed: 0,id,scalar_coupling_constant
0,4658147,-0.972774
1,4658148,88.200912
2,4658149,3.600787
3,4658150,88.200912
4,4658151,-1.077795
5,4658152,88.147089
6,4658153,2.784455
7,4658154,-11.391395
8,4658155,-11.391395
9,4658156,88.147089


In [68]:
output_df.to_csv('../data/pred.csv',index=False)