In [1]:
import pandas as pd
import pyarrow.parquet as pq
import numpy as np

from chemistry import Atom, Bond, Molecule

In [2]:
structures = pd.read_feather('../data/structures.feather')
labelled = pd.read_feather('../data/train.feather')
unlabelled = pd.read_feather('../data/test.feather')

  labels, = index.labels


In [3]:
def merge(data):
    data.copy()
    
    m0 = data.merge(structures, left_on=['molecule_name', 'atom_index_0'], right_on=['molecule_name', 'atom_index'], suffixes=('0', '0'))
    m1 = data.merge(structures, left_on=['molecule_name', 'atom_index_1'], right_on=['molecule_name', 'atom_index'], suffixes=('1', '1'))
    
    l0 = m0[['x', 'y', 'z']]
    l1 = m1[['x', 'y', 'z']]
    d = l0 - l1
    d2 = d * d
    dist2 = d2.x + d2.y + d2.z
    dist = dist2.apply(np.sqrt)
    dist.name = 'distance'
    
    merged = data.join(dist)
    merged['atom_0'] = m0.atom
    merged['atom_1'] = m1.atom
    
    return merged

In [4]:
labelled_merged = merge(labelled)
unlabelled_merged = merge(unlabelled)

In [5]:
min_dist = min(labelled_merged.distance.min(), unlabelled_merged.distance.min())
max_dist = max(labelled_merged.distance.max(), unlabelled_merged.distance.max())

min_coeff = labelled_merged.scalar_coupling_constant.min()
max_coeff = labelled_merged.scalar_coupling_constant.max()

In [6]:
offset = min_coeff
scale = max_coeff - min_coeff

labelled_merged['norm_distance'] = (labelled_merged.distance - min_dist) / (max_dist - min_dist)
unlabelled_merged['norm_distance'] = (unlabelled_merged.distance - min_dist) / (max_dist - min_dist)

labelled_merged['norm_scc'] = (labelled_merged.scalar_coupling_constant - min_coeff) / (max_coeff - min_coeff)

In [7]:
types = labelled.type.unique()
atoms = structures.atom.unique()

index = 0
type_index = {}
for t in types:
    type_index[t] = index
    index += 1

atom_index = {}
for a in atoms:
    atom_index[a] = index
    index += 2

dist_index = index
index += 1

columns = index

In [8]:
def partition_labelled(data, count=None, train_frac=0.7):
    n_labelled = count if count is not None else len(labelled)
    n_train = int(n_labelled * train_frac)
    n_test = n_labelled - n_train
    indices = np.arange(0, n_labelled)
    np.random.shuffle(indices)
    
    train_indices = indices[0:n_train]
    test_indices = indices[n_train:]
    
    train = data.iloc[train_indices, :]
    test = data.iloc[test_indices, :]

    return train, test

def make_input(data, columns, type_index, atom_index, dist_index):
    n = len(data)
    input = np.zeros((columns, n), dtype='float32')
    
    for t in type_index:
        input[type_index[t], data.type == t] = 1
        
    for a in atom_index:
        input[atom_index[a], data.atom_0 == t] = 1
        input[atom_index[a] + 1, data.atom_1 == t] = 1
        
    input[dist_index] = data.norm_distance
        
    return input.T

def make_output(data):
    n = len(data)
    output = np.zeros(n, dtype='float32')
    output[:] = data.norm_scc
        
    return output

In [None]:
from sklearn.svm import SVR,LinearSVR

In [None]:
labelled_merged.columns

In [41]:
labelled_train, labelled_test = partition_labelled(labelled_merged, 30000)

labelled_train_input = make_input(labelled_train, columns, type_index, atom_index, dist_index)
labelled_train_output = make_output(labelled_train)

labelled_test_input = make_input(labelled_test, columns, type_index, atom_index, dist_index)
labelled_test_output = make_output(labelled_test)

labelled_train_input.shape, labelled_train_output.shape

((21000, 19), (21000,))

In [34]:
model = LinearSVR()
model.fit(labelled_train_input, labelled_train_output)
print('')
#print(model.predict(test_input))
#print(test_output)
model.score(labelled_test_input, labelled_test_output)

NameError: name 'LinearSVR' is not defined

In [35]:
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.utils import Sequence

In [36]:
def make_nn_model():
    i = Input(shape=(columns,))
    
    x = Dense(64, activation='relu')(i)
    x = Dense(64, activation='sigmoid')(x)
    x = Dense(32, activation='relu')(x)
    x = Dense(32, activation='sigmoid')(x)
    x = Dense(16, activation='relu')(x)
    x = Dense(16, activation='sigmoid')(x)
    
    o = Dense(1)(x)
    
    model = Model(inputs=[i], outputs=[o])
    model.compile(loss='mean_absolute_error', optimizer='adam')
    model.summary()
    
    return model

In [37]:
class Batch(Sequence):
    def __init__(self, input_data, output_data, batch_size):
        self.input_data = input_data
        self.output_data = output_data
        self.batch_size = batch_size
        
        self.l = len(self.output_data)
        
    def __len__(self):
        return self.l // self.batch_size
    
    def __getitem__(self, idx):
        i = idx * self.batch_size
        j = (idx + 1) * self.batch_size
        return self.input_data[i:j, :], self.output_data[i:j]

In [46]:
nn_model = make_nn_model()
history = nn_model.fit_generator(Batch(labelled_train_input, labelled_train_output, 1024),
                                 workers=8, epochs=150, steps_per_epoch=128, verbose=1)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_17 (InputLayer)        (None, 19)                0         
_________________________________________________________________
dense_112 (Dense)            (None, 64)                1280      
_________________________________________________________________
dense_113 (Dense)            (None, 64)                4160      
_________________________________________________________________
dense_114 (Dense)            (None, 32)                2080      
_________________________________________________________________
dense_115 (Dense)            (None, 32)                1056      
_________________________________________________________________
dense_116 (Dense)            (None, 16)                528       
_________________________________________________________________
dense_117 (Dense)            (None, 16)                272       
__________

KeyboardInterrupt: 

In [47]:
labelled_train_input = make_input(labelled_merged, columns, type_index, atom_index, dist_index)
labelled_train_output = make_output(labelled_merged)

labelled_train_input.shape, labelled_train_output.shape

((4658147, 19), (4658147,))

In [None]:
model = LinearSVR()
model.fit(labelled_train_input, labelled_train_output)

In [48]:
unlabelled_input = make_input(unlabelled_merged, columns, type_index, atom_index, dist_index)

In [None]:
unlabelled_output = model.predict(unlabelled_input)

In [None]:
nn_model = make_nn_model()
history = nn_model.fit_generator(Batch(labelled_train_input, labelled_train_output, 1024),
                                 workers=8, epochs=150, steps_per_epoch=128, verbose=1)
unlabelled_output = nn_model.predict(unlabelled_input)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_18 (InputLayer)        (None, 19)                0         
_________________________________________________________________
dense_119 (Dense)            (None, 64)                1280      
_________________________________________________________________
dense_120 (Dense)            (None, 64)                4160      
_________________________________________________________________
dense_121 (Dense)            (None, 32)                2080      
_________________________________________________________________
dense_122 (Dense)            (None, 32)                1056      
_________________________________________________________________
dense_123 (Dense)            (None, 16)                528       
_________________________________________________________________
dense_124 (Dense)            (None, 16)                272       
__________

In [None]:
unlabelled_output

In [None]:
unlabelled_output = unlabelled_output * scale + offset

In [None]:
unlabelled_output

In [None]:
output_df = pd.DataFrame({'id':unlabelled.id, 'scalar_coupling_constant':pd.Series(unlabelled_output, index=unlabelled.index)})

In [None]:
output_df

In [None]:
output_df.to_csv('../data/pred.csv',index=False)