In [1]:
%load_ext autoreload
%autoreload 2

%load_ext line_profiler

In [2]:
import pandas as pd
import numpy as np

import  matplotlib.pyplot as plt
import pickle

from chemistry import Molecule
from util import score
from pprint import pprint


In [3]:
with open('../data/molecules_enh.pickle', 'rb') as f:
    molecules = pickle.load(f)

m = molecules['dsgdb9nsd_000002']
m.compute_path(2, 3)
m

m = molecules['dsgdb9nsd_000014']
i0, i1 = 2, 3
p0, p1 = m.positions[i0], m.positions[i1]
mid = (p0 + p1) * 0.5
other_atoms = list(range(m.n_atoms))
other_atoms.remove(i0)
other_atoms.remove(i1)
p = m.positions[other_atoms]
diff = p - mid
dist = np.linalg.norm(diff, axis=1)
print(dist)
print(dist.argsort()[0:2])
print(dist[dist.argsort()[0:2]])

In [4]:
structures = pd.read_feather('../data/structures_enh.feather')
molecules_df = pd.read_feather('../data/molecules.feather')
labelled = pd.read_feather('../data/train.feather')
unlabelled = pd.read_feather('../data/test.feather')

In [5]:
labelled_enh = labelled.merge(molecules_df, left_on='molecule_name', right_on='molecule_name')
#labelled_enh.head(32)

In [6]:
unlabelled_enh = unlabelled.merge(molecules_df, left_on='molecule_name', right_on='molecule_name')
#unlabelled_enh.head(32)

In [14]:
from models import LGBModel
from models import partition_data
import util
from hyperopt import fmin, tpe, hp, STATUS_OK

#coupling_types = sorted(labelled_enh.type.unique())[0:1]
coupling_types = ['3JHC', '3JHH']
models = {}
def run_test(data, count, args):
    
    num_leaves, min_child_samples, min_data_in_leaf, reg_alpha, reg_lambda, bagging_fraction, bagging_freq = args
    
    lgb_args = dict(n_jobs=8,
                    max_depth=16,
                    boosting_type='gbdt',
                    num_leaves=num_leaves,
                    min_child_samples=int(min_child_samples),
                    min_data_in_leaf=int(min_data_in_leaf),
                    learning_rate=0.1,
                    n_estimators=1000,
                    reg_alpha=reg_alpha,
                    reg_lambda=reg_lambda,
                    bagging_fraction = bagging_fraction,
                    bagging_freq = int(bagging_freq),
                    num_iterations=300)
    lgb_fit_args = dict(early_stopping_rounds=20,
                        verbose=False)
    
    out_df = None
    
    try:
        for i, t in enumerate(coupling_types, 1):    
            data_df = data[data.type == t].sample(count)

            train_df, valid_df, test_df = partition_data(data_df)

            if len(train_df) < 10 or len(test_df) < 10:
                continue

            model = LGBModel(dict(molecules=molecules, 
                                  structures=structures),
                             lgb_args, lgb_fit_args)
            models[t] = model

            model.fit(train_df, train_df, valid_df, valid_df)
            output = model.predict(test_df)

            id = test_df['id']
            out_df_coupling = pd.DataFrame(data={'id':id, 'out_scc':output}, index=test_df.index)
            out_df_coupling['type'] = test_df.type
            out_df_coupling['ref_scc'] = test_df.scalar_coupling_constant

            if out_df is None:
                out_df = out_df_coupling
            else:
                out_df = out_df.append(out_df_coupling).sort_index()

        return util.score(out_df, out_df.ref_scc, out_df.out_scc)
    except:
        return 100000

def run_opt(data, count):
    fn = lambda args: run_test(data, count, args)
    
    space = [
        hp.choice('num_leaves', [128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536]),
        hp.quniform('min_child_samples', 10, 100, 1),
        hp.quniform('min_data_in_leaf', 50, 200, 1),
        hp.uniform('reg_alpha', 0.01, 5),
        hp.uniform('reg_lambda', 0.01, 5),
        hp.uniform('bagging_fraction', 0.9, 1),
        hp.choice('bagging_freq', [1, 3, 5]),
    ]
    
    best = fmin(fn,
                space=space,
                algo=tpe.suggest,
                max_evals=10)
    return best
    
 
run_opt(labelled_enh, 50000)
#run_test(labelled_enh.head(200))

  0%|          | 0/10 [00:00<?, ?it/s, best loss: ?]


Found `num_iterations` in params. Will use it instead of argument




 10%|█         | 1/10 [01:22<12:24, 82.75s/it, best loss: -0.49589307632330837]


Found `num_iterations` in params. Will use it instead of argument



Found `num_iterations` in params. Will use it instead of argument




 20%|██        | 2/10 [03:21<12:27, 93.43s/it, best loss: -0.5311903023803697] 


Found `num_iterations` in params. Will use it instead of argument



Found `num_iterations` in params. Will use it instead of argument




 30%|███       | 3/10 [04:46<10:37, 91.04s/it, best loss: -0.5311903023803697]


Found `num_iterations` in params. Will use it instead of argument



Found `num_iterations` in params. Will use it instead of argument




 40%|████      | 4/10 [06:20<09:11, 91.95s/it, best loss: -0.5311903023803697]


Found `num_iterations` in params. Will use it instead of argument



Found `num_iterations` in params. Will use it instead of argument




 50%|█████     | 5/10 [07:40<07:21, 88.35s/it, best loss: -0.5311903023803697]


Found `num_iterations` in params. Will use it instead of argument



Found `num_iterations` in params. Will use it instead of argument




 60%|██████    | 6/10 [08:47<05:27, 81.82s/it, best loss: -0.5311903023803697]


Found `num_iterations` in params. Will use it instead of argument



Found `num_iterations` in params. Will use it instead of argument




 70%|███████   | 7/10 [10:15<04:11, 83.83s/it, best loss: -0.5311903023803697]


Found `num_iterations` in params. Will use it instead of argument



Found `num_iterations` in params. Will use it instead of argument




 80%|████████  | 8/10 [11:23<02:37, 78.99s/it, best loss: -0.5311903023803697]


Found `num_iterations` in params. Will use it instead of argument



Found `num_iterations` in params. Will use it instead of argument




 90%|█████████ | 9/10 [12:34<01:16, 76.65s/it, best loss: -0.5311903023803697]


Found `num_iterations` in params. Will use it instead of argument



Found `num_iterations` in params. Will use it instead of argument




100%|██████████| 10/10 [14:42<00:00, 92.13s/it, best loss: -0.5462469105383353]


{'bagging_fraction': 0.9921343868390777,
 'bagging_freq': 0,
 'min_child_samples': 70.0,
 'min_data_in_leaf': 68.0,
 'num_leaves': 9,
 'reg_alpha': 3.3985058768448306,
 'reg_lambda': 0.6552719703561464}