In [1]:
%load_ext autoreload
%autoreload 2

%load_ext line_profiler

In [2]:
import pandas as pd
import numpy as np

import  matplotlib.pyplot as plt
import pickle

from chemistry import Molecule
from util import score
from pprint import pprint


In [3]:
with open('../data/molecules_enh.pickle', 'rb') as f:
    molecules = pickle.load(f)

In [4]:
structures = pd.read_feather('../data/structures_enh.feather')
molecules_df = pd.read_feather('../data/molecules.feather')
labelled = pd.read_feather('../data/train.feather')
unlabelled = pd.read_feather('../data/test.feather')

In [5]:
labelled_enh = labelled.merge(molecules_df, left_on='molecule_name', right_on='molecule_name')
#labelled_enh.head(32)

In [6]:
unlabelled_enh = unlabelled.merge(molecules_df, left_on='molecule_name', right_on='molecule_name')
#unlabelled_enh.head(32)

In [None]:
len(labelled), len(labelled_enh)

In [None]:
from models import SKModel, partition_data

#data_df = labelled[labelled.type == '1JHC'].head(10)
data_df = labelled_enh.head(16)
train_df, valid_df, test_df = partition_data(data_df, train_frac=1)
train_df = train_df#.copy()
        
model = SKModel(molecules=molecules,
                structures=structures)
input_df, numeric_df, output_df = model.setup_data(train_df, train_df)
None

In [None]:
model.structures.columns

In [None]:
list(input_df.columns)

In [None]:
list(numeric_df.columns)

In [None]:
model.input_df.dtypes

In [None]:
numeric_df.head(60)

In [None]:
list(zip(input_df.columns, input_df.loc[14], input_df.loc[15]))

In [None]:
import itertools

In [7]:
lgb_args = dict(n_jobs=8,
                max_depth=16,
                boosting_type='gbdt',
                num_leaves=65536, #128,
                min_child_samples=20,
                min_data_in_leaf=100,
                learning_rate=0.1,
                n_estimators=1000,
                reg_alpha=0.1,
                reg_lambda=0.3,
                bagging_fraction = 0.9,
                bagging_freq = 5,
                num_iterations=300)

In [None]:
from models import LGBModel
from models import partition_data

coupling_types = sorted(labelled_enh.type.unique())
models = {}
def test(data, count=5000):
    global model
    
    plt.figure(figsize=(25, 25))
    for i, t in enumerate(coupling_types, 1):    
        data_df = data[data.type == t].head(count)

        train_df, valid_df, test_df = partition_data(data_df)
        
        if len(train_df) < 10 or len(test_df) < 10:
            continue
            
        print(f'Training {len(train_df)} samples for {t}')

        model = LGBModel(dict(molecules=molecules, 
                              structures=structures),
                         lgb_args)
        models[t] = model

        model.fit(train_df, train_df, valid_df, valid_df)
        output, score = model.evaluate(test_df, test_df)
        
        print(f'{t} score: {score} (trained on {len(train_df)} elements)')
        plt.subplot(4, 4, 2 * i - 1)
        plt.plot(test_df.scalar_coupling_constant, output, '*')
        mn = min(test_df.scalar_coupling_constant.min(), output.min())
        mx = min(test_df.scalar_coupling_constant.max(), output.max())
        plt.plot([mn, mx], [mn, mx])
        plt.title(t)
        
        ax = plt.subplot(4, 4, 2 * i)
        model.plot_metric(ax=ax)
    plt.show()
        
test(labelled_enh, 50000)


In [None]:
from models import LGBModel
fig = plt.figure(figsize=(15, 10))
model.plot_importance(ax=fig.add_axes([0, 0, 1, 1]), height=0.5)
plt.show()

In [None]:
model.last_numeric_input_df

In [11]:
from models import LGBModel

def train(data):
    models = {}
    for t in sorted(data.type.unique()):
        train_df = data[data.type == t]
        print(f'Training {len(train_df)} samples for {t}')
        model = LGBModel(dict(molecules=molecules, 
                              structures=structures),
                         lgb_args)
        models[t] = model
        model.fit(train_df, train_df)
    
    return models

models = train(labelled_enh)

Training 709416 samples for 1JHC
  Setting up data
  Fitting model



Found `num_iterations` in params. Will use it instead of argument



Training 43363 samples for 1JHN
  Setting up data
  Fitting model
Training 1140674 samples for 2JHC
  Setting up data
  Fitting model
Training 378036 samples for 2JHH
  Setting up data
  Fitting model
Training 119253 samples for 2JHN
  Setting up data
  Fitting model
Training 1510379 samples for 3JHC
  Setting up data
  Fitting model
Training 590611 samples for 3JHH
  Setting up data
  Fitting model
Training 166415 samples for 3JHN
  Setting up data
  Fitting model


In [12]:
def predict(data, models):
    out_df = None
    
    for t in sorted(data.type.unique()):
        predict_df = data[data.type == t]
        print(f'Predicting {len(predict_df)} samples for {t}')
        output = models[t].predict(predict_df)
        
        id = predict_df['id']
        out_df_coupling = pd.DataFrame(data={'id':id, 'scalar_coupling_constant':output}, index=predict_df.index)
        
        if out_df is None:
            out_df = out_df_coupling
        else:
            out_df = out_df.append(out_df_coupling).sort_index()

    return out_df.sort_index()
    
#%prun -s cumulative f(unlabelled.head(10000))
prediction = predict(unlabelled_enh, models)

Predicting 380609 samples for 1JHC
  Setting up data
  Predicting
Predicting 24195 samples for 1JHN
  Setting up data
  Predicting
Predicting 613138 samples for 2JHC
  Setting up data
  Predicting
Predicting 203126 samples for 2JHH
  Setting up data
  Predicting
Predicting 64424 samples for 2JHN
  Setting up data
  Predicting
Predicting 811999 samples for 3JHC
  Setting up data
  Predicting
Predicting 317435 samples for 3JHH
  Setting up data
  Predicting
Predicting 90616 samples for 3JHN
  Setting up data
  Predicting


In [13]:
prediction.head()

Unnamed: 0,id,scalar_coupling_constant
0,4658147,11.203662
1,4658148,187.777599
2,4658149,4.631113
3,4658150,187.108198
4,4658151,10.673393


In [14]:
prediction.to_csv('../data/pred.csv',index=False)