In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import rdkit
from rdkit import Chem
from rdkit.Chem import AllChem
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import LassoCV
from sklearn.linear_model import BayesianRidge
from sklearn.linear_model import ElasticNet
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error

In [2]:
"""
Read in train and test as Pandas DataFrames
"""
df_non_test = pd.read_csv("train.csv")

non_test_features = df_non_test.loc[:, df_non_test.columns.str.startswith('feat_')].astype(np.int8)
df_non_test[df_non_test.columns[1:-1]] = non_test_features

#split training/validation 80/20
msk = np.random.rand(len(df_non_test)) < 0.8
df_train, df_validation = df_non_test[msk].copy(deep = True), df_non_test[~msk].copy(deep = True)

In [3]:
df_train.head()

Unnamed: 0,smiles,feat_001,feat_002,feat_003,feat_004,feat_005,feat_006,feat_007,feat_008,feat_009,...,feat_248,feat_249,feat_250,feat_251,feat_252,feat_253,feat_254,feat_255,feat_256,gap
0,c1ccc(o1)-c1ccc(s1)-c1cnc(-c2scc3[se]ccc23)c2n...,0,0,0,0,1,0,1,0,0,...,1,0,0,0,0,0,0,0,0,1.19
1,C1=CC=C(C1)c1cc2ncc3c4[SiH2]C=Cc4ncc3c2c2=C[Si...,1,0,0,0,1,0,1,0,0,...,1,0,0,1,0,0,0,0,0,1.6
3,[nH]1c2-c3occc3Cc2c2c1cc(-c1cccc3=C[SiH2]C=c13...,1,0,0,0,1,1,1,0,0,...,1,0,0,0,1,0,0,0,0,1.36
4,c1cnc2c3oc4cc(-c5ncncn5)c5nsnc5c4c3c3cocc3c2c1,0,0,0,0,1,0,1,0,0,...,1,0,0,0,0,0,0,0,0,1.98
5,C1=Cc2cnc3cc4cc(-c5scc6[nH]ccc56)c5ccccc5c4cc3...,1,0,0,0,1,1,1,0,0,...,1,0,0,0,0,0,0,0,0,1.81


In [4]:
df_validation.head()

Unnamed: 0,smiles,feat_001,feat_002,feat_003,feat_004,feat_005,feat_006,feat_007,feat_008,feat_009,...,feat_248,feat_249,feat_250,feat_251,feat_252,feat_253,feat_254,feat_255,feat_256,gap
2,[nH]1c-2c([SiH2]c3cc(-c4scc5C=CCc45)c4nsnc4c-2...,1,0,0,0,1,1,1,0,0,...,1,0,0,0,1,0,0,0,0,1.49
14,c1sc(-c2sc(-c3sc(-c4ncncn4)c4nccnc34)c3cc[nH]c...,1,0,0,0,1,1,1,0,0,...,1,0,0,0,0,0,0,0,0,1.14
17,[nH]1c2-c3ncc(cc3Cc2c2[se]ccc12)-c1scc2ccoc12,1,0,0,0,1,1,1,0,0,...,1,0,0,0,1,0,0,0,0,2.4
18,c1cnc(s1)-c1ccc(cc1)-c1sc(c2Cccc12)-c1scc2occc12,1,0,0,0,1,0,1,0,0,...,1,0,0,0,0,0,0,0,0,2.08
23,[nH]1cccc1-c1cnc2c(c1)c1cocc1c1c3sccc3ccc21,0,0,0,0,1,1,1,0,0,...,1,0,0,0,0,0,0,0,0,2.68


In [5]:
#store gap values
Y_train = df_train.gap.values
Y_validation = df_validation.gap.values
#row where validation examples start
validation_idx = df_train.shape[0]
#delete 'gap' column
df_train = df_train.drop(['gap'], axis=1)
df_validation = df_validation.drop(['gap'], axis=1)

In [6]:
#save y values in separate CSV
pd.DataFrame(Y_train).to_csv('Y_TRAIN___.csv')
pd.DataFrame(Y_validation).to_csv('Y_VALIDATION___.csv')

In [7]:
#extract 1024 features and add to molecules data
smiles_train_list = df_train['smiles']
smiles_validation_list = df_validation['smiles']

def mol_to_objects(x): return Chem.MolFromSmiles(x)
m_train = list(map(mol_to_objects, smiles_train_list)) # list of molecule objects
m_validation = list(map(mol_to_objects, smiles_validation_list))

def features_extract(x): return AllChem.GetMorganFingerprintAsBitVect(x,2,nBits=1024)
fp_train = list(map(features_extract, m_train)) #list of bit vectors
fp_validation = list(map(features_extract, m_validation))

def bit_value(x): return [x.GetBit(i) for i in range(x.GetNumBits())]
fp_values_train = np.vstack(map(bit_value, fp_train)) # extract values
fp_values_validation = np.vstack(map(bit_value, fp_validation))

def get_num_atoms(x): return x.GetNumAtoms()
atoms_train = np.array(map(get_num_atoms, m_train)).reshape((len(m_train),1))
atoms_validation = np.array(map(get_num_atoms, m_validation)).reshape((len(m_validation),1))

df_train = np.concatenate((df_train, fp_values_train, atoms_train),axis=1)
df_validation = np.concatenate((df_validation, fp_values_validation, atoms_validation),axis=1)

In [8]:
train_data = pd.DataFrame(df_train)
train_data.to_csv('TRAIN_DATA___.csv')

validation_data = pd.DataFrame(df_validation)
validation_data.to_csv('VALIDATION_DATA___.csv')