In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import rdkit
from rdkit import Chem
from rdkit.Chem import AllChem
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import LassoCV
from sklearn.linear_model import BayesianRidge
from sklearn.linear_model import ElasticNet
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error

In [2]:
"""
Read in test as Pandas DataFrames
"""
df_test = pd.read_csv("test.csv")

#convert to ints to decrease memory used
test_features = df_test.loc[:, df_test.columns.str.startswith('feat_')].astype(np.int8)
df_test[df_test.columns[2:]] = test_features

In [3]:
df_test.head()

Unnamed: 0,Id,smiles,feat_001,feat_002,feat_003,feat_004,feat_005,feat_006,feat_007,feat_008,...,feat_247,feat_248,feat_249,feat_250,feat_251,feat_252,feat_253,feat_254,feat_255,feat_256
0,1,c1sc(-c2cnc3c(c2)c2nsnc2c2cc4cccnc4cc32)c2cc[n...,0,0,0,0,1,1,1,0,...,0,1,0,0,0,0,0,0,0,0
1,2,[nH]1cccc1-c1cc2c3nsnc3c3c4sccc4[nH]c3c2s1,0,0,0,0,1,1,1,0,...,0,1,0,0,0,0,0,0,0,0
2,3,[nH]1c2cc(-c3ccc[se]3)c3nsnc3c2c2c3cscc3c3ccc4...,1,0,0,0,1,1,1,0,...,0,1,0,0,0,0,0,0,0,0
3,4,[nH]1c(cc2cnc3c(c12)c1=C[SiH2]C=c1c1ccc2=CCC=c...,1,0,0,0,1,1,1,0,...,0,1,0,0,0,0,0,0,0,0
4,5,c1sc(-c2sc(-c3sc(-c4scc5[se]ccc45)c4ccoc34)c3c...,0,0,0,0,1,0,1,0,...,0,1,0,0,0,0,0,0,0,0


In [4]:
df_test = df_test.drop(['Id'], axis=1)

In [5]:
#extract 1024 features and add to molecules data
smiles_test_list = df_test['smiles']

def mol_to_objects(x): return Chem.MolFromSmiles(x)
m_test = list(map(mol_to_objects, smiles_test_list))

def features_extract(x): return AllChem.GetMorganFingerprintAsBitVect(x,2,nBits=1024)
fp_test = list(map(features_extract, m_test))

def bit_value(x): return [x.GetBit(i) for i in range(x.GetNumBits())]
fp_values_test = np.vstack(map(bit_value, fp_test))

def get_num_atoms(x): return x.GetNumAtoms()
atoms_test = np.array(map(get_num_atoms, m_test)).reshape((len(m_test),1))

df_test = np.concatenate((df_test, fp_values_test, atoms_test),axis=1)

In [6]:
test_data = pd.DataFrame(df_test)
test_data.to_csv('TEST_DATA___.csv')