In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import rdkit
from rdkit import Chem
from rdkit.Chem import AllChem
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import LassoCV
from sklearn.linear_model import BayesianRidge
from sklearn.linear_model import ElasticNet
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error

In [2]:
"""
Read in train and test as Pandas DataFrames
"""
df_non_test = pd.read_csv("train.csv")

msk = np.random.rand(len(df_non_test)) < 0.8
df_train, df_validation = df_non_test[msk].copy(deep = True), df_non_test[~msk].copy(deep = True)

df_test = pd.read_csv("test.csv")

In [3]:
df_train.head()

Unnamed: 0,smiles,feat_001,feat_002,feat_003,feat_004,feat_005,feat_006,feat_007,feat_008,feat_009,...,feat_248,feat_249,feat_250,feat_251,feat_252,feat_253,feat_254,feat_255,feat_256,gap
0,c1ccc(o1)-c1ccc(s1)-c1cnc(-c2scc3[se]ccc23)c2n...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.19
1,C1=CC=C(C1)c1cc2ncc3c4[SiH2]C=Cc4ncc3c2c2=C[Si...,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.6
2,[nH]1c-2c([SiH2]c3cc(-c4scc5C=CCc45)c4nsnc4c-2...,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.49
6,c1ncc(s1)-c1cnc2c(c1)oc1c2ccc2ccccc12,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.91
7,c1sc(-c2ccc3c(c2)sc2c3c3=CCC=c3c3cccnc23)c2[se...,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.17


In [4]:
df_validation.head()

Unnamed: 0,smiles,feat_001,feat_002,feat_003,feat_004,feat_005,feat_006,feat_007,feat_008,feat_009,...,feat_248,feat_249,feat_250,feat_251,feat_252,feat_253,feat_254,feat_255,feat_256,gap
3,[nH]1c2-c3occc3Cc2c2c1cc(-c1cccc3=C[SiH2]C=c13...,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.36
4,c1cnc2c3oc4cc(-c5ncncn5)c5nsnc5c4c3c3cocc3c2c1,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.98
5,C1=Cc2cnc3cc4cc(-c5scc6[nH]ccc56)c5ccccc5c4cc3...,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.81
11,c1cc2oc3c(sc4cc([se]c34)-c3cncc4nsnc34)c2o1,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.42
22,c1ccc(cn1)C1=Cc2c(C1)c1ncc3ccc4c[nH]cc4c3c1c1c...,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,2.17


In [5]:
df_test.head()

Unnamed: 0,Id,smiles,feat_001,feat_002,feat_003,feat_004,feat_005,feat_006,feat_007,feat_008,...,feat_247,feat_248,feat_249,feat_250,feat_251,feat_252,feat_253,feat_254,feat_255,feat_256
0,1,c1sc(-c2cnc3c(c2)c2nsnc2c2cc4cccnc4cc32)c2cc[n...,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,[nH]1cccc1-c1cc2c3nsnc3c3c4sccc4[nH]c3c2s1,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,[nH]1c2cc(-c3ccc[se]3)c3nsnc3c2c2c3cscc3c3ccc4...,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,[nH]1c(cc2cnc3c(c12)c1=C[SiH2]C=c1c1ccc2=CCC=c...,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,c1sc(-c2sc(-c3sc(-c4scc5[se]ccc45)c4ccoc34)c3c...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
#store gap values
Y_train = df_train.gap.values
Y_validation = df_validation.gap.values
#row where validation examples start
validation_idx = df_train.shape[0]
#row where testing examples start
test_idx = df_train.shape[0] + df_validation.shape[0]
#delete 'Id' column
df_test = df_test.drop(['Id'], axis=1)
#delete 'gap' column
df_train = df_train.drop(['gap'], axis=1)
df_validation = df_validation.drop(['gap'], axis=1)

In [7]:
# #extract 1024 features and add to molecules data
# smiles_train_list = df_train['smiles']
# smiles_validation_list = df_validation['smiles']
# smiles_test_list = df_test['smiles']

# def mol_to_objects(x): return Chem.MolFromSmiles('Cc1ccccc1')
# m_train = list(map(mol_to_objects, smiles_train_list)) # list of molecule objects
# m_validation = list(map(mol_to_objects, smiles_validation_list))
# m_test = list(map(mol_to_objects, smiles_test_list))

# def features_extract(x): return AllChem.GetMorganFingerprintAsBitVect(x,2,nBits=200)
# fp_train = list(map(features_extract, m_train)) #list of bit vectors
# fp_validation = list(map(features_extract, m_validation))
# fp_test = list(map(features_extract, m_test))

# def bit_value(x): return [x.GetBit(i) for i in range(x.GetNumBits())]
# fp_values_train = np.vstack(map(bit_value, fp_train)) # extract values
# fp_values_validation = np.vstack(map(bit_value, fp_validation))
# fp_values_test = np.vstack(map(bit_value, fp_test))

# df_train = np.concatenate((df_train, fp_values_train),axis=1)
# df_validation = np.concatenate((df_validation, fp_values_validation),axis=1)
# df_test = np.concatenate((df_test, fp_values_test),axis=1)

In [8]:
# df_train = pd.DataFrame(df_train).rename(index=str, columns={0: "smiles"})
# df_validation = pd.DataFrame(df_validation).rename(index=str, columns={0: "smiles"})
# df_test = pd.DataFrame(df_test).rename(index=str, columns={0: "smiles"})

In [9]:
#DataFrame with all train and test examples so we can more easily apply feature engineering on
df_all = pd.concat((df_train, df_validation, df_test), axis=0)
df_all.head()

Unnamed: 0,smiles,feat_001,feat_002,feat_003,feat_004,feat_005,feat_006,feat_007,feat_008,feat_009,...,feat_247,feat_248,feat_249,feat_250,feat_251,feat_252,feat_253,feat_254,feat_255,feat_256
0,c1ccc(o1)-c1ccc(s1)-c1cnc(-c2scc3[se]ccc23)c2n...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,C1=CC=C(C1)c1cc2ncc3c4[SiH2]C=Cc4ncc3c2c2=C[Si...,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,[nH]1c-2c([SiH2]c3cc(-c4scc5C=CCc45)c4nsnc4c-2...,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
6,c1ncc(s1)-c1cnc2c(c1)oc1c2ccc2ccccc12,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,c1sc(-c2ccc3c(c2)sc2c3c3=CCC=c3c3cccnc23)c2[se...,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
"""
Example Feature Engineering

this calculates the length of each smile string and adds a feature column with those lengths
Note: this is NOT a good feature and will result in a lower score!
"""
#smiles_len = np.vstack(df_all.smiles.astype(str).apply(lambda x: len(x)))
#df_all['smiles_len'] = pd.DataFrame(smiles_len)

'\nExample Feature Engineering\n\nthis calculates the length of each smile string and adds a feature column with those lengths\nNote: this is NOT a good feature and will result in a lower score!\n'

In [11]:
#Drop the 'smiles' column
df_all = df_all.drop(['smiles'], axis=1)
vals = df_all.values
X_train = vals[:validation_idx]
X_validation = vals[validation_idx:test_idx]
X_test = vals[test_idx:]
print("Train features:", X_train.shape)
print("Train gap:", Y_train.shape)
print("Validation features:", X_validation.shape)
print("Validation gap:", Y_validation.shape)
print("Test features:", X_test.shape)

('Train features:', (800006, 256))
('Train gap:', (800006,))
('Validation features:', (199994, 256))
('Validation gap:', (199994,))
('Test features:', (824230, 256))


In [12]:
#Linear Regression
LR = LinearRegression()
LR.fit(X_train, Y_train)
LR_pred = LR.predict(X_test)

print 'LR Validation RMSE:', np.sqrt(mean_squared_error(Y_validation, LR.predict(X_validation)))

LR Validation RMSE: 0.2987970150202188


In [13]:
#Random Forest 
RF = RandomForestRegressor()
RF.fit(X_train, Y_train)
RF_pred = RF.predict(X_test)

print 'RF Validation RMSE:', np.sqrt(mean_squared_error(Y_validation, RF.predict(X_validation)))

RF Validation RMSE: 0.27234308678527036


In [14]:
#lambda values to test for Ridge and Lasso Regression
lambdas = [.001,.005,1,5,10,50,100,500,1000]

In [15]:
#Ridge Regression
Ridge = RidgeCV(alphas=lambdas, fit_intercept=True, cv=5)
Ridge.fit(X_train, Y_train)
Ridge_pred = Ridge.predict(X_test)

print 'Ridge Validation RMSE:', np.sqrt(mean_squared_error(Y_validation, Ridge.predict(X_validation)))
print 'lambda:', Ridge.alpha_

Ridge Validation RMSE: 0.29879576257458357
lambda: 5


In [16]:
#Lasso Regression
Lasso = LassoCV(alphas=lambdas, fit_intercept=True, cv=5)
Lasso.fit(X_train, Y_train)
Lasso_pred = Lasso.predict(X_test)

print 'Lasso Validation RMSE:', np.sqrt(mean_squared_error(Y_validation, Lasso.predict(X_validation)))
print 'lambda:', Lasso.alpha_

Lasso Validation RMSE: 0.3003996019307757
lambda: 0.001


In [17]:
#Elastic Net Regression
Elastic = ElasticNet(alpha=1.0, l1_ratio=0.5, fit_intercept=True)
Elastic.fit(X_train, Y_train)
Elastic_pred = Elastic.predict(X_test)

print 'Elastic Net Validation RMSE:', np.sqrt(mean_squared_error(Y_validation, Elastic.predict(X_validation)))

Elastic Net Validation RMSE: 0.4070404547692697


In [18]:
#MLP Regression
MLP = MLPRegressor()
MLP.fit(X_train, Y_train)
MLP_pred = MLP.predict(X_test)

print 'MLP Validation RMSE:', np.sqrt(mean_squared_error(Y_validation, MLP.predict(X_validation)))

MLP Validation RMSE: 0.27626773866372983


In [19]:
#Bayesian Ridge Regression
BRidge = BayesianRidge()
BRidge.fit(X_train, Y_train)
BRidge_pred = BRidge.predict(X_test)

print 'BRidge Validation RMSE:', np.sqrt(mean_squared_error(Y_validation, BRidge.predict(X_validation)))

BRidge Validation RMSE: 0.29879573686661914


In [20]:
def write_to_file(filename, predictions):
    with open(filename, "w") as f:
        f.write("Id,Prediction\n")
        for i,p in enumerate(predictions):
            f.write(str(i+1) + "," + str(p) + "\n")

In [21]:
folder = os.path.dirname(os.getcwd()) + "/P1-regression/predictions/"

write_to_file(folder + "LR.csv", LR_pred)
write_to_file(folder + "RF.csv", RF_pred)
write_to_file(folder + "Ridge.csv", Ridge_pred)
write_to_file(folder + "BRidge.csv", BRidge_pred)
write_to_file(folder + "Lasso.csv", Lasso_pred)
write_to_file(folder + "ElasticNet.csv", Elastic_pred)
write_to_file(folder + "MLP.csv", MLP_pred)