In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import rdkit
from rdkit import Chem
from rdkit.Chem import AllChem
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import LassoCV
from sklearn.linear_model import ElasticNet
from sklearn.metrics import mean_squared_error

In [2]:
"""
Read in train and test as Pandas DataFrames
"""
df_train_full = pd.read_csv("train.csv")
df_train = df_train_full.sample(n=6000)
df_test = pd.read_csv("test.csv")
#df_test = df_test_full.sample(n=1000)

#df_train = pd.read_csv("train.csv")
#df_test = pd.read_csv("test.csv")

In [3]:
df_train.head()

Unnamed: 0,smiles,feat_001,feat_002,feat_003,feat_004,feat_005,feat_006,feat_007,feat_008,feat_009,...,feat_248,feat_249,feat_250,feat_251,feat_252,feat_253,feat_254,feat_255,feat_256,gap
837268,c1cnc2cc3cc4C=C(Cc4cc3cc2c1)c1cccc2=CCC=c12,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.72
716523,c1sc(-c2cc3oc4c(c5nsnc5c5ccc6nsnc6c45)c3s2)c2n...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.52
510116,[nH]1cccc1-c1sc(-c2ccc(cc2)-c2cccnc2)c2ccoc12,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.1
170751,[nH]1cccc1-c1ccc(-c2ccc(o2)-c2scc3occc23)c2c[S...,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.28
252920,c1cc2c3nsnc3c3c(ccc4cc(-c5cncc6nsnc56)c5=CCC=c...,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.48


In [4]:
df_test.head()

Unnamed: 0,Id,smiles,feat_001,feat_002,feat_003,feat_004,feat_005,feat_006,feat_007,feat_008,...,feat_247,feat_248,feat_249,feat_250,feat_251,feat_252,feat_253,feat_254,feat_255,feat_256
0,1,c1sc(-c2cnc3c(c2)c2nsnc2c2cc4cccnc4cc32)c2cc[n...,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,[nH]1cccc1-c1cc2c3nsnc3c3c4sccc4[nH]c3c2s1,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,[nH]1c2cc(-c3ccc[se]3)c3nsnc3c2c2c3cscc3c3ccc4...,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,[nH]1c(cc2cnc3c(c12)c1=C[SiH2]C=c1c1ccc2=CCC=c...,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,c1sc(-c2sc(-c3sc(-c4scc5[se]ccc45)c4ccoc34)c3c...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
#store gap values
Y_train = df_train.gap.values
#row where testing examples start
test_idx = df_train.shape[0]
#delete 'Id' column
df_test = df_test.drop(['Id'], axis=1)
#delete 'gap' column
df_train = df_train.drop(['gap'], axis=1)

In [6]:
#extract 1024 features and add to molecules data
smiles_train_list = df_train['smiles']
smiles_test_list = df_test['smiles']

def mol_to_objects(x): return Chem.MolFromSmiles('Cc1ccccc1')
m = list(map(mol_to_objects, smiles_train_list)) # list of molecule objects
mt = list(map(mol_to_objects, smiles_test_list))

def features_extract(x): return AllChem.GetMorganFingerprintAsBitVect(x,2,nBits=1024)
fp = list(map(features_extract, m)) #list of bit vectors
fpt = list(map(features_extract, mt))

def bit_value(x): return [x.GetBit(i) for i in range(x.GetNumBits())]
fp_values = np.vstack(map(bit_value, fp)) # extract values
fp_values_t = np.vstack(map(bit_value, fpt)) # extract values

df_train = np.concatenate((df_train, fp_values),axis=1)
df_test = np.concatenate((df_test, fp_values_t),axis=1)

In [7]:
#DataFrame with all train and test examples so we can more easily apply feature engineering on
df_all = np.concatenate((df_train, df_test), axis=0)
#df_all.head()

In [8]:
"""
Example Feature Engineering

this calculates the length of each smile string and adds a feature column with those lengths
Note: this is NOT a good feature and will result in a lower score!
"""
#smiles_len = np.vstack(df_all.smiles.astype(str).apply(lambda x: len(x)))
#df_all['smiles_len'] = pd.DataFrame(smiles_len)


'\nExample Feature Engineering\n\nthis calculates the length of each smile string and adds a feature column with those lengths\nNote: this is NOT a good feature and will result in a lower score!\n'

In [9]:
#Drop the 'smiles' column
#df_all = df_all.drop(['smiles'], axis=1)
#vals = df_all.values
#X_train = vals[:test_idx]
#X_test = vals[test_idx:]
df_all = np.delete(df_all,0,1)
X_train = df_all[:test_idx]
X_test = df_all[test_idx:]
print "Train features:", X_train.shape
print "Train gap:", Y_train.shape
print "Test features:", X_test.shape

Train features: (6000, 1280)
Train gap: (6000,)
Test features: (824230, 1280)


In [10]:
LR = LinearRegression()
LR.fit(X_train, Y_train)
LR_pred = LR.predict(X_test)

print('LR Train R^2:', LR.score(X_train, Y_train))
print('LR Train RMSE:', np.sqrt(mean_squared_error(Y_train, LR.predict(X_train))))

('LR Train R^2:', 0.43764724314237646)
('LR Train RMSE:', 0.30449093745870148)


In [11]:
#Random Forest 
RF = RandomForestRegressor()
RF.fit(X_train, Y_train)
RF_pred = RF.predict(X_test)

print('RF Train R^2:', RF.score(X_train, Y_train))
print('RF Train RMSE:', np.sqrt(mean_squared_error(Y_train, RF.predict(X_train))))

('RF Train R^2:', 0.59309471845663042)
('RF Train RMSE:', 0.25901005283802103)


In [12]:
#lambda values to test for Ridge and Lasso Regression
lambdas = [.001,.005,1,5,10,50,100,500,1000]
# lambdas = [.001,.005,.01,.05,.1,.5,1,5,10,50,100,250,500,750,1000]

In [13]:
#Ridge Regression
Ridge = RidgeCV(alphas=lambdas, fit_intercept=True, cv=5)
Ridge.fit(X_train, Y_train)
Ridge_pred = Ridge.predict(X_test)

print('Ridge Train R^2:', Ridge.score(X_train, Y_train))
print('Ridge Train RMSE:', np.sqrt(mean_squared_error(Y_train, Ridge.predict(X_train))))

('Ridge Train R^2:', 0.46282924835333938)
('Ridge Train RMSE:', 0.29759534652875674)


In [14]:
#Lasso Regression
Lasso = LassoCV(alphas=lambdas, fit_intercept=True, cv=5)
Lasso.fit(X_train, Y_train)
Lasso_pred = Lasso.predict(X_test)

print('Lasso Train R^2:', Lasso.score(X_train, Y_train))
print('Lasso Train RMSE:', np.sqrt(mean_squared_error(Y_train, Lasso.predict(X_train))))

('Lasso Train R^2:', 0.45596188703611507)
('Lasso Train RMSE:', 0.29949158192115372)


In [15]:
#Elastic Net Regression
Elastic = ElasticNet(alpha=1.0, l1_ratio=0.5, fit_intercept=True)
Elastic.fit(X_train, Y_train)
Elastic_pred = Elastic.predict(X_test)

print('Elastic Net Train R^2:', Elastic.score(X_train, Y_train))
print('Elastic Net Train RMSE:', np.sqrt(mean_squared_error(Y_train, Elastic.predict(X_train))))

('Elastic Net Train R^2:', 0.0)
('Elastic Net Train RMSE:', 0.40604106387846484)


In [16]:
def write_to_file(filename, predictions):
    with open(filename, "w") as f:
        f.write("Id,Prediction\n")
        for i,p in enumerate(predictions):
            f.write(str(i+1) + "," + str(p) + "\n")

In [17]:
folder = os.path.dirname(os.getcwd()) + "/P1-regression/predictions/"

write_to_file("OLS.csv", LR_pred)
write_to_file("RF.csv", RF_pred)
write_to_file("Ridge.csv", Ridge_pred)
write_to_file("Lasso.csv", Lasso_pred)
write_to_file("ElasticNet.csv", Elastic_pred)