In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn.preprocessing as Pre
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.decomposition import PCA
from sklearn.model_selection import KFold
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
#from rdkit import DataStructs
#from rdkit.Chem.Fingerprints import FingerprintMols

# 1. Reading and processing data

In [3]:
"""
Read in train and test as Pandas DataFrames
"""
df_train = pd.read_csv("train.csv.gz", compression='gzip', nrows=100000)
# df_test = pd.read_csv("test.csv.gz", compression='gzip', nrows=100000)

In [4]:
df_train.head()

Unnamed: 0,smiles,feat_001,feat_002,feat_003,feat_004,feat_005,feat_006,feat_007,feat_008,feat_009,...,feat_248,feat_249,feat_250,feat_251,feat_252,feat_253,feat_254,feat_255,feat_256,gap
0,c1ccc(o1)-c1ccc(s1)-c1cnc(-c2scc3[se]ccc23)c2n...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.19
1,C1=CC=C(C1)c1cc2ncc3c4[SiH2]C=Cc4ncc3c2c2=C[Si...,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.6
2,[nH]1c-2c([SiH2]c3cc(-c4scc5C=CCc45)c4nsnc4c-2...,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.49
3,[nH]1c2-c3occc3Cc2c2c1cc(-c1cccc3=C[SiH2]C=c13...,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.36
4,c1cnc2c3oc4cc(-c5ncncn5)c5nsnc5c4c3c3cocc3c2c1,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.98


In [5]:
# df_test.head()

Unnamed: 0,Id,smiles,feat_001,feat_002,feat_003,feat_004,feat_005,feat_006,feat_007,feat_008,...,feat_247,feat_248,feat_249,feat_250,feat_251,feat_252,feat_253,feat_254,feat_255,feat_256
0,1,c1sc(-c2cnc3c(c2)c2nsnc2c2cc4cccnc4cc32)c2cc[n...,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,[nH]1cccc1-c1cc2c3nsnc3c3c4sccc4[nH]c3c2s1,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,[nH]1c2cc(-c3ccc[se]3)c3nsnc3c2c2c3cscc3c3ccc4...,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,[nH]1c(cc2cnc3c(c12)c1=C[SiH2]C=c1c1ccc2=CCC=c...,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,c1sc(-c2sc(-c3sc(-c4scc5[se]ccc45)c4ccoc34)c3c...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
#row where testing examples start
test_idx = df_train.shape[0]

# extract gaps
y_train_pd = df_train['gap']

In [7]:
#delete 'Id' column
df_test = df_test.drop(['Id'], axis=1)

#delete 'gap' column
df_train = df_train.drop(['gap'], axis=1)

In [8]:
#DataFrame with all train and test examples so we can more easily apply feature engineering on
df_all = pd.concat((df_train, df_test), axis=0)
df_all.head()

#Store and drop the 'smiles' column
smiles = df_all['smiles']
df_all = df_all.drop(['smiles'], axis=1)

In [9]:
"""
Example Feature Engineering

this calculates the length of each smile string and adds a feature column with those lengths
Note: this is NOT a good feature and will result in a lower score!
"""
#smiles_len = np.vstack(df_all.smiles.astype(str).apply(lambda x: len(x)))
#df_all['smiles_len'] = pd.DataFrame(smiles_len)


'\nExample Feature Engineering\n\nthis calculates the length of each smile string and adds a feature column with those lengths\nNote: this is NOT a good feature and will result in a lower score!\n'

In [10]:
# Morgan fingerprint features
morgan_train = pd.read_csv('morgan.csv')
morgan_train = morgan_train.drop(['Unnamed: 0'], axis=1)
morgan_train = pd.concat((morgan_train, y_train_pd[:morgan_train.shape[0]]), axis=1)

In [12]:
# MACCS features
maccs_train = pd.read_csv('maccs.csv')
maccs_train= maccs_train.drop(['Unnamed: 0'], axis=1)
maccs_train = pd.concat((maccs_train, y_train_pd[:maccs_train.shape[0]]), axis=1)

In [13]:
# Top fingerprint
top_train = pd.read_csv('top.csv')
top_train = top_train.drop(['Unnamed: 0'], axis=1)
top_train = pd.concat((top_train, y_train_pd[:top_train.shape[0]]), axis=1)

In [14]:
combo_train = pd.concat((morgan_train.iloc[:, :-1], df_train.iloc[:morgan_train.shape[0], :]), axis=1).drop(['smiles'], axis=1)
combo_train = pd.concat((combo_train, y_train_pd[:morgan_train.shape[0]]), axis=1)

In [15]:
train_vals = combo_train.values # only thing that needs to be changed to affect features

In [16]:
# PCA 
# pca = PCA(n_components=3)
# pca.fit(X_train)
# X_train_reduced = pca.transform(X_train)
# train_data_reduced = np.concatenate((X_train_reduced, Y_train.reshape(-1, 1)), axis=1)

In [17]:
# Break into x and y, standardize features
X_train = train_vals[:, :-1]
Y_train = train_vals[:, -1]
X_train_std = Pre.scale(X_train)
train_vals_std = np.concatenate((X_train_std, Y_train.reshape(-1, 1)), axis=1)

# print "Train features:", X_train.shape
# print "Train gap:", Y_train.shape

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

# 2. Model selection, tuning

In [None]:
# Function for k-fold cross validation
def kfold(k, model, data):
    kf = KFold(n_splits=k)
    rmses = []
    for train_fold_index, validate_fold_index in kf.split(data):
        train_fold = data[train_fold_index]
        test_fold = data[validate_fold_index]
        X_train_fold = train_fold[:, :-1]
        Y_train_fold = train_fold[:, -1]
        X_test_fold = test_fold[:, :-1]
        Y_test_fold = test_fold[:, -1]
        model.fit(X_train_fold, Y_train_fold)
        Y_hat = model.predict(X_test_fold)
        rmse = np.sqrt(np.mean([(Y_hat[i] - Y_test_fold[i]) ** 2 for i in range(len(Y_test_fold))]))
        rmses.append(rmse)   
    return np.mean(rmses)

In [57]:
# Linear regression
LR = LinearRegression()
kfold(5, LR, train_vals)

0.18517745926640408

In [58]:
# Random forest
RF = RandomForestRegressor()
kfold(5, RF, train_vals)

0.17365224801369011

In [59]:
# Ridge regression
RidgeReg = Ridge()
kfold(5, RidgeReg, train_vals)

0.18512660398969502

In [60]:
# Lasso regression
LassoReg = Lasso()
kfold(5, LassoReg, train_vals)

0.4076321587320873

In [61]:
# Random forest bagging
Bagging = BaggingRegressor()
print kfold(5, Bagging, train_vals)

0.172710067101


In [62]:
# Tune random forest
n_estimators = [10, 20, 30, 40, 50, 100]
scores = []
for n in n_estimators:
    RF = RandomForestRegressor(n_estimators=n)
    scores.append((n, kfold(5, RF, train_vals)))
print scores

[(10, 0.17288329200658401), (20, 0.16788261269210436), (30, 0.16614482171694153), (40, 0.16440302184232353), (50, 0.16381810197967331), (100, 0.16216091866106774)]


In [63]:
# Tune bagging
n_estimators = [5, 10, 20, 30, 40, 50]
scores = []
for n in n_estimators:
    Bagging = BaggingRegressor(n_estimators=n)
    scores.append((n, kfold(5, Bagging, train_vals)))
print scores

[(5, 0.18662846505991179), (10, 0.17386886952347935), (20, 0.16839919981168017), (30, 0.16521958700742329), (40, 0.16400702471676193), (50, 0.16376902574945454)]


In [64]:
# Tune ridge regression
scores = []
for a in range(-4,5):
    RidgeReg = Ridge(alpha=10.**a)
    scores.append((a, kfold(5, RidgeReg, train_vals)))
print scores

[(-4, 0.18516319942104664), (-3, 0.18516315945044109), (-2, 0.18516276059624712), (-1, 0.18515885537425456), (0, 0.18512660398969502), (1, 0.18502105803828384), (2, 0.18654384262647433), (3, 0.21049977091077537), (4, 0.30888601534264654)]


# 3. Final model

In [14]:
morgan_test = pd.read_csv("morgan_test.csv.zip", compression='zip').drop('Unnamed: 0', axis=1)

In [16]:
X_test = morgan_test.as_matrix()

In [17]:
LR = LinearRegression()
LR.fit(X_train, Y_train)
LR_pred = LR.predict(X_test)

NameError: name 'X_train' is not defined

In [10]:
RF = RandomForestRegressor()
RF.fit(X_train, Y_train)
RF_pred = RF.predict(X_test)

In [11]:
def write_to_file(filename, predictions):
    with open(filename, "w") as f:
        f.write("Id,Prediction\n")
        for i,p in enumerate(predictions):
            f.write(str(i+1) + "," + str(p) + "\n")

In [12]:
#write_to_file("sample1.csv", LR_pred)
#write_to_file("sample2.csv", RF_pred)
write_to_file("test.csv", RF_pred)