In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import rdkit
from rdkit import Chem
from rdkit.Chem import AllChem
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import LassoCV
from sklearn.linear_model import BayesianRidge
from sklearn.linear_model import ElasticNet
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error

In [2]:
X_train = pd.read_csv("TRAIN_DATA___.csv", nrows=100000).drop(columns=['Unnamed: 0','0'])
X_validation = pd.read_csv("VALIDATION_DATA___.csv", nrows=100000).drop(columns=['Unnamed: 0','0'])
X_test = pd.read_csv("TEST_DATA___.csv").drop(columns=['Unnamed: 0','0'])

In [3]:
X_train.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,1272,1273,1274,1275,1276,1277,1278,1279,1280,1281
0,0,0,0,0,1,0,1,0,0,0,...,False,False,False,False,False,False,False,False,False,27
1,1,0,0,0,1,0,1,0,0,0,...,False,False,False,False,False,False,False,False,False,25
2,1,0,0,0,1,1,1,0,0,0,...,False,False,False,False,False,False,False,False,False,27
3,0,0,0,0,1,0,1,0,0,0,...,False,False,False,False,False,False,False,False,False,29
4,1,0,0,0,1,1,1,0,0,0,...,False,False,False,False,False,False,False,False,False,29


In [4]:
X_validation.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,1272,1273,1274,1275,1276,1277,1278,1279,1280,1281
0,1,0,0,0,1,1,1,0,0,0,...,False,False,False,False,False,False,False,False,False,27
1,1,0,0,0,1,1,1,0,0,0,...,False,False,False,False,False,False,False,False,False,31
2,1,0,0,0,1,1,1,0,0,0,...,False,False,False,False,False,False,False,False,False,23
3,1,0,0,0,1,0,1,0,0,0,...,False,False,False,False,False,False,False,False,False,27
4,0,0,0,0,1,1,1,0,0,0,...,False,False,False,False,False,False,False,False,False,25


In [5]:
X_test.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,1272,1273,1274,1275,1276,1277,1278,1279,1280,1281
0,0,0,0,0,1,1,1,0,0,0,...,False,False,False,False,False,False,False,False,False,29
1,0,0,0,0,1,1,1,0,0,0,...,False,False,False,False,False,False,False,False,False,23
2,1,0,0,0,1,1,1,0,0,0,...,False,False,False,False,False,False,False,False,False,31
3,1,0,0,0,1,1,1,0,0,0,...,False,False,False,False,False,False,False,False,False,28
4,0,0,0,0,1,0,1,0,0,0,...,False,False,False,False,False,False,False,False,False,32


In [6]:
Y_train = pd.read_csv('Y_TRAIN___.csv', nrows=100000).drop(columns=['Unnamed: 0']).rename(columns={'0': 'gap'})
Y_validation = pd.read_csv('Y_VALIDATION___.csv', nrows=100000).drop(columns=['Unnamed: 0']).rename(columns={'0': 'gap'})

In [7]:
#store gap values
Y_train = Y_train.gap.values
Y_validation = Y_validation.gap.values

In [8]:
"""
Example Feature Engineering

this calculates the length of each smile string and adds a feature column with those lengths
Note: this is NOT a good feature and will result in a lower score!
"""
#smiles_len = np.vstack(df_all.smiles.astype(str).apply(lambda x: len(x)))
#df_all['smiles_len'] = pd.DataFrame(smiles_len)

'\nExample Feature Engineering\n\nthis calculates the length of each smile string and adds a feature column with those lengths\nNote: this is NOT a good feature and will result in a lower score!\n'

In [9]:
print("Train features:", X_train.shape)
print("Train gap:", Y_train.shape)
print("Validation features:", X_validation.shape)
print("Validation gap:", Y_validation.shape)
print("Test features:", X_test.shape)

('Train features:', (100000, 1281))
('Train gap:', (100000,))
('Validation features:', (100000, 1281))
('Validation gap:', (100000,))
('Test features:', (824230, 1281))


In [None]:
#Linear Regression
LR = LinearRegression()
LR.fit(X_train, Y_train)
LR_pred = LR.predict(X_test)

print 'LR Validation RMSE:', np.sqrt(mean_squared_error(Y_validation, LR.predict(X_validation)))

In [11]:
#Random Forest 
RF = RandomForestRegressor()
RF.fit(X_train, Y_train)
RF_pred = RF.predict(X_test)

print 'RF Validation RMSE:', np.sqrt(mean_squared_error(Y_validation, RF.predict(X_validation)))

RF Validation RMSE: 0.10265736080145046


In [12]:
#lambda values to test for Ridge and Lasso Regression
lambdas = [.001,.005,1,5,10,50,100,500,1000]

In [13]:
#Ridge Regression
Ridge = RidgeCV(alphas=lambdas, fit_intercept=True, cv=5)
Ridge.fit(X_train, Y_train)
Ridge_pred = Ridge.predict(X_test)

print 'Ridge Validation RMSE:', np.sqrt(mean_squared_error(Y_validation, Ridge.predict(X_validation)))

Ridge Validation RMSE: 0.1382990670754662


In [14]:
print 'lambda:', Ridge.alpha_

lambda: 5


In [15]:
#Lasso Regression
Lasso = LassoCV(alphas=lambdas, fit_intercept=True, cv=5)
Lasso.fit(X_train, Y_train)
Lasso_pred = Lasso.predict(X_test)

print 'Lasso Validation RMSE:', np.sqrt(mean_squared_error(Y_validation, Lasso.predict(X_validation)))
print 'lambda:', Lasso.alpha_

Lasso Validation RMSE: 0.16646635988952993
lambda: 0.001


In [16]:
#Elastic Net Regression
Elastic = ElasticNet(alpha=1.0, l1_ratio=0.5, fit_intercept=True)
Elastic.fit(X_train, Y_train)
Elastic_pred = Elastic.predict(X_test)

print 'Elastic Net Validation RMSE:', np.sqrt(mean_squared_error(Y_validation, Elastic.predict(X_validation)))

Elastic Net Validation RMSE: 0.39825906232071795


In [17]:
#MLP Regression
MLP = MLPRegressor()
MLP.fit(X_train, Y_train)
MLP_pred = MLP.predict(X_test)

print 'MLP Validation RMSE:', np.sqrt(mean_squared_error(Y_validation, MLP.predict(X_validation)))

MLP Validation RMSE: 0.06556489300650357


In [18]:
#Bayesian Ridge Regression
BRidge = BayesianRidge()
BRidge.fit(X_train, Y_train)
BRidge_pred = BRidge.predict(X_test)

print 'BRidge Validation RMSE:', np.sqrt(mean_squared_error(Y_validation, BRidge.predict(X_validation)))

BRidge Validation RMSE: 0.138309327741764


In [19]:
def write_to_file(filename, predictions):
    with open(filename, "w") as f:
        f.write("Id,Prediction\n")
        for i,p in enumerate(predictions):
            f.write(str(i+1) + "," + str(p) + "\n")

In [20]:
folder = os.path.dirname(os.getcwd()) + "/P1-regression/predictions-final/"

# write_to_file(folder + "LR.csv", LR_pred)
# write_to_file(folder + "RF.csv", RF_pred)
write_to_file(folder + "Ridge.csv", Ridge_pred)
write_to_file(folder + "BRidge.csv", BRidge_pred)
write_to_file(folder + "Lasso.csv", Lasso_pred)
write_to_file(folder + "ElasticNet.csv", Elastic_pred)
write_to_file(folder + "MLP.csv", MLP_pred)