In [6]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns

from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, RationalQuadratic, ExpSineSquared, ConstantKernel
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor
from datetime import datetime
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.ensemble import ExtraTreesClassifier

In [7]:
#Dataset for Concrete strength
df_merck1 = pd.read_csv('ACT2_competition_training.csv')
df_merck1.head()

Unnamed: 0,MOLECULE,Act,D_3,D_4,D_5,D_7,D_10,D_11,D_12,D_16,...,D_11030,D_11031,D_11034,D_11045,D_11046,D_11048,D_11049,D_11051,D_11056,D_11072
0,ACT2_M_19,6.7153,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,ACT2_M_20,6.4912,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,ACT2_M_21,5.8528,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,ACT2_M_22,6.3854,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,ACT2_M_23,5.8941,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
df_merck2 = pd.read_csv('ACT4_competition_training.csv')
df_merck2.head()

Unnamed: 0,MOLECULE,Act,D_5,D_39,D_40,D_41,D_42,D_43,D_44,D_45,...,D_10700,D_10701,D_10702,D_10703,D_10706,D_10735,D_10736,D_10737,D_10742,D_10772
0,ACT4_M_5065,5.3001,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,ACT4_M_7241,4.6012,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,ACT4_M_11511,5.3001,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,ACT4_M_13147,5.3001,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,ACT4_M_15001,5.3001,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
df_merck = pd.concat([df_merck1,df_merck2], axis=0, ignore_index=True)
df_merck.head()

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


Unnamed: 0,Act,D_10,D_1000,D_10000,D_10001,D_10002,D_10003,D_10004,D_10005,D_10006,...,D_999,D_9992,D_9993,D_9994,D_9995,D_9996,D_9997,D_9998,D_9999,MOLECULE
0,6.7153,0.0,0,0,0,0,0.0,0,0,0,...,0,,,0.0,0,0,0,0,0,ACT2_M_19
1,6.4912,0.0,0,0,0,0,0.0,0,0,0,...,0,,,0.0,0,0,0,0,0,ACT2_M_20
2,5.8528,0.0,0,0,0,0,0.0,0,0,0,...,0,,,0.0,0,0,0,0,0,ACT2_M_21
3,6.3854,0.0,0,0,0,0,0.0,0,0,0,...,0,,,0.0,0,0,0,0,0,ACT2_M_22
4,5.8941,0.0,0,0,0,0,0.0,0,0,0,...,0,,,0.0,0,0,0,0,0,ACT2_M_23


In [None]:
print(df_merck.info())
df_merck.replace(r'^\s*$', 0, regex=True, inplace = True)
df_merck.replace('?', 0, inplace = True)
df_merck.replace('[A-Z]+[0-9]*',0, inplace = True)
print(df_merck.isnull().sum())
df_merck.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10531 entries, 0 to 10530
Columns: 6315 entries, Act to MOLECULE
dtypes: float64(2444), int64(3870), object(1)
memory usage: 507.4+ MB
None


In [None]:
X = df_merck.loc[:, df_merck.columns != 'Act']
X = X.apply(pd.to_numeric, errors='coerce')
X.fillna(0, inplace=True)
y = df_merck['Act']
X.head()

In [None]:
X.shape

In [None]:
X_train,X_test,y_train,y_test = train_test_split(
    X,
    y,
    random_state=0)

In [None]:
#Standardizing the prepared training and test data
scaler = preprocessing.StandardScaler().fit(x_train)
X_train_scaled = scaler.transform(x_train)
X_test_scaled = scaler.transform(x_test)

In [None]:
selector = VarianceThreshold(threshold=.50 * (1 - .5))
selector.fit(X_train)
X_train = selector.transform(X_train)
X_test = selector.transform(X_test)

In [None]:
def svr_param_selection(X, y, X_test, y_test, nfolds):
    Kernels = ['poly', 'rbf']
    Cs = [0.001, 0.01]
    Gammas = [0.001, 0.1]
    param_grid = {'kernel':Kernels, 'C': Cs, 'gamma' : Gammas}
#     grid_search = GridSearchCV(SVR(), param_grid, cv=nfolds, n_jobs=-1)
    grid_search = SVR()
    grid_search.fit(X, y)
    print('SVR MSE Score for training data: '+str(grid_search.score(X_test, y_test)))
    print('SVR With Parameters: '+str(grid_search))    
    print('SVR coefficient of determination R^2 on test data: '+str(grid_search.score(X_test, y_test)))
    y_pred = grid_search.predict(X_test)
    print('MSE for SVR on test set: '+str(mean_squared_error(y_test, y_pred)))

In [None]:
def random_forest_regressor_param_selection(X, y, X_test, y_test, nfolds):
    grid_search = RandomForestRegressor(random_state=0)
    grid_search.fit(X, y)
    print('RandomForestRegressor MSE Score for training data: '+str(grid_search.score(X_test, y_test)))
    print('RandomForestRegressor With Parameters: '+str(grid_search))    
    print('RandomForestRegressor coefficient of determination R^2 on test data: '+str(grid_search.score(X_test, y_test)))
    y_pred = grid_search.predict(X_test)
    print('MSE for RandomForestRegressor on test set: '+str(mean_squared_error(y_test, y_pred)))

In [None]:
def decision_tree_regressor_param_selection(X, y, X_test, y_test, nfolds):
    grid_search = DecisionTreeRegressor(random_state=0)
    grid_search.fit(X, y)
    print('DecisionTreeRegressor MSE Score for training data: '+str(grid_search.score(X_test, y_test)))
    print('DecisionTreeRegressor With Parameters: '+str(grid_search))    
    print('DecisionTreeRegressor coefficient of determination R^2 on test data: '+str(grid_search.score(X_test, y_test)))
    y_pred = grid_search.predict(X_test)
    print('MSE for DecisionTreeRegressor on test set: '+str(mean_squared_error(y_test, y_pred)))

In [None]:
def ada_boost_regressor_param_selection(X, y, X_test, y_test, nfolds):
    grid_search = AdaBoostRegressor(random_state=0)
    grid_search.fit(X, y)
    print('AdaBoostRegressor MSE Score for training data: '+str(grid_search.score(X_test, y_test)))
    print('AdaBoostRegressor With Parameters: '+str(grid_search))    
    print('AdaBoostRegressor coefficient of determination R^2 on test data: '+str(grid_search.score(X_test, y_test)))
    y_pred = grid_search.predict(X_test)
    print('MSE for AdaBoostRegressor on test set: '+str(mean_squared_error(y_test, y_pred)))

In [None]:
def gaussian_regressor_param_selection(X, y, X_test, y_test, nfolds):
    grid_search = GaussianProcessRegressor(random_state=0)
    grid_search.fit(X, y)
    print('GaussianProcessRegressor MSE Score for training data: '+str(grid_search.score(X_test, y_test)))
    print('GaussianProcessRegressor With Parameters: '+str(grid_search))    
    print('GaussianProcessRegressor coefficient of determination R^2 on test data: '+str(grid_search.score(X_test, y_test)))
    y_pred = grid_search.predict(X_test)
    print('MSE for GaussianProcessRegressor on test set: '+str(mean_squared_error(y_test, y_pred)))

In [None]:
def linear_regressor_param_selection(X, y, X_test, y_test, nfolds):
    grid_search = LinearRegression()
    grid_search.fit(X, y)
    print('LinearRegressor MSE Score for training data: '+str(grid_search.score(X_test, y_test)))
    print('LinearRegressor With Parameters: '+str(grid_search))    
    print('LinearRegressor coefficient of determination R^2 on test data: '+str(grid_search.score(X_test, y_test)))
    y_pred = grid_search.predict(X_test)
    print('MSE for LinearRegressor on test set: '+str(mean_squared_error(y_test, y_pred)))

In [None]:
def neural_network_regressor_param_selection(X, y, X_test, y_test, nfolds):
    grid_search = MLPRegressor(random_state=0)
    grid_search.fit(X, y)
    print('NeuralNetworkRegressor MSE Score for training data: '+str(grid_search.score(X_test, y_test)))
    print('NeuralNetworkRegressor With Parameters: '+str(grid_search))    
    print('NeuralNetworkRegressor coefficient of determination R^2 on test data: '+str(grid_search.score(X_test, y_test)))
    y_pred = grid_search.predict(X_test)
    print('MSE for NeuralNetworkRegressor on test set: '+str(mean_squared_error(y_test, y_pred)))

In [None]:
print('Due to the strict 3 minute rule, we have skipped the k-fold validation for large datasets like these and skipped SVR')
print("now ="+str(datetime.now()))
linear_best_param         = linear_regressor_param_selection(X_train, y_train, X_test, y_test, 3)
print()
print("now ="+str(datetime.now()))
random_forest_best_param = random_forest_regressor_param_selection(X_train, y_train, X_test, y_test, 3)
print()
print("now ="+str(datetime.now()))
decision_tree_best_param = decision_tree_regressor_param_selection(X_train, y_train, X_test, y_test, 3)
print()
print("now ="+str(datetime.now()))
ada_boost_best_param     = ada_boost_regressor_param_selection(X_train, y_train, X_test, y_test, 3)
print()
print("now ="+str(datetime.now()))
neural_network_best_param = neural_network_regressor_param_selection(X_train, y_train, X_test, y_test, 3)
print()
print("now ="+str(datetime.now()))
gaussian_best_param       = gaussian_regressor_param_selection(x_train, y_train, X_test, y_test, 3)
print("now ="+str(datetime.now()))
# svr_best_param           = svr_param_selection(X_train, y_train, X_test, y_test, 3)
# print()
print("now ="+str(datetime.now()))