# Import libraries

In [41]:
import csv
import os
import re
import sys

import keras.layers as layers
import nltk
import numpy as np
import pandas as pd
import sklearn
import tensorflow as tf
import tensorflow_hub as hub
import xgboost as xgb
from keras import backend as K
from keras.engine import Layer
from keras.layers import (Dense, Dropout, Embedding, Flatten, Input,
                          MaxPooling1D)
from keras.models import Model, Sequential, load_model
from keras.optimizers import SGD, Adam
from keras.preprocessing.sequence import pad_sequences
from keras.wrappers.scikit_learn import KerasRegressor
from scipy.stats import pearsonr
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.ensemble import (AdaBoostClassifier, GradientBoostingRegressor,
                              RandomForestRegressor)
from sklearn.feature_extraction.text import (CountVectorizer, TfidfTransformer,
                                             TfidfVectorizer)
from sklearn.linear_model import (ElasticNet, Lasso, LinearRegression,
                                  MultiTaskElasticNet, Ridge)
from sklearn.model_selection import (GridSearchCV, LeaveOneOut,
                                     cross_val_score, train_test_split)
from sklearn.pipeline import Pipeline
from sklearn.svm import SVR, LinearSVR
from xgboost import XGBRegressor

from GridSearch import GridSearch

In [44]:
os.getcwd()
print('The nltk version is {}.'.format(nltk.__version__))
print('The scikit-learn version is {}.'.format(sklearn.__version__))
print('The tensorflow version is {}.'.format(tf.__version__))

The nltk version is 3.4.5.
The scikit-learn version is 0.22.1.
The tensorflow version is 2.1.0.


# GridSearch Object

In [4]:
class GridSearch:

    def __init__(self, models_dict, params_dict):
        # if not set(models_dict.keys()).issubset(set(params_dict.keys())):
        #     missing_params = list(set(models.keys()) - set(params_dict.keys()))
        #     raise ValueError(
        #         "Some estimators are missing parameters: %s" % missing_params)
        self.models = models_dict
        self.params = params_dict
        self.keys = models_dict.keys()
        #print(self.keys)
        self.best_ = {
            'estimator': [None],
            'params': {},
            'y_pred': [],
            'r': [],
        }

    def predict(self):
        return self.best_['r']

    def tune(self, X_train, y_train, X_test, y_test, **grid_kwargs):
        max_r = 0
        for key in self.keys:
            print("\tRunning GridSearchCV for %s." % key)
            model = self.models[key]
            params = self.params[key]

            #Pipeline the estimators
            pipeline = Pipeline([
                ('clf', model),
            ])

            gs = GridSearchCV(pipeline, params, **grid_kwargs)
            gs.fit(X_train, y_train)

            print("\tPredicting for %s." % key)
            y_pred = gs.predict(X_test)
            r = np.corrcoef(y_pred, y_test)[0, 1]
            print(params)
            print(r)

            if (abs(r) > abs(max_r)):
                self.best_['estimator'] = model
                self.best_['params'] = gs.best_params_
                self.best_['r'] = r
                self.best_['y_pred'] = y_pred

            print("Current Best")
            #print(self.best_['params'])
            print(self.best_['r'])

            print('\tTuning for %s Done.' % key)

# Elmo Regression Model for Keras Neural Network
Create a custom layer that allows us to update weights (lambda layers do not have trainable parameters!)

In [5]:
class ElmoEmbeddingLayer(Layer):
    def __init__(self, **kwargs):
        self.dimensions = 1024
        self.trainable=True
        super(ElmoEmbeddingLayer, self).__init__(**kwargs)

    def build(self, input_shape):
        self.elmo = hub.Module('https://tfhub.dev/google/elmo/2', trainable=self.trainable, name="{}_module".format(self.name))
        self.trainable_weights += K.tf.trainable_variables(scope="^{}_module/.*".format(self.name))
        super(ElmoEmbeddingLayer, self).build(input_shape)

    def call(self, x, mask=None):
        result = self.elmo(K.squeeze(K.cast(x, tf.string), axis=1),
                      as_dict=True,
                      signature='default',
                      )['default']
        return result

    def compute_mask(self, inputs, mask=None):
        return K.not_equal(inputs, '--PAD--')

    def compute_output_shape(self, input_shape):
        return (input_shape[0], self.dimensions)

def ElmoRegressionModel(
    dense_dropout_rate=0.5,
    loss='mean_squared_error',
    optimizer='adam',
    metrics=['mse'],
    print_summary=False,
    include_hidden_layer=False,
    hidden_layer_size=64
):
    inputs, embeddings = [], []
    
    for idx in range(1, 6):
        _input = layers.Input(shape=(1,), dtype="string")
        inputs.append(_input)
        embedding = ElmoEmbeddingLayer()(_input)
        embeddings.append(embedding)
        
    concat = layers.concatenate(embeddings)
    dense = Dropout(dense_dropout_rate)(concat)
    if include_hidden_layer:
        dense = layers.Dense(hidden_layer_size, activation='relu')(dense)
        dense = Dropout(dense_dropout_rate)(dense)
    dense = layers.Dense(1, activation='relu')(dense)# (drop2)
    
    # If we want to do 5-way prediction within a single network
    # dense = layers.Dense(5, activation='relu')(dense)
    
    model = Model(inputs=inputs, outputs=dense)

    model.compile(loss=loss, optimizer=optimizer, metrics=metrics)
    
    if print_summary:
        model.summary()

    return model

In [19]:
###############################################################################################################
# Import the process data from R Studio
###############################################################################################################
data = pd.read_csv("mega_dataset.csv")

train_data = data.loc[data.Dataset == "Train"]
train_data_X = train_data.iloc[:, 13:]
train_data_y = train_data.loc[:, train_data.columns.str.contains('_Scale_score')]

test_data = data.loc[(data.Dataset == "Dev") | (data.Dataset == "Test")]
test_data_X = test_data.iloc[:, 13:]
test_data_y = test_data.loc[:, test_data.columns.str.contains('_Scale_score')]
ATTRIBUTE_LIST = ["E_Scale_score", "A_Scale_score", "O_Scale_score", "C_Scale_score", "N_Scale_score"]
#Y = np.array(train_raw_df[[att + "_Scale_score" for att in ATTRIBUTE_LIST]].values)



In [24]:
train_data_y["E_Scale_score"]

0       2.250000
1       4.666667
2       2.250000
3       2.916667
4       3.750000
          ...   
1083    2.500000
1084    3.000000
1085    3.833333
1086    4.500000
1087    2.750000
Name: E_Scale_score, Length: 1088, dtype: float64

In [33]:
# best_hyperparameter_list = tune(train_data_X, train_data_y, dev_data_X, dev_data_y, selected_feature)
best_hyperparameters = []
clf_dict = {
    'neural': KerasRegressor(build_fn=lambda: ElmoRegressionModel(**model_params)),
    'forest': RandomForestRegressor(),
    'ridge': Ridge(),
    'elastic': ElasticNet(),
}

max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
param_list = {
    'neural': {
        # 'clf__build_fn': [lambda: ElmoRegressionModel(**model_params)],
        'clf__epochs': [10,20],
        'clf__batch_size': [10, 20, 32, 40, 60, 80, 100, 128],
        # 'clf__optimizer' : ['Adam', 'Nadam'],
        # 'clf__dropout_rate' : [0.2, 0.3],
        # 'clf__activation' : ['relu', 'elu']
    },
    'forest': {
        'clf__n_estimators': [1000, 1500, 2000, 2500, 3000, 3500, 4000, 5000, 8000, 10000],
        'clf__criterion': ['mse'],
        'clf__max_depth': max_depth,
        'clf__min_samples_split': [2, 3, 4, 5, 6, 10, 12],
        'clf__min_samples_leaf': [1, 2, 4],
        'clf__min_weight_fraction_leaf': [0.0],
        'clf__max_features': ['auto', 'sqrt'],
        'clf__max_leaf_nodes': [None],
        'clf__min_impurity_decrease': [0.0],
        'clf__min_impurity_split': [None],
        'clf__bootstrap': [True, False],
        'clf__oob_score': [False],
        'clf__n_jobs': [None],
        'clf__random_state': [None],
        'clf__verbose': [0],
        'clf__warm_start': [False],
        'clf__ccp_alpha': [0.0],
        'clf__max_samples': [None]
    },
    'ridge': {
        'clf__alpha': [0.0, 0.05, 0.1, 0.3, 0.5, 0.8, 0.9, 0.95, 1.0, 1.2, 1.8, 10, 100, 10000, 1000000], 
        'clf__fit_intercept': [True], 
        'clf__normalize': [False], 
        'clf__copy_X': [True],
        'clf__max_iter': [None],
        'clf__tol': [0.001],
        'clf__solver': ['auto'],
        'clf__random_state': [None],
    },
    'elastic': {
        'clf__alpha': [0.0, 0.05, 0.1, 0.3, 0.5, 0.8, 0.9, 0.95, 1.0, 1.2, 1.8, 10, 100, 10000, 1000000],
        'clf__l1_ratio': [0.0, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.95, 1.0],
        'clf__fit_intercept': [True],
        'clf__normalize': [False],
        'clf__precompute': [False],
        'clf__max_iter': [1000],
        'clf__copy_X': [True],
        'clf__tol': [0.0001],
        'clf__warm_start': [False],
        'clf__positive': [False],
        'clf__random_state': [None],
        'clf__selection': ['cyclic']
    },
}

In [26]:
Corr = {
    'neural_r2': {"O_Scale_score": None,
                "C_Scale_score": None,
                "E_Scale_score": None,
                "A_Scale_score": None,
                "N_Scale_score": None},
    'neural_mse': {"O_Scale_score": None,
                "C_Scale_score": None,
                "E_Scale_score": None,
                "A_Scale_score": None,
                "N_Scale_score": None},
    'forest_r2': {"O_Scale_score": None,
                "C_Scale_score": None,
                "E_Scale_score": None,
                "A_Scale_score": None,
                "N_Scale_score": None},
    'forest_mse': {"O_Scale_score": None,
                "C_Scale_score": None,
                "E_Scale_score": None,
                "A_Scale_score": None,
                "N_Scale_score": None},
    'ridge_r2': {"O_Scale_score": None,
                "C_Scale_score": None,
                "E_Scale_score": None,
                "A_Scale_score": None,
                "N_Scale_score": None},
    'ridge_mse': {"O_Scale_score": None,
                "C_Scale_score": None,
                "E_Scale_score": None,
                "A_Scale_score": None,
                "N_Scale_score": None},
    'elastic_r2': None,
    'elastic_mse': None,
}

# Run Neural Network

In [45]:
print('Running Parameter TUning for Neural Network with r2')
# Initialize session
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
sess = tf.Session(config=config)
K.set_session(sess)
import logging
logging.getLogger("tensorflow").setLevel(logging.WARNING)
# Make sure we have a GPU - else this'll take a lifetime or two
sess.list_devices()
ATTRIBUTE_MODEL_PARAMS = [
    dict(dense_dropout_rate=0.7),
    dict(dense_dropout_rate=0.7),
    dict(dense_dropout_rate=0.7),
    dict(dense_dropout_rate=0.7),
    dict(include_hidden_layer=True, dense_dropout_rate=0.2),
]

for idx, att in enumerate(ATTRIBUTE_LIST):
    print("Training for attribute {}".format(att))
    model_params = ATTRIBUTE_MODEL_PARAMS[idx]

    clf = KerasRegressor(
        build_fn=lambda: ElmoRegressionModel(**model_params),
        epochs=10,
        batch_size=32,
        verbose=1
    )
    pipe = Pipeline([('clf', clf)])
    gs = GridSearchCV(pipe, param_list.get("neural"), cv=10, n_jobs=10, verbose=5, scoring='r2', return_train_score=False, error_score='raise', iid=True)
    gs.fit(train_data_X, train_data_y[att])

    neural_y = gs.predict(test_data_X)
    Corr['neural_r2'][att] = neural_y.corr(test_data_y[att])


Running Parameter TUning for Neural Network with r2


AttributeError: module 'tensorflow' has no attribute 'ConfigProto'

In [None]:
print('Running Parameter TUning for Neural Network with mse')

pipe = Pipeline([('clf', clf_dict.get("neural"))])
gs = GridSearchCV(pipe, param_list.get(key), cv=10, n_jobs=10, verbose=1, scoring='mean_squared_error', return_train_score=False, error_score='raise', iid=True)
gs.fit(train_data_X, train_data_y)
neural_y = gs.predict(test_data_X)
Corr['neural_mse'] = forest_y.corr(test_data_y)

# Run Random Forest

In [None]:
print('Running Parameter TUning for Random Forest with r2')
pipe = Pipeline([('clf', clf_dict.get("forest"))])
gs = GridSearchCV(pipe, param_list.get("forest"), cv=10, n_jobs=30, verbose=5, scoring='r2', return_train_score=False, error_score='raise', iid=True)
gs.fit(train_data_X, train_data_y)
forest_y = gs.predict(test_data_X)
Corr['forest_r2'] = forest_y.corr(test_data_y)
Corr

In [None]:
print('Running Parameter TUning for Random Forest with mse')
pipe = Pipeline([('clf', clf_dict.get("neural"))])
gs = GridSearchCV(pipe, param_list.get(key), cv=10, n_jobs=30, verbose=5, scoring='mean_squared_error', return_train_score=False, error_score='raise', iid=True)
gs.fit(train_data_X, train_data_y)
forest_y = gs.predict(test_data_X)
Corr['forest_mse'] = forest_y.corr(test_data_y)
Corr

# Run Ridge Regression

In [None]:
print('Running Parameter TUning for Ridge with r2')
pipe = Pipeline([('clf', clf_dict.get("ridge"))])
gs = GridSearchCV(pipe, param_list.get("ridge"), cv=10, n_jobs=30, verbose=5, scoring='r2', return_train_score=False, error_score='raise', iid=True)
gs.fit(train_data_X, train_data_y)
forest_y = gs.predict(test_data_X)
Corr['ridge_r2'] = forest_y.corr(test_data_y)
Corr

In [None]:
print('Running Parameter TUning for Ridge with mse')
pipe = Pipeline([('clf', clf_dict.get("ridge"))])
gs = GridSearchCV(pipe, param_list.get("ridge"), cv=10, n_jobs=30, verbose=5, scoring='r2', return_train_score=False, error_score='raise', iid=True)
gs.fit(train_data_X, train_data_y)
forest_y = gs.predict(test_data_X)
Corr['ridge_mse'] = forest_y.corr(test_data_y)
Corr

# Run Elastic Net

In [None]:
print('Running Parameter TUning for Elastic Net with r2')
pipe = Pipeline([('clf', clf_dict.get("elastic"))])
gs = GridSearchCV(pipe, param_list.get("elastic"), cv=10, n_jobs=30, verbose=5, scoring='r2', return_train_score=False, error_score='raise', iid=True)
gs.fit(train_data_X, train_data_y)
forest_y = gs.predict(test_data_X)
Corr['elastic_r2'] = forest_y.corr(test_data_y)
Corr

In [None]:
print('Running Parameter TUning for Elastic Net with mse')
pipe = Pipeline([('clf', clf_dict.get("elastic"))])
gs = GridSearchCV(pipe, param_list.get("elastic"), cv=10, n_jobs=30, verbose=5, scoring='r2', return_train_score=False, error_score='raise', iid=True)
gs.fit(train_data_X, train_data_y)
forest_y = gs.predict(test_data_X)
Corr['elastic_mse'] = forest_y.corr(test_data_y)
Corr