In [11]:
import math
import numpy as np
import pandas as pd
from numpy import arange
from matplotlib import pyplot
from pandas import read_csv
from pandas.plotting import scatter_matrix

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error

from datetime import datetime

import warnings
warnings.filterwarnings("ignore")

# Create one random seed number for reproducible results
seedNum = 888

# Libraries for neural network
import tensorflow
import tensorflow.keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.wrappers.scikit_learn import KerasRegressor

### Reading data

In [2]:
#[kannan: directly reading the data, later change the pipeline to preprocessed, feature engineered data]
#inputFile = '../data/input/OnlineNewsPopularity.csv' #Error tokenizing data. C error: Expected 1 fields in line 6, saw 3
#df = pd.read_csv(inputFile)
inputFile = '../data/input/OnlineNewsPopularity.xlsx'
df = pd.read_excel(inputFile)
df.columns = df.columns.str.replace(' ','')
print(df.shape)
df.head()

(7795, 20)


Unnamed: 0,Id,url,n_tokens_title,n_tokens_content,n_unique_tokens,n_non_stop_words,n_non_stop_unique_tokens,num_hrefs,num_self_hrefs,num_imgs,num_videos,average_token_length,num_keywords,data_channel_is_lifestyle,data_channel_is_entertainment,data_channel_is_bus,data_channel_is_socmed,data_channel_is_tech,data_channel_is_world,shares
0,1,http://mashable.com/2014/09/01/americans-held-...,10,261,0.661355,1.0,0.7875,7,3,1,1,4.873563,7,0,0,0,0,0,1,1100
1,2,http://mashable.com/2014/09/01/apple-visa-mast...,7,1791,0.370242,1.0,0.535038,74,3,50,0,4.554439,8,0,0,0,0,0,0,1100
2,3,http://mashable.com/2014/09/01/aussie-football...,7,503,0.524291,1.0,0.704918,3,3,1,0,5.003976,5,0,0,0,0,0,1,1000
3,4,http://mashable.com/2014/09/01/australia-gover...,10,526,0.536204,1.0,0.654867,17,1,1,0,4.998099,9,0,0,0,0,0,1,822
4,5,http://mashable.com/2014/09/01/australia-jane-...,13,237,0.619048,1.0,0.807143,5,3,1,0,5.046414,9,0,0,0,0,0,1,841


In [3]:
# Dropping useless attributes
df.drop('url', axis=1, inplace=True)
df.drop('Id', axis=1, inplace=True)

In [4]:
# variables for total columns, total input attributes, target column index and renaming target variable
totCol = len(df.columns)
totAttr = totCol-1
targetCol = totCol #last column
df.rename(columns={'shares': 'targetVar'}, inplace=True)

In [5]:
#70:30 train test split
if targetCol == totCol :
    x_df = df.iloc[:,0:totAttr]
    y_df = df.iloc[:,totAttr]
else :
    x_df = df.iloc[:,1:totCol]
    y_df = df.iloc[:,0]

validation_size = 0.30
x_train, x_test, y_train, y_test = train_test_split(x_df, y_df, test_size=validation_size, random_state=seedNum)
print("x_df.shape: {} y_df.shape: {}".format(x_df.shape, y_df.shape))
print("x_train.shape: {} y_train.shape: {}".format(x_train.shape, y_train.shape))
print("x_test.shape: {} y_test.shape: {}".format(x_test.shape, y_test.shape))

x_df.shape: (7795, 17) y_df.shape: (7795,)
x_train.shape: (5456, 17) y_train.shape: (5456,)
x_test.shape: (2339, 17) y_test.shape: (2339,)


In [6]:
# visualization configs
dispCol = 3
if totAttr % dispCol == 0 :
    dispRow = totAttr // dispCol
else :
    dispRow = (totAttr // dispCol) + 1
    
# Set figure width to 16 and height to 12 (4:3 aspect ratio)
fig_size = pyplot.rcParams["figure.figsize"]
fig_size[0] = 16
fig_size[1] = 12
pyplot.rcParams["figure.figsize"] = fig_size

In [7]:
#kannan: This is decided from feature engineering / feature selection
lowAttributes=[]

In [8]:
# Removing the unselected attributes from the training and validation dataframes
#xy_train <- xy_train[, !(names(xy_train) %in% lowAttributes)]
#xy_test <- xy_test[, !(names(xy_test) %in% lowAttributes)]
xy_newdf = df.drop(lowAttributes, axis=1)

totCol = len(xy_newdf.columns)
totAttr = totCol-1
targetCol = totCol
array = xy_newdf.values

if targetCol == totCol :
    x_newdf = array[:,0:totAttr]
    y_newdf = array[:,totAttr]
else :
    x_newdf = array[:,1:totCol]
    y_newdf = array[:,0]

validation_size = 0.30
x_train, x_test, y_train, y_test = train_test_split(x_newdf, y_newdf, test_size=validation_size, random_state=seedNum)
print("x_newdf.shape: {} y_newdf.shape: {}".format(x_newdf.shape, y_newdf.shape))
print("x_train.shape: {} y_train.shape: {}".format(x_train.shape, y_train.shape))
print("x_test.shape: {} y_test.shape: {}".format(x_test.shape, y_test.shape))

x_newdf.shape: (7795, 17) y_newdf.shape: (7795,)
x_train.shape: (5456, 17) y_train.shape: (5456,)
x_test.shape: (2339, 17) y_test.shape: (2339,)


## Tuning DL Models

In [28]:
# Tune optimizer

def baseline_model(optimizer='adam', act_function='relu'):
        
    model = Sequential()
    
    model.add(BatchNormalization())
    model.add(Dense(10, input_dim=17, kernel_initializer='normal', activation=act_function))
    #model.add(Dense(15, kernel_initializer='normal', activation=act_function))
    model.add(Dense(6,  kernel_initializer='normal', activation=act_function))
    model.add(Dense(1,  kernel_initializer='normal'))
    
    model.compile(loss='mean_squared_error', optimizer=optimizer)
    
    return model

estimator = KerasRegressor(build_fn=baseline_model, epochs=50, batch_size=10, verbose=0)

# grid search parameters
optimizer = ['SGD', 'RMSprop', 'Adagrad', 'Adadelta', 'Adam', 'Adamax']
param_grid = dict(optimizer=optimizer)

grid = GridSearchCV(estimator=estimator, param_grid=param_grid, n_jobs=-1, cv=3)
grid_result = grid.fit(x_train, y_train)
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
print("Best: root %f using %s" % ((math.sqrt(abs(grid_result.best_score_))), grid_result.best_params_))

means = grid_result.cv_results_['mean_test_score']
params = grid_result.cv_results_['params']
for mean, param in zip(means, params):
    print("rmse %f with: %r" % (math.sqrt(abs(mean)), param))

Best: -49104322.374014 using {'optimizer': 'Adam'}
Best: root 7007.447636 using {'optimizer': 'Adam'}
rmse 7071.159484 with: {'optimizer': 'SGD'}
rmse 7009.151713 with: {'optimizer': 'RMSprop'}
rmse 7659.313580 with: {'optimizer': 'Adagrad'}
rmse 7661.626916 with: {'optimizer': 'Adadelta'}
rmse 7007.447636 with: {'optimizer': 'Adam'}
rmse 7067.114582 with: {'optimizer': 'Adamax'}


In [29]:
# Tune learning rate and momentum for RMSprop optimizer
def baseline_model(learn_rate=0.0001, momentum=0, act_function='relu'):
        
    model = Sequential()
    
    model.add(BatchNormalization())
    model.add(Dense(10, input_dim=17, kernel_initializer='normal', activation=act_function))
    model.add(Dense(6,  kernel_initializer='normal', activation=act_function))
    model.add(Dense(1,  kernel_initializer='normal'))
    
    optimizer = tensorflow.keras.optimizers.RMSprop(lr=learn_rate, momentum=momentum)
    model.compile(loss='mean_squared_error', optimizer=optimizer)
    
    return model

estimator = KerasRegressor(build_fn=baseline_model, epochs=50, batch_size=10, verbose=0)

# grid search parameters
learn_rate = [0.001, 0.01, 0.1, 0.2, 0.3]
momentum = [0.0, 0.2, 0.4, 0.6, 0.8, 0.9]
param_grid = dict(learn_rate=learn_rate, momentum=momentum)

grid = GridSearchCV(estimator=estimator, param_grid=param_grid, n_jobs=-1, cv=3)
grid_result = grid.fit(x_train, y_train)
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
print("Best: rmse %f using %s" % ((math.sqrt(abs(grid_result.best_score_))), grid_result.best_params_))

means = grid_result.cv_results_['mean_test_score']
params = grid_result.cv_results_['params']
for mean, param in zip(means, params):
    print("rmse %f with: %r" % (math.sqrt(abs(mean)), param))


Best: -49042593.877202 using {'learn_rate': 0.001, 'momentum': 0.2}
Best: rmse 7003.041759 using {'learn_rate': 0.001, 'momentum': 0.2}
rmse 7009.691044 with: {'learn_rate': 0.001, 'momentum': 0.0}
rmse 7003.041759 with: {'learn_rate': 0.001, 'momentum': 0.2}
rmse 7010.343204 with: {'learn_rate': 0.001, 'momentum': 0.4}
rmse 7013.898362 with: {'learn_rate': 0.001, 'momentum': 0.6}
rmse 7012.396081 with: {'learn_rate': 0.001, 'momentum': 0.8}
rmse 7016.509321 with: {'learn_rate': 0.001, 'momentum': 0.9}
rmse 7024.241766 with: {'learn_rate': 0.01, 'momentum': 0.0}
rmse 7025.214369 with: {'learn_rate': 0.01, 'momentum': 0.2}
rmse 7041.254277 with: {'learn_rate': 0.01, 'momentum': 0.4}
rmse 7175.208887 with: {'learn_rate': 0.01, 'momentum': 0.6}
rmse 7052.716495 with: {'learn_rate': 0.01, 'momentum': 0.8}
rmse 7065.203714 with: {'learn_rate': 0.01, 'momentum': 0.9}
rmse 7177.463414 with: {'learn_rate': 0.1, 'momentum': 0.0}
rmse 7085.940891 with: {'learn_rate': 0.1, 'momentum': 0.2}
rmse 7

In [31]:
# Tune number of neurons in first layer
def baseline_model(neurons=10, learn_rate=0.001, momentum=0.2, act_function='relu'):
        
    model = Sequential()
    
    model.add(BatchNormalization())
    model.add(Dense(neurons, input_dim=17, kernel_initializer='normal', activation=act_function))
    model.add(Dense(6,  kernel_initializer='normal', activation=act_function))
    model.add(Dense(1,  kernel_initializer='normal'))
    
    optimizer = tensorflow.keras.optimizers.RMSprop(lr=learn_rate, momentum=momentum)
    model.compile(loss='mean_squared_error', optimizer=optimizer, )
    
    return model

estimator = KerasRegressor(build_fn=baseline_model, epochs=50, batch_size=10, verbose=0)

# grid search parameters
neurons = [5, 10, 15, 20, 25]
param_grid = dict(neurons=neurons)

grid = GridSearchCV(estimator=estimator, param_grid=param_grid, n_jobs=-1, cv=3)
grid_result = grid.fit(x_train, y_train)
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
print("Best: rmse %f using %s" % ((math.sqrt(abs(grid_result.best_score_))), grid_result.best_params_))

means = grid_result.cv_results_['mean_test_score']
params = grid_result.cv_results_['params']
for mean, param in zip(means, params):
    print("rmse %f with: %r" % (math.sqrt(abs(mean)), param))


Best: -49064040.914169 using {'neurons': 20}
Best: rmse 7004.572857 using {'neurons': 20}
rmse 7004.924551 with: {'neurons': 5}
rmse 7005.581685 with: {'neurons': 10}
rmse 7005.850694 with: {'neurons': 15}
rmse 7004.572857 with: {'neurons': 20}
rmse 7011.723906 with: {'neurons': 25}


In [35]:
# Tune initialization
def baseline_model(init_mode='uniform', learn_rate=0.001, momentum=0.2, act_function='relu'):
        
    model = Sequential()
    
    model.add(BatchNormalization())
    model.add(Dense(5, input_dim=17, kernel_initializer=init_mode, activation=act_function))
    model.add(Dense(6,  kernel_initializer=init_mode, activation=act_function))
    model.add(Dense(1,  kernel_initializer='normal'))
    
    optimizer = tensorflow.keras.optimizers.RMSprop(lr=learn_rate, momentum=momentum)
    model.compile(loss='mean_squared_error', optimizer=optimizer, )
    
    return model

estimator = KerasRegressor(build_fn=baseline_model, epochs=50, batch_size=10, verbose=0)

# grid search parameters
init_mode = ['uniform', 'lecun_uniform', 'normal', 'zero', 'glorot_normal', 'glorot_uniform', 'he_normal', 'he_uniform']
param_grid = dict(init_mode=init_mode)

grid = GridSearchCV(estimator=estimator, param_grid=param_grid, n_jobs=-1, cv=3)
grid_result = grid.fit(x_train, y_train)
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
print("Best: rmse %f using %s" % ((math.sqrt(abs(grid_result.best_score_))), grid_result.best_params_))

means = grid_result.cv_results_['mean_test_score']
params = grid_result.cv_results_['params']
for mean, param in zip(means, params):
    print("rmse %f with: %r" % (math.sqrt(abs(mean)), param))


Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Best: -49023710.184524 using {'init_mode': 'uniform'}
Best: rmse 7001.693380 using {'init_mode': 'uniform'}
rmse 7001.693380 with: {'init_mode': 'uniform'}
rmse 7004.185162 with: {'init_mode': 'lecun_uniform'}
rmse 7008.535429 with: {'init_mode': 'normal'}
rmse 7654.526369 with: {'init_mode': 'zero'}
rmse 7004.503407 with: {'init_mode': 'glorot_normal'}
rmse 7008.724359 with: {'init_mode': 'glorot_uniform'}
rmse 7013.760493 with: {'init_mode': 'he_normal'}
rmse 7007.972445 with: {'init_mode': 'he_uniform'}


## Now train with tuned hyperparameters

In [36]:
def baseline_model(init_mode='uniform', learn_rate=0.001, momentum=0.2, act_function='relu'):
        
    model = Sequential()
    
    model.add(BatchNormalization())
    model.add(Dense(5, input_dim=17, kernel_initializer=init_mode, activation=act_function))
    model.add(Dense(6,  kernel_initializer=init_mode, activation=act_function))
    model.add(Dense(1,  kernel_initializer='normal'))
    
    optimizer = tensorflow.keras.optimizers.RMSprop(lr=learn_rate, momentum=momentum)
    model.compile(loss='mean_squared_error', optimizer=optimizer, )
    
    return model

estimator = KerasRegressor(build_fn=baseline_model, epochs=50, batch_size=10, verbose=0)

In [37]:
kfold = KFold(n_splits=10)
startTimeModule = datetime.now()
dl_results = cross_val_score(estimator, x_train, y_train, cv=kfold)
print("RMSE is (%.2f) ", math.sqrt(abs(dl_results.mean())))
print ('Model training time:',(datetime.now() - startTimeModule))

RMSE is (%.2f)  7006.977069005679
Model training time: 0:09:38.823799


In [39]:
estimator.fit(x_train, y_train)
predictions=estimator.predict(x_test)
rmse=math.sqrt(mean_squared_error(y_test, predictions))
print("\n%s: rmse:%f" % ("DL", rmse))


DL: rmse:10282.293521
