In [19]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.model_selection import GridSearchCV, cross_val_score, KFold
from sklearn import neighbors
from sklearn import linear_model
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn import tree
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, roc_curve, auc, confusion_matrix,plot_confusion_matrix, classification_report
import xgboost as xgb
import warnings
warnings.filterwarnings('ignore')

In [2]:
data = pd.read_excel("HW3.xlsx", header=0)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 25 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   sequence_number       2000 non-null   int64  
 1   US                    2000 non-null   int64  
 2   source_a              2000 non-null   int64  
 3   source_c              2000 non-null   int64  
 4   source_b              2000 non-null   int64  
 5   source_d              2000 non-null   int64  
 6   source_e              2000 non-null   int64  
 7   source_m              2000 non-null   int64  
 8   source_o              2000 non-null   int64  
 9   source_h              2000 non-null   int64  
 10  source_r              2000 non-null   int64  
 11  source_s              2000 non-null   int64  
 12  source_t              2000 non-null   int64  
 13  source_u              2000 non-null   int64  
 14  source_p              2000 non-null   int64  
 15  source_x             

## Import data

In [3]:
df = data.drop(["sequence_number","Purchase"], axis=1)
df.head(3)

Unnamed: 0,US,source_a,source_c,source_b,source_d,source_e,source_m,source_o,source_h,source_r,...,source_p,source_x,source_w,Freq,last_update_days_ago,1st_update_days_ago,Web order,Gender=male,Address_is_res,Spending
0,1,0,0,1,0,0,0,0,0,0,...,0,0,0,2,3662,3662,1,0,1,127.87
1,1,0,0,0,0,1,0,0,0,0,...,0,0,0,0,2900,2900,1,1,0,0.0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,2,3883,3914,0,0,0,127.48


## Formate data

In [4]:
c = df.loc[:, ~df.columns.isin(['Freq', 'last_update_days_ago', '1st_update_days_ago','Spending'])].columns
for col in c:
    df[col] = df[col].astype('str')

In [5]:
df.dtypes

US                       object
source_a                 object
source_c                 object
source_b                 object
source_d                 object
source_e                 object
source_m                 object
source_o                 object
source_h                 object
source_r                 object
source_s                 object
source_t                 object
source_u                 object
source_p                 object
source_x                 object
source_w                 object
Freq                      int64
last_update_days_ago      int64
1st_update_days_ago       int64
Web order                object
Gender=male              object
Address_is_res           object
Spending                float64
dtype: object

In [6]:
# Check missing value
df.isnull().sum()

US                      0
source_a                0
source_c                0
source_b                0
source_d                0
source_e                0
source_m                0
source_o                0
source_h                0
source_r                0
source_s                0
source_t                0
source_u                0
source_p                0
source_x                0
source_w                0
Freq                    0
last_update_days_ago    0
1st_update_days_ago     0
Web order               0
Gender=male             0
Address_is_res          0
Spending                0
dtype: int64

In [8]:
#Splitting the data
feature = df.loc[:, df.columns!="Spending"]
target = df.loc[:, df.columns=='Spending']
X_train, X_test, y_train, y_test = train_test_split(feature, target, test_size=0.2, random_state=0)

In [9]:
# Normalization
scaler = StandardScaler()
scaler.fit(X_train[['Freq', 'last_update_days_ago', '1st_update_days_ago']])
X_train[['Freq', 'last_update_days_ago', '1st_update_days_ago']] =scaler.transform(X_train[['Freq', 'last_update_days_ago', '1st_update_days_ago']])
X_test[['Freq', 'last_update_days_ago', '1st_update_days_ago']] = scaler.transform(X_test[['Freq', 'last_update_days_ago', '1st_update_days_ago']])

In [11]:
# define cross_validation settings
inner_cv = KFold(n_splits=5,shuffle = True,random_state = 1)
outer_cv = KFold(n_splits=5,shuffle = True,random_state = 1)

## KNN Regression

In [17]:
knn = neighbors.KNeighborsRegressor()
p_grid ={
    "weights" : ["uniform", "distance"], 
    "n_neighbors": range(2,20)
}

# Search for best hyperparameters
clf = GridSearchCV(estimator=knn, param_grid=p_grid, cv=inner_cv, scoring='neg_mean_squared_error')
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
mse = mean_squared_error(y_test, y_pred)

print(f'Best hyperparameter is {clf.best_params_}')
print(f"MSE: {mse}") 

Best hyperparameter is {'n_neighbors': 12, 'weights': 'uniform'}
MSE: 10448.243826730903


## ElasticNet Regression

In [24]:
en = ElasticNet()
p_grid ={
    "alpha" : [0.001,0.01,1,10],
    "l1_ratio": [0.1,0.3,0.5,0.7]
}
# Search for best hyperparameters
clf = GridSearchCV(estimator=en, param_grid=p_grid, cv=inner_cv, scoring='neg_mean_squared_error')
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
mse = mean_squared_error(y_test, y_pred)

print(f'Best hyperparameter is {clf.best_params_}')
print(f"MSE: {mse}")

Best hyperparameter is {'alpha': 0.01, 'l1_ratio': 0.1}
MSE: 10357.691345309508


## Regression Tree

In [25]:
dt = tree.DecisionTreeRegressor()

p_grid = {"min_samples_split": [10, 20, 40],
          "max_depth": [2,3,4,5,6,7,10,20]}

# Search for best hyperparameters
clf = GridSearchCV(estimator=dt, param_grid=p_grid, cv=inner_cv, scoring='neg_mean_squared_error')
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
mse = mean_squared_error(y_test, y_pred)

print(f'Best hyperparameter is {clf.best_params_}')
print(f"MSE: {mse}")

Best hyperparameter is {'max_depth': 5, 'min_samples_split': 10}
MSE: 11567.19858786886


## SVM regression

In [26]:
svm = SVR()
p_grid = {'C': [0.1,1, 10, 100], 
          'gamma': [1,0.1,0.01,0.001],
          'kernel': ['rbf', 'poly', 'sigmoid', 'linear']}

# Search for best hyperparameters
clf = GridSearchCV(estimator=svm, param_grid=p_grid, cv=inner_cv, scoring='neg_mean_squared_error')
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
mse = mean_squared_error(y_test, y_pred)

print(f'Best hyperparameter is {clf.best_params_}')
print(f"MSE: {mse}")

Best hyperparameter is {'C': 100, 'gamma': 1, 'kernel': 'linear'}
MSE: 12491.80850128714


## Random Forest Regression

In [27]:
rf = RandomForestRegressor()

# Method of selecting samples for training each tree
bootstrap = [True, False]

p_grid = { 'n_estimators': [200,300,400],
            'max_features': ['sqrt','log2'],
            'max_depth' : [3,4,5,6,7],
            'bootstrap': bootstrap}

# Search for best hyperparameters
clf = GridSearchCV(estimator=rf, param_grid=p_grid, cv=inner_cv, scoring='neg_mean_squared_error')
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
mse = mean_squared_error(y_test, y_pred)

print(f'Best hyperparameter is {clf.best_params_}')
print(f"MSE: {mse}")

Best hyperparameter is {'bootstrap': False, 'max_depth': 7, 'max_features': 'sqrt', 'n_estimators': 200}
MSE: 10908.55621454224


## Neural Network

In [29]:
import tensorflow.keras.backend as K
from keras.models import Sequential
from keras.layers import Dense, Dropout, BatchNormalization
from keras import optimizers
from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.metrics import mean_squared_error

In [30]:
# Transform object data into float data as Keras do not accept object data 
X_train_float=np.asarray(X_train).astype(np.float32)
X_test_float=np.asarray(X_test).astype(np.float32)
y_train_float=np.asarray(y_train).astype(np.float32)
y_test_float=np.asarray(y_test).astype(np.float32)

In [34]:
def create_model(activation, optimizer, hidden):
    model = Sequential()
    model.add(Dense(hidden, input_dim=X_train_float.shape[1], activation=activation))
    model.add(Dense(hidden, activation=activation))
    model.add(Dense(hidden, activation=activation))
    model.add(Dense(1, activation = 'linear'))
    model.compile(loss='mean_squared_error', optimizer=optimizer, metrics=['mse'])    
    return model

# Neural Network hyperparameter tuning
activations = ['relu', 'tanh']
optimizers = ['adam']
hiddens = [32, 64, 128, 256]
epochs = list(range(3, 10))
p_grid = dict(activation=activations, 
             optimizer=optimizers, 
             hidden=hiddens,
             epochs = epochs)

classifierNN = KerasRegressor(build_fn = create_model, verbose = 0)

# Search for best hyperparameters
clf = GridSearchCV(estimator=classifierNN, param_grid=p_grid, cv=inner_cv, scoring='neg_mean_squared_error')
clf.fit(X_train_float, y_train_float)

y_pred = clf.predict(X_test_float)
mse = mean_squared_error(y_test_float, y_pred)

print(f'Best hyperparameter is {clf.best_params_}')
print(f"MSE: {mse}")

Best hyperparameter is {'activation': 'relu', 'epochs': 9, 'hidden': 128, 'optimizer': 'adam'}
MSE: 8905.474609375


Regression models including KNN, Elastic Net, Regression Tree, SVM regression, Random Forest and Neural Network are built and mean squared error is set as metric. Based on the results, the 3 layers deep neural network with 128 neurons outperformed other models with the lowest MSE.