In [1]:
import pandas as pd
import numpy as np
import sqlite3
import matplotlib as mpl
import matplotlib.pyplot as plt
# import validasiVariable.py
import validasiVariable as valVar
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.externals import joblib

In [48]:
#Adj R square
def adj_r2_score(model,X_test, y_test,):
    y_pred = model.predict(X_test)
    # model.coefs_ doesn't exist
    adj = 1 - float(len(y_test)-1)/(len(y_test)-model.n_features_-1) * \
            (1 - r2_score(y_test,y_pred))
    return adj

# Evalute random search
def evaluate(model, X_test, y_test):
    y_pred = model.predict(X_test)
    MSE = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
#    r2_adjusted = adj_r2_score(model, X_test, y_test)
    print('Model Validation')
    print('MSE: {0}'.format(MSE))
    print('R^2: {0}'.format(r2))
#    print('R^2 Adjusted: {0}'.format(r2_adjusted))

In [3]:
# make connection to sqlite db
conn = sqlite3.connect('validasi.db')
c = conn.cursor()

# enable foreign keys
c.execute("PRAGMA foreign_keys = ON")
conn.commit()

In [4]:
# rank product
csvoutput = './csvfiles/validasi_ranking.csv'
valVar.meanRank(conn, c, csvoutput)

  2%|▏         | 24/1582 [00:00<00:06, 235.30it/s]

Updating 'ranking' column


100%|██████████| 1582/1582 [00:06<00:00, 230.67it/s]


In [31]:
# get training data from database
# the output is dataframe
dftrain = valVar.prodpageTrain(conn, c)

In [32]:
# preprocessing
dftrain['merchanttype'] = dftrain['merchanttype'].astype('category')
dftrain['merchantname'] = dftrain['merchantname'].astype('category')
dftrain['topads'] = dftrain['topads'].replace((1,0), ('yes', 'no'))
dftrain['topads'] = dftrain['topads'].astype('category')
# drop column 'id' and 'prodname'
dftrain = dftrain.drop(['id', 'prodname', 'merchantname', 'actualrevcount'], axis=1)

In [33]:
dftrain.head()

Unnamed: 0,merchanttype,topads,cashback,cashbackval,price,prodrating,reviewcount,negreview,posreview,answerscnt,otheragreemean,ratingmosthelpful,possentiment,negsentiment,sentipolarity,reviewersrep,revpictotal,prodpicstotal,ranking
0,biasa,no,0.0,0.0,32977,4.8,4100,0.005804,0.994196,1763,0.0,5,3,-1,1,1,545,23,1580.0
1,gold,no,0.0,0.0,103000,4.6,3300,0.008502,0.991498,190,0.0,5,3,-1,1,1,653,48,1407.5
2,official,no,0.0,0.0,399000,4.6,3100,0.038106,0.961894,1053,0.0,5,3,-1,1,1,187,22,1349.5
3,gold,no,0.0,0.0,599000,4.8,2500,0.00534,0.99466,1782,0.0,5,3,-1,1,1,341,26,1294.0
4,biasa,no,0.0,0.0,27987,4.8,2500,0.006684,0.993316,915,0.0,5,3,-1,1,1,316,14,1572.0


In [34]:
dftrain = dftrain.values
# Encoding categorical data ('merchanttype' and 'topads')
labelencoder1 = LabelEncoder()
labelencoder2 = LabelEncoder()
# 'merchanttype'
dftrain[:, 0] = labelencoder1.fit_transform(dftrain[:, 0])
# 'topads'
dftrain[:, 1] = labelencoder2.fit_transform(dftrain[:, 1])
# onehotencoder for both 'merchanttype'
onehotencoder = OneHotEncoder(categorical_features=[0])
dftrain = onehotencoder.fit_transform(dftrain).toarray()

In [35]:
# dummy variables for merchanttype (3-1)
# avoiding dummy variable trap
dftrain = dftrain[:, 1:]

In [36]:
sc = StandardScaler()
dftrain = sc.fit_transform(dftrain)

In [37]:
X = dftrain[:, :-1].copy()
y = dftrain[:, 19].copy()

In [38]:
# DECISION TREE
# load model
model = joblib.load('./training/regressor_destree_new.pkl')

# validate chosen model
evaluate(model, X, y)

Model Validation
MSE: 1.9740488691126206
R^2: -0.9740488691126203
R^2 Adjusted: -0.9980609872388304


In [44]:
# Random Forest
# load model
model = joblib.load('./training/regressor_randforest.pkl')

# validate chosen model
evaluate(model, X, y)

Model Validation
MSE: 1.8490725682241387
R^2: -0.8490725682241382
R^2 Adjusted: -0.871564488068094


In [49]:
# MLP
# load model
model = joblib.load('./training/regressor_mlp.pkl')

# validate chosen model
evaluate(model, X, y)

Model Validation
MSE: 53.55342972545439
R^2: -52.553429725454386


In [51]:
# SVR
# load model
model = joblib.load('./training/regressor_svr_new.pkl')

# validate chosen model
evaluate(model, X, y)

Model Validation
MSE: 1.4076950686106993
R^2: -0.40769506861069904
