In [1]:
import json
import os
import sys
import numpy as np
from gensim.models import Word2Vec
from keras.preprocessing import sequence
from keras.layers import merge, Dense, Input,Dropout, Embedding, LSTM, Bidirectional, Activation
from keras.layers import Conv2D,Conv1D
from keras.layers.merge import dot, multiply, add, concatenate
from keras.layers import Merge
from keras.layers.core import Lambda,Reshape, Flatten
from keras.layers.pooling import GlobalMaxPooling2D,GlobalMaxPooling1D
from keras.models import Model
from keras.backend import transpose,batch_dot,expand_dims
from keras import optimizers
from HomeDepotCSVReader import HomeDepotReader
import Utilities
from DataPreprocessing import DataPreprocessing
from Feature_Word2Vec import Feature_Word2Vec
from AutomaticQueryExpansion import Word2VecQueryExpansion
import re
from nltk.corpus import stopwords
from keras.utils.np_utils import to_categorical
import pandas as pd
from FeatureEngineering import HomeDepotFeature
from keras.layers.wrappers import TimeDistributed
from keras.callbacks import ModelCheckpoint,EarlyStopping
from sklearn.metrics import mean_squared_error
from Evaluation import NDCG_Eval

Using TensorFlow backend.


In [2]:
# train_filename = '../data/train_play.csv'
# test_filename = '../data/test_play.csv'
# attribute_filename = '../data/attributes_play.csv'
# description_filename = '../data/product_descriptions_play.csv'
# word2vec_model_path='model/word2vec_play.model'
# vocab_path='model/word2vec_play_vocab.json'
# embeddings_path='model/embeddings_play.npz'

train_filename = '../data/train.csv'
test_filename = '../data/test.csv'
soln_filename = '../data/solution.csv'
attribute_filename = '../data/attributes.csv'
description_filename = '../data/product_descriptions.csv'
word2vec_model_path='model/word2vec.model'
vocab_path='model/word2vec_vocab.json'
embeddings_path='model/embeddings.npz'
#full_features_filename = '../data/features_full_plusnouns_pluspuidthreshpluss2vsimscore.csv'
full_features_filename = '../data/features_final_20170419.csv'
#dnn_pred_filename='../data/dnn_private.csv'
dnn_private_pred_filename='../data/dnn_private.csv'
dnn_public_pred_filename='../data/dnn_public.csv'
xgboost_private_pred_filename='../data/xgboost_private.csv'
xgboost_public_pred_filename='../data/xgboost_public.csv'
or_private_pred_filename='../data/ordinal_private.csv'
or_public_pred_filename='../data/ordinal_public.csv'


In [3]:
# train_query_df, product_df, attribute_df, test_query_df = reader.getQueryProductAttributeDataFrame(train_filename,
#                                               test_filename,
#                                               attribute_filename,
#                                               description_filename)
# print("train_query_df:",list(train_query_df))
# print("product_df:", list(product_df))
# print("attribute_df:", list(attribute_df))
# print("test_query_df:", list(test_query_df))

In [4]:
soln_df = pd.read_csv(soln_filename, delimiter=',', low_memory=False, encoding="ISO-8859-1")
print(soln_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 166693 entries, 0 to 166692
Data columns (total 3 columns):
id           166693 non-null int64
relevance    166693 non-null float64
Usage        166693 non-null object
dtypes: float64(1), int64(1), object(1)
memory usage: 3.8+ MB
None


In [5]:
reader = HomeDepotReader()
feature_df = reader.getBasicDataFrame(full_features_filename)

In [6]:
dp = DataPreprocessing()
feature_test_df=feature_df.copy()
feature_test_df.pop('relevance')
test_private_df = dp.getGoldTestSet(feature_test_df, soln_df,
                                    testsetoption='Private')  # ,savepath='../data/test_private_gold.csv')
test_public_df = dp.getGoldTestSet(feature_test_df, soln_df,
                                    testsetoption='Public')  # ,savepath='../data/test_private_gold.csv')

In [7]:
dnn_public_df = reader.getBasicDataFrame(dnn_public_pred_filename)
xgboost_public_df = reader.getBasicDataFrame(xgboost_public_pred_filename)
or_public_df = reader.getBasicDataFrame(or_public_pred_filename)
dnn_private_df = reader.getBasicDataFrame(dnn_private_pred_filename)
xgboost_private_df = reader.getBasicDataFrame(xgboost_private_pred_filename)
or_private_df = reader.getBasicDataFrame(or_private_pred_filename)

In [8]:
#test_private_df

In [9]:
RMSE = mean_squared_error(test_public_df.relevance.as_matrix(), xgboost_public_df.relevance)**0.5
print(RMSE)

0.465053891326


In [10]:
RMSE = mean_squared_error(test_public_df.relevance.as_matrix(), dnn_public_df.pred_relevance)**0.5
print(RMSE)

0.472106575935


In [11]:
RMSE = mean_squared_error(test_public_df.relevance.as_matrix(), or_public_df.pred_relevance)**0.5
print(RMSE)

0.484292203552


In [12]:
public_combined_avg=(xgboost_public_df.relevance*0.75+dnn_public_df.pred_relevance*0.25)#+or_df.pred_relevance*0.1)

In [13]:
RMSE = mean_squared_error(test_public_df.relevance.as_matrix(), public_combined_avg)**0.5
print(RMSE)

0.46485322051


# private

In [14]:
RMSE = mean_squared_error(test_private_df.relevance.as_matrix(), xgboost_private_df.relevance)**0.5
print(RMSE)

0.462250021607


In [15]:
RMSE = mean_squared_error(test_private_df.relevance.as_matrix(), dnn_private_df.pred_relevance)**0.5
print(RMSE)

0.469481831722


In [16]:
RMSE = mean_squared_error(test_private_df.relevance.as_matrix(), or_private_df.pred_relevance)**0.5
print(RMSE)

0.481759145651


In [17]:
private_combined_avg=(xgboost_private_df.relevance*0.75+dnn_private_df.pred_relevance*0.25)#+or_df.pred_relevance*0.1)

In [18]:
RMSE = mean_squared_error(test_private_df.relevance.as_matrix(), private_combined_avg)**0.5
print(RMSE)

0.462008016567


# save results

In [19]:
dnn_test_public_pred_df = pd.DataFrame(test_public_df[['id','search_term','product_uid']])
dnn_test_public_pred_df.reset_index(drop=True, inplace=True)
dnn_test_public_prediction_ori_scale_df = pd.DataFrame(public_combined_avg,columns=['relevance_int']) # to fit in interface to ncdg
dnn_test_public_prediction_ori_scale_df
dnn_test_public_pred_df=dnn_test_public_pred_df.join(dnn_test_public_prediction_ori_scale_df)

In [20]:
dnn_test_private_pred_df = pd.DataFrame(test_private_df[['id','search_term','product_uid']])
dnn_test_private_pred_df.reset_index(drop=True, inplace=True)
dnn_test_private_prediction_ori_scale_df = pd.DataFrame(private_combined_avg,columns=['relevance_int']) # to fit in interface to ncdg
dnn_test_private_prediction_ori_scale_df
dnn_test_private_pred_df=dnn_test_private_pred_df.join(dnn_test_private_prediction_ori_scale_df)

In [21]:
ensemble_test_public_pred_df = pd.DataFrame(test_public_df[['id','search_term','product_uid']])
ensemble_test_public_pred_df.reset_index(drop=True, inplace=True)
ensemble_test_public_prediction_ori_scale_df = pd.DataFrame(public_combined_avg,columns=['relevance_int']) # to fit in interface to ncdg
ensemble_test_public_prediction_ori_scale_df
ensemble_test_public_pred_df=ensemble_test_public_pred_df.join(ensemble_test_public_prediction_ori_scale_df)

In [22]:
ensemble_test_private_pred_df = pd.DataFrame(test_private_df[['id','search_term','product_uid']])
ensemble_test_private_pred_df.reset_index(drop=True, inplace=True)
ensemble_test_private_prediction_ori_scale_df = pd.DataFrame(private_combined_avg,columns=['relevance_int']) # to fit in interface to ncdg
ensemble_test_private_prediction_ori_scale_df
ensemble_test_private_pred_df=ensemble_test_private_pred_df.join(ensemble_test_private_prediction_ori_scale_df)

In [23]:
ensemble_test_private_pred_df

Unnamed: 0,id,search_term,product_uid,relevance_int
0,5,simpson sku abl,100001,2.152155
1,6,simpson strong tie,100001,2.740542
2,10,bath shower kit,100003,2.607591
3,15,delta ashland shower faucet,100005,2.331733
4,24,self tap screw,100008,2.555223
5,26,door trim,100009,2.078613
6,28,anchor stake,100010,2.519902
7,30,lawn edg,100010,2.172465
8,33,steel landscap edg,100010,2.136848
9,39,honda push mower,100011,2.355558


In [24]:
# pd.DataFrame(ensemble_test_private_pred_df).to_csv('../data/ensemble_private_xgb_0_75_dnn_new_0_25.csv', \
#                                     index=False, header=True)

In [25]:
pd.DataFrame(ensemble_test_private_pred_df).to_csv('../data/ensemble_private.csv', \
                                    index=False, header=True)

In [26]:
pd.DataFrame(ensemble_test_public_pred_df).to_csv('../data/ensemble_public.csv', \
                                    index=False, header=True)

# ncdg

In [27]:
ndcgEval=NDCG_Eval()

In [28]:
test_private_df.drop('relevance_int', axis=1, inplace=True)
test_private_df.rename(columns={'relevance': 'relevance_int'}, inplace=True) #for ncdg

In [29]:
test_private_df

Unnamed: 0.1,Unnamed: 0,id,product_idx,product_uid,search_term,color_exist,color1hot_almond,color1hot_aluminum,color1hot_beige,color1hot_biscuit,...,noun_uniq_overlap_counts,noun_overlap_ratios,pmi,common_w_title,common_w_description,common_words,search_ratio,title_ratio,desc_ratio,relevance_int
74069,74069,5,"Int64Index([0], dtype='int64')",100001,simpson sku abl,0,0,0,0,0,...,1.0,0.054054,31.452746,1,1,2,0.666667,0.333333,0.333333,2.33
74070,74070,6,"Int64Index([0], dtype='int64')",100001,simpson strong tie,0,0,0,0,0,...,2.0,0.162162,32.706596,2,2,4,1.333333,0.666667,0.666667,2.67
74073,74073,10,"Int64Index([54667], dtype='int64')",100003,bath shower kit,0,0,0,0,0,...,3.0,0.153846,39.760484,4,2,6,1.500000,1.000000,0.500000,2.67
74078,74078,15,"Int64Index([2], dtype='int64')",100005,delta ashland shower faucet,0,0,0,0,0,...,3.0,0.280000,37.193529,3,3,6,1.500000,0.750000,0.750000,2.67
74081,74081,24,"Int64Index([54669], dtype='int64')",100008,self tap screw,0,0,0,0,0,...,2.0,0.054054,34.045164,2,3,5,1.666667,0.666667,1.000000,3.00
74083,74083,26,"Int64Index([5], dtype='int64')",100009,door trim,0,0,0,0,0,...,0.0,0.000000,25.664433,0,0,0,0.000000,0.000000,0.000000,2.00
74084,74084,28,"Int64Index([6], dtype='int64')",100010,anchor stake,0,0,0,0,0,...,1.0,0.025641,24.654201,1,2,3,1.500000,0.500000,1.000000,2.00
74086,74086,30,"Int64Index([6], dtype='int64')",100010,lawn edg,0,0,0,0,0,...,2.0,0.076923,24.589559,0,2,2,1.000000,0.000000,1.000000,1.00
74089,74089,33,"Int64Index([6], dtype='int64')",100010,steel landscap edg,1,0,0,0,0,...,0.0,0.000000,32.055197,0,2,2,0.666667,0.000000,0.666667,2.00
74091,74091,39,"Int64Index([7], dtype='int64')",100011,honda push mower,0,0,0,0,0,...,5.0,0.032407,31.583518,1,1,2,0.666667,0.333333,0.333333,2.67


In [30]:
test_public_df.drop('relevance_int', axis=1, inplace=True)
test_public_df.rename(columns={'relevance': 'relevance_int'}, inplace=True) #for ncdg

In [31]:
test_public_df

Unnamed: 0.1,Unnamed: 0,id,product_idx,product_uid,search_term,color_exist,color1hot_almond,color1hot_aluminum,color1hot_beige,color1hot_biscuit,...,noun_uniq_overlap_counts,noun_overlap_ratios,pmi,common_w_title,common_w_description,common_words,search_ratio,title_ratio,desc_ratio,relevance_int
74068,74068,4,"Int64Index([0], dtype='int64')",100001,metal l bracket,0,0,0,0,0,...,0.0,0.000000,32.119412,1,1,2,0.666667,0.333333,0.333333,2.33
74071,74071,7,"Int64Index([0], dtype='int64')",100001,simpson strong tie hcc668,0,0,0,0,0,...,2.0,0.108108,36.936054,2,2,4,1.000000,0.500000,0.500000,2.00
74075,74075,12,"Int64Index([54667], dtype='int64')",100003,one piec tub shower,0,0,0,0,0,...,0.0,0.000000,38.310176,1,0,1,0.250000,0.250000,0.000000,2.00
74076,74076,13,"Int64Index([54668], dtype='int64')",100004,solar panel,0,0,0,0,0,...,3.0,0.200000,26.485141,2,2,4,2.000000,1.000000,1.000000,3.00
74077,74077,14,"Int64Index([2], dtype='int64')",100005,1 handl shower delta trim kit,0,0,0,0,0,...,3.0,0.170000,41.844126,6,6,12,2.000000,1.000000,1.000000,2.00
74082,74082,25,"Int64Index([5], dtype='int64')",100009,3 1 20door case,0,0,0,0,0,...,1.0,0.117647,32.119412,2,2,4,1.333333,0.666667,0.666667,2.33
74092,74092,40,"Int64Index([7], dtype='int64')",100011,lawn mower,0,0,0,0,0,...,6.0,0.120370,25.819275,2,2,4,2.000000,1.000000,1.000000,3.00
74097,74097,45,"Int64Index([7], dtype='int64')",100011,toro lawn mower bag,0,0,0,0,0,...,7.0,0.203704,37.568215,3,4,7,1.750000,0.750000,1.000000,2.67
74098,74098,46,"Int64Index([7], dtype='int64')",100011,toro mower washout,0,0,0,0,0,...,6.0,0.148148,31.990982,2,3,5,1.666667,0.666667,1.000000,2.67
74102,74102,52,"Int64Index([9], dtype='int64')",100013,garbag dispos,0,0,0,0,0,...,0.0,0.000000,24.351340,1,1,2,1.000000,0.500000,0.500000,2.00


In [32]:
avgNDCG=ndcgEval.computeAvgNDCG(test_public_df,dnn_test_public_pred_df)

Filtering essential columns
goldDF columns: ['relevance_int', 'search_term', 'product_uid']
predictDF columns: ['relevance_int', 'search_term', 'product_uid']
Completed: Filtering essential columns
STARTING AVG_NDCG computation...this operation can take a while..
Sorting by query small to big, relevance big to small
Completed: Sorting by query small to big, relevance big to small
Sorting predictdf according to prediction relevance
Completed: Sorting predictdf according to prediction relevance
Attaching the relevance_score computed in PreTrain to each query-doc in the query-docs set
Completed: Attaching the relevance_score computed in PreTrain to each query-doc in the query-docs set
Computing DCGp of predicted sets
Compeleted: Computing DCGp of predicted sets
Ensuring data correctness before final computation
No of search terms removed from NDCG:  0 []
Completed: Ensuring data correctness before final computation
Computing NDCG = DCGp / DCGmax
Completed: Computing NDCG = DCGp / DCGmax
C

In [33]:
avgNDCG=ndcgEval.computeAvgNDCG(test_private_df,dnn_test_private_pred_df)

Filtering essential columns
goldDF columns: ['relevance_int', 'search_term', 'product_uid']
predictDF columns: ['relevance_int', 'search_term', 'product_uid']
Completed: Filtering essential columns
STARTING AVG_NDCG computation...this operation can take a while..
Sorting by query small to big, relevance big to small
Completed: Sorting by query small to big, relevance big to small
Sorting predictdf according to prediction relevance
Completed: Sorting predictdf according to prediction relevance
Attaching the relevance_score computed in PreTrain to each query-doc in the query-docs set
Completed: Attaching the relevance_score computed in PreTrain to each query-doc in the query-docs set
Computing DCGp of predicted sets
Compeleted: Computing DCGp of predicted sets
Ensuring data correctness before final computation
No of search terms removed from NDCG:  0 []
Completed: Ensuring data correctness before final computation
Computing NDCG = DCGp / DCGmax
Completed: Computing NDCG = DCGp / DCGmax
C

In [34]:
avgNDCG=ndcgEval.computeAvgNDCG(test_private_df,ensemble_test_private_pred_df)

Filtering essential columns
goldDF columns: ['relevance_int', 'search_term', 'product_uid']
predictDF columns: ['relevance_int', 'search_term', 'product_uid']
Completed: Filtering essential columns
STARTING AVG_NDCG computation...this operation can take a while..
Sorting by query small to big, relevance big to small
Completed: Sorting by query small to big, relevance big to small
Sorting predictdf according to prediction relevance
Completed: Sorting predictdf according to prediction relevance
Attaching the relevance_score computed in PreTrain to each query-doc in the query-docs set
Completed: Attaching the relevance_score computed in PreTrain to each query-doc in the query-docs set
Computing DCGp of predicted sets
Compeleted: Computing DCGp of predicted sets
Ensuring data correctness before final computation
No of search terms removed from NDCG:  0 []
Completed: Ensuring data correctness before final computation
Computing NDCG = DCGp / DCGmax
Completed: Computing NDCG = DCGp / DCGmax
C

In [35]:
avgNDCG=ndcgEval.computeAvgNDCG(test_public_df,ensemble_test_public_pred_df)

Filtering essential columns
goldDF columns: ['relevance_int', 'search_term', 'product_uid']
predictDF columns: ['relevance_int', 'search_term', 'product_uid']
Completed: Filtering essential columns
STARTING AVG_NDCG computation...this operation can take a while..
Sorting by query small to big, relevance big to small
Completed: Sorting by query small to big, relevance big to small
Sorting predictdf according to prediction relevance
Completed: Sorting predictdf according to prediction relevance
Attaching the relevance_score computed in PreTrain to each query-doc in the query-docs set
Completed: Attaching the relevance_score computed in PreTrain to each query-doc in the query-docs set
Computing DCGp of predicted sets
Compeleted: Computing DCGp of predicted sets
Ensuring data correctness before final computation
No of search terms removed from NDCG:  0 []
Completed: Ensuring data correctness before final computation
Computing NDCG = DCGp / DCGmax
Completed: Computing NDCG = DCGp / DCGmax
C

# correlation

In [None]:
print(test_private_df.relevance.corr(test_private_df.relevance,method='pearson'))
print(test_private_df.relevance.corr(xgboost_df.relevance,method='pearson'))
print(test_private_df.relevance.corr(dnn_df.pred_relevance,method='pearson'))
print(test_private_df.relevance.corr(or_df.pred_relevance,method='pearson'))

In [None]:
print(xgboost_df.relevance.corr(dnn_df.pred_relevance,method='pearson'))
print(xgboost_df.relevance.corr(or_df.pred_relevance,method='pearson'))

In [None]:
print(dnn_df.pred_relevance.corr(or_df.pred_relevance,method='pearson'))