# Introduction

This is the final version of our implementation. The features that we have implemented including TFIDF based cosine similary between product title and product description, product description and search terms and product title and search terms. Besides, we also include query length as a feature. Common word count is also calculated between product title and search terms as well as between product description and search terms. To futher improve the performance, Jaccard similarity between product title and search terms and between product description and search terms is also included as our feature. For model selection, we have implemented linear regression, random forest and SVR as our model.

# Preprocess

This part we basically preprocess the data, including removing stop words, conversion to lower case and use Lemmatizer.
The reason why we leave out spell check is that somehow it makes the performance worse.
Moreover, we specify a data structure here to store the complicated input data for easier use. Since information about
one product-search pair is scattered in many files and we also want some id2item and item2id dictionary to help us get
information in a handy way

In [46]:
import time
import csv
import sklearn.feature_extraction.text as text;
import numpy as np
import scipy.sparse as sparse
from sklearn import linear_model
from scipy.spatial.distance import cosine
from scipy.spatial.distance import cdist
from sklearn.metrics.pairwise import cosine_similarity
import threading  
from nltk import word_tokenize  
from nltk import WordNetLemmatizer          
#from nltk.stem import WordNet 
class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, articles):
        return [self.wnl.lemmatize(t) for t in word_tokenize(articles)]
from nltk import word_tokenize          
from nltk.stem import WordNetLemmatizer 
class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, doc):
        return [self.wnl.lemmatize(t) for t in word_tokenize(doc)]
  


    
class Loader:
    #initialization and specify the data structure needed to store info
    def __init__(self,):
        #file path to train and test set
        train_file = 'train.csv';
        test_file = 'test.csv';
        #make item2id and id2item dictionary
        self.item2id = {}
        self.id2item = {}
        #item to title dictionary
        self.item_title = {}
        #Item to description dictionary
        self.item_text = {}
        #record id in test set for final output
        self.test_id = []
        #call load function
        self.train = self.load(train_file);
        self.test = self.load(test_file);
        self.load_description();
        #storing title string..
        self.title = []
        
        print('begin proprocessing')
        self.preprocess();
        #compute cosine similarity based on tfidf of product title and product description,
        #product description and search terms and product title and search terms for training set
        print('compute cosine')
        self.train_cosine = self.compute_cosine([self.train_s, self.item_title_tfidf, self.item_text_tfidf], self.train);
        self.train_cosine = np.asarray(self.train_cosine);
        #compute cosine similarity based on tfidf of product title and product description,
        #product description and search terms and product title and search terms for test set
        self.test_cosine = self.compute_cosine([self.test_s, self.item_title_tfidf, self.item_text_tfidf], self.test);
        self.test_cosine = np.asarray(self.test_cosine)
        print('finish')
        
    #Load the data
    def load(self, file_name):
        reader = csv.reader(open(file_name, encoding='latin-1'));
        cnt = 0;
        data = []
        for row in reader:
            cnt += 1;
            if (cnt == 1):
                continue;
            if(file_name == "test.csv"): self.test_id.append(row[0])
            #self.item
            item = int(row[1]);
            if (item not in self.item2id):
                idx = len(self.item2id)
                self.item2id[item] = idx;
                self.id2item[idx] = item;
                self.item_title[idx] = row[2].lower();    
            sample = {'id': int(row[0]), 'x': self.item2id[item], 'y': row[3].lower()}
            if (len(row) == 5):
                rate = float(row[-1])
                sample['r'] = rate;
            data.append(sample);
        return data;
    #load product description
    def load_description(self,):
        reader = csv.reader(open('product_descriptions.csv', encoding='latin-1'));
        cnt = 0;
        for row in reader:
            cnt += 1;
            if (cnt == 1):
                continue;
            
            item = int(row[0]);
            idx = self.item2id[item]
            self.item_text[idx] = row[1].lower()
        return

    #proprocess including removing stop words, conversion to lower case and use of Lemmatizer.
    #and transform the data into right form
    def preprocess(self,):
        
        transformer = text.TfidfVectorizer(max_features = 100000, stop_words ='english',tokenizer=LemmaTokenizer());
        n = len(self.item2id);
        corpus = []
        item_title = []
        item_text = []
        for i in range(n):
            corpus.append(self.item_text[i] + ' ' + self.item_title[i]);
            item_title.append(self.item_title[i])
            item_text.append(self.item_text[i])
            #self.title.append(wordcount(self.item_title[i],self.item_text[i]))
        self.title = np.asarray(self.title)
        transformer.fit(corpus);
        self.item_title_tfidf = transformer.transform(item_title);
        self.item_text_tfidf = transformer.transform(item_text);
        train_s = []
        for item in self.train:
            train_s.append(item['y']);
        test_s = []
        for item in self.test:
            test_s.append(item['y']);
        self.train_s = transformer.transform(train_s);
        self.test_s = transformer.transform(test_s);
        
    def compute_cosine(self, inputs, data):
        cosine = []
        print(inputs[0].shape[0])
        for i in range(inputs[0].shape[0]):
            x = data[i]['x'];
            a = inputs[0][i];b = inputs[1][x];c = inputs[2][x];
            ab = cosine_similarity(a,b)[0][0];
            bc = cosine_similarity(b,c)[0][0];
            ac = cosine_similarity(a,c)[0][0];
            cosine.append([ab,bc,ac]);
        return cosine

In [47]:
print("start loading")
import time
print(time.time())
Data = Loader()
print(time.time())
#913

start loading
1527214902.0816398
begin proprocessing
compute cosine
74067
166693
finish
1527215905.435162


# Making Features

In [48]:
#count the number of common word give two list of strings.
def wordcount(a,b):
    count = 0
    for ch in a:
        if ch in b:
            count += b.count(ch)
    return count
#self.item_title[item['x']]
#print(Data.title.shape,Data.title[:10])

In [52]:
#Calculate fuzz similarity between product title and search terms as well as between product description and 
#search terms

from fuzzywuzzy import fuzz
from fuzzywuzzy import process
def fuzzy(s1, s2):
    return fuzz.token_set_ratio(s1, s2) / 100.0

train_r = []
train_t = []
len_train_y = []
#title:query  number of common words
common = []
#description:query  number of common words
common1 = []
##fuzz_title = []
##fuzz_des = []
for i in range(len(Data.train)):
    item = Data.train[i];
    train_r.append(item['r']);
    train_t.append(item['x'])
    len_train_y.append(len(item['y'].split()))
    common.append(wordcount(Data.item_title[item['x']].split(),item['y'].split()))
    common1.append(wordcount(Data.item_text[item['x']].split(),item['y'].split()))
    ##fuzz_title.append(fuzzy(Data.item_title[item['x']].split(),item['y'].split()))
    ##fuzz_des.append(fuzzy(Data.item_title[item['x']].split(),item['y'].split()))
    
    #wordcount(Data.item_title[item['id']],item['y'])
#train_t = Data.item_tfidf[train_t]
#train_s = sparse.hstack([train_t, Data.train_s])
print(common[:100])


#########
r,c = Data.train_cosine.shape
train_s = np.zeros((r,c+3))
train_s[:,:-3] = Data.train_cosine
# train_s[:,-5] = fuzz_des
# train_s[:,-4] = fuzz_title
train_s[:,-3] = common1
train_s[:,-2] = common
#train_s = Data.train_cosine

for i in range(len(train_s)):
    train_s[i][-1] = len_train_y[i]
    
    
    
#train_s = Data.train_cosine.reshape(-1,4)
#b.reshape(-1,3)

# print(b.shape)
print(train_s.shape)
# print(b[:3])
# print(train_s[:3])
train_r = np.asarray(train_r)
#avg_r = np.mean(train_r);
#train_r = train_r - avg_r;

[1, 0, 0, 1, 3, 1, 2, 0, 1, 2, 0, 4, 1, 1, 3, 0, 2, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 0, 0, 0, 4, 2, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 2, 1, 1, 3, 0, 1, 0, 0, 0, 1, 0, 0, 2, 4, 2, 1, 1, 1, 2, 1, 1, 1, 2, 4, 4, 1, 1, 0, 2, 0, 1, 3, 2, 2, 2, 1, 1, 3, 0, 0, 2, 3, 2, 3, 2, 2, 2, 3, 0, 3, 1, 2, 1, 1, 0]
(74067, 6)


In [50]:


# train_r = [];
# train_t = []
# len_train_y = []
# #title:query  number of common words
# common = []
# #description:query  number of common words
# common1 = []

# for i in range(len(Data.train)):
#     item = Data.train[i];
#     train_r.append(item['r']);
#     train_t.append(item['x'])
#     len_train_y.append(len(item['y'].split()))
#     common.append(wordcount(Data.item_title[item['x']].split(),item['y'].split()))
#     common1.append(wordcount(Data.item_text[item['x']].split(),item['y'].split()))
    
#     #wordcount(Data.item_title[item['id']],item['y'])
# #train_t = Data.item_tfidf[train_t]
# #train_s = sparse.hstack([train_t, Data.train_s])
# print(common[:100])


# #########
# r,c = Data.train_cosine.shape
# #train_s = np.zeros((r,c+5))
# #train_s = np.zeros((r,c+5))
# train_s = np.zeros((r,5))
# train_s[:,:-5] = Data.train_cosine
# train_s[:,-5] = fuzz_title_test
# train_s[:,-4] = fuzz_des_test
# train_s[:,-3] = common1
# train_s[:,-2] = common
# #train_s = Data.train_cosine

# for i in range(len(train_s)):
#     train_s[i][-1] = len_train_y[i]
    
    
    
# #train_s = Data.train_cosine.reshape(-1,4)
# #b.reshape(-1,3)

# # print(b.shape)
# print(train_s.shape)
# # print(b[:3])
# # print(train_s[:3])
# train_r = np.asarray(train_r)
# #avg_r = np.mean(train_r);
# #train_r = train_r - avg_r;

# linear regression

In [55]:
#lr section

model = linear_model.Ridge(alpha = 50.)
print(train_s.shape)
model.fit(train_s, train_r)
print(model.get_params(deep=True))
print(model.coef_ )

(74067, 6)
{'alpha': 50.0, 'copy_X': True, 'fit_intercept': True, 'max_iter': None, 'normalize': False, 'random_state': None, 'solver': 'auto', 'tol': 0.001}
[ 0.54507024 -0.27813751  0.44060243  0.00396836  0.04111296 -0.06298762]


# Random Forest

In [11]:
#random forest setion
from sklearn.ensemble import RandomForestRegressor
model2 =RandomForestRegressor(max_depth=2, random_state=0)
model2.fit(train_s, train_r)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=2,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=0, verbose=0, warm_start=False)

# linearSVR

In [30]:
from sklearn.svm import LinearSVR
model4 = LinearSVR(random_state=0)
model4.fit(train_s, train_r) 
print(model4.coef_ )

[ 0.5220361  -0.24111234  0.36205871 -0.00298779  0.05372537 -0.09750834]


# SVR

In [38]:
#svr
from sklearn.svm import SVR
model3 = SVR(C=1.0, epsilon=0.2)
model3.fit(train_s, train_r) 
#print(model3.coef_ )

SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.2, gamma='auto',
  kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [None]:
print("ha")

In [None]:
print(model.score(train_s,train_r))

# Making Prediction and output

In [58]:

test_t = []
len_test_y = []
common_test = []
common_test1 = []
#fuzz_title_test = []
#fuzz_des_test = []
for i in range(len(Data.test)):
    #if(i%1000==0):print(i)
    item = Data.test[i];
    test_t.append(item['x'])
    len_test_y.append(len(item['y'].split()))
    common_test.append(wordcount(Data.item_title[item['x']].split(),item['y'].split()))
    common_test1.append(wordcount(Data.item_text[item['x']].split(),item['y'].split()))
    #fuzz_title_test.append(fuzzy(Data.item_text[item['x']].split(),item['y'].split()))
    #fuzz_des_test.append(fuzzy(Data.item_text[item['x']].split(),item['y'].split()))
    
#test_t = Data.item_tfidf[test_t]
#test_t = Data.item_tfidf[test_t]
#test_s = sparse.hstack([test_t, Data.test_s])

#
r,c = Data.test_cosine.shape
print(r,c)
#test_s = np.zeros((r,c+5))
test_s = np.zeros((r,c+3))
test_s[:,:-3] = Data.test_cosine
#test_s[:,-5] = fuzz_des_test
#test_s[:,-4] = fuzz_title_test
test_s[:,-3] = common_test1
test_s[:,-2] = common_test

print(test_s.shape)


for i in range(len(len_test_y)):
    test_s[i][-1] = len_test_y[i]


#test_s = Data.test_cosine.reshape(-1,3)
print("start predicting")
test_r = model.predict(test_s)
print("finished")

print(test_s.shape)
print(test_r)
len(test_r)
for i in range(len(test_r)):
    if (test_r[i]<1): test_r[i] = 1
    if (test_r[i]>3): test_r[i] = 3
        
#make output
op = np.asarray(test_r)
np.savetxt("submission.csv", op, delimiter=",")
f = open("submission.csv", "w")
f.write("{},{}\n".format("id","relevance"))

# true_ids = []
# for i in ids:
#     true_ids.append(Data.id2item[i])
for i in range(len(test_r)):
    f.write("{},{}\n".format(Data.test_id[i], test_r[i]))

f.close()



166693 3
(166693, 6)
start predicting
finished
(166693, 6)
[2.12642291 2.1007307  2.32706934 ... 2.60175378 2.50972434 2.4940787 ]
