In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.stem import SnowballStemmer
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
import re

encoding = "ISO-8859-1"
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package stopwords to /home/eugene/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/eugene/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/eugene/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [2]:
stemmer = SnowballStemmer('english')
stop_words = set(stopwords.words('english'))


def replaces(s):
    if isinstance(s, str):
        s = s.lower()
        s = re.sub(r"(\w)\.([A-Z])", r"\1 \2", s) 

        s = re.sub(r"([0-9]+)( *)(inches|inch|in|')\.?", r"\1in. ", s)
    
        s = re.sub(r"([0-9]+)( *)(foot|feet|ft|'')\.?", r"\1ft. ", s)
    
        s = re.sub(r"([0-9]+)( *)(pounds|pound|lbs|lb)\.?", r"\1lb. ", s)
    
        s = s.replace(" x "," xby ")
        s = s.replace("*"," xby ")
        s = s.replace(" by "," xby")
        s = s.replace("x0"," xby 0")
        s = s.replace("x1"," xby 1")
        s = s.replace("x2"," xby 2")
        s = s.replace("x3"," xby 3")
        s = s.replace("x4"," xby 4")
        s = s.replace("x5"," xby 5")
        s = s.replace("x6"," xby 6")
        s = s.replace("x7"," xby 7")
        s = s.replace("x8"," xby 8")
        s = s.replace("x9"," xby 9")
        s = s.replace("0x","0 xby ")
        s = s.replace("1x","1 xby ")
        s = s.replace("2x","2 xby ")
        s = s.replace("3x","3 xby ")
        s = s.replace("4x","4 xby ")
        s = s.replace("5x","5 xby ")
        s = s.replace("6x","6 xby ")
        s = s.replace("7x","7 xby ")
        s = s.replace("8x","8 xby ")
        s = s.replace("9x","9 xby ")
        
        s = re.sub(r"([0-9])( *)\.( *)([0-9])", r"\1.\4", s)
        s = re.sub(r"([0-9]+)( *)(inches|inch|in|')\.?", r"\1in. ", s)
        s = re.sub(r"([0-9]+)( *)(foot|feet|ft|'')\.?", r"\1ft. ", s)
        s = re.sub(r"([0-9]+)( *)(pounds|pound|lbs|lb)\.?", r"\1lb. ", s)
        s = re.sub(r"([0-9]+)( *)(square|sq) ?\.?(feet|foot|ft)\.?", r"\1sq.ft. ", s)
        s = re.sub(r"([0-9]+)( *)(cubic|cu) ?\.?(feet|foot|ft)\.?", r"\1cu.ft. ", s)
        s = re.sub(r"([0-9]+)( *)(gallons|gallon|gal)\.?", r"\1gal. ", s)
        s = re.sub(r"([0-9]+)( *)(ounces|ounce|oz)\.?", r"\1oz. ", s)
        s = re.sub(r"([0-9]+)( *)(centimeters|cm)\.?", r"\1cm. ", s)
        s = re.sub(r"([0-9]+)( *)(milimeters|mm)\.?", r"\1mm. ", s)
        s = s.replace("°"," degrees ")
        s = re.sub(r"([0-9]+)( *)(degrees|degree)\.?", r"\1deg. ", s)
        s = s.replace(" v "," volts ")
        s = re.sub(r"([0-9]+)( *)(volts|volt)\.?", r"\1volt. ", s)
        s = re.sub(r"([0-9]+)( *)(watts|watt)\.?", r"\1watt. ", s)
        s = re.sub(r"([0-9]+)( *)(amperes|ampere|amps|amp)\.?", r"\1amp. ", s)
        
        s = s.replace("toliet","toilet")
        s = s.replace("airconditioner","air conditioner")
        s = s.replace("vinal","vinyl")
        s = s.replace("vynal","vinyl")
        s = s.replace("skill","skil")
        s = s.replace("snowbl","snow bl")
        s = s.replace("plexigla","plexi gla")
        s = s.replace("rustoleum","rust-oleum")
        s = s.replace("whirpool","whirlpool")
        s = s.replace("whirlpoolga", "whirlpool ga")
        s = s.replace("whirlpoolstainless","whirlpool stainless")

        s = s.replace("  "," ")
        
        s = s.replace("/", " ")
        s = s.replace("-", " ")
        return " ".join([re.sub('[^A-Za-z0-9-./]', ' ', word)
                         for word in s.split()])
    else:
        return "null"

def stem_sentence(s): 
    s = replaces(s)   
    return str(" ".join([stemmer.stem(word)
                         for word in s.split()
                         if word not in stop_words]))


In [3]:
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import BaggingRegressor
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion

In [4]:
def word_match_count(search_term, text):
    count = 0
    search_term = search_term.split()
    text = text.split()
    if len(search_term) == 0:
        return 0
    for word in set(search_term):
        count += 1 if text.count(word) > 0 else 0
    return count / len(search_term)

In [39]:
def match_word_n_pos(text, search_term):
    text = nltk.pos_tag(replaces(text).split())
    search_pos = nltk.pos_tag(replaces(search_term).split())
    text = ["%s/%s" % (word[0].lower(), word[1]) for word in text]
    search_pos = ["%s/%s" % (word[0].lower(), word[1]) for word in search_pos]
    
    if len(search_pos) == 0:
        return 0
    matches = 0
    for word in set(text):
        if word in search_pos:
            matches += 1
    return matches / len(search_pos)

In [7]:
products = pd.read_csv('product_descriptions.csv', encoding=encoding, index_col=['product_uid'])
products['orig_descr'] = products['product_description']
products['descr'] = products['product_description'].apply(stem_sentence)
attrs = pd.read_csv('attributes.csv')
brand = attrs[attrs['name'] == 'MFG Brand Name']
brand.index = brand['product_uid']
brand = brand['value'].astype(str).apply(stem_sentence)

In [1]:
train = pd.read_csv('train.csv', encoding=encoding, index_col=['id'])
train['orig_search_term'] = train['search_term']
train['orig_product_title'] = train['product_title']
train['descr'] = train.product_uid.map(products['descr'])
train['orig_descr'] = train.product_uid.map(products['orig_descr'])
for col in ['product_title', 'search_term']:
    train[col] = train[col].apply(stem_sentence)

for col in ['product_title', 'descr']:
    train['match_' + col] = train.apply(lambda row: word_match_count(row['search_term'], row[col]), axis=1)

train['len_search_term_words'] = train['search_term'].apply(lambda x: len(x.split()))
train['len_search_term_letters'] = train['search_term'].str.len()
train['complete_match_title'] = \
    train.apply(lambda row:1 if row['product_title'].find(row['search_term']) >= 0 else 0, axis=1)   
train['complete_match_descr'] = \
    train.apply(lambda row:1 if row['descr'].find(row['search_term']) >= 0 else 0, axis=1)
train['match_pos_title'] = train.apply(
    lambda x: match_word_n_pos(x['orig_product_title'], x['orig_search_term']), axis=1)
train['match_pos_descr'] = train.apply(
    lambda x: match_word_n_pos(x['orig_descr'], x['orig_search_term']), axis=1)

train['brand'] = train['product_uid'].map(brand).astype(str)
train['search_term'] = train['search_term'].astype(str)
train['match_brand'] = train.apply(lambda row: word_match_count(row['brand'], row['search_term']), axis=1)
train.to_csv('train2.csv')

NameError: name 'pd' is not defined

In [6]:
train = pd.read_csv('train_processed.csv', index_col=['id'])
train.head()

Unnamed: 0_level_0,product_uid,product_title,search_term,relevance,orig_search_term,orig_product_title,descr,orig_descr,match_product_title,match_descr,len_search_term_words,len_search_term_letters,complete_match_title,complete_match_descr,match_pos_title,match_pos_descr,brand,match_brand
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2,100001,simpson strong-ti 12-gaug angl,angl bracket,3.0,angle bracket,Simpson Strong-Tie 12-Gauge Angle,"angl make joint stronger, also provid consiste...","Not only do angles make joints stronger, they ...",0.5,0.5,2,12,0,0,0.5,0.0,simpson strong-ti,0.0
3,100001,simpson strong-ti 12-gaug angl,l bracket,2.5,l bracket,Simpson Strong-Tie 12-Gauge Angle,"angl make joint stronger, also provid consiste...","Not only do angles make joints stronger, they ...",0.0,0.0,2,9,0,0,0.0,0.0,simpson strong-ti,0.0
9,100002,behr premium textur deckov 1-gal. #sc-141 tugb...,deck,3.0,deck over,BEHR Premium Textured DeckOver 1-gal. #SC-141 ...,behr premium textur deckov innov solid color c...,BEHR Premium Textured DECKOVER is an innovativ...,0.0,0.0,1,4,1,1,0.0,0.5,behr premium textur deckov,0.0
16,100005,delta vero 1-handl shower faucet trim kit chro...,rain shower head,2.33,rain shower head,Delta Vero 1-Handle Shower Only Faucet Trim Ki...,updat bathroom delta vero single-handl shower ...,Update your bathroom with the Delta Vero Singl...,0.333333,0.333333,3,16,0,0,0.333333,0.0,delta,0.0
17,100005,delta vero 1-handl shower faucet trim kit chro...,shower faucet,2.67,shower only faucet,Delta Vero 1-Handle Shower Only Faucet Trim Ki...,updat bathroom delta vero single-handl shower ...,Update your bathroom with the Delta Vero Singl...,1.0,1.0,2,13,1,1,0.666667,0.333333,delta,0.0


In [None]:
test = pd.read_csv('test.csv', encoding=encoding, index_col=['id'])
test['orig_search_term'] = test['search_term']
test['orig_product_title'] = test['product_title']
test['descr'] = test.product_uid.map(products['descr']).astype(str)
test['orig_descr'] = test.product_uid.map(products['orig_descr'])
test['search_term'] = test['search_term'].astype(str)
test['product_title'] = test['product_title'].astype(str)
for col in ['product_title', 'search_term']:
    test[col] = test[col].apply(stem_sentence)

for col in ['product_title', 'descr']:
    test['match_' + col] = test.apply(lambda row: word_match_count(row['search_term'], row[col]), axis=1)

test['len_search_term_words'] = test['search_term'].apply(lambda x: len(x.split()))
test['len_search_term_letters'] = test['search_term'].str.len()
test['complete_match_title'] = \
    test.apply(lambda row:1 if row['product_title'].find(row['search_term']) >= 0 else 0, axis=1)   
test['complete_match_descr'] = \
    test.apply(lambda row:1 if row['descr'].find(row['search_term']) >= 0 else 0, axis=1)
test['match_pos_title'] = test.apply(
    lambda x: match_word_n_pos(x['orig_product_title'], x['orig_search_term']), axis=1)
test['match_pos_descr'] = test.apply(
    lambda x: match_word_n_pos(x['orig_descr'], x['orig_search_term']), axis=1)

In [16]:
test['brand'] = test['product_uid'].map(brand).astype(str)
test['match_brand'] = test.apply(lambda row: word_match_count(row['brand'], row['search_term']), axis=1)
test.to_csv('test_processed.csv')

In [7]:
test = pd.read_csv('test_processed.csv', index_col=['id'])
test.head()

Unnamed: 0_level_0,product_uid,product_title,search_term,orig_search_term,orig_product_title,descr,orig_descr,match_product_title,match_descr,len_search_term_words,len_search_term_letters,complete_match_title,complete_match_descr,match_pos_title,match_pos_descr,brand,match_brand
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1,100001,simpson strong-ti 12-gaug angl,90 degre bracket,90 degree bracket,Simpson Strong-Tie 12-Gauge Angle,"angl make joint stronger, also provid consiste...","Not only do angles make joints stronger, they ...",0.0,0.333333,3,16,0,0,0.0,0.0,simpson strong-ti,0.0
4,100001,simpson strong-ti 12-gaug angl,metal l bracket,metal l brackets,Simpson Strong-Tie 12-Gauge Angle,"angl make joint stronger, also provid consiste...","Not only do angles make joints stronger, they ...",0.0,0.0,3,15,0,0,0.0,0.0,simpson strong-ti,0.0
5,100001,simpson strong-ti 12-gaug angl,simpson sku abl,simpson sku able,Simpson Strong-Tie 12-Gauge Angle,"angl make joint stronger, also provid consiste...","Not only do angles make joints stronger, they ...",0.333333,0.333333,3,15,0,0,0.333333,0.333333,simpson strong-ti,0.058824
6,100001,simpson strong-ti 12-gaug angl,simpson strong tie,simpson strong ties,Simpson Strong-Tie 12-Gauge Angle,"angl make joint stronger, also provid consiste...","Not only do angles make joints stronger, they ...",0.333333,0.333333,3,18,0,0,0.666667,1.0,simpson strong-ti,0.058824
7,100001,simpson strong-ti 12-gaug angl,simpson strong tie hcc668,simpson strong tie hcc668,Simpson Strong-Tie 12-Gauge Angle,"angl make joint stronger, also provid consiste...","Not only do angles make joints stronger, they ...",0.25,0.25,4,25,0,0,0.5,1.0,simpson strong-ti,0.058824


In [8]:
cols = ['match_product_title', 'match_descr', 'len_search_term_words', 'len_search_term_letters',
        'complete_match_title', 'complete_match_descr', 'match_pos_title', 'match_brand', 'match_pos_descr']
X = train[cols]
y = train['relevance']

In [20]:
br = BaggingRegressor(lgb.LGBMRegressor(), max_samples=0.1, n_estimators=40)
br.fit(X, y)

X_test = test[cols]
y_pred = br.predict(X_test)
results = pd.DataFrame({'id':test.index.values, 'relevance':y_pred})
results.to_csv('lgbm3.csv', header=True, index=False)

0.48929

In [9]:
# Обвязка для FeatureUnion
tree_cols = ['match_product_title', 'match_descr', 'len_search_term_words', 'len_search_term_letters',
             'complete_match_title', 'complete_match_descr', 'match_pos_title', 'match_brand', 'match_pos_descr']

use_cols = tree_cols + ['search_term', 'product_title', 'descr', 'brand']



class cust_regression_vals(BaseEstimator, TransformerMixin):
    def fit(self, x, y=None):
        return self
    def transform(self, train):
        return train[tree_cols].values


class cust_txt_col(BaseEstimator, TransformerMixin):
    def __init__(self, col):
        self.col = col
    def fit(self, x, y=None):
        return self
    def transform(self, dataset):
        return dataset[self.col].apply(str)



tfidf = TfidfVectorizer(stop_words='english')
tsvd = TruncatedSVD(n_components=25)
gbm = lgb.LGBMRegressor()

clf = Pipeline([
    ('union', FeatureUnion(
                transformer_list = [
                    ('cst',  cust_regression_vals()),  
                    ('txt1', Pipeline([('s1', cust_txt_col(col='search_term')), ('tfidf1', tfidf), ('tsvd1', tsvd)])),
                    ('txt2', Pipeline([('s2', cust_txt_col(col='product_title')), ('tfidf2', tfidf), ('tsvd2', tsvd)])),
                    ('txt3', Pipeline([('s3', cust_txt_col(col='descr')), ('tfidf3', tfidf), ('tsvd3', tsvd)])),
                    ('txt4', Pipeline([('s4', cust_txt_col(col='brand')), ('tfidf4', tfidf), ('tsvd4', tsvd)]))
                    ]
            )), 
    ('gbm', gbm)])

In [51]:
clf.fit(train[use_cols], y)

Pipeline(memory=None,
     steps=[('union', FeatureUnion(n_jobs=None,
       transformer_list=[('cst', cust_regression_vals()), ('txt1', Pipeline(memory=None,
     steps=[('s1', cust_txt_col(col='search_term')), ('tfidf1', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float6....0, reg_lambda=0.0, silent=True,
       subsample=1.0, subsample_for_bin=200000, subsample_freq=0))])

In [52]:
y_pred = clf.predict(test[use_cols][:])
results = pd.DataFrame({'id':test.index.values, 'relevance':y_pred})
results.to_csv('lgbm3.csv', header=True, index=False)

In [10]:
clf.get_params()

{'memory': None, 'steps': [('union', FeatureUnion(n_jobs=None,
          transformer_list=[('cst', cust_regression_vals()), ('txt1', Pipeline(memory=None,
        steps=[('s1', cust_txt_col(col='search_term')), ('tfidf1', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
           dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
           lo...ncatedSVD(algorithm='randomized', n_components=20, n_iter=5,
          random_state=None, tol=0.0))]))],
          transformer_weights=None)),
  ('gbm',
   LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
          importance_type='split', learning_rate=0.1, max_depth=-1,
          min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
          n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
          random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
          subsample=1.0, subsample_for_bin=200000, subsample_freq=0))], 'union': FeatureUnion(n

In [13]:
params = {
    'gbm__learning_rate': [0.05, 0.1],
    'gbm__n_estimators': [200, 300, 400]
}
model = GridSearchCV(clf, params,
                     cv=3,
                     n_jobs=1,
                     scoring='neg_mean_squared_error',
                     return_train_score=False)
model.fit(train[use_cols], y)

GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('union', FeatureUnion(n_jobs=None,
       transformer_list=[('cst', cust_regression_vals()), ('txt1', Pipeline(memory=None,
     steps=[('s1', cust_txt_col(col='search_term')), ('tfidf1', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float6....0, reg_lambda=0.0, silent=True,
       subsample=1.0, subsample_for_bin=200000, subsample_freq=0))]),
       fit_params=None, iid='warn', n_jobs=1,
       param_grid={'gbm__learning_rate': [0.05, 0.1], 'gbm__n_estimators': [200, 300, 400]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
       scoring='neg_mean_squared_error', verbose=0)

In [14]:
grid_df = pd.DataFrame(model.cv_results_)
grid_df.drop(columns=grid_df.columns[:3], inplace=True)
grid_df.sort_values(['rank_test_score'], inplace=True)
grid_df.head()

Unnamed: 0,std_score_time,param_gbm__learning_rate,param_gbm__n_estimators,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
5,0.039619,0.1,400,"{'gbm__learning_rate': 0.1, 'gbm__n_estimators...",-0.224061,-0.213777,-0.241131,-0.226323,0.011281,1
4,0.062405,0.1,300,"{'gbm__learning_rate': 0.1, 'gbm__n_estimators...",-0.223873,-0.214883,-0.242731,-0.227162,0.011604,2
2,0.011238,0.05,400,"{'gbm__learning_rate': 0.05, 'gbm__n_estimator...",-0.224448,-0.215748,-0.241955,-0.227384,0.010899,3
3,0.009048,0.1,200,"{'gbm__learning_rate': 0.1, 'gbm__n_estimators...",-0.224385,-0.216939,-0.243309,-0.228211,0.0111,4
1,0.06463,0.05,300,"{'gbm__learning_rate': 0.05, 'gbm__n_estimator...",-0.225343,-0.217233,-0.24296,-0.228512,0.010739,5


In [15]:
y_pred = model.predict(test[use_cols][:])
results = pd.DataFrame({'id':test.index.values, 'relevance':y_pred})
results.to_csv('lgbm3.csv', header=True, index=False)

0.48111

In [23]:
params = {
    'gbm__learning_rate': [0.05, 0.07, 0.1],
    'gbm__n_estimators': [300, 400, 500],
    'union__txt1__tsvd1__n_components': [15],
    'union__txt2__tsvd2__n_components': [15],
    'union__txt3__tsvd3__n_components': [15],
    'union__txt4__tsvd4__n_components': [15]
}
model = GridSearchCV(clf, params,
                     cv=3,
                     n_jobs=1,
                     scoring='neg_mean_squared_error',
                     return_train_score=False)
model.fit(train[use_cols][:], y)

GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('union', FeatureUnion(n_jobs=None,
       transformer_list=[('cst', cust_regression_vals()), ('txt1', Pipeline(memory=None,
     steps=[('s1', cust_txt_col(col='search_term')), ('tfidf1', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float6....0, reg_lambda=0.0, silent=True,
       subsample=1.0, subsample_for_bin=200000, subsample_freq=0))]),
       fit_params=None, iid='warn', n_jobs=1,
       param_grid={'gbm__learning_rate': [0.05, 0.07, 0.1], 'gbm__n_estimators': [300, 400, 500], 'union__txt1__tsvd1__n_components': [15], 'union__txt2__tsvd2__n_components': [15], 'union__txt3__tsvd3__n_components': [15], 'union__txt4__tsvd4__n_components': [15]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
       scoring='neg_mean_squared_error', verbose=0)

In [24]:
grid_df = pd.DataFrame(model.cv_results_)
grid_df.drop(columns=grid_df.columns[:3], inplace=True)
grid_df.sort_values(['rank_test_score'], inplace=True)
grid_df.head()

Unnamed: 0,std_score_time,param_gbm__learning_rate,param_gbm__n_estimators,param_union__txt1__tsvd1__n_components,param_union__txt2__tsvd2__n_components,param_union__txt3__tsvd3__n_components,param_union__txt4__tsvd4__n_components,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
4,0.011484,0.07,400,15,15,15,15,"{'gbm__learning_rate': 0.07, 'gbm__n_estimator...",-0.224289,-0.215079,-0.240609,-0.226659,0.010556,1
5,0.030416,0.07,500,15,15,15,15,"{'gbm__learning_rate': 0.07, 'gbm__n_estimator...",-0.22512,-0.214967,-0.24022,-0.226769,0.010375,2
8,0.024669,0.1,500,15,15,15,15,"{'gbm__learning_rate': 0.1, 'gbm__n_estimators...",-0.224752,-0.215703,-0.241877,-0.227444,0.010854,3
7,0.020821,0.1,400,15,15,15,15,"{'gbm__learning_rate': 0.1, 'gbm__n_estimators...",-0.225535,-0.217081,-0.240346,-0.227654,0.009615,4
2,0.017676,0.05,500,15,15,15,15,"{'gbm__learning_rate': 0.05, 'gbm__n_estimator...",-0.225012,-0.216101,-0.242173,-0.227762,0.01082,5


In [25]:
y_pred = model.predict(test[use_cols])
y_pred[y_pred>3.] = 3.
results = pd.DataFrame({'id':test.index.values, 'relevance':y_pred})
results.to_csv('lgbm3.csv', header=True, index=False)

0.48232

In [26]:
params = {
    'gbm__learning_rate': [0.07],
    'gbm__n_estimators': [450, 500, 550],
    'union__txt1__tsvd1__n_components': [18],
    'union__txt2__tsvd2__n_components': [18],
    'union__txt3__tsvd3__n_components': [18],
    'union__txt4__tsvd4__n_components': [18]
}
model = GridSearchCV(clf, params,
                     cv=3,
                     n_jobs=1,
                     scoring='neg_mean_squared_error',
                     return_train_score=False)
model.fit(train[use_cols][:], y)

GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('union', FeatureUnion(n_jobs=None,
       transformer_list=[('cst', cust_regression_vals()), ('txt1', Pipeline(memory=None,
     steps=[('s1', cust_txt_col(col='search_term')), ('tfidf1', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float6....0, reg_lambda=0.0, silent=True,
       subsample=1.0, subsample_for_bin=200000, subsample_freq=0))]),
       fit_params=None, iid='warn', n_jobs=1,
       param_grid={'gbm__learning_rate': [0.07], 'gbm__n_estimators': [450, 500, 550], 'union__txt1__tsvd1__n_components': [18], 'union__txt2__tsvd2__n_components': [18], 'union__txt3__tsvd3__n_components': [18], 'union__txt4__tsvd4__n_components': [18]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
       scoring='neg_mean_squared_error', verbose=0)

In [27]:
grid_df = pd.DataFrame(model.cv_results_)
grid_df.drop(columns=grid_df.columns[:3], inplace=True)
grid_df.sort_values(['rank_test_score'], inplace=True)
grid_df.head()

Unnamed: 0,std_score_time,param_gbm__learning_rate,param_gbm__n_estimators,param_union__txt1__tsvd1__n_components,param_union__txt2__tsvd2__n_components,param_union__txt3__tsvd3__n_components,param_union__txt4__tsvd4__n_components,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
1,0.040481,0.07,500,18,18,18,18,"{'gbm__learning_rate': 0.07, 'gbm__n_estimator...",-0.222623,-0.214811,-0.239279,-0.225571,0.010204,1
2,0.044157,0.07,550,18,18,18,18,"{'gbm__learning_rate': 0.07, 'gbm__n_estimator...",-0.223949,-0.213966,-0.240192,-0.226035,0.010808,2
0,0.049255,0.07,450,18,18,18,18,"{'gbm__learning_rate': 0.07, 'gbm__n_estimator...",-0.223858,-0.214956,-0.240773,-0.226529,0.010708,3


In [28]:
y_pred = model.predict(test[use_cols])
y_pred[y_pred>3.] = 3.
results = pd.DataFrame({'id':test.index.values, 'relevance':y_pred})
results.to_csv('lgbm3.csv', header=True, index=False)

0.48015

In [11]:
def word_match_index(search_term, text):
    count = 0
    search_term = search_term.split()
    text = text.split()
    if len(search_term) == 0:
        return 0
    min_ind = len(text)
    for word in set(search_term):
        if word in text:
            min_ind = min(min_ind, text.index(word))
    return min_ind / len(text)

In [12]:
train['search_term'] = train['search_term'].apply(str)
train['prod_title_ind'] = train.apply(lambda x: word_match_index(x['search_term'], x['product_title']), axis=1)
train['descr_ind'] = train.apply(lambda x: word_match_index(x['search_term'], x['descr']), axis=1)
train.head()

Unnamed: 0_level_0,product_uid,product_title,search_term,relevance,orig_search_term,orig_product_title,descr,orig_descr,match_product_title,match_descr,len_search_term_words,len_search_term_letters,complete_match_title,complete_match_descr,match_pos_title,match_pos_descr,brand,match_brand,prod_title_ind,descr_ind
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2,100001,simpson strong-ti 12-gaug angl,angl bracket,3.0,angle bracket,Simpson Strong-Tie 12-Gauge Angle,"angl make joint stronger, also provid consiste...","Not only do angles make joints stronger, they ...",0.5,0.5,2,12,0,0,0.5,0.0,simpson strong-ti,0.0,0.75,0.0
3,100001,simpson strong-ti 12-gaug angl,l bracket,2.5,l bracket,Simpson Strong-Tie 12-Gauge Angle,"angl make joint stronger, also provid consiste...","Not only do angles make joints stronger, they ...",0.0,0.0,2,9,0,0,0.0,0.0,simpson strong-ti,0.0,1.0,1.0
9,100002,behr premium textur deckov 1-gal. #sc-141 tugb...,deck,3.0,deck over,BEHR Premium Textured DeckOver 1-gal. #SC-141 ...,behr premium textur deckov innov solid color c...,BEHR Premium Textured DECKOVER is an innovativ...,0.0,0.0,1,4,1,1,0.0,0.5,behr premium textur deckov,0.0,1.0,1.0
16,100005,delta vero 1-handl shower faucet trim kit chro...,rain shower head,2.33,rain shower head,Delta Vero 1-Handle Shower Only Faucet Trim Ki...,updat bathroom delta vero single-handl shower ...,Update your bathroom with the Delta Vero Singl...,0.333333,0.333333,3,16,0,0,0.333333,0.0,delta,0.0,0.3,0.075758
17,100005,delta vero 1-handl shower faucet trim kit chro...,shower faucet,2.67,shower only faucet,Delta Vero 1-Handle Shower Only Faucet Trim Ki...,updat bathroom delta vero single-handl shower ...,Update your bathroom with the Delta Vero Singl...,1.0,1.0,2,13,1,1,0.666667,0.333333,delta,0.0,0.3,0.075758


In [13]:
# Обвязка для FeatureUnion
tree_cols = ['match_product_title', 'match_descr', 'len_search_term_words', 'len_search_term_letters',
             'complete_match_title', 'complete_match_descr', 'match_pos_title', 'match_brand', 'match_pos_descr',
             'prod_title_ind', 'descr_ind']

use_cols = tree_cols + ['search_term', 'product_title', 'descr', 'brand']



class cust_regression_vals(BaseEstimator, TransformerMixin):
    def fit(self, x, y=None):
        return self
    def transform(self, train):
        return train[tree_cols].values


class cust_txt_col(BaseEstimator, TransformerMixin):
    def __init__(self, col):
        self.col = col
    def fit(self, x, y=None):
        return self
    def transform(self, dataset):
        return dataset[self.col].apply(str)



tfidf = TfidfVectorizer(stop_words='english')
tsvd = TruncatedSVD(n_components=18)
gbm = lgb.LGBMRegressor()

clf = Pipeline([
    ('union', FeatureUnion(
                transformer_list = [
                    ('cst',  cust_regression_vals()),  
                    ('txt1', Pipeline([('s1', cust_txt_col(col='search_term')), ('tfidf1', tfidf), ('tsvd1', tsvd)])),
                    ('txt2', Pipeline([('s2', cust_txt_col(col='product_title')), ('tfidf2', tfidf), ('tsvd2', tsvd)])),
                    ('txt3', Pipeline([('s3', cust_txt_col(col='descr')), ('tfidf3', tfidf), ('tsvd3', tsvd)])),
                    ('txt4', Pipeline([('s4', cust_txt_col(col='brand')), ('tfidf4', tfidf), ('tsvd4', tsvd)]))
                    ]
            )), 
    ('gbm', gbm)])

In [34]:
params = {
    'gbm__learning_rate': [0.05, 0.07, 0.1],
    'gbm__n_estimators': [450, 500, 550]
}
model = GridSearchCV(clf, params,
                     cv=3,
                     n_jobs=1,
                     scoring='neg_mean_squared_error',
                     return_train_score=False)
model.fit(train[use_cols], y)
grid_df = pd.DataFrame(model.cv_results_)
grid_df.drop(columns=grid_df.columns[:3], inplace=True)
grid_df.sort_values(['rank_test_score'], inplace=True)
grid_df.head()

Unnamed: 0,std_score_time,param_gbm__learning_rate,param_gbm__n_estimators,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
2,0.053544,0.05,550,"{'gbm__learning_rate': 0.05, 'gbm__n_estimator...",-0.220945,-0.213883,-0.237682,-0.22417,0.00998,1
5,0.017776,0.07,550,"{'gbm__learning_rate': 0.07, 'gbm__n_estimator...",-0.22206,-0.214188,-0.238463,-0.224904,0.010112,2
1,0.011547,0.05,500,"{'gbm__learning_rate': 0.05, 'gbm__n_estimator...",-0.22265,-0.213925,-0.238743,-0.225106,0.01028,3
3,0.030457,0.07,450,"{'gbm__learning_rate': 0.07, 'gbm__n_estimator...",-0.222487,-0.21343,-0.239771,-0.225229,0.010927,4
4,0.01831,0.07,500,"{'gbm__learning_rate': 0.07, 'gbm__n_estimator...",-0.222807,-0.213934,-0.238969,-0.225237,0.010364,5


In [14]:
test['search_term'] = test['search_term'].apply(str)
test['prod_title_ind'] = test.apply(lambda x: word_match_index(x['search_term'], x['product_title']), axis=1)
test['descr_ind'] = test.apply(lambda x: word_match_index(x['search_term'], x['descr']), axis=1)
test.head()

Unnamed: 0_level_0,product_uid,product_title,search_term,orig_search_term,orig_product_title,descr,orig_descr,match_product_title,match_descr,len_search_term_words,len_search_term_letters,complete_match_title,complete_match_descr,match_pos_title,match_pos_descr,brand,match_brand,prod_title_ind,descr_ind
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
1,100001,simpson strong-ti 12-gaug angl,90 degre bracket,90 degree bracket,Simpson Strong-Tie 12-Gauge Angle,"angl make joint stronger, also provid consiste...","Not only do angles make joints stronger, they ...",0.0,0.333333,3,16,0,0,0.0,0.0,simpson strong-ti,0.0,1.0,0.55814
4,100001,simpson strong-ti 12-gaug angl,metal l bracket,metal l brackets,Simpson Strong-Tie 12-Gauge Angle,"angl make joint stronger, also provid consiste...","Not only do angles make joints stronger, they ...",0.0,0.0,3,15,0,0,0.0,0.0,simpson strong-ti,0.0,1.0,1.0
5,100001,simpson strong-ti 12-gaug angl,simpson sku abl,simpson sku able,Simpson Strong-Tie 12-Gauge Angle,"angl make joint stronger, also provid consiste...","Not only do angles make joints stronger, they ...",0.333333,0.333333,3,15,0,0,0.333333,0.333333,simpson strong-ti,0.058824,0.0,0.104651
6,100001,simpson strong-ti 12-gaug angl,simpson strong tie,simpson strong ties,Simpson Strong-Tie 12-Gauge Angle,"angl make joint stronger, also provid consiste...","Not only do angles make joints stronger, they ...",0.333333,0.333333,3,18,0,0,0.666667,1.0,simpson strong-ti,0.058824,0.0,0.104651
7,100001,simpson strong-ti 12-gaug angl,simpson strong tie hcc668,simpson strong tie hcc668,Simpson Strong-Tie 12-Gauge Angle,"angl make joint stronger, also provid consiste...","Not only do angles make joints stronger, they ...",0.25,0.25,4,25,0,0,0.5,1.0,simpson strong-ti,0.058824,0.0,0.104651


In [36]:
y_pred = model.predict(test[use_cols])
y_pred[y_pred>3.] = 3.
results = pd.DataFrame({'id':test.index.values, 'relevance':y_pred})
results.to_csv('lgbm3.csv', header=True, index=False)

0.47894

In [38]:
train['brand'] = train['brand'].apply(str)
train['match_brand'] = train.apply(lambda row: word_match_count(row['brand'], row['search_term']), axis=1)
params = {
    'gbm__learning_rate': [0.05, 0.07],
    'gbm__n_estimators': [500, 550]
}
model = GridSearchCV(clf, params,
                     cv=3,
                     n_jobs=1,
                     scoring='neg_mean_squared_error',
                     return_train_score=False)
model.fit(train[use_cols], y)
grid_df = pd.DataFrame(model.cv_results_)
grid_df.drop(columns=grid_df.columns[:3], inplace=True)
grid_df.sort_values(['rank_test_score'], inplace=True)
grid_df.head()

Unnamed: 0,std_score_time,param_gbm__learning_rate,param_gbm__n_estimators,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
3,0.083328,0.07,550,"{'gbm__learning_rate': 0.07, 'gbm__n_estimator...",-0.221026,-0.213931,-0.238266,-0.224408,0.010219,1
2,0.033788,0.07,500,"{'gbm__learning_rate': 0.07, 'gbm__n_estimator...",-0.222234,-0.213949,-0.237767,-0.22465,0.009873,2
1,0.016607,0.05,550,"{'gbm__learning_rate': 0.05, 'gbm__n_estimator...",-0.222216,-0.213646,-0.238101,-0.224654,0.010131,3
0,0.05462,0.05,500,"{'gbm__learning_rate': 0.05, 'gbm__n_estimator...",-0.222154,-0.214268,-0.240048,-0.22549,0.010786,4


In [40]:
params = {
    'gbm__learning_rate': [0.06, 0.07, 0.08],
    'gbm__n_estimators': [500, 550, 600]
}
model = GridSearchCV(clf, params,
                     cv=3,
                     n_jobs=1,
                     scoring='neg_mean_squared_error',
                     return_train_score=False)
model.fit(train[use_cols], y)
grid_df = pd.DataFrame(model.cv_results_)
grid_df.drop(columns=grid_df.columns[:3], inplace=True)
grid_df.sort_values(['rank_test_score'], inplace=True)
grid_df.head()

Unnamed: 0,std_score_time,param_gbm__learning_rate,param_gbm__n_estimators,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
2,0.062972,0.06,600,"{'gbm__learning_rate': 0.06, 'gbm__n_estimator...",-0.222682,-0.21275,-0.237397,-0.224277,0.010125,1
7,0.031326,0.08,550,"{'gbm__learning_rate': 0.08, 'gbm__n_estimator...",-0.221876,-0.21389,-0.237226,-0.224331,0.009683,2
6,0.046469,0.08,500,"{'gbm__learning_rate': 0.08, 'gbm__n_estimator...",-0.222018,-0.213549,-0.237538,-0.224368,0.009934,3
1,0.064603,0.06,550,"{'gbm__learning_rate': 0.06, 'gbm__n_estimator...",-0.221873,-0.213015,-0.238575,-0.224488,0.010597,4
4,0.022276,0.07,550,"{'gbm__learning_rate': 0.07, 'gbm__n_estimator...",-0.22123,-0.213778,-0.238482,-0.224497,0.010346,5


In [41]:
test['brand'] = test['brand'].apply(str)
test['match_brand'] = test.apply(lambda row: word_match_count(row['brand'], row['search_term']), axis=1)
y_pred = model.predict(test[use_cols])
y_pred[y_pred>3.] = 3.
results = pd.DataFrame({'id':test.index.values, 'relevance':y_pred})
results.to_csv('lgbm3.csv', header=True, index=False)

0.47864

In [10]:
params = {
    'gbm__learning_rate': [0.06],
    'gbm__n_estimators': [600],
    'gbm__num_leaves': [31, 40, 50],
    'gbm__colsample_bytree': [0.8, 1.]
    
}
model = GridSearchCV(clf, params,
                     cv=3,
                     n_jobs=1,
                     scoring='neg_mean_squared_error',
                     return_train_score=False)
model.fit(train[use_cols], y)
grid_df = pd.DataFrame(model.cv_results_)
grid_df.drop(columns=grid_df.columns[:3], inplace=True)
grid_df.sort_values(['rank_test_score'], inplace=True)
grid_df.head()

Unnamed: 0,std_score_time,param_gbm__colsample_bytree,param_gbm__learning_rate,param_gbm__n_estimators,param_gbm__num_leaves,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
2,0.036648,0.8,0.06,600,50,"{'gbm__colsample_bytree': 0.8, 'gbm__learning_...",-0.220665,-0.211912,-0.238434,-0.22367,0.011034,1
4,0.06211,1.0,0.06,600,40,"{'gbm__colsample_bytree': 1.0, 'gbm__learning_...",-0.222417,-0.212668,-0.238081,-0.224389,0.010468,2
5,0.040513,1.0,0.06,600,50,"{'gbm__colsample_bytree': 1.0, 'gbm__learning_...",-0.221894,-0.212823,-0.23866,-0.224459,0.010703,3
1,0.040047,0.8,0.06,600,40,"{'gbm__colsample_bytree': 0.8, 'gbm__learning_...",-0.221908,-0.213617,-0.23829,-0.224605,0.010252,4
3,0.022388,1.0,0.06,600,31,"{'gbm__colsample_bytree': 1.0, 'gbm__learning_...",-0.222465,-0.213042,-0.239621,-0.225042,0.011003,5


In [50]:
y_pred = model.predict(test[use_cols])
y_pred[y_pred>3.] = 3.
results = pd.DataFrame({'id':test.index.values, 'relevance':y_pred})
results.to_csv('lgbm3.csv', header=True, index=False)

0.47843

In [22]:
params = {
    'gbm__learning_rate': [0.06],
    'gbm__n_estimators': [600],
    'gbm__boosting_type': ['rf'],
    'gbm__bagging_freq': [1],
    'gbm__bagging_fraction': [0.8],
    'gbm__feature_fraction' : [0.7]
}
model = GridSearchCV(clf, params,
                     cv=3,
                     n_jobs=1,
                     scoring='neg_mean_squared_error',
                     return_train_score=False)
model.fit(train[use_cols], y)
grid_df = pd.DataFrame(model.cv_results_)
grid_df.drop(columns=grid_df.columns[:3], inplace=True)
grid_df.sort_values(['rank_test_score'], inplace=True)
grid_df.head()

Unnamed: 0,std_score_time,param_gbm__bagging_fraction,param_gbm__bagging_freq,param_gbm__boosting_type,param_gbm__feature_fraction,param_gbm__learning_rate,param_gbm__n_estimators,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,0.054577,0.8,1,rf,0.7,0.06,600,"{'gbm__bagging_fraction': 0.8, 'gbm__bagging_f...",-0.237328,-0.229481,-0.255629,-0.240813,0.010956,1


In [23]:
params = {
    'gbm__learning_rate': [0.05, 0.07],
    'gbm__n_estimators': [500, 600],
    'gbm__boosting_type': ['rf'],
    'gbm__bagging_freq': [1, 2],
    'gbm__bagging_fraction': [0.8],
    'gbm__feature_fraction' : [0.8]
}
model = GridSearchCV(clf, params,
                     cv=3,
                     n_jobs=1,
                     scoring='neg_mean_squared_error',
                     return_train_score=False)
model.fit(train[use_cols], y)
grid_df = pd.DataFrame(model.cv_results_)
grid_df.drop(columns=grid_df.columns[:3], inplace=True)
grid_df.sort_values(['rank_test_score'], inplace=True)
grid_df.head()

Unnamed: 0,std_score_time,param_gbm__bagging_fraction,param_gbm__bagging_freq,param_gbm__boosting_type,param_gbm__feature_fraction,param_gbm__learning_rate,param_gbm__n_estimators,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
4,0.082362,0.8,2,rf,0.8,0.05,500,"{'gbm__bagging_fraction': 0.8, 'gbm__bagging_f...",-0.237202,-0.229403,-0.255029,-0.240545,0.010725,1
5,0.032558,0.8,2,rf,0.8,0.05,600,"{'gbm__bagging_fraction': 0.8, 'gbm__bagging_f...",-0.237652,-0.229307,-0.254713,-0.240557,0.010574,2
3,0.017711,0.8,1,rf,0.8,0.07,600,"{'gbm__bagging_fraction': 0.8, 'gbm__bagging_f...",-0.23726,-0.229452,-0.255271,-0.240661,0.010811,3
1,0.045032,0.8,1,rf,0.8,0.05,600,"{'gbm__bagging_fraction': 0.8, 'gbm__bagging_f...",-0.237244,-0.229453,-0.255331,-0.240676,0.01084,4
7,0.050543,0.8,2,rf,0.8,0.07,600,"{'gbm__bagging_fraction': 0.8, 'gbm__bagging_f...",-0.237451,-0.229266,-0.255386,-0.240701,0.010908,5


In [24]:
y_pred = model.predict(test[use_cols])
y_pred[y_pred>3.] = 3.
results = pd.DataFrame({'id':test.index.values, 'relevance':y_pred})
results.to_csv('lgbm3.csv', header=True, index=False)

0.48978

In [15]:
params = {
    'gbm__learning_rate': [0.04, 0.06],
    'gbm__n_estimators': [600],
    'gbm__num_leaves': [50, 60],
    'gbm__colsample_bytree': [0.7, 0.9]
}
model = GridSearchCV(clf, params,
                     cv=3,
                     n_jobs=1,
                     scoring='neg_mean_squared_error',
                     return_train_score=False)
model.fit(train[use_cols], y)
grid_df = pd.DataFrame(model.cv_results_)
grid_df.drop(columns=grid_df.columns[:3], inplace=True)
grid_df.sort_values(['rank_test_score'], inplace=True)
grid_df.head()

Unnamed: 0,std_score_time,param_gbm__colsample_bytree,param_gbm__learning_rate,param_gbm__n_estimators,param_gbm__num_leaves,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
2,0.031582,0.7,0.06,600,50,"{'gbm__colsample_bytree': 0.7, 'gbm__learning_...",-0.219796,-0.212528,-0.236325,-0.222883,0.009957,1
3,0.023708,0.7,0.06,600,60,"{'gbm__colsample_bytree': 0.7, 'gbm__learning_...",-0.220758,-0.211899,-0.236235,-0.222964,0.010057,2
1,0.019798,0.7,0.04,600,60,"{'gbm__colsample_bytree': 0.7, 'gbm__learning_...",-0.22007,-0.211971,-0.237386,-0.223142,0.010601,3
6,0.022488,0.9,0.06,600,50,"{'gbm__colsample_bytree': 0.9, 'gbm__learning_...",-0.220494,-0.212264,-0.237102,-0.223287,0.010331,4
5,0.02922,0.9,0.04,600,60,"{'gbm__colsample_bytree': 0.9, 'gbm__learning_...",-0.221223,-0.211994,-0.236923,-0.22338,0.010291,5


In [16]:
y_pred = model.predict(test[use_cols])
y_pred[y_pred>3.] = 3.
results = pd.DataFrame({'id':test.index.values, 'relevance':y_pred})
results.to_csv('lgbm3.csv', header=True, index=False)

0.47891

In [20]:
print(model.best_estimator_.named_steps['gbm'].feature_importances_)
print(use_cols)

[371 226 182 585  47  18 315  65 297 555 497 583 556 486 450 438 531 457
 409 482 490 462 430 528 523 541 513 539 601 413 355 386 388 310 367 322
 402 320 338 419 336 410 394 392 384 412 426 483 389 378 359 359 379 405
 396 362 375 356 402 404 415 436 381 465 425 240 187 177 158 245 165 179
 168 182 173 162 202 173 176 156 198 206 133]
['match_product_title', 'match_descr', 'len_search_term_words', 'len_search_term_letters', 'complete_match_title', 'complete_match_descr', 'match_pos_title', 'match_brand', 'match_pos_descr', 'prod_title_ind', 'descr_ind', 'search_term', 'product_title', 'descr', 'brand']


In [21]:
params = {
    'gbm__learning_rate': [0.05],
    'gbm__n_estimators': [600],
    'gbm__num_leaves': [50],
    'gbm__colsample_bytree': [0.7],
    'union__txt1__tsvd1__n_components': [30],
    'union__txt2__tsvd2__n_components': [30],
    'union__txt3__tsvd3__n_components': [30],
    'union__txt4__tsvd4__n_components': [30]
}
model = GridSearchCV(clf, params,
                     cv=3,
                     n_jobs=1,
                     scoring='neg_mean_squared_error',
                     return_train_score=False)
model.fit(train[use_cols], y)
grid_df = pd.DataFrame(model.cv_results_)
grid_df.drop(columns=grid_df.columns[:3], inplace=True)
grid_df.sort_values(['rank_test_score'], inplace=True)
grid_df.head()

Unnamed: 0,std_score_time,param_gbm__colsample_bytree,param_gbm__learning_rate,param_gbm__n_estimators,param_gbm__num_leaves,param_union__txt1__tsvd1__n_components,param_union__txt2__tsvd2__n_components,param_union__txt3__tsvd3__n_components,param_union__txt4__tsvd4__n_components,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,0.03503,0.7,0.05,600,50,30,30,30,30,"{'gbm__colsample_bytree': 0.7, 'gbm__learning_...",-0.21917,-0.210504,-0.236751,-0.222142,0.010919,1


In [22]:
y_pred = model.predict(test[use_cols])
y_pred[y_pred>3.] = 3.
results = pd.DataFrame({'id':test.index.values, 'relevance':y_pred})
results.to_csv('lgbm3.csv', header=True, index=False)

0.47741

In [27]:
print(model.best_estimator_.named_steps['gbm'].feature_importances_[:12],
      model.best_estimator_.named_steps['gbm'].feature_importances_[12:42],
      model.best_estimator_.named_steps['gbm'].feature_importances_[42:72],
      model.best_estimator_.named_steps['gbm'].feature_importances_[72:102],
      model.best_estimator_.named_steps['gbm'].feature_importances_[102:],
      sep='\n'
     )
print(use_cols)

[324 214 152 546  39  13 286  65 162 414 365 419]
[410 305 277 309 273 306 279 299 314 291 319 288 332 366 277 304 289 260
 336 284 296 260 361 358 298 356 349 269 358 277]
[223 235 217 225 195 231 256 163 215 242 215 231 206 240 218 271 221 211
 211 248 227 244 243 237 254 246 263 293 237 342]
[260 269 236 278 208 222 245 228 212 200 209 209 221 223 207 244 249 273
 269 275 298 250 249 260 249 233 289 255 254 163]
[113 127  94 133  85 108  80 123  87 111  82  97  74  89  68  96 101  88
  90 114 112  90  87  88  91  80 108  75 113]
['match_product_title', 'match_descr', 'len_search_term_words', 'len_search_term_letters', 'complete_match_title', 'complete_match_descr', 'match_pos_title', 'match_brand', 'match_pos_descr', 'prod_title_ind', 'descr_ind', 'search_term', 'product_title', 'descr', 'brand']


In [28]:
params = {
    'gbm__learning_rate': [0.05],
    'gbm__n_estimators': [600],
    'gbm__num_leaves': [50],
    'gbm__colsample_bytree': [0.7],
    'union__txt1__tsvd1__n_components': [50],
    'union__txt2__tsvd2__n_components': [50],
    'union__txt3__tsvd3__n_components': [40],
    'union__txt4__tsvd4__n_components': [35]
}
model = GridSearchCV(clf, params,
                     cv=3,
                     n_jobs=1,
                     scoring='neg_mean_squared_error',
                     return_train_score=False)
model.fit(train[use_cols], y)
grid_df = pd.DataFrame(model.cv_results_)
grid_df.drop(columns=grid_df.columns[:3], inplace=True)
grid_df.sort_values(['rank_test_score'], inplace=True)
grid_df.head()

Unnamed: 0,std_score_time,param_gbm__colsample_bytree,param_gbm__learning_rate,param_gbm__n_estimators,param_gbm__num_leaves,param_union__txt1__tsvd1__n_components,param_union__txt2__tsvd2__n_components,param_union__txt3__tsvd3__n_components,param_union__txt4__tsvd4__n_components,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,0.047816,0.7,0.05,600,50,50,50,40,35,"{'gbm__colsample_bytree': 0.7, 'gbm__learning_...",-0.217159,-0.2092,-0.236125,-0.220828,0.011294,1


In [29]:
print(model.best_estimator_.named_steps['gbm'].feature_importances_[:12],
      model.best_estimator_.named_steps['gbm'].feature_importances_[12:62],
      model.best_estimator_.named_steps['gbm'].feature_importances_[62:102],
      model.best_estimator_.named_steps['gbm'].feature_importances_[102:142],
      model.best_estimator_.named_steps['gbm'].feature_importances_[142:],
      sep='\n'
     )
print(use_cols)

[289 185 140 407  42  11 208  40 145 352 250 291]
[270 236 202 189 219 202 181 175 216 188 207 154 218 206 165 223 165 156
 180 198 195 177 163 204 207 179 185 191 165 201 170 243 224 214 221 233
 188 207 223 235 209 237 222 245 250 236 238 246 250 238]
[174 143 154 148 152 137 152 126 138 156 139 157 154 170 134 135 167 172
 139 146 151 159 142 155 133 119 165 142 154 180 159 140 194 183 151 158
 154 157 179 166]
[200 143 159 134 138 187 187 181 164 253 140 179 152 199 129 158 182 141
 148 162 187 198 160 182 158 177 166 169 170 179 152 165 163 162 171 144
 159 177 142 190]
[142 145 176 169 192 162 161 170 164 121  82  52  64 105  62  54  76  69
  52  56  69  68  50  58  54  61  68  53  68  90  57  68  61  76  68  54
  77  63  58  78  97  50  69  64]
['match_product_title', 'match_descr', 'len_search_term_words', 'len_search_term_letters', 'complete_match_title', 'complete_match_descr', 'match_pos_title', 'match_brand', 'match_pos_descr', 'prod_title_ind', 'descr_ind', 'search_term', 

In [30]:
params = {
    'gbm__learning_rate': [0.05],
    'gbm__n_estimators': [600],
    'gbm__num_leaves': [50],
    'gbm__colsample_bytree': [0.7],
    'union__txt1__tsvd1__n_components': [80],
    'union__txt2__tsvd2__n_components': [80],
    'union__txt3__tsvd3__n_components': [60],
    'union__txt4__tsvd4__n_components': [35]
}
model = GridSearchCV(clf, params,
                     cv=3,
                     n_jobs=1,
                     scoring='neg_mean_squared_error',
                     return_train_score=False)
model.fit(train[use_cols], y)
grid_df = pd.DataFrame(model.cv_results_)
grid_df.drop(columns=grid_df.columns[:3], inplace=True)
grid_df.sort_values(['rank_test_score'], inplace=True)
grid_df.head()

Unnamed: 0,std_score_time,param_gbm__colsample_bytree,param_gbm__learning_rate,param_gbm__n_estimators,param_gbm__num_leaves,param_union__txt1__tsvd1__n_components,param_union__txt2__tsvd2__n_components,param_union__txt3__tsvd3__n_components,param_union__txt4__tsvd4__n_components,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,0.040601,0.7,0.05,600,50,80,80,60,35,"{'gbm__colsample_bytree': 0.7, 'gbm__learning_...",-0.217119,-0.208772,-0.234422,-0.220104,0.010682,1


In [32]:
print(model.best_estimator_.named_steps['gbm'].feature_importances_[:12],
      model.best_estimator_.named_steps['gbm'].feature_importances_[12:92],
      model.best_estimator_.named_steps['gbm'].feature_importances_[92:172],
      model.best_estimator_.named_steps['gbm'].feature_importances_[172:232],
      model.best_estimator_.named_steps['gbm'].feature_importances_[232:],
      sep='\n'
     )
print(use_cols)

[238 138 125 323  35   5 186  33 113 294 222 241]
[159 151 125 108 134 127 133 125 154 130 124  95 124 137 115 103 131 131
 150 116 144 114  97  98 167 132 102 108 105 118 122 166 149 144 114 131
 112 151 141 149 101 115 126 120 107 124 176 117 110 139 103 109 116 130
 176 147 127 117 147 168 119 138 137 128 160 152 145 157 148 106 112 121
 180 127 137 132 152 169 139 157]
[ 94 118 108  85 133  78 126  89  87 141 106  79  87 120 109 108 107 108
  83  85 103 101  88  86  79  71 103 117 113  85 106 108  99 111 115 118
 110  96 101  95  94 104  98  86  85 112  90  92 103  91 116 103  90  84
 127 127 155 110 138 115 106 117 112  93 113 115 117 133 142  89 122 124
 115 110 128 112 114 123 111 188]
[132 115  75 119 112 106 131 100  99 102 102 106 122 115 105 123 104 114
 118  86 107 120 119  96 128  90 122 112 104  90  95 109 103  95 105 103
  99 101 107  90 124 117 120 120 100 133 108 132 109 152  92 107 106 146
 127 137 134 106 133 119]
[44 45 66 84 36 58 45 44 41 46 73 41 54 41 47 48 33 4

In [61]:
# Обвязка для FeatureUnion
tree_cols = ['match_product_title', 'match_descr', 'len_search_term_words', 'len_search_term_letters',
             'match_pos_title', 'match_brand', 'match_pos_descr', 'prod_title_ind', 'descr_ind']

use_cols = tree_cols + ['search_term', 'product_title', 'descr', 'brand']


tfidf = TfidfVectorizer(stop_words='english')
tsvd80 = TruncatedSVD(n_components=80)
tsvd80_2 = TruncatedSVD(n_components=80)
tsvd60 = TruncatedSVD(n_components=60)
tsvd35 = TruncatedSVD(n_components=30)
gbm = lgb.LGBMRegressor()

clf = Pipeline([
    ('union', FeatureUnion(
                transformer_list = [
                    ('cst',  cust_regression_vals()),  
                    ('txt1', Pipeline([('s1', cust_txt_col(col='search_term')), ('tfidf1', tfidf), ('tsvd1', tsvd80)])),
                    ('txt2', Pipeline([('s2', cust_txt_col(col='product_title')), ('tfidf2', tfidf), ('tsvd2', tsvd80_2)])),
                    ('txt3', Pipeline([('s3', cust_txt_col(col='descr')), ('tfidf3', tfidf), ('tsvd3', tsvd60)])),
                    ('txt4', Pipeline([('s4', cust_txt_col(col='brand')), ('tfidf4', tfidf), ('tsvd4', tsvd35)]))
                    ]
            )), 
    ('gbm', gbm)])

In [34]:
params = {
    'gbm__learning_rate': [0.05],
    'gbm__n_estimators': [600, 700],
    'gbm__num_leaves': [50, 60],
    'gbm__colsample_bytree': [0.7]
}
model = GridSearchCV(clf, params,
                     cv=3,
                     n_jobs=1,
                     scoring='neg_mean_squared_error',
                     return_train_score=False)
model.fit(train[use_cols], y)
grid_df = pd.DataFrame(model.cv_results_)
grid_df.drop(columns=grid_df.columns[:3], inplace=True)
grid_df.sort_values(['rank_test_score'], inplace=True)
grid_df.head()

Unnamed: 0,std_score_time,param_gbm__colsample_bytree,param_gbm__learning_rate,param_gbm__n_estimators,param_gbm__num_leaves,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
3,0.022706,0.7,0.05,700,60,"{'gbm__colsample_bytree': 0.7, 'gbm__learning_...",-0.217018,-0.206555,-0.234525,-0.219366,0.011539,1
1,0.042269,0.7,0.05,600,60,"{'gbm__colsample_bytree': 0.7, 'gbm__learning_...",-0.216747,-0.20751,-0.234539,-0.219599,0.011217,2
2,0.030646,0.7,0.05,700,50,"{'gbm__colsample_bytree': 0.7, 'gbm__learning_...",-0.21671,-0.208067,-0.235516,-0.220097,0.011459,3
0,0.036436,0.7,0.05,600,50,"{'gbm__colsample_bytree': 0.7, 'gbm__learning_...",-0.21684,-0.207697,-0.235915,-0.220151,0.011755,4


In [37]:
print(model.best_estimator_.named_steps['gbm'].feature_importances_[:9],
      model.best_estimator_.named_steps['gbm'].feature_importances_[9:89],
      model.best_estimator_.named_steps['gbm'].feature_importances_[89:169],
      model.best_estimator_.named_steps['gbm'].feature_importances_[169:229],
      model.best_estimator_.named_steps['gbm'].feature_importances_[229:],
      sep='\n'
     )
print(use_cols)

[270 164 135 378 241  44 145 376 310]
[280 246 220 164 169 166 155 155 158 191 145 164 147 183 208 125 144 155
 167 162 171 141 135 166 147 200 159 156 146 207 175 167 199 160 165 164
 183 184 166 168 164 149 168 165 177 172 164 181 169 170 148 160 197 171
 205 210 193 176 166 220 184 204 169 180 210 196 182 244 178 195 210 182
 192 191 178 206 217 221 202 231]
[227 185 156 140 147 158 145 154 141 162 160 167 154 132 116 175 145 171
 153 149 147 148 125 132 166 166 125 158 176 149 154 165 137 146 153 174
 155 193 146 151 156 154 158 145 127 157 156 157 150 167 133 154 148 116
 174 148 168 179 215 202 172 153 161 168 146 180 174 179 155 131 181 163
 178 178 189 172 216 172 161 183]
[263 173 171 154 158 148 135 175 142 149 146 137 182 143 165 168 163 146
 191 139 147 167 178 186 152 172 142 145 183 141 167 144 145 150 143 167
 146 155 183 153 154 143 225 143 199 188 189 202 186 137 159 174 194 163
 178 180 154 167 174 214]
[164  87  70  81 114  70  62  72  66  58  80  56  68  58  63  72 

In [36]:
y_pred = model.predict(test[use_cols])
y_pred[y_pred>3.] = 3.
results = pd.DataFrame({'id':test.index.values, 'relevance':y_pred})
results.to_csv('lgbm3.csv', header=True, index=False)

0.47719

In [38]:
train['pred_diff'] = (train['relevance'] - model.predict(train[use_cols])).abs()
train.sort_values(['pred_diff'], ascending=False)

Unnamed: 0_level_0,product_uid,product_title,search_term,relevance,orig_search_term,orig_product_title,descr,orig_descr,match_product_title,match_descr,...,len_search_term_letters,complete_match_title,complete_match_descr,match_pos_title,match_pos_descr,brand,match_brand,prod_title_ind,descr_ind,pred_diff
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
69679,119478,romano 4 ft. boxwood spiral topiari tree,topiari tree,1.00,topiary tree,Romano 4 ft. Boxwood Spiral Topiary Tree,enhanc home romano boxwood spiral topiari tree...,Enhance your home with a Romano Boxwood Spiral...,1.000000,1.000000,...,12,1,1,1.000000,3.000000,,0.000000,0.714286,0.092593,1.625023
156918,160006,werner 14 ft. fiberglass round rung straight l...,14 ft ladder,1.00,14 ft ladder,Werner 14 ft. Fiberglass Round Rung Straight L...,7114-1 one-sect round rung 14 ft. straight lad...,The 7114-1 one-section round rung 14 ft. Strai...,0.666667,0.666667,...,12,0,0,1.000000,1.000000,,0.000000,0.062500,0.064516,1.462272
120624,140844,rachael ray 10 qt. cover stockpot,rachael ray,1.00,rachael ray,Rachael Ray 10 qt. Covered Stockpot,"whether boil pasta, make batch chili cook grai...","Whether you're boiling pasta, making a batch o...",1.000000,0.500000,...,11,1,0,1.000000,0.500000,,0.000000,0.000000,0.440367,1.448728
69064,119252,big ass fan 3600 144 in. yellow silver aluminu...,big ass,1.00,big ass,Big Ass Fans 3600 144 in. Yellow and Silver Al...,want air movement without noise. got it. big a...,Want air movement without noise. You've got it...,1.000000,1.000000,...,7,1,1,1.000000,1.000000,,0.000000,0.000000,0.052632,1.422245
204530,192555,heath bird stop blue ceram wild bird feeder,bird stop,1.00,bird stops,Heath Bird Stop Blue Ceramic Wild Bird Feeder,bird stop blue ceram wild bird feeder featur b...,The Bird Stop Blue Ceramic Wild Bird Feeder fe...,1.000000,1.000000,...,9,1,1,1.000000,1.000000,heath,0.000000,0.125000,0.000000,1.411231
136808,148897,earthquak 212cc tiller rear tine crt side shield,side shield,1.00,side shields,Earthquake 212cc Tiller Rear Tine CRT with Sid...,earthquak 6015v rear tine rototil deliv ultim ...,The Earthquake 6015V rear tine rototiller deli...,1.000000,0.000000,...,11,1,0,1.000000,0.000000,,0.000000,0.750000,1.000000,1.409618
104722,133520,danco low lead 1a-3c stem crane,101-1h crane,1.00,101-1h for crane,DANCO Low Lead 1A-3C Stem for Crane,repair leaki faucet easi inexpens altern repla...,Repairing a leaky faucet is an easy and inexpe...,0.500000,0.500000,...,12,0,0,0.500000,0.250000,,0.000000,0.833333,0.514286,1.395762
176724,172318,green matter 3-light mahogani bronz vaniti fixtur,bronz green,1.00,bronze green,Green Matters 3-Light Mahogany Bronze Vanity F...,"cornerston collect quality, incandesc vaniti e...","The cornerstone of this collection is quality,...",1.000000,0.500000,...,11,0,0,0.500000,1.000000,green matter,0.083333,0.000000,0.109756,1.382297
109498,135608,gilbert &amp; bennett 42 in. galvan steel toma...,tomato plant,1.00,tomato plants,Gilbert &amp; Bennett 42 in. Galvanized Steel ...,help tomato plant grow lush tall 42 in. tomato...,Help your tomato plant grow lush and tall with...,0.500000,1.000000,...,12,0,1,0.500000,1.000000,gilbert & bennett,0.000000,0.777778,0.023810,1.363714
17236,103041,ortho home defens max 1.33 gal. perimet indoor...,ant killer,1.00,ant killer,Ortho Home Defense Max 1.33 Gal. Perimeter and...,ortho home defens max 1.33 gal. ready-to-us pe...,The Ortho Home Defense Max 1.33 Gal. Ready-to-...,0.500000,0.500000,...,10,0,0,0.500000,0.500000,ortho,0.000000,0.818182,0.121951,1.348652


In [40]:
train['match_pos_title'] = train.apply(
    lambda x: match_word_n_pos(x['orig_product_title'], x['orig_search_term']), axis=1)
train['match_pos_descr'] = train.apply(
    lambda x: match_word_n_pos(x['orig_descr'], x['orig_search_term']), axis=1)
train['brand'].fillna('', inplace=True)

In [43]:
params = {
    'gbm__learning_rate': [0.05],
    'gbm__n_estimators': [700],
    'gbm__num_leaves': [60],
    'gbm__colsample_bytree': [0.7]
}
model = GridSearchCV(clf, params,
                     cv=3,
                     n_jobs=1,
                     scoring='neg_mean_squared_error',
                     return_train_score=False)
model.fit(train[use_cols], y)
grid_df = pd.DataFrame(model.cv_results_)
grid_df.drop(columns=grid_df.columns[:3], inplace=True)
grid_df.sort_values(['rank_test_score'], inplace=True)
grid_df.head()

Unnamed: 0,std_score_time,param_gbm__colsample_bytree,param_gbm__learning_rate,param_gbm__n_estimators,param_gbm__num_leaves,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,0.031314,0.7,0.05,700,60,"{'gbm__colsample_bytree': 0.7, 'gbm__learning_...",-0.216036,-0.207662,-0.235604,-0.219767,0.011709,1


In [None]:
test['match_pos_title'] = test.apply(
    lambda x: match_word_n_pos(x['orig_product_title'], x['orig_search_term']), axis=1)
test['match_pos_descr'] = test.apply(
    lambda x: match_word_n_pos(x['orig_descr'], x['orig_search_term']), axis=1)

In [45]:
test['brand'].fillna('', inplace=True)
train.to_csv('train_processed.csv')
test.to_csv('test_processed.csv')

In [47]:
y_pred = model.predict(test[use_cols])
y_pred[y_pred>3.] = 3.
results = pd.DataFrame({'id':test.index.values, 'relevance':y_pred})
results.to_csv('lgbm3.csv', header=True, index=False)

0.47696

In [46]:
def word_match_std(search_term, text):
    indexes = []
    search_term = search_term.split()
    text = text.split()    
    n = len(text)
    if len(search_term) == 0:
        return 0
    for word in set(search_term):
        if word in text:
            indexes.append(text.index(word))
    if len(indexes) > 0:
        return np.var(indexes)
    else:
        return 0

In [48]:
train['words_std_title'] = train.apply(
    lambda x: match_word_n_pos(x['search_term'], x['product_title']), axis=1)
train['words_std_descr'] = train.apply(
    lambda x: match_word_n_pos(x['search_term'], x['descr']), axis=1)

In [51]:
tree_cols = ['match_product_title', 'match_descr', 'len_search_term_words', 'len_search_term_letters',
             'match_pos_title', 'match_brand', 'match_pos_descr', 'prod_title_ind', 'descr_ind',
             'words_std_title', 'words_std_descr']

use_cols = tree_cols + ['search_term', 'product_title', 'descr', 'brand']
clf = Pipeline([
    ('union', FeatureUnion(
                transformer_list = [
                    ('cst',  cust_regression_vals()),  
                    ('txt1', Pipeline([('s1', cust_txt_col(col='search_term')), ('tfidf1', tfidf), ('tsvd1', tsvd80)])),
                    ('txt2', Pipeline([('s2', cust_txt_col(col='product_title')), ('tfidf2', tfidf), ('tsvd2', tsvd80)])),
                    ('txt3', Pipeline([('s3', cust_txt_col(col='descr')), ('tfidf3', tfidf), ('tsvd3', tsvd60)])),
                    ('txt4', Pipeline([('s4', cust_txt_col(col='brand')), ('tfidf4', tfidf), ('tsvd4', tsvd35)]))
                    ]
            )), 
    ('gbm', gbm)])

params = {
    'gbm__learning_rate': [0.05],
    'gbm__n_estimators': [700],
    'gbm__num_leaves': [60],
    'gbm__colsample_bytree': [0.7]
}
model = GridSearchCV(clf, params,
                     cv=3,
                     n_jobs=1,
                     scoring='neg_mean_squared_error',
                     return_train_score=False)
model.fit(train[use_cols], y)
grid_df = pd.DataFrame(model.cv_results_)
grid_df.drop(columns=grid_df.columns[:3], inplace=True)
grid_df.sort_values(['rank_test_score'], inplace=True)
grid_df.head()

Unnamed: 0,std_score_time,param_gbm__colsample_bytree,param_gbm__learning_rate,param_gbm__n_estimators,param_gbm__num_leaves,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,0.022354,0.7,0.05,700,60,"{'gbm__colsample_bytree': 0.7, 'gbm__learning_...",-0.215824,-0.206791,-0.236066,-0.21956,0.01224,1


In [52]:
print(model.best_estimator_.named_steps['gbm'].feature_importances_[:11],
      model.best_estimator_.named_steps['gbm'].feature_importances_[11:91],
      model.best_estimator_.named_steps['gbm'].feature_importances_[91:171],
      model.best_estimator_.named_steps['gbm'].feature_importances_[171:231],
      model.best_estimator_.named_steps['gbm'].feature_importances_[231:],
      sep='\n'
     )
print(use_cols)

[246 170 129 367 197  34 148 364 277 293 203]
[268 214 182 145 143 150 160 177 147 204 184 176 156 175 206 168 137 151
 173 164 177 159 176 161 139 181 165 157 154 179 154 176 158 204 172 152
 160 182 194 154 161 158 187 157 190 174 189 169 190 152 206 178 191 174
 189 227 190 179 187 197 180 232 217 202 188 214 160 180 163 198 208 187
 178 196 223 206 172 215 212 215]
[215 168 155 140 135 144 121 163 130 138 153 132 128 138 165 188 125 160
 145 136 156 152 121 148 138 132 147 167 146 131 171 147 165 131 174 152
 161 158 138 152 140 134 135 149 141 155 157 158 137 166 132 134 142 141
 131 137 138 165 197 161 214 167 179 175 136 155 169 175 166 151 160 164
 190 185 167 196 206 141 171 155]
[276 151 175 122 139 140 139 185 137 154 147 154 163 155 165 136 145 157
 159 179 139 158 172 155 190 162 152 149 152 181 160 170 149 168 138 182
 147 158 133 150 164 117 157 155 178 156 214 198 156 177 186 160 176 175
 193 135 187 198 204 188]
[ 81  63  78 127  86  80  74  78  76  68  83  66  77  66 

In [53]:
test['words_std_title'] = test.apply(
    lambda x: match_word_n_pos(x['search_term'], x['product_title']), axis=1)
test['words_std_descr'] = test.apply(
    lambda x: match_word_n_pos(x['search_term'], x['descr']), axis=1)
train.to_csv('train_processed.csv')
test.to_csv('test_processed.csv')

In [54]:
y_pred = model.predict(test[use_cols])
y_pred[y_pred>3.] = 3.
results = pd.DataFrame({'id':test.index.values, 'relevance':y_pred})
results.to_csv('lgbm3.csv', header=True, index=False)

0.47588

It was a mistake to fit the TfIDf and SVD every time

In [77]:
tree_cols = ['match_product_title', 'match_descr', 'len_search_term_words', 'len_search_term_letters',
             'match_pos_title', 'match_brand', 'match_pos_descr', 'prod_title_ind', 'descr_ind',
             'words_std_title', 'words_std_descr']

use_cols = tree_cols + ['search_term', 'product_title', 'descr', 'brand']

clf = Pipeline([
    ('union', FeatureUnion(
                transformer_list = [
                    ('cst',  cust_regression_vals()),  
                    ('txt1', Pipeline([('s1', cust_txt_col(col='search_term')),
                                       ('tfidf1', TfidfVectorizer(stop_words='english')),
                                       ('tsvd1', TruncatedSVD(n_components=80))])),
                    ('txt2', Pipeline([('s2', cust_txt_col(col='product_title')),
                                       ('tfidf2', TfidfVectorizer(stop_words='english')),
                                       ('tsvd2', TruncatedSVD(n_components=80))])),
                    ('txt3', Pipeline([('s3', cust_txt_col(col='descr')),
                                       ('tfidf3', TfidfVectorizer(stop_words='english')),
                                       ('tsvd3', TruncatedSVD(n_components=60))])),
                    ('txt4', Pipeline([('s4', cust_txt_col(col='brand')),
                                       ('tfidf4', TfidfVectorizer(stop_words='english')),
                                       ('tsvd4', TruncatedSVD(n_components=30))]))
                    ]
            ))])

clf.fit(train[use_cols])
X_train = clf.transform(train[use_cols])

In [81]:
gbm = lgb.LGBMRegressor(learning_rate=0.05, n_estimators=600, num_leaves=60, colsample_bytree=0.7)
gbm.fit(X_train, y)

LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=0.7,
       importance_type='split', learning_rate=0.05, max_depth=-1,
       min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
       n_estimators=600, n_jobs=-1, num_leaves=60, objective=None,
       random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
       subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [82]:
X_test = clf.transform(test[use_cols])

In [85]:
y_pred = gbm.predict(X_test)
y_pred[y_pred>3.] = 3.
results = pd.DataFrame({'id':test.index.values, 'relevance':y_pred})
results.to_csv('lgbm3.csv', header=True, index=False)

0.47543

In [88]:
params = {
    'learning_rate': [0.04, 0.05],
    'n_estimators': [600, 700],
    'num_leaves': [60, 70]
}
model = GridSearchCV(gbm, params,
                     cv=3,
                     n_jobs=1,
                     scoring='neg_mean_squared_error',
                     return_train_score=False,
                     verbose=2)
model.fit(X_train, y)
grid_df = pd.DataFrame(model.cv_results_)
grid_df.drop(columns=grid_df.columns[:3], inplace=True)
grid_df.sort_values(['rank_test_score'], inplace=True)
grid_df.head()

Fitting 3 folds for each of 8 candidates, totalling 24 fits
[CV] learning_rate=0.04, n_estimators=600, num_leaves=60 .............


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  learning_rate=0.04, n_estimators=600, num_leaves=60, total=  13.4s
[CV] learning_rate=0.04, n_estimators=600, num_leaves=60 .............


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   13.4s remaining:    0.0s


[CV]  learning_rate=0.04, n_estimators=600, num_leaves=60, total=  13.4s
[CV] learning_rate=0.04, n_estimators=600, num_leaves=60 .............
[CV]  learning_rate=0.04, n_estimators=600, num_leaves=60, total=  13.2s
[CV] learning_rate=0.04, n_estimators=600, num_leaves=70 .............
[CV]  learning_rate=0.04, n_estimators=600, num_leaves=70, total=  15.3s
[CV] learning_rate=0.04, n_estimators=600, num_leaves=70 .............
[CV]  learning_rate=0.04, n_estimators=600, num_leaves=70, total=  15.0s
[CV] learning_rate=0.04, n_estimators=600, num_leaves=70 .............
[CV]  learning_rate=0.04, n_estimators=600, num_leaves=70, total=  14.9s
[CV] learning_rate=0.04, n_estimators=700, num_leaves=60 .............
[CV]  learning_rate=0.04, n_estimators=700, num_leaves=60, total=  15.3s
[CV] learning_rate=0.04, n_estimators=700, num_leaves=60 .............
[CV]  learning_rate=0.04, n_estimators=700, num_leaves=60, total=  15.4s
[CV] learning_rate=0.04, n_estimators=700, num_leaves=60 ......

[Parallel(n_jobs=1)]: Done  24 out of  24 | elapsed:  6.0min finished


Unnamed: 0,std_score_time,param_learning_rate,param_n_estimators,param_num_leaves,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
2,0.001156,0.04,700,60,"{'learning_rate': 0.04, 'n_estimators': 700, '...",-0.213699,-0.204396,-0.232376,-0.216823,0.011634,1
3,0.000475,0.04,700,70,"{'learning_rate': 0.04, 'n_estimators': 700, '...",-0.213289,-0.204236,-0.233006,-0.216844,0.012011,2
7,0.002203,0.05,700,70,"{'learning_rate': 0.05, 'n_estimators': 700, '...",-0.214312,-0.205031,-0.232094,-0.217146,0.011228,3
6,0.001235,0.05,700,60,"{'learning_rate': 0.05, 'n_estimators': 700, '...",-0.214166,-0.204745,-0.232549,-0.217154,0.011546,4
0,0.001697,0.04,600,60,"{'learning_rate': 0.04, 'n_estimators': 600, '...",-0.213928,-0.204712,-0.232919,-0.217186,0.011744,5


In [89]:
y_pred = model.predict(X_test)
y_pred[y_pred>3.] = 3.
results = pd.DataFrame({'id':test.index.values, 'relevance':y_pred})
results.to_csv('lgbm3.csv', header=True, index=False)

0.47476

In [90]:
train['pred_diff'] = (train['relevance'] - model.predict(X_train)).abs()
train.sort_values(['pred_diff'], ascending=False)

Unnamed: 0_level_0,product_uid,product_title,search_term,relevance,orig_search_term,orig_product_title,descr,orig_descr,match_product_title,match_descr,...,complete_match_descr,match_pos_title,match_pos_descr,brand,match_brand,prod_title_ind,descr_ind,pred_diff,words_std_title,words_std_descr
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
120624,140844,rachael ray 10 qt. cover stockpot,rachael ray,1.00,rachael ray,Rachael Ray 10 qt. Covered Stockpot,"whether boil pasta, make batch chili cook grai...","Whether you're boiling pasta, making a batch o...",1.000000,0.500000,...,0,1.000000,0.500000,,0.000000,0.000000,0.440367,1.491619,0.333333,0.000000
156918,160006,werner 14 ft. fiberglass round rung straight l...,14 ft ladder,1.00,14 ft ladder,Werner 14 ft. Fiberglass Round Rung Straight L...,7114-1 one-sect round rung 14 ft. straight lad...,The 7114-1 one-section round rung 14 ft. Strai...,0.666667,0.666667,...,0,1.000000,1.000000,,0.000000,0.062500,0.064516,1.456519,0.071429,0.030303
69679,119478,romano 4 ft. boxwood spiral topiari tree,topiari tree,1.00,topiary tree,Romano 4 ft. Boxwood Spiral Topiary Tree,enhanc home romano boxwood spiral topiari tree...,Enhance your home with a Romano Boxwood Spiral...,1.000000,1.000000,...,1,1.000000,1.000000,,0.000000,0.714286,0.092593,1.451396,0.000000,0.037037
136808,148897,earthquak 212cc tiller rear tine crt side shield,side shield,1.00,side shields,Earthquake 212cc Tiller Rear Tine CRT with Sid...,earthquak 6015v rear tine rototil deliv ultim ...,The Earthquake 6015V rear tine rototiller deli...,1.000000,0.000000,...,0,1.000000,0.000000,,0.000000,0.750000,1.000000,1.434772,0.250000,0.000000
9508,101618,"sure comfort 40 gal. tall 3 year 34,000 btu na...",hot water tank gas,1.00,hot water tank gas,"Sure Comfort 40 Gal. Tall 3 Year 34,000 BTU Na...",sure comfort 40 gal. natur gas tall water heat...,The Sure Comfort 40 Gal. Natural Gas Tall Wate...,0.500000,1.000000,...,0,0.500000,1.000000,sure comfort,0.000000,0.769231,0.027778,1.400278,0.153846,0.021978
85851,125582,oakland live 26 in. metal grape tabl plant stand,grape plant,1.00,grape plant,Oakland Living 26 in. Metal Grape Table Plant ...,oakland live 26 in. metal grape tabl plant sta...,The Oakland Living 26 in. Metal Grape Table Pl...,1.000000,1.000000,...,0,1.000000,1.000000,,0.000000,0.555556,0.084746,1.395766,0.250000,0.034483
104722,133520,danco low lead 1a-3c stem crane,101-1h crane,1.00,101-1h for crane,DANCO Low Lead 1A-3C Stem for Crane,repair leaki faucet easi inexpens altern repla...,Repairing a leaky faucet is an easy and inexpe...,0.500000,0.500000,...,0,0.500000,0.250000,,0.000000,0.833333,0.514286,1.355375,0.142857,0.027027
176724,172318,green matter 3-light mahogani bronz vaniti fixtur,bronz green,1.00,bronze green,Green Matters 3-Light Mahogany Bronze Vanity F...,"cornerston collect quality, incandesc vaniti e...","The cornerstone of this collection is quality,...",1.000000,0.500000,...,0,0.500000,0.500000,green matter,0.083333,0.000000,0.109756,1.338270,0.000000,0.009259
17236,103041,ortho home defens max 1.33 gal. perimet indoor...,ant killer,1.00,ant killer,Ortho Home Defense Max 1.33 Gal. Perimeter and...,ortho home defens max 1.33 gal. ready-to-us pe...,The Ortho Home Defense Max 1.33 Gal. Ready-to-...,0.500000,0.500000,...,0,0.500000,0.500000,ortho,0.000000,0.818182,0.121951,1.320465,0.100000,0.011628
198022,187480,alpin 15 in. larg red bowl plastic planter,alpin larg,1.00,alpine large,Alpine 15 in. Large Red Bowl Plastic Planter,bowl planter perfect patio decks. avail variet...,These bowl planters are perfect for patios and...,1.000000,0.000000,...,0,1.000000,0.000000,alpin,0.200000,0.000000,1.000000,1.297985,0.000000,0.000000


In [93]:
model.best_estimator_.feature_importances_

array([317, 176, 158, 429, 248,  42, 178, 488, 316, 320, 221, 294, 242,
       225, 224, 199, 215, 198, 214, 179, 235, 207, 189, 184, 233, 230,
       206, 173, 169, 190, 206, 172, 198, 185, 199, 177, 212, 187, 167,
       170, 203, 181, 178, 242, 219, 200, 167, 196, 200, 229, 205, 190,
       197, 190, 212, 214, 194, 209, 179, 196, 226, 210, 188, 202, 235,
       230, 254, 217, 181, 230, 233, 233, 229, 186, 212, 238, 207, 219,
       194, 208, 252, 223, 257, 220, 246, 258, 235, 254, 205, 229, 232,
       231, 185, 202, 187, 161, 171, 171, 201, 141, 152, 193, 145, 183,
       130, 178, 174, 166, 163, 179, 148, 171, 179, 160, 178, 179, 169,
       139, 184, 191, 158, 180, 163, 184, 156, 195, 172, 178, 192, 162,
       190, 194, 174, 193, 174, 163, 162, 171, 205, 172, 185, 177, 165,
       162, 172, 165, 170, 203, 192, 261, 187, 177, 224, 188, 168, 203,
       174, 193, 249, 195, 192, 209, 186, 215, 212, 206, 195, 203, 205,
       208, 221, 293, 166, 211, 167, 187, 182, 173, 214, 175, 15

In [102]:
@np.vectorize
def word_match_diff(search_term, text):
    indexes = []
    search_term = search_term.split()
    text = text.split()
    if len(search_term) == 0:
        return 0
    for word in set(search_term):
        if word in text:
            indexes.append(text.index(word))
    if len(indexes) >= 2:
        return max(indexes) - min(indexes)
    elif len(indexes) == 1:
        return 1
    else:
        return 0

In [116]:
train = pd.read_csv('train_processed.csv', index_col=['id'])
train['search_term'] = train['search_term'].apply(str)
train['match_diff_title'] = word_match_diff(train['search_term'], train['product_title'])
train['match_diff_descr'] = word_match_diff(train['search_term'], train['descr'])

In [117]:
test = pd.read_csv('test_processed.csv', index_col=['id'])
test['search_term'] = test['search_term'].apply(str)
test['match_diff_title'] = word_match_diff(test['search_term'], test['product_title'])
test['match_diff_descr'] = word_match_diff(test['search_term'], test['descr'])

In [124]:
@np.vectorize
def word_match_count(search_term, text):
    count = 0
    search_term = search_term.split()
    text = text.split()
    if len(search_term) == 0:
        return 0
    for word in set(search_term):
        count += 1 if text.count(word) > 0 else 0
    return count / len(search_term)

In [134]:
train['match_product_title_orig'] = word_match_count(train['orig_search_term'].str.lower(),
                                                     train['orig_product_title'].str.lower())
train['match_descr_orig'] = word_match_count(train['orig_search_term'].str.lower(),
                                             train['orig_descr'].str.lower())

In [135]:
tree_cols = ['match_product_title', 'match_descr', 'len_search_term_words', 'len_search_term_letters',
             'match_pos_title', 'match_brand', 'match_pos_descr', 'prod_title_ind', 'descr_ind',
             'match_diff_title', 'match_diff_descr', 'match_product_title_orig', 'match_descr_orig']

use_cols = tree_cols + ['search_term', 'product_title', 'descr', 'brand']

clf = Pipeline([
    ('union', FeatureUnion(
                transformer_list = [
                    ('cst',  cust_regression_vals()),  
                    ('txt1', Pipeline([('s1', cust_txt_col(col='search_term')),
                                       ('tfidf1', TfidfVectorizer(stop_words='english')),
                                       ('tsvd1', TruncatedSVD(n_components=80))])),
                    ('txt2', Pipeline([('s2', cust_txt_col(col='product_title')),
                                       ('tfidf2', TfidfVectorizer(stop_words='english')),
                                       ('tsvd2', TruncatedSVD(n_components=80))])),
                    ('txt3', Pipeline([('s3', cust_txt_col(col='descr')),
                                       ('tfidf3', TfidfVectorizer(stop_words='english')),
                                       ('tsvd3', TruncatedSVD(n_components=60))])),
                    ('txt4', Pipeline([('s4', cust_txt_col(col='brand')),
                                       ('tfidf4', TfidfVectorizer(stop_words='english')),
                                       ('tsvd4', TruncatedSVD(n_components=30))]))
                    ]
            ))])

clf.fit(train[use_cols])
X_train = clf.transform(train[use_cols])

In [136]:
gbm = lgb.LGBMRegressor(learning_rate=0.05, n_estimators=700, num_leaves=60, colsample_bytree=0.7)
params = {
    'n_estimators': [700]
}
model = GridSearchCV(gbm, params,
                     cv=3,
                     n_jobs=1,
                     scoring='neg_mean_squared_error',
                     return_train_score=False,
                     verbose=2)
model.fit(X_train, y)
grid_df = pd.DataFrame(model.cv_results_)
grid_df.drop(columns=grid_df.columns[:3], inplace=True)
grid_df.sort_values(['rank_test_score'], inplace=True)
grid_df.head()

Fitting 3 folds for each of 1 candidates, totalling 3 fits
[CV] n_estimators=700 ................................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ................................. n_estimators=700, total=  15.2s
[CV] n_estimators=700 ................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   15.2s remaining:    0.0s


[CV] ................................. n_estimators=700, total=  14.9s
[CV] n_estimators=700 ................................................
[CV] ................................. n_estimators=700, total=  14.7s


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   44.8s finished


Unnamed: 0,std_score_time,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,0.000891,700,{'n_estimators': 700},-0.214023,-0.204974,-0.233179,-0.217392,0.011759,1


In [137]:
model.best_estimator_.feature_importances_

array([267, 126, 127, 385, 213,  46, 142, 349, 288,  63, 140, 151, 128,
       293, 202, 193, 152, 151, 167, 159, 131, 156, 195, 186, 176, 164,
       182, 179, 126, 160, 146, 179, 140, 155, 159, 188, 155, 149, 202,
       191, 157, 155, 166, 144, 153, 196, 178, 176, 160, 183, 188, 201,
       149, 154, 169, 149, 192, 154, 152, 189, 164, 162, 164, 172, 165,
       127, 186, 171, 175, 176, 208, 181, 199, 162, 189, 214, 178, 180,
       153, 168, 200, 188, 200, 190, 194, 202, 241, 208, 192, 207, 182,
       177, 187, 222, 189, 168, 123, 166, 153, 146, 179, 133, 133, 190,
       128, 139, 108, 126, 153, 126, 154, 158, 154, 157, 156, 147, 170,
       159, 163, 127, 186, 166, 134, 183, 151, 153, 141, 157, 164, 143,
       155, 159, 133, 172, 138, 148, 169, 165, 180, 164, 122, 147, 170,
       136, 157, 174, 177, 155, 152, 172, 147, 159, 158, 168, 201, 214,
       149, 182, 221, 154, 190, 156, 165, 198, 192, 158, 184, 156, 151,
       178, 146, 183, 172, 256, 128, 133, 159, 162, 157, 162, 14

In [138]:
test['match_product_title_orig'] = word_match_count(test['orig_search_term'].str.lower(),
                                                    test['orig_product_title'].str.lower())
test['match_descr_orig'] = word_match_count(test['orig_search_term'].str.lower(),
                                            test['orig_descr'].str.lower())
X_test = clf.transform(test[use_cols])

In [139]:
y_pred = model.predict(X_test)
y_pred[y_pred>3.] = 3.
y_pred[y_pred<1.] = 1.
results = pd.DataFrame({'id':test.index.values, 'relevance':y_pred})
results.to_csv('lgbm3.csv', header=True, index=False)

In [140]:
train['pred_diff'] = (train['relevance'] - model.predict(X_train)).abs()
train.sort_values(['pred_diff'], ascending=False)

Unnamed: 0_level_0,product_uid,product_title,search_term,relevance,orig_search_term,orig_product_title,descr,orig_descr,match_product_title,match_descr,...,match_pos_descr,brand,match_brand,prod_title_ind,descr_ind,pred_diff,match_diff_title,match_diff_descr,match_product_title_orig,match_descr_orig
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
156918,160006,werner 14 ft. fiberglass round rung straight l...,14 ft ladder,1.00,14 ft ladder,Werner 14 ft. Fiberglass Round Rung Straight L...,7114-1 one-sect round rung 14 ft. straight lad...,The 7114-1 one-section round rung 14 ft. Strai...,0.666667,0.666667,...,1.000000,,0.000000,0.062500,0.064516,1.469848,6,3,0.666667,0.666667
69679,119478,romano 4 ft. boxwood spiral topiari tree,topiari tree,1.00,topiary tree,Romano 4 ft. Boxwood Spiral Topiary Tree,enhanc home romano boxwood spiral topiari tree...,Enhance your home with a Romano Boxwood Spiral...,1.000000,1.000000,...,1.000000,,0.000000,0.714286,0.092593,1.374321,1,8,1.000000,1.000000
83392,124633,rubbermaid fasttrack garag cooler hook,rubbermaid cooler,1.33,rubbermaid cooler,Rubbermaid FastTrack Garage Cooler Hook,"part fasttrack system, cooler hook perfect sol...","Part of the FastTrack system, the Cooler Hook ...",1.000000,0.500000,...,0.500000,,0.000000,0.000000,0.051724,1.348056,3,1,1.000000,0.500000
136808,148897,earthquak 212cc tiller rear tine crt side shield,side shield,1.00,side shields,Earthquake 212cc Tiller Rear Tine CRT with Sid...,earthquak 6015v rear tine rototil deliv ultim ...,The Earthquake 6015V rear tine rototiller deli...,1.000000,0.000000,...,0.000000,,0.000000,0.750000,1.000000,1.343553,1,0,1.000000,0.000000
176724,172318,green matter 3-light mahogani bronz vaniti fixtur,bronz green,1.00,bronze green,Green Matters 3-Light Mahogany Bronze Vanity F...,"cornerston collect quality, incandesc vaniti e...","The cornerstone of this collection is quality,...",1.000000,0.500000,...,0.500000,green matter,0.083333,0.000000,0.109756,1.318144,4,1,1.000000,0.500000
120624,140844,rachael ray 10 qt. cover stockpot,rachael ray,1.00,rachael ray,Rachael Ray 10 qt. Covered Stockpot,"whether boil pasta, make batch chili cook grai...","Whether you're boiling pasta, making a batch o...",1.000000,0.500000,...,0.500000,,0.000000,0.000000,0.440367,1.297241,1,1,1.000000,0.000000
98819,130929,"prime-lin slide door tandem roller assembly, 1...",roller bear,1.00,roller bearings,Prime-Line Sliding Door Tandem Roller Assembly...,patio door roller construct steel tandem rolle...,This patio door roller is constructed from ste...,0.500000,1.000000,...,0.500000,,0.000000,0.307692,0.057143,1.294766,1,24,0.500000,0.500000
17236,103041,ortho home defens max 1.33 gal. perimet indoor...,ant killer,1.00,ant killer,Ortho Home Defense Max 1.33 Gal. Perimeter and...,ortho home defens max 1.33 gal. ready-to-us pe...,The Ortho Home Defense Max 1.33 Gal. Ready-to-...,0.500000,0.500000,...,0.500000,ortho,0.000000,0.818182,0.121951,1.287250,1,1,0.500000,0.500000
9508,101618,"sure comfort 40 gal. tall 3 year 34,000 btu na...",hot water tank gas,1.00,hot water tank gas,"Sure Comfort 40 Gal. Tall 3 Year 34,000 BTU Na...",sure comfort 40 gal. natur gas tall water heat...,The Sure Comfort 40 Gal. Natural Gas Tall Wate...,0.500000,1.000000,...,1.000000,sure comfort,0.000000,0.769231,0.027778,1.282277,1,42,0.500000,1.000000
188479,180470,laura ashley freya 3-light weather cognac vani...,laura ashley light,1.33,laura ashley lighting,Laura Ashley Freya 3-Light Weathered Cognac Va...,"found 1953, laura ashley becom quintessenti en...","Founded in 1953, Laura Ashley has become a qui...",1.000000,1.000000,...,0.666667,laura ashley,0.166667,0.000000,0.037037,1.276491,7,23,0.666667,0.666667
