In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.stem import SnowballStemmer
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
import re

encoding = "ISO-8859-1"
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package stopwords to /home/eugene/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/eugene/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/eugene/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [2]:
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import BaggingRegressor
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion

In [3]:
train = pd.read_csv('train_processed.csv', index_col=['id'])
train['search_term'] = train['search_term'].apply(str)
test = pd.read_csv('test_processed.csv', index_col=['id'])
test['search_term'] = test['search_term'].apply(str)

In [4]:
@np.vectorize
def word_match_std(search_term, text):
    indexes = []
    search_term = search_term.split()
    text = text.split()    
    n = len(text)
    if len(search_term) == 0:
        return 0
    for word in set(search_term):
        if word in text:
            indexes.append(text.index(word))
    if len(indexes) > 0:
        return np.var(indexes)
    else:
        return 0

In [5]:
train['words_std_title'] = word_match_std(train['search_term'], train['product_title'])
train['words_std_descr'] = word_match_std(train['search_term'], train['descr'])
test['words_std_title'] = word_match_std(test['search_term'], test['product_title'])
test['words_std_descr'] = word_match_std(test['search_term'], test['descr'])

In [6]:
class cust_regression_vals(BaseEstimator, TransformerMixin):
    def fit(self, x, y=None):
        return self
    def transform(self, train):
        return train[tree_cols].values


class cust_txt_col(BaseEstimator, TransformerMixin):
    def __init__(self, col):
        self.col = col
    def fit(self, x, y=None):
        return self
    def transform(self, dataset):
        return dataset[self.col].apply(str)

tree_cols = ['match_product_title', 'match_descr', 'len_search_term_words', 'len_search_term_letters',
             'match_pos_title', 'match_brand', 'match_pos_descr', 'prod_title_ind', 'descr_ind',
             'words_std_title', 'words_std_descr']

use_cols = tree_cols + ['search_term', 'product_title', 'descr', 'brand']

clf = Pipeline([
    ('union', FeatureUnion(
                transformer_list = [
                    ('cst',  cust_regression_vals()),  
                    ('txt1', Pipeline([('s1', cust_txt_col(col='search_term')),
                                       ('tfidf1', TfidfVectorizer(stop_words='english')),
                                       ('tsvd1', TruncatedSVD(n_components=80))])),
                    ('txt2', Pipeline([('s2', cust_txt_col(col='product_title')),
                                       ('tfidf2', TfidfVectorizer(stop_words='english')),
                                       ('tsvd2', TruncatedSVD(n_components=80))])),
                    ('txt3', Pipeline([('s3', cust_txt_col(col='descr')),
                                       ('tfidf3', TfidfVectorizer(stop_words='english')),
                                       ('tsvd3', TruncatedSVD(n_components=60))])),
                    ('txt4', Pipeline([('s4', cust_txt_col(col='brand')),
                                       ('tfidf4', TfidfVectorizer(stop_words='english')),
                                       ('tsvd4', TruncatedSVD(n_components=30))]))
                    ]
            ))])

clf.fit(train[use_cols])
X_train = clf.transform(train[use_cols])

In [8]:
y = train['relevance'].values

In [9]:
from lightgbm import LGBMRegressor
params = {
    'n_estimators': [600, 700],
    'num_leaves': [60, 70]
}
gbm = lgb.LGBMRegressor(learning_rate=0.04, colsample_bytree=0.7)
model = GridSearchCV(gbm, params,
                     cv=3,
                     n_jobs=1,
                     scoring='neg_mean_squared_error',
                     return_train_score=False,
                     verbose=2)
model.fit(X_train, y)
grid_df = pd.DataFrame(model.cv_results_)
grid_df.drop(columns=grid_df.columns[:3], inplace=True)
grid_df.sort_values(['rank_test_score'], inplace=True)
grid_df.head()

Fitting 3 folds for each of 4 candidates, totalling 12 fits
[CV] n_estimators=600, num_leaves=60 .................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] .................. n_estimators=600, num_leaves=60, total=  16.8s
[CV] n_estimators=600, num_leaves=60 .................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   16.8s remaining:    0.0s


[CV] .................. n_estimators=600, num_leaves=60, total=  15.7s
[CV] n_estimators=600, num_leaves=60 .................................
[CV] .................. n_estimators=600, num_leaves=60, total=  16.3s
[CV] n_estimators=600, num_leaves=70 .................................
[CV] .................. n_estimators=600, num_leaves=70, total=  21.1s
[CV] n_estimators=600, num_leaves=70 .................................
[CV] .................. n_estimators=600, num_leaves=70, total=  21.7s
[CV] n_estimators=600, num_leaves=70 .................................
[CV] .................. n_estimators=600, num_leaves=70, total=  18.0s
[CV] n_estimators=700, num_leaves=60 .................................
[CV] .................. n_estimators=700, num_leaves=60, total=  23.6s
[CV] n_estimators=700, num_leaves=60 .................................
[CV] .................. n_estimators=700, num_leaves=60, total=  18.0s
[CV] n_estimators=700, num_leaves=60 .................................
[CV] .

[Parallel(n_jobs=1)]: Done  12 out of  12 | elapsed:  3.7min finished


Unnamed: 0,std_score_time,param_n_estimators,param_num_leaves,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
3,0.003763,700,70,"{'n_estimators': 700, 'num_leaves': 70}",-0.213938,-0.204591,-0.233849,-0.217459,0.012201,1
2,0.005239,700,60,"{'n_estimators': 700, 'num_leaves': 60}",-0.214339,-0.205009,-0.233182,-0.21751,0.011718,2
0,0.005139,600,60,"{'n_estimators': 600, 'num_leaves': 60}",-0.21461,-0.205289,-0.233485,-0.217795,0.011729,3
1,0.009295,600,70,"{'n_estimators': 600, 'num_leaves': 70}",-0.214286,-0.20519,-0.234034,-0.217837,0.01204,4


In [10]:
X_test = clf.transform(test[use_cols])
y_pred = model.predict(X_test)
y_pred[y_pred>3.] = 3.
results = pd.DataFrame({'id':test.index.values, 'relevance':y_pred})
results.to_csv('lgbm3.csv', header=True, index=False)

0.47560

In [32]:
train['len_title'] = train['product_title'].apply(lambda x: len(x.split()))
train['len_descr'] = train['descr'].apply(lambda x: len(x.split()))
test['len_title'] = test['product_title'].apply(lambda x: len(x.split()))
test['len_descr'] = test['descr'].apply(lambda x: len(x.split()))

In [33]:
tree_cols = ['match_product_title', 'match_descr', 'len_search_term_words', 'len_search_term_letters',
             'match_pos_title', 'match_brand', 'match_pos_descr', 'prod_title_ind', 'descr_ind',
             'words_std_title', 'words_std_descr', 'len_title', 'len_descr']

use_cols = tree_cols + ['search_term', 'product_title', 'descr', 'brand']

clf = Pipeline([
    ('union', FeatureUnion(
                transformer_list = [
                    ('cst',  cust_regression_vals()),  
                    ('txt1', Pipeline([('s1', cust_txt_col(col='search_term')),
                                       ('tfidf1', TfidfVectorizer(stop_words='english')),
                                       ('tsvd1', TruncatedSVD(n_components=80))])),
                    ('txt2', Pipeline([('s2', cust_txt_col(col='product_title')),
                                       ('tfidf2', TfidfVectorizer(stop_words='english')),
                                       ('tsvd2', TruncatedSVD(n_components=80))])),
                    ('txt3', Pipeline([('s3', cust_txt_col(col='descr')),
                                       ('tfidf3', TfidfVectorizer(stop_words='english')),
                                       ('tsvd3', TruncatedSVD(n_components=60))])),
                    ('txt4', Pipeline([('s4', cust_txt_col(col='brand')),
                                       ('tfidf4', TfidfVectorizer(stop_words='english')),
                                       ('tsvd4', TruncatedSVD(n_components=30))]))
                    ]
            ))])

clf.fit(train[use_cols])
X_train = clf.transform(train[use_cols])

In [34]:
gbm = lgb.LGBMRegressor(n_estimators=700, num_leaves=70, colsample_bytree=0.7)
params = {
    'learning_rate': [0.03, 0.04]
}
model = GridSearchCV(gbm, params,
                     cv=3,
                     n_jobs=1,
                     scoring='neg_mean_squared_error',
                     return_train_score=False,
                     verbose=2)
model.fit(X_train, y)
grid_df = pd.DataFrame(model.cv_results_)
grid_df.drop(columns=grid_df.columns[:3], inplace=True)
grid_df.sort_values(['rank_test_score'], inplace=True)
grid_df.head()

Fitting 3 folds for each of 2 candidates, totalling 6 fits
[CV] learning_rate=0.03 ..............................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ............................... learning_rate=0.03, total=  20.7s
[CV] learning_rate=0.03 ..............................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   20.7s remaining:    0.0s


[CV] ............................... learning_rate=0.03, total=  29.6s
[CV] learning_rate=0.03 ..............................................
[CV] ............................... learning_rate=0.03, total=  29.3s
[CV] learning_rate=0.04 ..............................................
[CV] ............................... learning_rate=0.04, total=  27.8s
[CV] learning_rate=0.04 ..............................................
[CV] ............................... learning_rate=0.04, total=  26.8s
[CV] learning_rate=0.04 ..............................................
[CV] ............................... learning_rate=0.04, total=  30.4s


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:  2.7min finished


Unnamed: 0,std_score_time,param_learning_rate,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,0.052053,0.03,{'learning_rate': 0.03},-0.213347,-0.204404,-0.232452,-0.216734,0.011698,1
1,0.016509,0.04,{'learning_rate': 0.04},-0.213428,-0.204081,-0.232858,-0.216789,0.011986,2


In [35]:
X_test = clf.transform(test[use_cols])
y_pred = model.predict(X_test)
y_pred[y_pred>3.] = 3.
results = pd.DataFrame({'id':test.index.values, 'relevance':y_pred})
results.to_csv('lgbm3.csv', header=True, index=False)

0.47469

In [36]:
model.best_estimator_.feature_importances_

array([367, 207, 171, 539, 301,  47, 175, 530, 354, 164, 181, 300, 209,
       291, 303, 264, 232, 208, 212, 201, 221, 164, 262, 205, 242, 172,
       193, 227, 143, 177, 180, 193, 214, 204, 201, 184, 161, 195, 248,
       209, 167, 169, 183, 174, 218, 223, 223, 209, 199, 217, 244, 245,
       191, 209, 211, 199, 209, 177, 223, 200, 212, 211, 199, 244, 223,
       228, 237, 207, 203, 217, 207, 190, 192, 237, 233, 200, 267, 284,
       215, 215, 245, 197, 241, 206, 238, 230, 264, 282, 249, 249, 192,
       279, 255, 226, 188, 188, 170, 164, 176, 154, 187, 129, 157, 185,
       160, 170, 138, 163, 153, 137, 156, 184, 148, 176, 179, 125, 147,
       141, 175, 121, 190, 193, 151, 151, 158, 166, 152, 179, 187, 174,
       197, 156, 193, 155, 148, 173, 208, 150, 150, 191, 185, 170, 184,
       194, 139, 182, 186, 172, 154, 169, 198, 286, 233, 180, 162, 185,
       189, 198, 182, 200, 181, 175, 172, 192, 206, 184, 180, 228, 184,
       252, 188, 188, 214, 291, 158, 196, 138, 177, 142, 177, 19

In [26]:
clf = Pipeline([
    ('union', FeatureUnion(
                transformer_list = [
                    ('cst',  cust_regression_vals()),  
                    ('txt1', Pipeline([('s1', cust_txt_col(col='search_term')),
                                       ('tfidf1', TfidfVectorizer(stop_words='english')),
                                       ('tsvd1', TruncatedSVD(n_components=80))])),
                    ('txt2', Pipeline([('s2', cust_txt_col(col='product_title')),
                                       ('tfidf2', TfidfVectorizer(stop_words='english')),
                                       ('tsvd2', TruncatedSVD(n_components=80))])),
                    ('txt3', Pipeline([('s3', cust_txt_col(col='descr')),
                                       ('tfidf3', TfidfVectorizer(stop_words='english')),
                                       ('tsvd3', TruncatedSVD(n_components=60))])),
                    ('txt4', Pipeline([('s4', cust_txt_col(col='brand')),
                                       ('tfidf4', TfidfVectorizer(stop_words='english')),
                                       ('tsvd4', TruncatedSVD(n_components=20))]))
                    ]
            ))])

In [27]:
clf.fit(train[use_cols])
X_train = clf.transform(train[use_cols])

In [28]:
gbm = lgb.LGBMRegressor(learning_rate=0.03, n_estimators=700, num_leaves=70)
params = {
    'colsample_bytree': [0.5, 0.7]
}
model = GridSearchCV(gbm, params,
                     cv=4,
                     n_jobs=1,
                     scoring='neg_mean_squared_error',
                     return_train_score=False,
                     verbose=2)
model.fit(X_train, y)
grid_df = pd.DataFrame(model.cv_results_)
grid_df.drop(columns=grid_df.columns[:3], inplace=True)
grid_df.sort_values(['rank_test_score'], inplace=True)
grid_df.head()

Fitting 4 folds for each of 2 candidates, totalling 8 fits
[CV] colsample_bytree=0.5 ............................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ............................. colsample_bytree=0.5, total=  15.3s
[CV] colsample_bytree=0.5 ............................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   15.3s remaining:    0.0s


[CV] ............................. colsample_bytree=0.5, total=  14.1s
[CV] colsample_bytree=0.5 ............................................
[CV] ............................. colsample_bytree=0.5, total=  13.3s
[CV] colsample_bytree=0.5 ............................................
[CV] ............................. colsample_bytree=0.5, total=  13.8s
[CV] colsample_bytree=0.7 ............................................
[CV] ............................. colsample_bytree=0.7, total=  19.0s
[CV] colsample_bytree=0.7 ............................................
[CV] ............................. colsample_bytree=0.7, total=  26.9s
[CV] colsample_bytree=0.7 ............................................
[CV] ............................. colsample_bytree=0.7, total=  28.5s
[CV] colsample_bytree=0.7 ............................................
[CV] ............................. colsample_bytree=0.7, total=  32.9s


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:  2.7min finished


Unnamed: 0,std_score_time,param_colsample_bytree,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,mean_test_score,std_test_score,rank_test_score
0,0.019705,0.5,{'colsample_bytree': 0.5},-0.213362,-0.205561,-0.202458,-0.235457,-0.214209,0.012894,1
1,0.016014,0.7,{'colsample_bytree': 0.7},-0.213185,-0.205152,-0.202239,-0.236585,-0.21429,0.013482,2


In [30]:
X_test = clf.transform(test[use_cols])
y_pred = model.predict(X_test)
y_pred[y_pred>3.] = 3.
results = pd.DataFrame({'id':test.index.values, 'relevance':y_pred})
results.to_csv('lgbm3.csv', header=True, index=False)

In [31]:
train['pred_diff'] = (train['relevance'] - model.predict(X_train)).abs()
train.sort_values(['pred_diff'], ascending=False)

Unnamed: 0_level_0,product_uid,product_title,search_term,relevance,orig_search_term,orig_product_title,descr,orig_descr,match_product_title,match_descr,...,match_pos_descr,brand,match_brand,prod_title_ind,descr_ind,pred_diff,words_std_title,words_std_descr,len_title,len_descr
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
69679,119478,romano 4 ft. boxwood spiral topiari tree,topiari tree,1.00,topiary tree,Romano 4 ft. Boxwood Spiral Topiary Tree,enhanc home romano boxwood spiral topiari tree...,Enhance your home with a Romano Boxwood Spiral...,1.000000,1.000000,...,1.000000,,0.000000,0.714286,0.092593,1.661656,0.250000,16.000000,7,7
136808,148897,earthquak 212cc tiller rear tine crt side shield,side shield,1.00,side shields,Earthquake 212cc Tiller Rear Tine CRT with Sid...,earthquak 6015v rear tine rototil deliv ultim ...,The Earthquake 6015V rear tine rototiller deli...,1.000000,0.000000,...,0.000000,,0.000000,0.750000,1.000000,1.507517,0.250000,0.000000,8,8
120624,140844,rachael ray 10 qt. cover stockpot,rachael ray,1.00,rachael ray,Rachael Ray 10 qt. Covered Stockpot,"whether boil pasta, make batch chili cook grai...","Whether you're boiling pasta, making a batch o...",1.000000,0.500000,...,0.500000,,0.000000,0.000000,0.440367,1.501380,0.250000,0.000000,6,6
104722,133520,danco low lead 1a-3c stem crane,101-1h crane,1.00,101-1h for crane,DANCO Low Lead 1A-3C Stem for Crane,repair leaki faucet easi inexpens altern repla...,Repairing a leaky faucet is an easy and inexpe...,0.500000,0.500000,...,0.250000,,0.000000,0.833333,0.514286,1.452257,0.000000,0.000000,6,6
92432,128292,mont blanc northbrook drop-in composit granit ...,granit sand,1.00,granite sand,Mont Blanc Northbrook Drop-in Composite Granit...,durabl granit composit materi provid mont blan...,Durable granite composite material provides th...,1.000000,1.000000,...,0.500000,mont blanc,0.000000,0.333333,0.008130,1.441053,20.250000,256.000000,15,15
85851,125582,oakland live 26 in. metal grape tabl plant stand,grape plant,1.00,grape plant,Oakland Living 26 in. Metal Grape Table Plant ...,oakland live 26 in. metal grape tabl plant sta...,The Oakland Living 26 in. Metal Grape Table Pl...,1.000000,1.000000,...,1.000000,,0.000000,0.555556,0.084746,1.433580,1.000000,1.000000,9,9
134028,147467,master flow 24 vdc replac motor solar dual-pow...,solar vent,1.00,solar vents,Master Flow 24 VDC Replacement Motor for Solar...,slrm105 replac motor master flow green machin ...,The SLRM105 is the replacement motor for all M...,1.000000,1.000000,...,1.000000,,0.000000,0.600000,0.155556,1.405220,2.250000,81.000000,10,10
9508,101618,"sure comfort 40 gal. tall 3 year 34,000 btu na...",hot water tank gas,1.00,hot water tank gas,"Sure Comfort 40 Gal. Tall 3 Year 34,000 BTU Na...",sure comfort 40 gal. natur gas tall water heat...,The Sure Comfort 40 Gal. Natural Gas Tall Wate...,0.500000,1.000000,...,1.000000,sure comfort,0.000000,0.769231,0.027778,1.389258,0.250000,281.187500,13,13
204530,192555,heath bird stop blue ceram wild bird feeder,bird stop,1.00,bird stops,Heath Bird Stop Blue Ceramic Wild Bird Feeder,bird stop blue ceram wild bird feeder featur b...,The Bird Stop Blue Ceramic Wild Bird Feeder fe...,1.000000,1.000000,...,0.500000,heath,0.000000,0.125000,0.000000,1.381526,0.250000,0.250000,8,8
83392,124633,rubbermaid fasttrack garag cooler hook,rubbermaid cooler,1.33,rubbermaid cooler,Rubbermaid FastTrack Garage Cooler Hook,"part fasttrack system, cooler hook perfect sol...","Part of the FastTrack system, the Cooler Hook ...",1.000000,0.500000,...,0.500000,,0.000000,0.000000,0.051724,1.362543,2.250000,0.000000,5,5


In [37]:
@np.vectorize
def match_last_word(search_term, text):
    last_word = search_term.split()[-1]
    if text.find(last_word) > -1:
        return 1
    else:
        return 0

train['brand'].fillna('', inplace=True)
train['last_word_title'] = match_last_word(train['search_term'], train['product_title'])  
train['last_word_descr'] = match_last_word(train['search_term'], train['descr'])  

test['brand'].fillna('', inplace=True)
test['last_word_title'] = match_last_word(test['search_term'], test['product_title'])  
test['last_word_descr'] = match_last_word(test['search_term'], test['descr'])  

In [38]:
tree_cols = ['match_product_title', 'match_descr', 'len_search_term_words', 'len_search_term_letters',
             'match_pos_title', 'match_brand', 'match_pos_descr', 'prod_title_ind', 'descr_ind',
             'words_std_title', 'words_std_descr', 'len_title', 'len_descr', 'last_word_title',
             'last_word_descr']

use_cols = tree_cols + ['search_term', 'product_title', 'descr', 'brand']
clf = Pipeline([
    ('union', FeatureUnion(
                transformer_list = [
                    ('cst',  cust_regression_vals()),  
                    ('txt1', Pipeline([('s1', cust_txt_col(col='search_term')),
                                       ('tfidf1', TfidfVectorizer(stop_words='english')),
                                       ('tsvd1', TruncatedSVD(n_components=80))])),
                    ('txt2', Pipeline([('s2', cust_txt_col(col='product_title')),
                                       ('tfidf2', TfidfVectorizer(stop_words='english')),
                                       ('tsvd2', TruncatedSVD(n_components=80))])),
                    ('txt3', Pipeline([('s3', cust_txt_col(col='descr')),
                                       ('tfidf3', TfidfVectorizer(stop_words='english')),
                                       ('tsvd3', TruncatedSVD(n_components=60))])),
                    ('txt4', Pipeline([('s4', cust_txt_col(col='brand')),
                                       ('tfidf4', TfidfVectorizer(stop_words='english')),
                                       ('tsvd4', TruncatedSVD(n_components=20))]))
                    ]
            ))])
clf.fit(train[use_cols])
X_train = clf.transform(train[use_cols])
X_test = clf.transform(test[use_cols])

In [39]:
gbm = lgb.LGBMRegressor(learning_rate=0.03, n_estimators=700, num_leaves=70)
params = {
    'colsample_bytree': [0.5]
}
model = GridSearchCV(gbm, params,
                     cv=5,
                     n_jobs=1,
                     scoring='neg_mean_squared_error',
                     return_train_score=False,
                     verbose=2)
model.fit(X_train, y)
grid_df = pd.DataFrame(model.cv_results_)
grid_df.drop(columns=grid_df.columns[:3], inplace=True)
grid_df.sort_values(['rank_test_score'], inplace=True)
grid_df.head()

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] colsample_bytree=0.5 ............................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ............................. colsample_bytree=0.5, total=  19.2s
[CV] colsample_bytree=0.5 ............................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   19.2s remaining:    0.0s


[CV] ............................. colsample_bytree=0.5, total=  22.7s
[CV] colsample_bytree=0.5 ............................................
[CV] ............................. colsample_bytree=0.5, total=  21.2s
[CV] colsample_bytree=0.5 ............................................
[CV] ............................. colsample_bytree=0.5, total=  17.5s
[CV] colsample_bytree=0.5 ............................................
[CV] ............................. colsample_bytree=0.5, total=  21.7s


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  1.7min finished


Unnamed: 0,std_score_time,param_colsample_bytree,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.020028,0.5,{'colsample_bytree': 0.5},-0.21276,-0.203688,-0.199611,-0.204764,-0.229389,-0.210042,0.010571,1


In [40]:
y_pred = model.predict(X_test)
y_pred[y_pred>3.] = 3.
results = pd.DataFrame({'id':test.index.values, 'relevance':y_pred})
results.to_csv('lgbm3.csv', header=True, index=False)

0.47395

In [41]:
model.best_estimator_.feature_importances_

array([318, 192, 221, 510, 286,  48, 175, 474, 312, 148, 189, 275, 243,
        75,  48, 352, 271, 223, 207, 236, 213, 233, 183, 185, 210, 221,
       223, 203, 229, 215, 210, 189, 180, 212, 212, 202, 213, 230, 183,
       189, 284, 209, 194, 231, 196, 204, 234, 218, 233, 208, 222, 221,
       212, 246, 209, 200, 187, 228, 199, 201, 211, 203, 201, 198, 232,
       233, 193, 185, 195, 189, 220, 249, 199, 213, 200, 240, 222, 212,
       194, 189, 201, 237, 230, 209, 212, 231, 224, 255, 266, 278, 311,
       227, 274, 279, 264, 221, 192, 185, 195, 157, 164, 159, 248, 155,
       153, 184, 167, 191, 172, 192, 160, 140, 152, 154, 179, 171, 163,
       132, 135, 132, 200, 144, 166, 168, 154, 161, 183, 157, 144, 181,
       176, 189, 193, 177, 185, 153, 169, 153, 167, 148, 154, 180, 180,
       178, 199, 166, 159, 169, 184, 129, 208, 203, 208, 203, 191, 191,
       256, 211, 197, 200, 149, 200, 175, 189, 172, 137, 182, 202, 193,
       221, 190, 187, 201, 182, 145, 312, 150, 213, 151, 209, 16

In [43]:
train['pred_diff'] = (train['relevance'] - model.predict(X_train)).abs()
train.sort_values(['pred_diff'], ascending=False)[:50]

Unnamed: 0_level_0,product_uid,product_title,search_term,relevance,orig_search_term,orig_product_title,descr,orig_descr,match_product_title,match_descr,...,match_brand,prod_title_ind,descr_ind,pred_diff,words_std_title,words_std_descr,len_title,len_descr,last_word_title,last_word_descr
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
136808,148897,earthquak 212cc tiller rear tine crt side shield,side shield,1.0,side shields,Earthquake 212cc Tiller Rear Tine CRT with Sid...,earthquak 6015v rear tine rototil deliv ultim ...,The Earthquake 6015V rear tine rototiller deli...,1.0,0.0,...,0.0,0.75,1.0,1.583508,0.25,0.0,8,47,1,0
69679,119478,romano 4 ft. boxwood spiral topiari tree,topiari tree,1.0,topiary tree,Romano 4 ft. Boxwood Spiral Topiary Tree,enhanc home romano boxwood spiral topiari tree...,Enhance your home with a Romano Boxwood Spiral...,1.0,1.0,...,0.0,0.714286,0.092593,1.581459,0.25,16.0,7,54,1,1
156918,160006,werner 14 ft. fiberglass round rung straight l...,14 ft ladder,1.0,14 ft ladder,Werner 14 ft. Fiberglass Round Rung Straight L...,7114-1 one-sect round rung 14 ft. straight lad...,The 7114-1 one-section round rung 14 ft. Strai...,0.666667,0.666667,...,0.0,0.0625,0.064516,1.493323,9.0,2.25,16,62,1,1
120624,140844,rachael ray 10 qt. cover stockpot,rachael ray,1.0,rachael ray,Rachael Ray 10 qt. Covered Stockpot,"whether boil pasta, make batch chili cook grai...","Whether you're boiling pasta, making a batch o...",1.0,0.5,...,0.0,0.0,0.440367,1.484087,0.25,0.0,6,109,1,0
104722,133520,danco low lead 1a-3c stem crane,101-1h crane,1.0,101-1h for crane,DANCO Low Lead 1A-3C Stem for Crane,repair leaki faucet easi inexpens altern repla...,Repairing a leaky faucet is an easy and inexpe...,0.5,0.5,...,0.0,0.833333,0.514286,1.469233,0.0,0.0,6,35,1,1
204530,192555,heath bird stop blue ceram wild bird feeder,bird stop,1.0,bird stops,Heath Bird Stop Blue Ceramic Wild Bird Feeder,bird stop blue ceram wild bird feeder featur b...,The Bird Stop Blue Ceramic Wild Bird Feeder fe...,1.0,1.0,...,0.0,0.125,0.0,1.414796,0.25,0.25,8,50,1,1
146582,154163,tuscani coast 2-light weather charcoal outdoor...,tuscani classic,1.0,tuscany classic,Tuscany Coast 2-Light Weathered Charcoal Outdo...,tuscani coast 2-light exterior post lamp weath...,Tuscany Coast 2-Light Exterior Post Lamp In We...,0.5,1.0,...,0.0,0.0,0.0,1.389136,0.0,20.25,8,29,0,1
9508,101618,"sure comfort 40 gal. tall 3 year 34,000 btu na...",hot water tank gas,1.0,hot water tank gas,"Sure Comfort 40 Gal. Tall 3 Year 34,000 BTU Na...",sure comfort 40 gal. natur gas tall water heat...,The Sure Comfort 40 Gal. Natural Gas Tall Wate...,0.5,1.0,...,0.0,0.769231,0.027778,1.385987,0.25,281.1875,13,180,1,1
85851,125582,oakland live 26 in. metal grape tabl plant stand,grape plant,1.0,grape plant,Oakland Living 26 in. Metal Grape Table Plant ...,oakland live 26 in. metal grape tabl plant sta...,The Oakland Living 26 in. Metal Grape Table Pl...,1.0,1.0,...,0.0,0.555556,0.084746,1.37343,1.0,1.0,9,59,1,1
93622,128763,daylili grape magic bare root dormant plant (8...,grape plant,1.0,grape plant,Daylily Grape Magic Bare Root Dormant Plants (...,daylili vigor perenni requir littl care garden...,Daylilies are vigorous perennial that require ...,1.0,1.0,...,0.0,0.125,0.515625,1.371076,6.25,6.25,8,64,1,1


In [49]:
train[train['pred_diff']>0.7].shape

(3806, 27)

In [50]:
train['bad'] = train['pred_diff']>0.7

clf_bad = Pipeline([
    ('union', FeatureUnion(
                transformer_list = [
                    ('txt1', Pipeline([('s1', cust_txt_col(col='search_term')),
                                       ('tfidf1', TfidfVectorizer(stop_words='english')),
                                       ('tsvd1', TruncatedSVD(n_components=80))])),
                    ('txt2', Pipeline([('s2', cust_txt_col(col='product_title')),
                                       ('tfidf2', TfidfVectorizer(stop_words='english')),
                                       ('tsvd2', TruncatedSVD(n_components=80))])),
                    ('txt3', Pipeline([('s3', cust_txt_col(col='descr')),
                                       ('tfidf3', TfidfVectorizer(stop_words='english')),
                                       ('tsvd3', TruncatedSVD(n_components=60))]))
                    ]
            ))])
X_train_bad = clf_bad.fit(train[use_cols]).transform(train[use_cols])

In [51]:
y_bad = train['bad'].values
from lightgbm import LGBMClassifier
gbm_bad = LGBMClassifier()
gbm_bad.fit(X_train_bad, y_bad)
gbm_bad.score(X_train_bad, y_bad)

0.949343162271997

In [56]:
train['bad_pred'] = gbm_bad.predict_proba(X_train_bad)[:, 1]
X_test_bad = clf_bad.transform(test[use_cols])
test['bad_pred'] = gbm_bad.predict_proba(X_test_bad)[:, 1]

In [57]:
tree_cols = ['match_product_title', 'match_descr', 'len_search_term_words', 'len_search_term_letters',
             'match_pos_title', 'match_brand', 'match_pos_descr', 'prod_title_ind', 'descr_ind',
             'words_std_title', 'words_std_descr', 'len_title', 'len_descr', 'last_word_title',
             'last_word_descr', 'bad_pred']

use_cols = tree_cols + ['search_term', 'product_title', 'descr', 'brand']
clf = Pipeline([
    ('union', FeatureUnion(
                transformer_list = [
                    ('cst',  cust_regression_vals()),  
                    ('txt1', Pipeline([('s1', cust_txt_col(col='search_term')),
                                       ('tfidf1', TfidfVectorizer(stop_words='english')),
                                       ('tsvd1', TruncatedSVD(n_components=80))])),
                    ('txt2', Pipeline([('s2', cust_txt_col(col='product_title')),
                                       ('tfidf2', TfidfVectorizer(stop_words='english')),
                                       ('tsvd2', TruncatedSVD(n_components=80))])),
                    ('txt3', Pipeline([('s3', cust_txt_col(col='descr')),
                                       ('tfidf3', TfidfVectorizer(stop_words='english')),
                                       ('tsvd3', TruncatedSVD(n_components=60))])),
                    ('txt4', Pipeline([('s4', cust_txt_col(col='brand')),
                                       ('tfidf4', TfidfVectorizer(stop_words='english')),
                                       ('tsvd4', TruncatedSVD(n_components=20))]))
                    ]
            ))])
clf.fit(train[use_cols])
X_train = clf.transform(train[use_cols])
X_test = clf.transform(test[use_cols])

In [58]:
gbm = lgb.LGBMRegressor(learning_rate=0.03, n_estimators=700, colsample_bytree=0.5)
params = {
    'num_leaves': [70, 80]
}
model = GridSearchCV(gbm, params,
                     cv=5,
                     n_jobs=1,
                     scoring='neg_mean_squared_error',
                     return_train_score=False,
                     verbose=2)
model.fit(X_train, y)
grid_df = pd.DataFrame(model.cv_results_)
grid_df.drop(columns=grid_df.columns[:3], inplace=True)
grid_df.sort_values(['rank_test_score'], inplace=True)
grid_df.head()

Fitting 5 folds for each of 2 candidates, totalling 10 fits
[CV] num_leaves=70 ...................................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] .................................... num_leaves=70, total=  17.9s
[CV] num_leaves=70 ...................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   17.9s remaining:    0.0s


[CV] .................................... num_leaves=70, total=  27.0s
[CV] num_leaves=70 ...................................................
[CV] .................................... num_leaves=70, total=  23.5s
[CV] num_leaves=70 ...................................................
[CV] .................................... num_leaves=70, total=  23.1s
[CV] num_leaves=70 ...................................................
[CV] .................................... num_leaves=70, total=  23.9s
[CV] num_leaves=80 ...................................................
[CV] .................................... num_leaves=80, total=  24.6s
[CV] num_leaves=80 ...................................................
[CV] .................................... num_leaves=80, total=  24.6s
[CV] num_leaves=80 ...................................................
[CV] .................................... num_leaves=80, total=  27.2s
[CV] num_leaves=80 ...................................................
[CV] .

[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:  3.9min finished


Unnamed: 0,std_score_time,param_num_leaves,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
1,0.006844,80,{'num_leaves': 80},-0.196847,-0.19023,-0.185463,-0.188364,-0.213868,-0.194954,0.01017,1
0,0.007621,70,{'num_leaves': 70},-0.196586,-0.191186,-0.185393,-0.189037,-0.2143,-0.195301,0.010168,2


In [61]:
y_pred = model.predict(X_test)
y_pred[y_pred>3.] = 3.
y_pred[y_pred<1.] = 1.
results = pd.DataFrame({'id':test.index.values, 'relevance':y_pred})
results.to_csv('lgbm3.csv', header=True, index=False)

0.48290

In [62]:
tree_cols = ['match_product_title', 'match_descr', 'len_search_term_words', 'len_search_term_letters',
             'match_pos_title', 'match_brand', 'match_pos_descr', 'prod_title_ind', 'descr_ind',
             'words_std_title', 'words_std_descr', 'len_title', 'len_descr', 'last_word_title',
             'last_word_descr']

use_cols = tree_cols + ['search_term', 'product_title', 'descr', 'brand']
clf = Pipeline([
    ('union', FeatureUnion(
                transformer_list = [
                    ('cst',  cust_regression_vals()),  
                    ('txt1', Pipeline([('s1', cust_txt_col(col='search_term')),
                                       ('tfidf1', TfidfVectorizer(stop_words='english')),
                                       ('tsvd1', TruncatedSVD(n_components=80))])),
                    ('txt2', Pipeline([('s2', cust_txt_col(col='product_title')),
                                       ('tfidf2', TfidfVectorizer(stop_words='english')),
                                       ('tsvd2', TruncatedSVD(n_components=80))])),
                    ('txt3', Pipeline([('s3', cust_txt_col(col='descr')),
                                       ('tfidf3', TfidfVectorizer(stop_words='english')),
                                       ('tsvd3', TruncatedSVD(n_components=60))])),
                    ('txt4', Pipeline([('s4', cust_txt_col(col='brand')),
                                       ('tfidf4', TfidfVectorizer(stop_words='english')),
                                       ('tsvd4', TruncatedSVD(n_components=20))]))
                    ]
            ))])
clf.fit(train[use_cols])
X_train = clf.transform(train[use_cols])
X_test = clf.transform(test[use_cols])

In [63]:
gbm = lgb.LGBMRegressor(learning_rate=0.03, n_estimators=700, colsample_bytree=0.5)
params = {
    'num_leaves': [70, 80]
}
model = GridSearchCV(gbm, params,
                     cv=5,
                     n_jobs=1,
                     return_train_score=False,
                     verbose=2)
model.fit(X_train, y)
grid_df = pd.DataFrame(model.cv_results_)
grid_df.drop(columns=grid_df.columns[:3], inplace=True)
grid_df.sort_values(['rank_test_score'], inplace=True)
grid_df.head()

Fitting 5 folds for each of 2 candidates, totalling 10 fits
[CV] num_leaves=70 ...................................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] .................................... num_leaves=70, total=  17.1s
[CV] num_leaves=70 ...................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   17.1s remaining:    0.0s


[CV] .................................... num_leaves=70, total=  15.6s
[CV] num_leaves=70 ...................................................
[CV] .................................... num_leaves=70, total=  14.0s
[CV] num_leaves=70 ...................................................
[CV] .................................... num_leaves=70, total=  14.0s
[CV] num_leaves=70 ...................................................
[CV] .................................... num_leaves=70, total=  13.9s
[CV] num_leaves=80 ...................................................
[CV] .................................... num_leaves=80, total=  15.5s
[CV] num_leaves=80 ...................................................
[CV] .................................... num_leaves=80, total=  15.4s
[CV] num_leaves=80 ...................................................
[CV] .................................... num_leaves=80, total=  15.5s
[CV] num_leaves=80 ...................................................
[CV] .

[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:  2.6min finished


Unnamed: 0,std_score_time,param_num_leaves,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
1,0.001303,80,{'num_leaves': 80},0.259654,0.275215,0.273233,0.282373,0.152286,0.248553,0.048691,1
0,0.004459,70,{'num_leaves': 70},0.254528,0.272677,0.272693,0.277876,0.150542,0.245664,0.048217,2


In [64]:
y_pred = model.predict(X_test)
y_pred[y_pred>3.] = 3.
y_pred[y_pred<1.] = 1.
results = pd.DataFrame({'id':test.index.values, 'relevance':y_pred})
results.to_csv('lgbm3.csv', header=True, index=False)

0.47372

In [None]:
gbm_best = model.best_estimator_
gbm_best.fit(X_train, y)

In [67]:
y_pred = gbm_best.predict(X_test)
results = pd.DataFrame({'id':test.index.values, 'relevance':y_pred})
results.to_csv('lgbm3.csv', header=True, index=False)

In [72]:
train['pred_diff'] = (train['relevance'] - model.predict(X_train)).abs()
train.sort_values(['pred_diff'], ascending=False)[:50]

Unnamed: 0_level_0,product_uid,product_title,search_term,relevance,orig_search_term,orig_product_title,descr,orig_descr,match_product_title,match_descr,...,descr_ind,pred_diff,words_std_title,words_std_descr,len_title,len_descr,last_word_title,last_word_descr,bad,bad_pred
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
69679,119478,romano 4 ft. boxwood spiral topiari tree,topiari tree,1.0,topiary tree,Romano 4 ft. Boxwood Spiral Topiary Tree,enhanc home romano boxwood spiral topiari tree...,Enhance your home with a Romano Boxwood Spiral...,1.0,1.0,...,0.092593,1.627411,0.25,16.0,7,54,1,1,True,0.064587
156918,160006,werner 14 ft. fiberglass round rung straight l...,14 ft ladder,1.0,14 ft ladder,Werner 14 ft. Fiberglass Round Rung Straight L...,7114-1 one-sect round rung 14 ft. straight lad...,The 7114-1 one-section round rung 14 ft. Strai...,0.666667,0.666667,...,0.064516,1.482803,9.0,2.25,16,62,1,1,True,0.189142
136808,148897,earthquak 212cc tiller rear tine crt side shield,side shield,1.0,side shields,Earthquake 212cc Tiller Rear Tine CRT with Sid...,earthquak 6015v rear tine rototil deliv ultim ...,The Earthquake 6015V rear tine rototiller deli...,1.0,0.0,...,1.0,1.477143,0.25,0.0,8,47,1,0,True,0.103526
120624,140844,rachael ray 10 qt. cover stockpot,rachael ray,1.0,rachael ray,Rachael Ray 10 qt. Covered Stockpot,"whether boil pasta, make batch chili cook grai...","Whether you're boiling pasta, making a batch o...",1.0,0.5,...,0.440367,1.448947,0.25,0.0,6,109,1,0,True,0.088333
204530,192555,heath bird stop blue ceram wild bird feeder,bird stop,1.0,bird stops,Heath Bird Stop Blue Ceramic Wild Bird Feeder,bird stop blue ceram wild bird feeder featur b...,The Bird Stop Blue Ceramic Wild Bird Feeder fe...,1.0,1.0,...,0.0,1.393136,0.25,0.25,8,50,1,1,True,0.183157
104722,133520,danco low lead 1a-3c stem crane,101-1h crane,1.0,101-1h for crane,DANCO Low Lead 1A-3C Stem for Crane,repair leaki faucet easi inexpens altern repla...,Repairing a leaky faucet is an easy and inexpe...,0.5,0.5,...,0.514286,1.37536,0.0,0.0,6,35,1,1,True,0.063852
176724,172318,green matter 3-light mahogani bronz vaniti fixtur,bronz green,1.0,bronze green,Green Matters 3-Light Mahogany Bronze Vanity F...,"cornerston collect quality, incandesc vaniti e...","The cornerstone of this collection is quality,...",1.0,0.5,...,0.109756,1.37085,4.0,0.0,7,82,1,0,True,0.16583
83392,124633,rubbermaid fasttrack garag cooler hook,rubbermaid cooler,1.33,rubbermaid cooler,Rubbermaid FastTrack Garage Cooler Hook,"part fasttrack system, cooler hook perfect sol...","Part of the FastTrack system, the Cooler Hook ...",1.0,0.5,...,0.051724,1.369197,2.25,0.0,5,58,1,1,True,0.184175
9508,101618,"sure comfort 40 gal. tall 3 year 34,000 btu na...",hot water tank gas,1.0,hot water tank gas,"Sure Comfort 40 Gal. Tall 3 Year 34,000 BTU Na...",sure comfort 40 gal. natur gas tall water heat...,The Sure Comfort 40 Gal. Natural Gas Tall Wate...,0.5,1.0,...,0.027778,1.366897,0.25,281.1875,13,180,1,1,True,0.171904
92432,128292,mont blanc northbrook drop-in composit granit ...,granit sand,1.0,granite sand,Mont Blanc Northbrook Drop-in Composite Granit...,durabl granit composit materi provid mont blan...,Durable granite composite material provides th...,1.0,1.0,...,0.00813,1.365866,20.25,256.0,15,123,1,1,True,0.123621


In [73]:
@np.vectorize
def word_match_count(search_term, text):
    count = 0
    search_term = search_term.split()
    text = text.split()
    if len(search_term) == 0:
        return 0
    for word in set(search_term):
        count += 1 if text.count(word) > 0 else 0
    return count / len(search_term)

In [76]:
train['match_orig_title'] = word_match_count(train['search_term'], train['orig_product_title'])
train['match_orig_descr'] = word_match_count(train['search_term'], train['orig_descr'])
test['match_orig_title'] = word_match_count(test['search_term'], test['orig_product_title'])
test['match_orig_descr'] = word_match_count(test['search_term'], test['orig_descr'])

In [77]:
tree_cols = ['match_product_title', 'match_descr', 'len_search_term_words', 'len_search_term_letters',
             'match_pos_title', 'match_brand', 'match_pos_descr', 'prod_title_ind', 'descr_ind',
             'words_std_title', 'words_std_descr', 'len_title', 'len_descr', 'last_word_title',
             'last_word_descr', 'match_orig_title', 'match_orig_descr']

use_cols = tree_cols + ['search_term', 'product_title', 'descr', 'brand']
clf = Pipeline([
    ('union', FeatureUnion(
                transformer_list = [
                    ('cst',  cust_regression_vals()),  
                    ('txt1', Pipeline([('s1', cust_txt_col(col='search_term')),
                                       ('tfidf1', TfidfVectorizer(stop_words='english')),
                                       ('tsvd1', TruncatedSVD(n_components=80))])),
                    ('txt2', Pipeline([('s2', cust_txt_col(col='product_title')),
                                       ('tfidf2', TfidfVectorizer(stop_words='english')),
                                       ('tsvd2', TruncatedSVD(n_components=80))])),
                    ('txt3', Pipeline([('s3', cust_txt_col(col='descr')),
                                       ('tfidf3', TfidfVectorizer(stop_words='english')),
                                       ('tsvd3', TruncatedSVD(n_components=60))])),
                    ('txt4', Pipeline([('s4', cust_txt_col(col='brand')),
                                       ('tfidf4', TfidfVectorizer(stop_words='english')),
                                       ('tsvd4', TruncatedSVD(n_components=20))]))
                    ]
            ))])
clf.fit(train[use_cols])
X_train = clf.transform(train[use_cols])
X_test = clf.transform(test[use_cols])

In [78]:
gbm_best.fit(X_train, y)

LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=0.5,
       importance_type='split', learning_rate=0.03, max_depth=-1,
       min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
       n_estimators=700, n_jobs=-1, num_leaves=80, objective=None,
       random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
       subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [80]:
y_pred = gbm_best.predict(X_test)
y_pred[y_pred>3.] = 3.
y_pred[y_pred<1.] = 1.
results = pd.DataFrame({'id':test.index.values, 'relevance':y_pred})
results.to_csv('lgbm3.csv', header=True, index=False)

0.47370

In [82]:
tree_cols = ['match_product_title', 'match_descr', 'len_search_term_words', 'len_search_term_letters',
             'match_pos_title', 'match_brand', 'match_pos_descr', 'prod_title_ind', 'descr_ind',
             'words_std_title', 'words_std_descr', 'len_title', 'len_descr', 'last_word_title',
             'last_word_descr', 'match_orig_title', 'match_orig_descr']

use_cols = tree_cols + ['search_term', 'product_title', 'descr', 'brand']
clf = Pipeline([
    ('union', FeatureUnion(
                transformer_list = [
                    ('cst',  cust_regression_vals()),  
                    ('txt1', Pipeline([('s1', cust_txt_col(col='search_term')),
                                       ('tfidf1', TfidfVectorizer(stop_words='english')),
                                       ('tsvd1', TruncatedSVD(n_components=80))])),
                    ('txt2', Pipeline([('s2', cust_txt_col(col='product_title')),
                                       ('tfidf2', TfidfVectorizer(stop_words='english')),
                                       ('tsvd2', TruncatedSVD(n_components=80))])),
                    ('txt3', Pipeline([('s3', cust_txt_col(col='descr')),
                                       ('tfidf3', TfidfVectorizer(stop_words='english')),
                                       ('tsvd3', TruncatedSVD(n_components=60))])),
                    ('txt4', Pipeline([('s4', cust_txt_col(col='brand')),
                                       ('tfidf4', TfidfVectorizer(stop_words='english')),
                                       ('tsvd4', TruncatedSVD(n_components=20))]))
                    ],
                transformer_weights = {
                    'cst': 1.0,
                    'txt1': 0.5,
                    'txt2': 0.25,
                    'txt3': 0.01,
                    'txt4': 0.5
                    }
            ))])
clf.fit(train[use_cols])
X_train = clf.transform(train[use_cols])
X_test = clf.transform(test[use_cols])

In [83]:
gbm_best.fit(X_train, y)

LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=0.5,
       importance_type='split', learning_rate=0.03, max_depth=-1,
       min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
       n_estimators=700, n_jobs=-1, num_leaves=80, objective=None,
       random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
       subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [84]:
y_pred = gbm_best.predict(X_test)
y_pred[y_pred>3.] = 3.
y_pred[y_pred<1.] = 1.
results = pd.DataFrame({'id':test.index.values, 'relevance':y_pred})
results.to_csv('lgbm3.csv', header=True, index=False)

In [85]:
gbm_best.feature_importances_

array([317, 205, 191, 608, 280,  58, 196, 556, 409, 139, 220, 273, 281,
        86,  67,  99,  93, 292, 268, 269, 220, 217, 229, 250, 246, 200,
       229, 223, 217, 198, 218, 265, 221, 221, 212, 255, 239, 232, 216,
       258, 224, 228, 281, 254, 220, 216, 212, 189, 255, 222, 245, 265,
       203, 263, 245, 276, 267, 222, 222, 269, 266, 187, 209, 234, 253,
       200, 208, 234, 185, 222, 282, 234, 275, 244, 235, 231, 244, 308,
       231, 221, 351, 231, 278, 242, 271, 241, 285, 245, 243, 335, 277,
       227, 320, 255, 274, 263, 303, 253, 202, 245, 232, 203, 194, 199,
       254, 191, 181, 206, 206, 185, 168, 199, 160, 197, 201, 188, 222,
       178, 177, 161, 166, 205, 224, 171, 218, 174, 196, 177, 207, 190,
       161, 234, 229, 229, 239, 187, 188, 189, 181, 226, 211, 156, 203,
       210, 193, 196, 209, 169, 199, 185, 191, 238, 202, 220, 316, 161,
       218, 209, 234, 228, 227, 173, 211, 209, 239, 205, 198, 213, 261,
       186, 235, 243, 221, 220, 214, 215, 198, 325, 204, 204, 21

In [87]:
train.drop(columns=['pred_diff'], inplace=True)
train['pred_diff'] = (train['relevance'] - model.predict(X_train))
train.sort_values(['pred_diff'], ascending=False)[:50]

Unnamed: 0_level_0,product_uid,product_title,search_term,relevance,orig_search_term,orig_product_title,descr,orig_descr,match_product_title,match_descr,...,words_std_descr,len_title,len_descr,last_word_title,last_word_descr,bad,bad_pred,match_orig_title,match_orig_descr,pred_diff
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
207064,194599,home legend distress lennox hickori 3/8 in. th...,ennox,3.0,ennox,Home Legend Distressed Lennox Hickory 3/8 in. ...,"78 in. length, hardwood veneer carpet reduc ov...","78 in. length, hardwood veneer carpet reducer ...",0.0,0.0,...,0.0,20,89,1,1,True,0.318959,0.0,0.0,1.242307
203914,192088,behr premium plus #600e-2 harbor mist zero voc...,locit 2 plus 1,3.0,locite 2 plus 1,BEHR Premium Plus #600E-2 Harbor Mist Zero VOC...,"behr premium plus zero voc, self-prim interior...","BEHR Premium Plus Zero VOC, Self-Priming Inter...",0.25,0.5,...,529.0,10,93,0,1,True,0.051446,0.0,0.25,1.192145
194147,184559,lenmar nickel-met hydrid 1200mah/3.6-volt cord...,polish batteri metal,3.0,polisher battery metal,Lenmar Nickel-Metal Hydride 1200mAh/3.6-Volt C...,"power panasonic, american telecom, tele-phone,...","Power your Panasonic, American Telecom, Tele-P...",0.333333,0.333333,...,0.0,8,81,0,0,True,0.106693,0.0,0.0,1.144559
116307,138721,behr premium plus ultra #ecc-38-3 sea fern paint,fen,3.0,fen,BEHR Premium Plus Ultra #ECC-38-3 Sea Fern Paint,behr premium plus ultra semi-gloss enamel inte...,BEHR Premium Plus Ultra Semi-Gloss Enamel Inte...,0.0,0.0,...,0.0,8,111,0,0,True,0.232118,0.0,0.0,1.097668
78785,122860,american standard green tea ecosil 6 ft. x 42 ...,arctic air ast28r,3.0,arctic air ast28r,American Standard Green Tea EcoSilent 6 ft. x ...,"luxury, pure simple. modern acryl tub featur e...","Luxury, pure and simple. This modern acrylic t...",0.666667,0.333333,...,0.0,16,95,0,0,True,0.242641,0.0,0.333333,1.091517
114403,137867,extech instrument carbon monoxid co meter,carboy,3.0,carboy,Extech Instruments Carbon Monoxide CO Meter,"check co level easi push 1 button model c010, ...",Checking co levels is as easy as pushing just ...,0.0,0.0,...,0.0,6,76,0,0,True,0.245047,0.0,0.0,1.072386
181669,175688,cap tread cross wood 94 in. long x 12-1/8 in. ...,wood chip best cover,3.0,wood chips best to cover,Cap A Tread Cross Wood 94 in. Long x 12-1/8 in...,cap tread durabl vinyl pre-attach stair nosing...,Cap A Tread is a durable vinyl with pre-attach...,0.5,0.0,...,0.0,23,45,1,0,True,0.135103,0.0,0.0,1.072193
201248,189965,bdk warner brother batman carpet floor mat (4-...,vdk,3.0,vdk,BDK Warner Brothers Batman Carpet Floor Mats (...,warner brother batman carpet floor mat enhanc ...,Warner Brothers Batman carpet floor mats will ...,0.0,0.0,...,0.0,8,81,0,0,True,0.080473,0.0,0.0,1.0676
193001,183692,lichtenberg white no. 918 millenni ryan heathe...,w g 918,3.0,w g 918,LICHTENBERG White No. 918 Millennial Ryan Heat...,no. 918 millenni ryan heather textur semi-sh c...,No. 918 Millennial Ryan heathered texture semi...,0.666667,0.333333,...,0.0,18,56,1,1,True,0.308694,0.333333,0.333333,1.063746
72273,120449,"scott handi green ii 1,000 sq. ft. handheld sp...",grqss,3.0,grqss,"Scotts Handy Green II 1,000 sq. ft. Handheld S...","scott handi green ii 1,000 sq. ft. handheld sp...","The Scotts Handy Green II 1,000 sq. ft. Handhe...",0.0,0.0,...,0.0,9,64,0,0,True,0.085003,0.0,0.0,1.053112
