In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.stem import SnowballStemmer
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
import re

encoding = "ISO-8859-1"
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package stopwords to /home/eugene/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/eugene/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/eugene/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [2]:
train = pd.read_csv('train.csv', encoding=encoding, index_col=['id'])
train.head()

Unnamed: 0_level_0,product_uid,product_title,search_term,relevance
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2,100001,Simpson Strong-Tie 12-Gauge Angle,angle bracket,3.0
3,100001,Simpson Strong-Tie 12-Gauge Angle,l bracket,2.5
9,100002,BEHR Premium Textured DeckOver 1-gal. #SC-141 ...,deck over,3.0
16,100005,Delta Vero 1-Handle Shower Only Faucet Trim Ki...,rain shower head,2.33
17,100005,Delta Vero 1-Handle Shower Only Faucet Trim Ki...,shower only faucet,2.67


In [2]:
stemmer = SnowballStemmer('english')
stop_words = set(stopwords.words('english'))


def replaces(s):
    if isinstance(s, str):
        s = s.lower()
        s = re.sub(r"(\w)\.([A-Z])", r"\1 \2", s) 

        s = re.sub(r"([0-9]+)( *)(inches|inch|in|')\.?", r"\1in. ", s)
    
        s = re.sub(r"([0-9]+)( *)(foot|feet|ft|'')\.?", r"\1ft. ", s)
    
        s = re.sub(r"([0-9]+)( *)(pounds|pound|lbs|lb)\.?", r"\1lb. ", s)
    
        s = s.replace(" x "," xby ")
        s = s.replace("*"," xby ")
        s = s.replace(" by "," xby")
        s = s.replace("x0"," xby 0")
        s = s.replace("x1"," xby 1")
        s = s.replace("x2"," xby 2")
        s = s.replace("x3"," xby 3")
        s = s.replace("x4"," xby 4")
        s = s.replace("x5"," xby 5")
        s = s.replace("x6"," xby 6")
        s = s.replace("x7"," xby 7")
        s = s.replace("x8"," xby 8")
        s = s.replace("x9"," xby 9")
        s = s.replace("0x","0 xby ")
        s = s.replace("1x","1 xby ")
        s = s.replace("2x","2 xby ")
        s = s.replace("3x","3 xby ")
        s = s.replace("4x","4 xby ")
        s = s.replace("5x","5 xby ")
        s = s.replace("6x","6 xby ")
        s = s.replace("7x","7 xby ")
        s = s.replace("8x","8 xby ")
        s = s.replace("9x","9 xby ")
        
        s = re.sub(r"([0-9]+)( *)(square|sq) ?\.?(feet|foot|ft)\.?", r"\1sq.ft. ", s)
    
        s = re.sub(r"([0-9]+)( *)(gallons|gallon|gal)\.?", r"\1gal. ", s)
        
        s = re.sub(r"([0-9]+)( *)(ounces|ounce|oz)\.?", r"\1oz. ", s)
    
        s = re.sub(r"([0-9]+)( *)(centimeters|cm)\.?", r"\1cm. ", s)
    
        s = re.sub(r"([0-9]+)( *)(milimeters|mm)\.?", r"\1mm. ", s)
        
        s = re.sub(r"([0-9]+)( *)(degrees|degree)\.?", r"\1deg. ", s)
    
        s = re.sub(r"([0-9]+)( *)(volts|volt)\.?", r"\1volt. ", s)
        
        s = re.sub(r"([0-9]+)( *)(watts|watt)\.?", r"\1watt. ", s)
    
        s = re.sub(r"([0-9]+)( *)(amperes|ampere|amps|amp)\.?", r"\1amp. ", s)
        
        
        
        s = s.replace("whirpool","whirlpool")
        s = s.replace("whirlpoolga", "whirlpool")
        s = s.replace("whirlpoolstainless","whirlpool stainless")

        s = s.replace("  "," ")
        
        s = s.replace("/", " ")
        s = s.replace("-", " ")
        return " ".join([re.sub('[^A-Za-z0-9-./]', ' ', word)
                         for word in s.lower().split()]) 
    else:
        return "null"

def stem_sentence(s): 
    replaces(s)   
    return " ".join([stemmer.stem(word)
                     for word in s.lower().split()
                     if word not in stop_words])


In [4]:
products = pd.read_csv('product_descriptions.csv', encoding=encoding, index_col=['product_uid'])
products.head()

Unnamed: 0_level_0,product_description
product_uid,Unnamed: 1_level_1
100001,"Not only do angles make joints stronger, they ..."
100002,BEHR Premium Textured DECKOVER is an innovativ...
100003,Classic architecture meets contemporary design...
100004,The Grape Solar 265-Watt Polycrystalline PV So...
100005,Update your bathroom with the Delta Vero Singl...


In [5]:
train['description'] = train.product_uid.map(products.product_description)
train.head()

Unnamed: 0_level_0,product_uid,product_title,search_term,relevance,description
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2,100001,Simpson Strong-Tie 12-Gauge Angle,angle bracket,3.0,"Not only do angles make joints stronger, they ..."
3,100001,Simpson Strong-Tie 12-Gauge Angle,l bracket,2.5,"Not only do angles make joints stronger, they ..."
9,100002,BEHR Premium Textured DeckOver 1-gal. #SC-141 ...,deck over,3.0,BEHR Premium Textured DECKOVER is an innovativ...
16,100005,Delta Vero 1-Handle Shower Only Faucet Trim Ki...,rain shower head,2.33,Update your bathroom with the Delta Vero Singl...
17,100005,Delta Vero 1-Handle Shower Only Faucet Trim Ki...,shower only faucet,2.67,Update your bathroom with the Delta Vero Singl...


In [6]:
for col in ['product_title', 'description', 'search_term']:
    train[col] = train[col].apply(stem_sentence)
train.head()

Unnamed: 0_level_0,product_uid,product_title,search_term,relevance,description
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2,100001,simpson strong-ti 12-gaug angl,angl bracket,3.0,angl make joint stronger also provid consiste...
3,100001,simpson strong-ti 12-gaug angl,l bracket,2.5,angl make joint stronger also provid consiste...
9,100002,behr premium textur deckov 1-gal. sc-141 tugb...,deck,3.0,behr premium textur deckov innov solid color c...
16,100005,delta vero 1-handl shower faucet trim kitin. c...,rain shower head,2.33,updat bathroom delta vero single-handl shower ...
17,100005,delta vero 1-handl shower faucet trim kitin. c...,shower faucet,2.67,updat bathroom delta vero single-handl shower ...


In [3]:
def word_match_count(search_term, text):
    count = 0
    search_term = search_term.split()
    text = text.split()
    if len(search_term) == 0:
        return 0
    for word in search_term:
        count += 1 if text.count(word) > 0 else 0
    return count / len(search_term)

In [8]:
%%time
for col in ['product_title', 'description']:
    train['match_' + col] = train.apply(lambda row: word_match_count(row['search_term'], row[col]), axis=1)

CPU times: user 3.13 s, sys: 12 ms, total: 3.14 s
Wall time: 3.14 s


In [9]:
train.head()

Unnamed: 0_level_0,product_uid,product_title,search_term,relevance,description,match_product_title,match_description
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2,100001,simpson strong-ti 12-gaug angl,angl bracket,3.0,angl make joint stronger also provid consiste...,0.5,0.5
3,100001,simpson strong-ti 12-gaug angl,l bracket,2.5,angl make joint stronger also provid consiste...,0.0,0.0
9,100002,behr premium textur deckov 1-gal. sc-141 tugb...,deck,3.0,behr premium textur deckov innov solid color c...,0.0,1.0
16,100005,delta vero 1-handl shower faucet trim kitin. c...,rain shower head,2.33,updat bathroom delta vero single-handl shower ...,0.333333,0.333333
17,100005,delta vero 1-handl shower faucet trim kitin. c...,shower faucet,2.67,updat bathroom delta vero single-handl shower ...,1.0,1.0


In [10]:
X = train[['match_product_title', 'match_description']].values
y = train['relevance'].values

In [5]:
import lightgbm as lgb
from sklearn.model_selection import train_test_split

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [13]:
gbm = lgb.LGBMRegressor()
gbm.fit(X_train, y_train, eval_set=(X_test, y_test))

[1]	valid_0's l2: 0.276446
[2]	valid_0's l2: 0.270549
[3]	valid_0's l2: 0.265771
[4]	valid_0's l2: 0.261893
[5]	valid_0's l2: 0.258745
[6]	valid_0's l2: 0.256192
[7]	valid_0's l2: 0.254118
[8]	valid_0's l2: 0.252424
[9]	valid_0's l2: 0.251053
[10]	valid_0's l2: 0.249936
[11]	valid_0's l2: 0.249032
[12]	valid_0's l2: 0.248289
[13]	valid_0's l2: 0.247689
[14]	valid_0's l2: 0.247195
[15]	valid_0's l2: 0.246794
[16]	valid_0's l2: 0.246474
[17]	valid_0's l2: 0.246213
[18]	valid_0's l2: 0.245999
[19]	valid_0's l2: 0.245826
[20]	valid_0's l2: 0.24569
[21]	valid_0's l2: 0.245564
[22]	valid_0's l2: 0.245469
[23]	valid_0's l2: 0.24539
[24]	valid_0's l2: 0.245327
[25]	valid_0's l2: 0.245283
[26]	valid_0's l2: 0.245247
[27]	valid_0's l2: 0.24521
[28]	valid_0's l2: 0.245185
[29]	valid_0's l2: 0.245165
[30]	valid_0's l2: 0.245144
[31]	valid_0's l2: 0.245127
[32]	valid_0's l2: 0.245116
[33]	valid_0's l2: 0.245106
[34]	valid_0's l2: 0.245103
[35]	valid_0's l2: 0.245099
[36]	valid_0's l2: 0.245094
[37]

LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
       importance_type='split', learning_rate=0.1, max_depth=-1,
       min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
       n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
       random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
       subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [14]:
%%time
test = pd.read_csv('test.csv', encoding=encoding)
test['description'] = test.product_uid.map(products.product_description)
for col in ['product_title', 'description', 'search_term']:
    test[col] = test[col].apply(stem_sentence)
test.head()

CPU times: user 3min 5s, sys: 208 ms, total: 3min 5s
Wall time: 3min 3s


In [15]:
for col in ['product_title', 'description']:
    test['match_' + col] = test.apply(lambda row: word_match_count(row['search_term'], row[col]), axis=1)


In [16]:
X_test = test[['match_product_title', 'match_description']].values
y_pred = gbm.predict(X_test)
results = pd.DataFrame({'id':test.id.values, 'relevance':y_pred})
results.to_csv('baseline.csv', header=True, index=False)

0.50074

In [6]:
from sklearn.model_selection import GridSearchCV

In [18]:
param_grid = {
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators': [50, 100, 200]
}
grid = GridSearchCV(lgb.LGBMRegressor(),
                    param_grid,
                    cv=3,
                    scoring='neg_mean_squared_error',
                    return_train_score=False)
grid.fit(X, y)

GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
       importance_type='split', learning_rate=0.1, max_depth=-1,
       min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
       n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
       random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
       subsample=1.0, subsample_for_bin=200000, subsample_freq=0),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'learning_rate': [0.01, 0.05, 0.1], 'n_estimators': [50, 100, 200]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
       scoring='neg_mean_squared_error', verbose=0)

In [19]:
grid_df = pd.DataFrame(grid.cv_results_)
grid_df.drop(columns=grid_df.columns[:3], inplace=True)
grid_df.sort_values(['rank_test_score'], inplace=True)
grid_df.head()

Unnamed: 0,std_score_time,param_learning_rate,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
4,0.001108,0.05,100,"{'learning_rate': 0.05, 'n_estimators': 100}",-0.248767,-0.243963,-0.268647,-0.253792,0.010685,1
6,0.003768,0.1,50,"{'learning_rate': 0.1, 'n_estimators': 50}",-0.248775,-0.243971,-0.26864,-0.253795,0.010679,2
5,0.001172,0.05,200,"{'learning_rate': 0.05, 'n_estimators': 200}",-0.248886,-0.244021,-0.268569,-0.253825,0.010613,3
7,0.002487,0.1,100,"{'learning_rate': 0.1, 'n_estimators': 100}",-0.248878,-0.244028,-0.268583,-0.25383,0.010619,4
8,0.001789,0.1,200,"{'learning_rate': 0.1, 'n_estimators': 200}",-0.24894,-0.244042,-0.268622,-0.253868,0.010623,5


In [20]:
gbm = grid.best_estimator_
y_pred = gbm.predict(X_test)
results = pd.DataFrame({'id':test.id.values, 'relevance':y_pred})
results.to_csv('lgbm_cv.csv', header=True, index=False)

0.50067

In [21]:
train['preds'] = gbm.predict(X)
train['diff'] = (train['preds'] - train['relevance']).abs()
pd.set_option('display.max_colwidth', -1)
train.sort_values(by=['diff'], ascending=False)

Unnamed: 0_level_0,product_uid,product_title,search_term,relevance,description,match_product_title,match_description,preds,diff
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
80599,123546,prime-lin storm door closer shock spring heavi duti black,storm door black,1.00,storm screen door closer come finishedin. aluminum. featur adjust close speed includ shock spring. style closer use medium heavi weight doors.heavi duti constructionadjust close speedblack finishinclud shock springus medium heavi weight door,1.000000,0.666667,2.718035,1.718035
152112,157270,patio live concept bahama weav 34in. dark mahogani outdoor tabl lamp straw linen shade,outdoor patio shade,1.00,carefre resin durability ad eleg outdoor live area. weather wicker woven base add dramat appeal. lamp featur resin construct heavi weight base. two level dim switch vari bright levels. 16ft. weatherproof cord plug provid ad safeti convenience. durabl polycarbon waterproof light bulb enclosur allow use standard 100-watt bulb compar led fluoresc bulbs. 20in. shade cover durabl sunbrella fabric.2-level dim switchall-weath 2in. resin wicker woven lamp poleheavi weight base16ft. weatherproof cord plugsunbrella fabric coverhom depot protect plan,1.000000,0.666667,2.718035,1.718035
151606,156981,ge 36in. over-the-rang microwav accessori filler kitin. slate,ge slate microwav,1.00,36in. over-the-rang microwav trim kit provid custom appear built-in conveni ge ge profil microwave. ge applianc provid up-to-d technolog except qualiti simplifi way live. timeless appearance famili applianc ideal family. and come one trust namesin. america know entir select applianc advanc practical.microwav filler kit allow standard 30in. microwav fit 36in. openingpart included 2 trim kit side mounts 2 instal screws 1 connect barthi kit best instal 12in. base cabinet 15in. cabinet instal kit jx15bumpfit select ge over-the-rang microwaves jvm7195/dvm7195/jnm7196/pvm9195/pnm9196/pvm9215,1.000000,0.666667,2.718035,1.718035
158507,160961,behr 1-gal. sc-112 barn red solid color hous fenc wood stain,red wood fenc,1.00,behr solid color hous fenc paint advanc exterior stain combin best featur oil latex superior color retention adhesion penetr durability. provid film high resist cracking peeling blistering weathering chalk erosion. clean easili soap water appli water-bas oil-bas paint primer.california residents see nbsp proposit 65 informationid exterior applic vertic wood surfaces fenc sidingoil-latex formula retain wood natur textur featur self-prim capabilityprotect uv raysresist mildew growth maintain pristin appearanceup 400 sq.ft. coveragedesign easi clean-up soap wateractu paint color may vari on-screen printer representationsonlin price includ paint care feein. follow states ca co ct me mn or ri vt,1.000000,0.666667,2.718035,1.718035
213999,200282,martha stewart live lake carolina picket fenc 2-seat outdoor patio bench periwinkl cushion-discontinu,outdoor patio fenc,1.00,snug yet stylish place lean back enjoy meal sunset look martha stewart live lake carolina 2-seat bench. construct durabl eucalyptus zinc-coated brass-plat steel hardware. sturdi bench featur arm comfort back relaxing. add one outdoor decor today.craft 100 fsc-certifi eucalyptus wood respons manag forestspicket fenc option coat weather resistant environment friend paintcushion made polyest duck fabriceasi assembledimension 33.5in. h x 47in. w x 21.25in. dcomfort seat two,1.000000,0.666667,2.718035,1.718035
77920,122529,ductlessair 4in. x 14ft. cover kit air condition heat pump line set - ductless mini split central,air line,1.00,complet mini split cover line set wire drainag line easi instal kit polish clean profession finish look. smart look protect cover kit made ultra-dur weather resist rigid materi ad finish newli instal air condit tube ad previous installation. also great electr plumb applications interior exterior. comesin. white color finish paint blend wall color choice. materi easili cut allow configur slit instal need depend way need bend proper fit. bullet below find materi size includedin. kit. note length particular instal may need order 1 kit longer pipe line set installations. cover ideal solut hide line visibl take minut instal complet project.kit includes four 39in. hide-a-lin tubing 3 coupler 1 wall cover cap 1 horizont 90 - 120 degre elbow 1 end cap mount hardwarecan ad exist previous instal easeclean slick designprofession grade long-last productgreat insul air condit tubing 1/4in. - 5/8in. 3/8in. - 5/8in. 3/8in. - 3/4in. 3/8in. - 7/8in.outdoor indoor use,1.000000,1.000000,2.675796,1.675796
10345,101774,new york wire brown 5/16in. screen frame corner 4-pack fsp8571-u,screen frame,1.00,5/16 screen frame corner use build repair screen frame. plastic corner made fit channel 5/16 frame. build repair screen frame easi use frame corners.fit 5/16in. framingdesign join length framingbrown plastic constructionweath resistantno miter requirednote product may vari store,1.000000,1.000000,2.675796,1.675796
92432,128292,mont blanc northbrook drop-in composit granit 25x22x9in. 4-hole singl bowl kitchen sinkin. desert sand,granit sand,1.00,durabl granit composit materi provid mont blanc northbrook drop-in 25in. x 22in. singl bowl kitchen sink function beauty. chip scratch-resist sink featur insul construct reduc nois run water food dispos desert sand finish stain rust oxid resistant. 1 bowl design versatil size applic transit desert-sand color fit varieti kitchen decors. drop-in instal quick easi requir clips.drop-in design easi instal requir secur clip use silicon sealanton 9in. deep bowl allow varieti kitchen functionsstain rust oxidation-resist finish protect sink everyday wear tearcolor shown close represent actual materi colors within limit web graphics call sale support 678-445-2022 request color chip actual materialcomposit granit materi scratch chip stain scorch resist long-last beautycertifi canadian standard associ nation associ home builder research centerclick learn kitchen sink,1.000000,1.000000,2.675796,1.675796
124211,142552,daltil maraca even sun 12in. x 12in. 8 mm frost glass mesh-mount mosaic wall tile,sun glass,1.00,daltil glass maraca even sun frost glass mesh-mount mosaic tile featur smooth finish glossi sheen ad style enhanc setting. impervi qualiti tile make mosaic suitabl exterior applicationsin. fair freez climat less like suffer damag freez thawing. add charact area home kitchen bath bedroom coordin mosaic trim accent piec field tile creat design best suit overal decor space.1 squareft. per piece sold piece 3.12lb. per piecegrad 1 first-qual glass tile walls backsplash pool linings12in. length x 12in. width x 5/16in. thick mesh-mount mosaicunglaz smooth finish medium sheen slight variationin. tonep.e.i. rate 0 suitabl wall residenti use onlyimpervi floor water absorpt less 0.5 indoor outdoor use frost proof when proper instal method followed c.o.f. less .50 margin skid resist indoor use recommend standard residenti applicationsfrost resist indoor outdoor applic when proper instal method followed residenti commerci uselearn get lifetim warranti use custom build product home depot. visit www.homedepot.com/cbpdon t forget coordin trim pieces grout backerboard thinset instal toolsal onlin order item ship via parcel ground may arrivein. multipl boxesit recommend purchas minimum 10 overag account design cut pattern,1.000000,1.000000,2.675796,1.675796
70207,119682,wyndham collect sheffield 59in. vaniti cabinet 58in. mirrorin. white,sheffield,1.00,distinct style eleg line come togeth form complet rang modern classicsin. sheffield bathroom vaniti collection. inspir well establish american standard craft without compromise vaniti design complement decor tradit minimalist modern.construct environment friendly zero emiss solid wood engin prevent warp last lifetime12-stag wood preparation sanding paint hand-finish processhigh water-resist low v.o.c. seal finishbeauti transit style complement bathroompract floor-stand designno assembl requireddeep dowel drawersfully-extend undermount soft-clos drawer slidesconc soft-clos door hingesmet exterior hardwar brush chrome finishplenti storag space2 function doors7 function drawersfaucet s includedcount included58in. match mirror includedsink s includ,1.000000,1.000000,2.675796,1.675796


In [22]:
gbm.feature_importances_

array([1422, 1578])

In [23]:
train['len_search_term'] = train['search_term'].apply(len)
X = train[['match_product_title', 'match_description','len_search_term']].values
grid = GridSearchCV(lgb.LGBMRegressor(),
                    param_grid,
                    cv=3,
                    scoring='neg_mean_squared_error',
                    return_train_score=False)
grid.fit(X, y)
grid_df = pd.DataFrame(grid.cv_results_)
grid_df.drop(columns=grid_df.columns[:3], inplace=True)
grid_df.sort_values(['rank_test_score'], inplace=True)
grid_df.head()

Unnamed: 0,std_score_time,param_learning_rate,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
4,0.001224,0.05,100,"{'learning_rate': 0.05, 'n_estimators': 100}",-0.244044,-0.238491,-0.265293,-0.249276,0.01155,1
6,0.000821,0.1,50,"{'learning_rate': 0.1, 'n_estimators': 50}",-0.244095,-0.238565,-0.265346,-0.249335,0.011544,2
5,0.002801,0.05,200,"{'learning_rate': 0.05, 'n_estimators': 200}",-0.244418,-0.238493,-0.265498,-0.249469,0.011589,3
7,0.000368,0.1,100,"{'learning_rate': 0.1, 'n_estimators': 100}",-0.244428,-0.238562,-0.265527,-0.249506,0.011579,4
3,0.00016,0.05,50,"{'learning_rate': 0.05, 'n_estimators': 50}",-0.244443,-0.238763,-0.266394,-0.249867,0.011915,5


In [24]:
gbm = grid.best_estimator_
test['len_search_term'] = test['search_term'].apply(len)
X_test = test[['match_product_title', 'match_description', 'len_search_term']].values
y_pred = gbm.predict(X_test)
results = pd.DataFrame({'id':test.id.values, 'relevance':y_pred})
results.to_csv('lgbm2.csv', header=True, index=False)
gbm.feature_importances_

array([ 714,  684, 1602])

0.49807

In [25]:
train['preds'] = gbm.predict(X)
train['diff'] = (train['preds'] - train['relevance']).abs()
pd.set_option('display.max_colwidth', -1)
train.sort_values(by=['diff'], ascending=False)

Unnamed: 0_level_0,product_uid,product_title,search_term,relevance,description,match_product_title,match_description,preds,diff,len_search_term
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
210852,197684,prime-lin 1in. nylon slide screen door roller steel tension spring 2-pack,slide door screen,1.00,prime line product 1in. nylon slide screen door roller steel tension spring 2-pack featur center-groov roller design 2-1/4in. heat-treat steel tension springs.compat metal industri academi manufactur co. screen doorsfit roll form doorsnylon wheel construction2-1/4in. flat heat-treat steel spring1in. diamet center groov rollerinclud 2 roller,1.0,1.000000,2.742622,1.742622,17
132167,146522,bona stone tile lamin floor care system,lamin floor tile,1.00,use bona stone tile lamin floor care system clean hard-surfac floorsin. home. 4-piec kit includ mop microfib clean dust pad 32 oz. spray bottl bona stone tile lamin floor cleaner.hard-surfac floor clean kitinclud mop microfib clean pad microfib dust pad 32 oz. spray bottl bona stone tile lamin floor cleanerdur 4-piec mop handl 4in. x 15in. mop headclean pad machin washabl reusablefloor cleaner non-tox,1.0,1.000000,2.741351,1.741351,16
96532,129999,brasscraft 3/8in. o.d. x 36in. copper faucet riserin. chrome,copper faucet,1.00,brasscraft 3/8in. o.d. tube x 36in. length chrome-pl copper riser faucet instal featur one-piec form metal nosepiec insert. cut-to-length flexibl fit clean profession appearance. usein. potabl water distribut systemsin. access locat only.3/8in. o.d. x 36in. length copper riser faucet installations1-piec form metal nosepiec insertchrom platedcut-to-length flexibl fit clean profession appearancefor usein. potabl water distribut systemsin. access locat onlytemperatur rating 40 180 fpressur rating 125 psi maximumno-lead certifiediapmo csa list asm a112.18.1/csa b125.1designed machin assembledin. usa,1.0,1.000000,2.740669,1.740669,13
151606,156981,ge 36in. over-the-rang microwav accessori filler kitin. slate,ge slate microwav,1.00,36in. over-the-rang microwav trim kit provid custom appear built-in conveni ge ge profil microwave. ge applianc provid up-to-d technolog except qualiti simplifi way live. timeless appearance famili applianc ideal family. and come one trust namesin. america know entir select applianc advanc practical.microwav filler kit allow standard 30in. microwav fit 36in. openingpart included 2 trim kit side mounts 2 instal screws 1 connect barthi kit best instal 12in. base cabinet 15in. cabinet instal kit jx15bumpfit select ge over-the-rang microwaves jvm7195/dvm7195/jnm7196/pvm9195/pnm9196/pvm9215,1.0,0.666667,2.722397,1.722397,17
158507,160961,behr 1-gal. sc-112 barn red solid color hous fenc wood stain,red wood fenc,1.00,behr solid color hous fenc paint advanc exterior stain combin best featur oil latex superior color retention adhesion penetr durability. provid film high resist cracking peeling blistering weathering chalk erosion. clean easili soap water appli water-bas oil-bas paint primer.california residents see nbsp proposit 65 informationid exterior applic vertic wood surfaces fenc sidingoil-latex formula retain wood natur textur featur self-prim capabilityprotect uv raysresist mildew growth maintain pristin appearanceup 400 sq.ft. coveragedesign easi clean-up soap wateractu paint color may vari on-screen printer representationsonlin price includ paint care feein. follow states ca co ct me mn or ri vt,1.0,0.666667,2.718144,1.718144,13
80599,123546,prime-lin storm door closer shock spring heavi duti black,storm door black,1.00,storm screen door closer come finishedin. aluminum. featur adjust close speed includ shock spring. style closer use medium heavi weight doors.heavi duti constructionadjust close speedblack finishinclud shock springus medium heavi weight door,1.0,0.666667,2.717996,1.717996,16
213999,200282,martha stewart live lake carolina picket fenc 2-seat outdoor patio bench periwinkl cushion-discontinu,outdoor patio fenc,1.00,snug yet stylish place lean back enjoy meal sunset look martha stewart live lake carolina 2-seat bench. construct durabl eucalyptus zinc-coated brass-plat steel hardware. sturdi bench featur arm comfort back relaxing. add one outdoor decor today.craft 100 fsc-certifi eucalyptus wood respons manag forestspicket fenc option coat weather resistant environment friend paintcushion made polyest duck fabriceasi assembledimension 33.5in. h x 47in. w x 21.25in. dcomfort seat two,1.0,0.666667,2.710475,1.710475,18
152112,157270,patio live concept bahama weav 34in. dark mahogani outdoor tabl lamp straw linen shade,outdoor patio shade,1.00,carefre resin durability ad eleg outdoor live area. weather wicker woven base add dramat appeal. lamp featur resin construct heavi weight base. two level dim switch vari bright levels. 16ft. weatherproof cord plug provid ad safeti convenience. durabl polycarbon waterproof light bulb enclosur allow use standard 100-watt bulb compar led fluoresc bulbs. 20in. shade cover durabl sunbrella fabric.2-level dim switchall-weath 2in. resin wicker woven lamp poleheavi weight base16ft. weatherproof cord plugsunbrella fabric coverhom depot protect plan,1.0,0.666667,2.710179,1.710179,19
156918,160006,werner 14ft. fiberglass round rung straight ladder 375lb. load capac type iaa duti rate,14ft. ladder,1.00,7114-1 one-sect round rung 14ft. straight ladder made non-conduct fiberglass rails. easi transport maneuver ladder bear duti rate 375 lbs. make durabl long lasting. slip-resistant traction-tr rung round durabl rail shield shoe bracket help protect rail damage. plate rung assembl rivet four point side rails.alflo twist-proof performancemar resist end capsslip resist round rungssho pad spur platecannot ship home avail store deliveri,1.0,1.000000,2.692996,1.692996,12
69679,119478,romano 4ft. boxwood spiral topiari tree,topiari tree,1.00,enhanc home romano boxwood spiral topiari tree. wonder full bodi boxwood spiral topiari tree keep color rest lose color leaves. alway look forward rich green tone romano boxwood spiral topiari treein. home. look high qualiti artifici tree make bold statementin. home it.light mobil designov 700 leavesindoor/outdooriron frameuv resist rubber leaves7in. potenh roomin. hous,1.0,1.000000,2.692996,1.692996,12


In [26]:
train['complete_match_title'] = train.apply(lambda row:1 if row['product_title'].find(row['search_term']) >= 0 else 0, axis=1)   

In [27]:
X = train[['match_product_title', 'match_description','len_search_term', 'complete_match_title']].values
grid = GridSearchCV(lgb.LGBMRegressor(),
                    param_grid,
                    cv=3,
                    scoring='neg_mean_squared_error',
                    return_train_score=False)
grid.fit(X, y)
grid_df = pd.DataFrame(grid.cv_results_)
grid_df.drop(columns=grid_df.columns[:3], inplace=True)
grid_df.sort_values(['rank_test_score'], inplace=True)
grid_df.head()

Unnamed: 0,std_score_time,param_learning_rate,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
4,0.000369,0.05,100,"{'learning_rate': 0.05, 'n_estimators': 100}",-0.242519,-0.237219,-0.264378,-0.248039,0.011755,1
6,0.000398,0.1,50,"{'learning_rate': 0.1, 'n_estimators': 50}",-0.242542,-0.237211,-0.264635,-0.24813,0.011872,2
5,0.003974,0.05,200,"{'learning_rate': 0.05, 'n_estimators': 200}",-0.242755,-0.237258,-0.264413,-0.248142,0.011722,3
7,0.004597,0.1,100,"{'learning_rate': 0.1, 'n_estimators': 100}",-0.242736,-0.237216,-0.264802,-0.248251,0.011918,4
8,0.000931,0.1,200,"{'learning_rate': 0.1, 'n_estimators': 200}",-0.243307,-0.237512,-0.265332,-0.248717,0.011984,5


In [28]:
gbm = grid.best_estimator_
test['complete_match_title'] = test.apply(lambda row:1 if row['product_title'].find(row['search_term']) >= 0 else 0, axis=1)   
X_test = test[['match_product_title', 'match_description', 'len_search_term', 'complete_match_title']].values
y_pred = gbm.predict(X_test)
results = pd.DataFrame({'id':test.id.values, 'relevance':y_pred})
results.to_csv('lgbm2.csv', header=True, index=False)
gbm.feature_importances_

array([ 684,  608, 1445,  263])

0.49650

In [29]:
train['complete_match_descr'] = train.apply(lambda row:1 if row['description'].find(row['search_term']) >= 0 else 0, axis=1)
X = train[['match_product_title', 'match_description','len_search_term',
           'complete_match_title', 'complete_match_descr']].values
grid = GridSearchCV(lgb.LGBMRegressor(),
                    param_grid,
                    cv=3,
                    scoring='neg_mean_squared_error',
                    return_train_score=False)
grid.fit(X, y)
grid_df = pd.DataFrame(grid.cv_results_)
grid_df.drop(columns=grid_df.columns[:3], inplace=True)
grid_df.sort_values(['rank_test_score'], inplace=True)
grid_df.head()

Unnamed: 0,std_score_time,param_learning_rate,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
4,0.001376,0.05,100,"{'learning_rate': 0.05, 'n_estimators': 100}",-0.242797,-0.237297,-0.264444,-0.248179,0.011718,1
6,0.00061,0.1,50,"{'learning_rate': 0.1, 'n_estimators': 50}",-0.242796,-0.237277,-0.264644,-0.248239,0.011817,2
7,0.00527,0.1,100,"{'learning_rate': 0.1, 'n_estimators': 100}",-0.243029,-0.237292,-0.264486,-0.248269,0.011704,3
5,0.002686,0.05,200,"{'learning_rate': 0.05, 'n_estimators': 200}",-0.243073,-0.237348,-0.264476,-0.248299,0.011675,4
8,0.006853,0.1,200,"{'learning_rate': 0.1, 'n_estimators': 200}",-0.24363,-0.237519,-0.265071,-0.24874,0.011814,5


In [30]:
gbm = grid.best_estimator_
test['complete_match_descr'] = \
    test.apply(lambda row:1 if row['description'].find(row['search_term']) >= 0 else 0, axis=1)   
X_test = test[['match_product_title', 'match_description','len_search_term',
               'complete_match_title', 'complete_match_descr']].values
y_pred = gbm.predict(X_test)
results = pd.DataFrame({'id':test.id.values, 'relevance':y_pred})
results.to_csv('lgbm2.csv', header=True, index=False)
gbm.feature_importances_

array([ 651,  573, 1413,  257,  106])

0.49640

In [31]:
train['preds'] = gbm.predict(X)
train['diff'] = (train['preds'] - train['relevance']).abs()
pd.set_option('display.max_colwidth', -1)
train.sort_values(by=['diff'], ascending=False)

Unnamed: 0_level_0,product_uid,product_title,search_term,relevance,description,match_product_title,match_description,preds,diff,len_search_term,complete_match_title,complete_match_descr
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
96532,129999,brasscraft 3/8in. o.d. x 36in. copper faucet riserin. chrome,copper faucet,1.00,brasscraft 3/8in. o.d. tube x 36in. length chrome-pl copper riser faucet instal featur one-piec form metal nosepiec insert. cut-to-length flexibl fit clean profession appearance. usein. potabl water distribut systemsin. access locat only.3/8in. o.d. x 36in. length copper riser faucet installations1-piec form metal nosepiec insertchrom platedcut-to-length flexibl fit clean profession appearancefor usein. potabl water distribut systemsin. access locat onlytemperatur rating 40 180 fpressur rating 125 psi maximumno-lead certifiediapmo csa list asm a112.18.1/csa b125.1designed machin assembledin. usa,1.000000,1.000000,2.738383,1.738383,13,1,0
210852,197684,prime-lin 1in. nylon slide screen door roller steel tension spring 2-pack,slide door screen,1.00,prime line product 1in. nylon slide screen door roller steel tension spring 2-pack featur center-groov roller design 2-1/4in. heat-treat steel tension springs.compat metal industri academi manufactur co. screen doorsfit roll form doorsnylon wheel construction2-1/4in. flat heat-treat steel spring1in. diamet center groov rollerinclud 2 roller,1.000000,1.000000,2.720435,1.720435,17,0,0
132167,146522,bona stone tile lamin floor care system,lamin floor tile,1.00,use bona stone tile lamin floor care system clean hard-surfac floorsin. home. 4-piec kit includ mop microfib clean dust pad 32 oz. spray bottl bona stone tile lamin floor cleaner.hard-surfac floor clean kitinclud mop microfib clean pad microfib dust pad 32 oz. spray bottl bona stone tile lamin floor cleanerdur 4-piec mop handl 4in. x 15in. mop headclean pad machin washabl reusablefloor cleaner non-tox,1.000000,1.000000,2.713652,1.713652,16,0,0
69679,119478,romano 4ft. boxwood spiral topiari tree,topiari tree,1.00,enhanc home romano boxwood spiral topiari tree. wonder full bodi boxwood spiral topiari tree keep color rest lose color leaves. alway look forward rich green tone romano boxwood spiral topiari treein. home. look high qualiti artifici tree make bold statementin. home it.light mobil designov 700 leavesindoor/outdooriron frameuv resist rubber leaves7in. potenh roomin. hous,1.000000,1.000000,2.707201,1.707201,12,1,1
10345,101774,new york wire brown 5/16in. screen frame corner 4-pack fsp8571-u,screen frame,1.00,5/16 screen frame corner use build repair screen frame. plastic corner made fit channel 5/16 frame. build repair screen frame easi use frame corners.fit 5/16in. framingdesign join length framingbrown plastic constructionweath resistantno miter requirednote product may vari store,1.000000,1.000000,2.707201,1.707201,12,1,1
151606,156981,ge 36in. over-the-rang microwav accessori filler kitin. slate,ge slate microwav,1.00,36in. over-the-rang microwav trim kit provid custom appear built-in conveni ge ge profil microwave. ge applianc provid up-to-d technolog except qualiti simplifi way live. timeless appearance famili applianc ideal family. and come one trust namesin. america know entir select applianc advanc practical.microwav filler kit allow standard 30in. microwav fit 36in. openingpart included 2 trim kit side mounts 2 instal screws 1 connect barthi kit best instal 12in. base cabinet 15in. cabinet instal kit jx15bumpfit select ge over-the-rang microwaves jvm7195/dvm7195/jnm7196/pvm9195/pnm9196/pvm9215,1.000000,0.666667,2.699219,1.699219,17,0,0
158507,160961,behr 1-gal. sc-112 barn red solid color hous fenc wood stain,red wood fenc,1.00,behr solid color hous fenc paint advanc exterior stain combin best featur oil latex superior color retention adhesion penetr durability. provid film high resist cracking peeling blistering weathering chalk erosion. clean easili soap water appli water-bas oil-bas paint primer.california residents see nbsp proposit 65 informationid exterior applic vertic wood surfaces fenc sidingoil-latex formula retain wood natur textur featur self-prim capabilityprotect uv raysresist mildew growth maintain pristin appearanceup 400 sq.ft. coveragedesign easi clean-up soap wateractu paint color may vari on-screen printer representationsonlin price includ paint care feein. follow states ca co ct me mn or ri vt,1.000000,0.666667,2.693513,1.693513,13,0,0
80599,123546,prime-lin storm door closer shock spring heavi duti black,storm door black,1.00,storm screen door closer come finishedin. aluminum. featur adjust close speed includ shock spring. style closer use medium heavi weight doors.heavi duti constructionadjust close speedblack finishinclud shock springus medium heavi weight door,1.000000,0.666667,2.692436,1.692436,16,0,0
213999,200282,martha stewart live lake carolina picket fenc 2-seat outdoor patio bench periwinkl cushion-discontinu,outdoor patio fenc,1.00,snug yet stylish place lean back enjoy meal sunset look martha stewart live lake carolina 2-seat bench. construct durabl eucalyptus zinc-coated brass-plat steel hardware. sturdi bench featur arm comfort back relaxing. add one outdoor decor today.craft 100 fsc-certifi eucalyptus wood respons manag forestspicket fenc option coat weather resistant environment friend paintcushion made polyest duck fabriceasi assembledimension 33.5in. h x 47in. w x 21.25in. dcomfort seat two,1.000000,0.666667,2.691313,1.691313,18,0,0
152112,157270,patio live concept bahama weav 34in. dark mahogani outdoor tabl lamp straw linen shade,outdoor patio shade,1.00,carefre resin durability ad eleg outdoor live area. weather wicker woven base add dramat appeal. lamp featur resin construct heavi weight base. two level dim switch vari bright levels. 16ft. weatherproof cord plug provid ad safeti convenience. durabl polycarbon waterproof light bulb enclosur allow use standard 100-watt bulb compar led fluoresc bulbs. 20in. shade cover durabl sunbrella fabric.2-level dim switchall-weath 2in. resin wicker woven lamp poleheavi weight base16ft. weatherproof cord plugsunbrella fabric coverhom depot protect plan,1.000000,0.666667,2.688033,1.688033,19,0,0


In [32]:
train['len_search_term_words'] = train['search_term'].apply(lambda x:len(x.split()))
train['len_search_term_letters'] = train['search_term'].apply(len)
X = train[['match_product_title', 'match_description', 'len_search_term_words', 'len_search_term_letters',
           'complete_match_title', 'complete_match_descr']].values
grid = GridSearchCV(lgb.LGBMRegressor(),
                    param_grid,
                    cv=3,
                    scoring='neg_mean_squared_error',
                    return_train_score=False)
grid.fit(X, y)
grid_df = pd.DataFrame(grid.cv_results_)
grid_df.drop(columns=grid_df.columns[:3], inplace=True)
grid_df.sort_values(['rank_test_score'], inplace=True)
grid_df.head()

Unnamed: 0,std_score_time,param_learning_rate,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
6,0.001278,0.1,50,"{'learning_rate': 0.1, 'n_estimators': 50}",-0.242008,-0.236208,-0.263188,-0.247135,0.011596,1
4,0.00072,0.05,100,"{'learning_rate': 0.05, 'n_estimators': 100}",-0.241961,-0.236267,-0.263306,-0.247178,0.011639,2
7,0.002167,0.1,100,"{'learning_rate': 0.1, 'n_estimators': 100}",-0.24231,-0.236063,-0.263198,-0.24719,0.011603,3
5,0.001511,0.05,200,"{'learning_rate': 0.05, 'n_estimators': 200}",-0.242256,-0.236232,-0.263209,-0.247233,0.011562,4
8,0.004213,0.1,200,"{'learning_rate': 0.1, 'n_estimators': 200}",-0.243057,-0.236424,-0.263936,-0.247806,0.011723,5


In [33]:
gbm = grid.best_estimator_
test['len_search_term_words'] = test['search_term'].apply(lambda x:len(x.split()))
test['len_search_term_letters'] = test['search_term'].apply(len)
X_test = test[['match_product_title', 'match_description', 'len_search_term_words', 'len_search_term_letters',
               'complete_match_title', 'complete_match_descr']].values
y_pred = gbm.predict(X_test)
results = pd.DataFrame({'id':test.id.values, 'relevance':y_pred})
results.to_csv('lgbm2.csv', header=True, index=False)
gbm.feature_importances_

array([251, 237, 244, 610, 114,  44])

0.49587

In [34]:
train['preds'] = gbm.predict(X)
train['diff'] = (train['preds'] - train['relevance']).abs()
pd.set_option('display.max_colwidth', -1)
train.drop(columns=['len_search_term'], inplace=True)
train.sort_values(by=['diff'], ascending=False)

Unnamed: 0_level_0,product_uid,product_title,search_term,relevance,description,match_product_title,match_description,preds,diff,complete_match_title,complete_match_descr,len_search_term_words,len_search_term_letters
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
96532,129999,brasscraft 3/8in. o.d. x 36in. copper faucet riserin. chrome,copper faucet,1.00,brasscraft 3/8in. o.d. tube x 36in. length chrome-pl copper riser faucet instal featur one-piec form metal nosepiec insert. cut-to-length flexibl fit clean profession appearance. usein. potabl water distribut systemsin. access locat only.3/8in. o.d. x 36in. length copper riser faucet installations1-piec form metal nosepiec insertchrom platedcut-to-length flexibl fit clean profession appearancefor usein. potabl water distribut systemsin. access locat onlytemperatur rating 40 180 fpressur rating 125 psi maximumno-lead certifiediapmo csa list asm a112.18.1/csa b125.1designed machin assembledin. usa,1.000000,1.000000,2.750436,1.750436,1,0,2,13
204530,192555,heath bird stop blue ceram wild bird feeder,bird stop,1.00,bird stop blue ceram wild bird feeder featur blend ceram metal construction. circular open ideal feed sunflow seeds shell peanut mealworms. hold 1lb. seed. drainag holesin. tray help keep seed dry. top remov refilling.sleek attract design popsin. gardenversatil design hold mealworms shell peanut sunflow seedsceram metal construct disassembl clean,1.000000,1.000000,2.732568,1.732568,1,1,2,9
210852,197684,prime-lin 1in. nylon slide screen door roller steel tension spring 2-pack,slide door screen,1.00,prime line product 1in. nylon slide screen door roller steel tension spring 2-pack featur center-groov roller design 2-1/4in. heat-treat steel tension springs.compat metal industri academi manufactur co. screen doorsfit roll form doorsnylon wheel construction2-1/4in. flat heat-treat steel spring1in. diamet center groov rollerinclud 2 roller,1.000000,1.000000,2.719288,1.719288,0,0,3,17
132167,146522,bona stone tile lamin floor care system,lamin floor tile,1.00,use bona stone tile lamin floor care system clean hard-surfac floorsin. home. 4-piec kit includ mop microfib clean dust pad 32 oz. spray bottl bona stone tile lamin floor cleaner.hard-surfac floor clean kitinclud mop microfib clean pad microfib dust pad 32 oz. spray bottl bona stone tile lamin floor cleanerdur 4-piec mop handl 4in. x 15in. mop headclean pad machin washabl reusablefloor cleaner non-tox,1.000000,1.000000,2.710181,1.710181,0,0,3,16
10345,101774,new york wire brown 5/16in. screen frame corner 4-pack fsp8571-u,screen frame,1.00,5/16 screen frame corner use build repair screen frame. plastic corner made fit channel 5/16 frame. build repair screen frame easi use frame corners.fit 5/16in. framingdesign join length framingbrown plastic constructionweath resistantno miter requirednote product may vari store,1.000000,1.000000,2.706508,1.706508,1,1,2,12
69679,119478,romano 4ft. boxwood spiral topiari tree,topiari tree,1.00,enhanc home romano boxwood spiral topiari tree. wonder full bodi boxwood spiral topiari tree keep color rest lose color leaves. alway look forward rich green tone romano boxwood spiral topiari treein. home. look high qualiti artifici tree make bold statementin. home it.light mobil designov 700 leavesindoor/outdooriron frameuv resist rubber leaves7in. potenh roomin. hous,1.000000,1.000000,2.706508,1.706508,1,1,2,12
92149,128179,grip-rit 11 x 2in. 6 hot-galvan ring shank patio deck nail 5lb.-pack,patio deck,1.00,comprehens select nail screws look grip-rite popular brand fastenersin. america. matter project size job grip-rit optim fasten solution. mani size style nail available. construct materi techniqu evolve fasten design keep pace chang technology. fasten need vari project project safeti life expect project extend compromis base upon fasten used. fasten design meet special need project roof drywal andin. mani case known applic intended.for patio deck constructionring shank provid greater hold powermad meet astm a-153 requirementsmad conform astm f1667,1.000000,1.000000,2.697748,1.697748,1,1,2,10
151606,156981,ge 36in. over-the-rang microwav accessori filler kitin. slate,ge slate microwav,1.00,36in. over-the-rang microwav trim kit provid custom appear built-in conveni ge ge profil microwave. ge applianc provid up-to-d technolog except qualiti simplifi way live. timeless appearance famili applianc ideal family. and come one trust namesin. america know entir select applianc advanc practical.microwav filler kit allow standard 30in. microwav fit 36in. openingpart included 2 trim kit side mounts 2 instal screws 1 connect barthi kit best instal 12in. base cabinet 15in. cabinet instal kit jx15bumpfit select ge over-the-rang microwaves jvm7195/dvm7195/jnm7196/pvm9195/pnm9196/pvm9215,1.000000,0.666667,2.692984,1.692984,0,0,3,17
213999,200282,martha stewart live lake carolina picket fenc 2-seat outdoor patio bench periwinkl cushion-discontinu,outdoor patio fenc,1.00,snug yet stylish place lean back enjoy meal sunset look martha stewart live lake carolina 2-seat bench. construct durabl eucalyptus zinc-coated brass-plat steel hardware. sturdi bench featur arm comfort back relaxing. add one outdoor decor today.craft 100 fsc-certifi eucalyptus wood respons manag forestspicket fenc option coat weather resistant environment friend paintcushion made polyest duck fabriceasi assembledimension 33.5in. h x 47in. w x 21.25in. dcomfort seat two,1.000000,0.666667,2.690997,1.690997,0,0,3,18
152112,157270,patio live concept bahama weav 34in. dark mahogani outdoor tabl lamp straw linen shade,outdoor patio shade,1.00,carefre resin durability ad eleg outdoor live area. weather wicker woven base add dramat appeal. lamp featur resin construct heavi weight base. two level dim switch vari bright levels. 16ft. weatherproof cord plug provid ad safeti convenience. durabl polycarbon waterproof light bulb enclosur allow use standard 100-watt bulb compar led fluoresc bulbs. 20in. shade cover durabl sunbrella fabric.2-level dim switchall-weath 2in. resin wicker woven lamp poleheavi weight base16ft. weatherproof cord plugsunbrella fabric coverhom depot protect plan,1.000000,0.666667,2.688502,1.688502,0,0,3,19


In [36]:
train['len_description_words'] = train['description'].apply(lambda x:len(x.split()))
train['len_description_letters'] = train['description'].apply(len)
cols = ['match_product_title', 'match_description', 'len_search_term_words', 'len_search_term_letters',
        'complete_match_title', 'complete_match_descr', 'len_description_words', 'len_description_letters']
X = train[cols].values
grid = GridSearchCV(lgb.LGBMRegressor(),
                    param_grid,
                    cv=3,
                    scoring='neg_mean_squared_error',
                    return_train_score=False)
grid.fit(X, y)
grid_df = pd.DataFrame(grid.cv_results_)
grid_df.drop(columns=grid_df.columns[:3], inplace=True)
grid_df.sort_values(['rank_test_score'], inplace=True)
grid_df.head()

Unnamed: 0,std_score_time,param_learning_rate,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
5,0.001946,0.05,200,"{'learning_rate': 0.05, 'n_estimators': 200}",-0.241816,-0.236587,-0.262682,-0.247028,0.011273,1
7,0.000735,0.1,100,"{'learning_rate': 0.1, 'n_estimators': 100}",-0.241745,-0.236452,-0.26318,-0.247126,0.011556,2
6,0.010097,0.1,50,"{'learning_rate': 0.1, 'n_estimators': 50}",-0.241575,-0.236512,-0.263403,-0.247163,0.011668,3
4,0.003645,0.05,100,"{'learning_rate': 0.05, 'n_estimators': 100}",-0.241707,-0.236611,-0.2632,-0.247173,0.011523,4
8,0.001147,0.1,200,"{'learning_rate': 0.1, 'n_estimators': 200}",-0.242553,-0.23679,-0.263811,-0.247718,0.01162,5


In [37]:
gbm = grid.best_estimator_
test['len_description_words'] = test['description'].apply(lambda x:len(x.split()))
test['len_description_letters'] = test['description'].apply(len)
X_test = test[cols].values
y_pred = gbm.predict(X_test)
results = pd.DataFrame({'id':test.id.values, 'relevance':y_pred})
results.to_csv('lgbm2.csv', header=True, index=False)
gbm.feature_importances_

array([ 643,  582,  507, 1319,  195,   70, 1355, 1329])

0.49598
похоже на переобучение на конкретные продукты

In [39]:
train = train.drop(columns=['len_description_words', 'len_description_letters'])
test = test.drop(columns=['len_description_words', 'len_description_letters'])

In [40]:
attrs = pd.read_csv('attributes.csv')
attrs.head()

Unnamed: 0,product_uid,name,value
0,100001.0,Bullet01,Versatile connector for various 90° connections and home repair projects
1,100001.0,Bullet02,Stronger than angled nailing or screw fastening alone
2,100001.0,Bullet03,Help ensure joints are consistently straight and strong
3,100001.0,Bullet04,Dimensions: 3 in. x 3 in. x 1-1/2 in.
4,100001.0,Bullet05,Made from 12-Gauge steel


In [49]:
";".join([str(a) for a in attrs.name.unique()])



In [76]:
attrs[(attrs['name'].str.findall(r'Product.[T|t]ype').str.len() > 0)]['name'].unique()

array(['Paint Product Type', 'Patching & Repair Product Type',
       'Electrical Product Type', 'Decor Product Type',
       'Window Treatment Product Type', 'Kitchen Product Type',
       'Outdoor Living Product Type', 'Storage Product Type',
       'Paint/Stain/Waterproofer Product Type',
       'Tool Storage Product Type', 'Tools Product Type',
       'Power Tool Product Type', 'Fencing Product Type',
       'Builders Hardware Product Type', 'Roofing Product Type',
       'Safety & Security Product Type', 'Flooring Product Type',
       'Pipe or Fitting Product Type',
       'Interior Paint & Stains Product Type',
       'Air Conditioner Product Type', 'Snow Equipment Product Type',
       'Adhesive Product Type', 'Safety & Accessories Product Type',
       'Product Type', 'Heating Product Type',
       'Evaporative Cooler Product Type',
       'Exterior Lighting Product Type', 'Drop Cloths Product Type',
       'Tarps, Drop Cloths & Plastic Sheeting Product Type',
       'Holiday 

In [77]:
attrs[(attrs['name'].str.findall(r'[N|n]ame').str.len() > 0)]['name'].unique()

array(['MFG Brand Name', 'Collection Name', 'Botanical Name',
       'Structural/ornamental', 'Pattern name', 'Ornament or Tree Topper',
       "Artist's name", 'Artwork name', 'Name card holders included'],
      dtype=object)

In [85]:
import_names = attrs[(attrs['name'].str.findall(r'Product.[T|t]ype').str.len() > 0)]['name'].unique().tolist()
print(import_names)

['Paint Product Type', 'Patching & Repair Product Type', 'Electrical Product Type', 'Decor Product Type', 'Window Treatment Product Type', 'Kitchen Product Type', 'Outdoor Living Product Type', 'Storage Product Type', 'Paint/Stain/Waterproofer Product Type', 'Tool Storage Product Type', 'Tools Product Type', 'Power Tool Product Type', 'Fencing Product Type', 'Builders Hardware Product Type', 'Roofing Product Type', 'Safety & Security Product Type', 'Flooring Product Type', 'Pipe or Fitting Product Type', 'Interior Paint & Stains Product Type', 'Air Conditioner Product Type', 'Snow Equipment Product Type', 'Adhesive Product Type', 'Safety & Accessories Product Type', 'Product Type', 'Heating Product Type', 'Evaporative Cooler Product Type', 'Exterior Lighting Product Type', 'Drop Cloths Product Type', 'Tarps, Drop Cloths & Plastic Sheeting Product Type', 'Holiday Decor Product Type', 'Roller Product Type', 'Mobility Product Type', 'Tarps & Accessories Product Type', 'Door Locks & Knobs 

In [88]:
brand = attrs[attrs['name'] == 'MFG Brand Name']
brand.index = brand['product_uid']
brand = brand['value']
train['brands'] = train['product_uid'].map(brand)
train.head()

Unnamed: 0_level_0,product_uid,product_title,search_term,relevance,description,match_product_title,match_description,preds,diff,complete_match_title,complete_match_descr,len_search_term_words,len_search_term_letters,brands
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2,100001,simpson strong-ti 12-gaug angl,angl bracket,3.0,angl make joint stronger also provid consistent straight corners. simpson strong-ti offer wide varieti anglesin. various size thick handl light-duti job project structur connect needed. bent skewed match project. outdoor project moistur present use zmax zinc-coat connectors provid extra resist corros look z end model number .versatil connector various 90 connect home repair projectsstrong angl nail screw fasten alonehelp ensur joint consist straight strongdimensions 3in. x 3in. x 1-1/2in.mad 12-gaug steelgalvan extra corros resistanceinstal 10d common nail 9 x 1-1/2in. strong-driv sd screw,0.5,0.5,2.440523,0.559477,0,0,2,12,Simpson Strong-Tie
3,100001,simpson strong-ti 12-gaug angl,l bracket,2.5,angl make joint stronger also provid consistent straight corners. simpson strong-ti offer wide varieti anglesin. various size thick handl light-duti job project structur connect needed. bent skewed match project. outdoor project moistur present use zmax zinc-coat connectors provid extra resist corros look z end model number .versatil connector various 90 connect home repair projectsstrong angl nail screw fasten alonehelp ensur joint consist straight strongdimensions 3in. x 3in. x 1-1/2in.mad 12-gaug steelgalvan extra corros resistanceinstal 10d common nail 9 x 1-1/2in. strong-driv sd screw,0.0,0.0,2.190558,0.309442,0,0,2,9,Simpson Strong-Tie
9,100002,behr premium textur deckov 1-gal. sc-141 tugboat wood concret coat,deck,3.0,behr premium textur deckov innov solid color coating. bring old weather wood concret back life. advanc 100 acryl resin formula creat durabl coat tire worn deck rejuven whole new look. best results sure proper prepar surfac use applic behr product display above.california residents see nbsp proposit 65 informationrev wood composit decks railings porch boat docks also great concret pool decks patio sidewalks100 acryl solid color coatingresist crack peel conceal splinter crack 1/4in.provid durable mildew resist finishcov 75 sq.ft.in. 2 coat per galloncr textured slip-resist finishfor best results prepar appropri behr product wood concret surfaceactu paint color may vari on-screen printer representationscolor avail tintedin. storesonlin price includ paint care feein. follow states ca co ct me mn or ri vt,0.0,1.0,2.596128,0.403872,1,1,1,4,BEHR Premium Textured DeckOver
16,100005,delta vero 1-handl shower faucet trim kitin. chrome valv included,rain shower head,2.33,updat bathroom delta vero single-handl shower faucet trim kitin. chrome. sleek modern minimalist aesthetic. multichoic univers valv keep water temperatur within /-3 degre fahrenheit help prevent scalding.california residents see nbsp proposit 65 informationinclud trim kit only rough-in kit r10000-unbx sold separatelyinclud handlemaintain balanc pressur hot cold water even valv turn elsewherein. systemdu watersens regulationsin. state new york pleas confirm ship zip code restrict use item meet watersens qualif,0.333333,0.333333,2.204707,0.125293,0,0,3,16,Delta
17,100005,delta vero 1-handl shower faucet trim kitin. chrome valv included,shower faucet,2.67,updat bathroom delta vero single-handl shower faucet trim kitin. chrome. sleek modern minimalist aesthetic. multichoic univers valv keep water temperatur within /-3 degre fahrenheit help prevent scalding.california residents see nbsp proposit 65 informationinclud trim kit only rough-in kit r10000-unbx sold separatelyinclud handlemaintain balanc pressur hot cold water even valv turn elsewherein. systemdu watersens regulationsin. state new york pleas confirm ship zip code restrict use item meet watersens qualif,1.0,1.0,2.766321,0.096321,1,1,2,13,Delta


In [98]:
attrs_type = attrs[attrs['name'].isin(import_names)]
prod_type = attrs_type.groupby(['product_uid'])['value'].apply(lambda x:" ".join(x))
train['product_type'] = train['product_uid'].map(prod_type)
train.head()
#"""

Unnamed: 0_level_0,product_uid,product_title,search_term,relevance,description,match_product_title,match_description,preds,diff,complete_match_title,complete_match_descr,len_search_term_words,len_search_term_letters,brands,product_type
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2,100001,simpson strong-ti 12-gaug angl,angl bracket,3.0,angl make joint stronger also provid consistent straight corners. simpson strong-ti offer wide varieti anglesin. various size thick handl light-duti job project structur connect needed. bent skewed match project. outdoor project moistur present use zmax zinc-coat connectors provid extra resist corros look z end model number .versatil connector various 90 connect home repair projectsstrong angl nail screw fasten alonehelp ensur joint consist straight strongdimensions 3in. x 3in. x 1-1/2in.mad 12-gaug steelgalvan extra corros resistanceinstal 10d common nail 9 x 1-1/2in. strong-driv sd screw,0.5,0.5,2.440523,0.559477,0,0,2,12,Simpson Strong-Tie,
3,100001,simpson strong-ti 12-gaug angl,l bracket,2.5,angl make joint stronger also provid consistent straight corners. simpson strong-ti offer wide varieti anglesin. various size thick handl light-duti job project structur connect needed. bent skewed match project. outdoor project moistur present use zmax zinc-coat connectors provid extra resist corros look z end model number .versatil connector various 90 connect home repair projectsstrong angl nail screw fasten alonehelp ensur joint consist straight strongdimensions 3in. x 3in. x 1-1/2in.mad 12-gaug steelgalvan extra corros resistanceinstal 10d common nail 9 x 1-1/2in. strong-driv sd screw,0.0,0.0,2.190558,0.309442,0,0,2,9,Simpson Strong-Tie,
9,100002,behr premium textur deckov 1-gal. sc-141 tugboat wood concret coat,deck,3.0,behr premium textur deckov innov solid color coating. bring old weather wood concret back life. advanc 100 acryl resin formula creat durabl coat tire worn deck rejuven whole new look. best results sure proper prepar surfac use applic behr product display above.california residents see nbsp proposit 65 informationrev wood composit decks railings porch boat docks also great concret pool decks patio sidewalks100 acryl solid color coatingresist crack peel conceal splinter crack 1/4in.provid durable mildew resist finishcov 75 sq.ft.in. 2 coat per galloncr textured slip-resist finishfor best results prepar appropri behr product wood concret surfaceactu paint color may vari on-screen printer representationscolor avail tintedin. storesonlin price includ paint care feein. follow states ca co ct me mn or ri vt,0.0,1.0,2.596128,0.403872,1,1,1,4,BEHR Premium Textured DeckOver,Exterior Paint/Stain Restoration Coating
16,100005,delta vero 1-handl shower faucet trim kitin. chrome valv included,rain shower head,2.33,updat bathroom delta vero single-handl shower faucet trim kitin. chrome. sleek modern minimalist aesthetic. multichoic univers valv keep water temperatur within /-3 degre fahrenheit help prevent scalding.california residents see nbsp proposit 65 informationinclud trim kit only rough-in kit r10000-unbx sold separatelyinclud handlemaintain balanc pressur hot cold water even valv turn elsewherein. systemdu watersens regulationsin. state new york pleas confirm ship zip code restrict use item meet watersens qualif,0.333333,0.333333,2.204707,0.125293,0,0,3,16,Delta,
17,100005,delta vero 1-handl shower faucet trim kitin. chrome valv included,shower faucet,2.67,updat bathroom delta vero single-handl shower faucet trim kitin. chrome. sleek modern minimalist aesthetic. multichoic univers valv keep water temperatur within /-3 degre fahrenheit help prevent scalding.california residents see nbsp proposit 65 informationinclud trim kit only rough-in kit r10000-unbx sold separatelyinclud handlemaintain balanc pressur hot cold water even valv turn elsewherein. systemdu watersens regulationsin. state new york pleas confirm ship zip code restrict use item meet watersens qualif,1.0,1.0,2.766321,0.096321,1,1,2,13,Delta,


In [100]:
train['product_type'] = train['product_type'].apply(stem_sentence)
train['brands'] = train['brands'].apply(stem_sentence)
train = train.fillna('')

In [101]:
for col in ['product_type', 'brands']:
    train['match_' + col] = train.apply(lambda row: word_match_count(row['search_term'], row[col]), axis=1)

In [103]:
cols = ['match_product_title', 'match_description', 'len_search_term_words', 'len_search_term_letters',
        'complete_match_title', 'complete_match_descr',
        'match_product_type', 'match_brands']
X = train[cols].values
grid = GridSearchCV(lgb.LGBMRegressor(),
                    param_grid,
                    cv=3,
                    scoring='neg_mean_squared_error',
                    return_train_score=False)
grid.fit(X, y)
grid_df = pd.DataFrame(grid.cv_results_)
grid_df.drop(columns=grid_df.columns[:3], inplace=True)
grid_df.sort_values(['rank_test_score'], inplace=True)
grid_df.head()

Unnamed: 0,std_score_time,param_learning_rate,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
7,0.001012,0.1,100,"{'learning_rate': 0.1, 'n_estimators': 100}",-0.241414,-0.235674,-0.262825,-0.246638,0.011684,1
5,0.000494,0.05,200,"{'learning_rate': 0.05, 'n_estimators': 200}",-0.241414,-0.235892,-0.262811,-0.246706,0.011609,2
6,4.1e-05,0.1,50,"{'learning_rate': 0.1, 'n_estimators': 50}",-0.241355,-0.235995,-0.263015,-0.246788,0.011681,3
4,0.00038,0.05,100,"{'learning_rate': 0.05, 'n_estimators': 100}",-0.241407,-0.236158,-0.262904,-0.246823,0.011571,4
8,0.000696,0.1,200,"{'learning_rate': 0.1, 'n_estimators': 200}",-0.242036,-0.235758,-0.263039,-0.246944,0.011666,5


In [104]:
grid.best_estimator_.feature_importances_

array([ 443,  419,  345, 1091,  127,   69,  268,  238])

In [108]:
test['brands'] = test['product_uid'].map(brand)
test['product_type'] = test['product_uid'].map(prod_type)
test.fillna('')
test['product_type'] = test['product_type'].apply(stem_sentence)
test['brands'] = test['brands'].apply(stem_sentence)

for col in ['product_type', 'brands']:
    test['match_' + col] = test.apply(lambda row: word_match_count(row['search_term'], row[col]), axis=1)
gbm = grid.best_estimator_
X_test = test[cols].values
y_pred = gbm.predict(X_test)
results = pd.DataFrame({'id':test.id.values, 'relevance':y_pred})
results.to_csv('lgbm2.csv', header=True, index=False)

0.49579

In [110]:
train['preds'] = gbm.predict(X)
train['diff'] = (train['preds'] - train['relevance']).abs()
pd.set_option('display.max_colwidth', -1)
pd.set_option('max_rows', 100)
train.sort_values(by=['diff'], ascending=False)

Unnamed: 0_level_0,product_uid,product_title,search_term,relevance,description,match_product_title,match_description,preds,diff,complete_match_title,complete_match_descr,len_search_term_words,len_search_term_letters,brands,product_type,match_product_type,match_brands
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
96532,129999,brasscraft 3/8in. o.d. x 36in. copper faucet riserin. chrome,copper faucet,1.00,brasscraft 3/8in. o.d. tube x 36in. length chrome-pl copper riser faucet instal featur one-piec form metal nosepiec insert. cut-to-length flexibl fit clean profession appearance. usein. potabl water distribut systemsin. access locat only.3/8in. o.d. x 36in. length copper riser faucet installations1-piec form metal nosepiec insertchrom platedcut-to-length flexibl fit clean profession appearancefor usein. potabl water distribut systemsin. access locat onlytemperatur rating 40 180 fpressur rating 125 psi maximumno-lead certifiediapmo csa list asm a112.18.1/csa b125.1designed machin assembledin. usa,1.000000,1.000000,2.753319,1.753319,1,0,2,13,,,0.000000,0.000000
204530,192555,heath bird stop blue ceram wild bird feeder,bird stop,1.00,bird stop blue ceram wild bird feeder featur blend ceram metal construction. circular open ideal feed sunflow seeds shell peanut mealworms. hold 1lb. seed. drainag holesin. tray help keep seed dry. top remov refilling.sleek attract design popsin. gardenversatil design hold mealworms shell peanut sunflow seedsceram metal construct disassembl clean,1.000000,1.000000,2.744923,1.744923,1,1,2,9,heath,,0.000000,0.000000
210852,197684,prime-lin 1in. nylon slide screen door roller steel tension spring 2-pack,slide door screen,1.00,prime line product 1in. nylon slide screen door roller steel tension spring 2-pack featur center-groov roller design 2-1/4in. heat-treat steel tension springs.compat metal industri academi manufactur co. screen doorsfit roll form doorsnylon wheel construction2-1/4in. flat heat-treat steel spring1in. diamet center groov rollerinclud 2 roller,1.000000,1.000000,2.721225,1.721225,0,0,3,17,prime-lin,door hardwar,0.333333,0.000000
69679,119478,romano 4ft. boxwood spiral topiari tree,topiari tree,1.00,enhanc home romano boxwood spiral topiari tree. wonder full bodi boxwood spiral topiari tree keep color rest lose color leaves. alway look forward rich green tone romano boxwood spiral topiari treein. home. look high qualiti artifici tree make bold statementin. home it.light mobil designov 700 leavesindoor/outdooriron frameuv resist rubber leaves7in. potenh roomin. hous,1.000000,1.000000,2.711267,1.711267,1,1,2,12,,,0.000000,0.000000
132167,146522,bona stone tile lamin floor care system,lamin floor tile,1.00,use bona stone tile lamin floor care system clean hard-surfac floorsin. home. 4-piec kit includ mop microfib clean dust pad 32 oz. spray bottl bona stone tile lamin floor cleaner.hard-surfac floor clean kitinclud mop microfib clean pad microfib dust pad 32 oz. spray bottl bona stone tile lamin floor cleanerdur 4-piec mop handl 4in. x 15in. mop headclean pad machin washabl reusablefloor cleaner non-tox,1.000000,1.000000,2.694449,1.694449,0,0,3,16,bona,,0.000000,0.000000
213999,200282,martha stewart live lake carolina picket fenc 2-seat outdoor patio bench periwinkl cushion-discontinu,outdoor patio fenc,1.00,snug yet stylish place lean back enjoy meal sunset look martha stewart live lake carolina 2-seat bench. construct durabl eucalyptus zinc-coated brass-plat steel hardware. sturdi bench featur arm comfort back relaxing. add one outdoor decor today.craft 100 fsc-certifi eucalyptus wood respons manag forestspicket fenc option coat weather resistant environment friend paintcushion made polyest duck fabriceasi assembledimension 33.5in. h x 47in. w x 21.25in. dcomfort seat two,1.000000,0.666667,2.694105,1.694105,0,0,3,18,martha stewart live,patio bench/glid,0.333333,0.000000
92149,128179,grip-rit 11 x 2in. 6 hot-galvan ring shank patio deck nail 5lb.-pack,patio deck,1.00,comprehens select nail screws look grip-rite popular brand fastenersin. america. matter project size job grip-rit optim fasten solution. mani size style nail available. construct materi techniqu evolve fasten design keep pace chang technology. fasten need vari project project safeti life expect project extend compromis base upon fasten used. fasten design meet special need project roof drywal andin. mani case known applic intended.for patio deck constructionring shank provid greater hold powermad meet astm a-153 requirementsmad conform astm f1667,1.000000,1.000000,2.690980,1.690980,1,1,2,10,,,0.000000,0.000000
152112,157270,patio live concept bahama weav 34in. dark mahogani outdoor tabl lamp straw linen shade,outdoor patio shade,1.00,carefre resin durability ad eleg outdoor live area. weather wicker woven base add dramat appeal. lamp featur resin construct heavi weight base. two level dim switch vari bright levels. 16ft. weatherproof cord plug provid ad safeti convenience. durabl polycarbon waterproof light bulb enclosur allow use standard 100-watt bulb compar led fluoresc bulbs. 20in. shade cover durabl sunbrella fabric.2-level dim switchall-weath 2in. resin wicker woven lamp poleheavi weight base16ft. weatherproof cord plugsunbrella fabric coverhom depot protect plan,1.000000,0.666667,2.687910,1.687910,0,0,3,19,,,0.000000,0.000000
158507,160961,behr 1-gal. sc-112 barn red solid color hous fenc wood stain,red wood fenc,1.00,behr solid color hous fenc paint advanc exterior stain combin best featur oil latex superior color retention adhesion penetr durability. provid film high resist cracking peeling blistering weathering chalk erosion. clean easili soap water appli water-bas oil-bas paint primer.california residents see nbsp proposit 65 informationid exterior applic vertic wood surfaces fenc sidingoil-latex formula retain wood natur textur featur self-prim capabilityprotect uv raysresist mildew growth maintain pristin appearanceup 400 sq.ft. coveragedesign easi clean-up soap wateractu paint color may vari on-screen printer representationsonlin price includ paint care feein. follow states ca co ct me mn or ri vt,1.000000,0.666667,2.686758,1.686758,0,0,3,13,,,0.000000,0.000000
151606,156981,ge 36in. over-the-rang microwav accessori filler kitin. slate,ge slate microwav,1.00,36in. over-the-rang microwav trim kit provid custom appear built-in conveni ge ge profil microwave. ge applianc provid up-to-d technolog except qualiti simplifi way live. timeless appearance famili applianc ideal family. and come one trust namesin. america know entir select applianc advanc practical.microwav filler kit allow standard 30in. microwav fit 36in. openingpart included 2 trim kit side mounts 2 instal screws 1 connect barthi kit best instal 12in. base cabinet 15in. cabinet instal kit jx15bumpfit select ge over-the-rang microwaves jvm7195/dvm7195/jnm7196/pvm9195/pnm9196/pvm9215,1.000000,0.666667,2.680964,1.680964,0,0,3,17,ge,,0.000000,0.333333


In [186]:
train_origs = pd.read_csv('train.csv', encoding=encoding, index_col=['id'])
train['orig_search_term'] = train_origs['search_term']
train['orig_product_title'] = train_origs['product_title']
train.sort_values(by=['diff'], ascending=False)[:50]

Unnamed: 0_level_0,product_uid,product_title,search_term,relevance,description,match_product_title,match_description,preds,diff,complete_match_title,complete_match_descr,len_search_term_words,len_search_term_letters,brands,product_type,match_product_type,match_brands,orig_search_term,orig_product_title
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
96532,129999,brasscraft 3/8in. o.d. x 36in. copper faucet riserin. chrome,copper faucet,1.0,brasscraft 3/8in. o.d. tube x 36in. length chrome-pl copper riser faucet instal featur one-piec form metal nosepiec insert. cut-to-length flexibl fit clean profession appearance. usein. potabl water distribut systemsin. access locat only.3/8in. o.d. x 36in. length copper riser faucet installations1-piec form metal nosepiec insertchrom platedcut-to-length flexibl fit clean profession appearancefor usein. potabl water distribut systemsin. access locat onlytemperatur rating 40 180 fpressur rating 125 psi maximumno-lead certifiediapmo csa list asm a112.18.1/csa b125.1designed machin assembledin. usa,1.0,1.0,2.753319,1.753319,1,0,2,13,,,0.0,0.0,copper faucet,BrassCraft 3/8 in. O.D. x 36 in. Copper Faucet Riser in Chrome
204530,192555,heath bird stop blue ceram wild bird feeder,bird stop,1.0,bird stop blue ceram wild bird feeder featur blend ceram metal construction. circular open ideal feed sunflow seeds shell peanut mealworms. hold 1lb. seed. drainag holesin. tray help keep seed dry. top remov refilling.sleek attract design popsin. gardenversatil design hold mealworms shell peanut sunflow seedsceram metal construct disassembl clean,1.0,1.0,2.744923,1.744923,1,1,2,9,heath,,0.0,0.0,bird stops,Heath Bird Stop Blue Ceramic Wild Bird Feeder
210852,197684,prime-lin 1in. nylon slide screen door roller steel tension spring 2-pack,slide door screen,1.0,prime line product 1in. nylon slide screen door roller steel tension spring 2-pack featur center-groov roller design 2-1/4in. heat-treat steel tension springs.compat metal industri academi manufactur co. screen doorsfit roll form doorsnylon wheel construction2-1/4in. flat heat-treat steel spring1in. diamet center groov rollerinclud 2 roller,1.0,1.0,2.721225,1.721225,0,0,3,17,prime-lin,door hardwar,0.333333,0.0,Sliding Door Screening,Prime-Line 1 in. Nylon Sliding Screen Door Rollers with Steel Tension Springs (2-pack)
69679,119478,romano 4ft. boxwood spiral topiari tree,topiari tree,1.0,enhanc home romano boxwood spiral topiari tree. wonder full bodi boxwood spiral topiari tree keep color rest lose color leaves. alway look forward rich green tone romano boxwood spiral topiari treein. home. look high qualiti artifici tree make bold statementin. home it.light mobil designov 700 leavesindoor/outdooriron frameuv resist rubber leaves7in. potenh roomin. hous,1.0,1.0,2.711267,1.711267,1,1,2,12,,,0.0,0.0,topiary tree,Romano 4 ft. Boxwood Spiral Topiary Tree
132167,146522,bona stone tile lamin floor care system,lamin floor tile,1.0,use bona stone tile lamin floor care system clean hard-surfac floorsin. home. 4-piec kit includ mop microfib clean dust pad 32 oz. spray bottl bona stone tile lamin floor cleaner.hard-surfac floor clean kitinclud mop microfib clean pad microfib dust pad 32 oz. spray bottl bona stone tile lamin floor cleanerdur 4-piec mop handl 4in. x 15in. mop headclean pad machin washabl reusablefloor cleaner non-tox,1.0,1.0,2.694449,1.694449,0,0,3,16,bona,,0.0,0.0,laminate floor tile,"Bona Stone, Tile and Laminate Floor Care System"
213999,200282,martha stewart live lake carolina picket fenc 2-seat outdoor patio bench periwinkl cushion-discontinu,outdoor patio fenc,1.0,snug yet stylish place lean back enjoy meal sunset look martha stewart live lake carolina 2-seat bench. construct durabl eucalyptus zinc-coated brass-plat steel hardware. sturdi bench featur arm comfort back relaxing. add one outdoor decor today.craft 100 fsc-certifi eucalyptus wood respons manag forestspicket fenc option coat weather resistant environment friend paintcushion made polyest duck fabriceasi assembledimension 33.5in. h x 47in. w x 21.25in. dcomfort seat two,1.0,0.666667,2.694105,1.694105,0,0,3,18,martha stewart live,patio bench/glid,0.333333,0.0,outdoor patio fence,Martha Stewart Living Lake Carolina Picket Fence 2-Seat Outdoor Patio Bench with Periwinkle Cushion-DISCONTINUED
92149,128179,grip-rit 11 x 2in. 6 hot-galvan ring shank patio deck nail 5lb.-pack,patio deck,1.0,comprehens select nail screws look grip-rite popular brand fastenersin. america. matter project size job grip-rit optim fasten solution. mani size style nail available. construct materi techniqu evolve fasten design keep pace chang technology. fasten need vari project project safeti life expect project extend compromis base upon fasten used. fasten design meet special need project roof drywal andin. mani case known applic intended.for patio deck constructionring shank provid greater hold powermad meet astm a-153 requirementsmad conform astm f1667,1.0,1.0,2.69098,1.69098,1,1,2,10,,,0.0,0.0,patio decking,Grip-Rite #11 x 2 in. 6å¡ Hot-Galvanized Ring Shank Patio Deck Nails (5 lb.-Pack)
152112,157270,patio live concept bahama weav 34in. dark mahogani outdoor tabl lamp straw linen shade,outdoor patio shade,1.0,carefre resin durability ad eleg outdoor live area. weather wicker woven base add dramat appeal. lamp featur resin construct heavi weight base. two level dim switch vari bright levels. 16ft. weatherproof cord plug provid ad safeti convenience. durabl polycarbon waterproof light bulb enclosur allow use standard 100-watt bulb compar led fluoresc bulbs. 20in. shade cover durabl sunbrella fabric.2-level dim switchall-weath 2in. resin wicker woven lamp poleheavi weight base16ft. weatherproof cord plugsunbrella fabric coverhom depot protect plan,1.0,0.666667,2.68791,1.68791,0,0,3,19,,,0.0,0.0,outdoor patio shades,Patio Living Concepts Bahama Weave 34 in. Dark Mahogany Outdoor Table Lamp with Straw Linen Shade
158507,160961,behr 1-gal. sc-112 barn red solid color hous fenc wood stain,red wood fenc,1.0,behr solid color hous fenc paint advanc exterior stain combin best featur oil latex superior color retention adhesion penetr durability. provid film high resist cracking peeling blistering weathering chalk erosion. clean easili soap water appli water-bas oil-bas paint primer.california residents see nbsp proposit 65 informationid exterior applic vertic wood surfaces fenc sidingoil-latex formula retain wood natur textur featur self-prim capabilityprotect uv raysresist mildew growth maintain pristin appearanceup 400 sq.ft. coveragedesign easi clean-up soap wateractu paint color may vari on-screen printer representationsonlin price includ paint care feein. follow states ca co ct me mn or ri vt,1.0,0.666667,2.686758,1.686758,0,0,3,13,,,0.0,0.0,red wood fence,BEHR 1-gal. #SC-112 Barn Red Solid Color House and Fence Wood Stain
151606,156981,ge 36in. over-the-rang microwav accessori filler kitin. slate,ge slate microwav,1.0,36in. over-the-rang microwav trim kit provid custom appear built-in conveni ge ge profil microwave. ge applianc provid up-to-d technolog except qualiti simplifi way live. timeless appearance famili applianc ideal family. and come one trust namesin. america know entir select applianc advanc practical.microwav filler kit allow standard 30in. microwav fit 36in. openingpart included 2 trim kit side mounts 2 instal screws 1 connect barthi kit best instal 12in. base cabinet 15in. cabinet instal kit jx15bumpfit select ge over-the-rang microwaves jvm7195/dvm7195/jnm7196/pvm9195/pnm9196/pvm9215,1.0,0.666667,2.680964,1.680964,0,0,3,17,ge,,0.0,0.333333,ge slate microwave,GE 36 in. Over-the-Range Microwave Accessory Filler Kit in Slate


In [215]:
train.sort_values(by=['diff'], ascending=True)[:50]

Unnamed: 0_level_0,product_uid,product_title,search_term,relevance,description,match_product_title,match_description,preds,diff,complete_match_title,complete_match_descr,len_search_term_words,len_search_term_letters,brands,product_type,match_product_type,match_brands,orig_search_term,orig_product_title,match_pos_title
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
204530,192555,heath bird stop blue ceram wild bird feeder,bird stop,1.0,bird stop blue ceram wild bird feeder featur blend ceram metal construction. circular open ideal feed sunflow seeds shell peanut mealworms. hold 1lb. seed. drainag holesin. tray help keep seed dry. top remov refilling.sleek attract design popsin. gardenversatil design hold mealworms shell peanut sunflow seedsceram metal construct disassembl clean,1.0,1.0,2.848639,1.848639,1,1,2,9,heath,,0.0,0.0,bird stops,Heath Bird Stop Blue Ceramic Wild Bird Feeder,1.0
137108,149071,viagrow strawberri planter vertic garden,strawberri,1.0,viagrow strawberri planter allow grow strawberri util vertic plant design maxim number plant grow per squareft.. vertic strawberri growth prefer way grow strawberries. top side plant space grow ever-bear strawberri season long. polyethylen fabric help retain moistur even summer months.grow 12 strawberri plant per planter 8 vertic 4 top plant hold 50 qt. media soilgreat grow herbs basil flowerslast mani season break easili easi storag,1.0,1.0,2.8458,1.8458,1,1,1,10,,,0.0,0.0,strawberries,Viagrow Strawberry Planter Vertical Garden,0.0
69679,119478,romano 4ft. boxwood spiral topiari tree,topiari tree,1.0,enhanc home romano boxwood spiral topiari tree. wonder full bodi boxwood spiral topiari tree keep color rest lose color leaves. alway look forward rich green tone romano boxwood spiral topiari treein. home. look high qualiti artifici tree make bold statementin. home it.light mobil designov 700 leavesindoor/outdooriron frameuv resist rubber leaves7in. potenh roomin. hous,1.0,1.0,2.822381,1.822381,1,1,2,12,,,0.0,0.0,topiary tree,Romano 4 ft. Boxwood Spiral Topiary Tree,1.0
10345,101774,new york wire brown 5/16in. screen frame corner 4-pack fsp8571-u,screen frame,1.0,5/16 screen frame corner use build repair screen frame. plastic corner made fit channel 5/16 frame. build repair screen frame easi use frame corners.fit 5/16in. framingdesign join length framingbrown plastic constructionweath resistantno miter requirednote product may vari store,1.0,1.0,2.766817,1.766817,1,1,2,12,new york wire,screen frame,1.0,0.0,screen frame,New York Wire Brown 5/16 in. Screen Frame Corners (4-Pack) FSP8571-U,1.0
96532,129999,brasscraft 3/8in. o.d. x 36in. copper faucet riserin. chrome,copper faucet,1.0,brasscraft 3/8in. o.d. tube x 36in. length chrome-pl copper riser faucet instal featur one-piec form metal nosepiec insert. cut-to-length flexibl fit clean profession appearance. usein. potabl water distribut systemsin. access locat only.3/8in. o.d. x 36in. length copper riser faucet installations1-piec form metal nosepiec insertchrom platedcut-to-length flexibl fit clean profession appearancefor usein. potabl water distribut systemsin. access locat onlytemperatur rating 40 180 fpressur rating 125 psi maximumno-lead certifiediapmo csa list asm a112.18.1/csa b125.1designed machin assembledin. usa,1.0,1.0,2.7353,1.7353,1,0,2,13,,,0.0,0.0,copper faucet,BrassCraft 3/8 in. O.D. x 36 in. Copper Faucet Riser in Chrome,1.0
70207,119682,wyndham collect sheffield 59in. vaniti cabinet 58in. mirrorin. white,sheffield,1.0,distinct style eleg line come togeth form complet rang modern classicsin. sheffield bathroom vaniti collection. inspir well establish american standard craft without compromise vaniti design complement decor tradit minimalist modern.construct environment friendly zero emiss solid wood engin prevent warp last lifetime12-stag wood preparation sanding paint hand-finish processhigh water-resist low v.o.c. seal finishbeauti transit style complement bathroompract floor-stand designno assembl requireddeep dowel drawersfully-extend undermount soft-clos drawer slidesconc soft-clos door hingesmet exterior hardwar brush chrome finishplenti storag space2 function doors7 function drawersfaucet s includedcount included58in. match mirror includedsink s includ,1.0,1.0,2.733345,1.733345,1,1,1,9,wyndham collect,,0.0,0.0,sheffield,Wyndham Collection Sheffield 59 in. Vanity Cabinet with 58 in. Mirror in White,0.0
69064,119252,big ass fan 3600 144in. yellow silver aluminum shop ceil fan,big ass,1.0,want air movement without noise. got it. big ass fan 3600 money-saving statement-making air-mov machine. 12ft. wingspan creat serious airflow space 3 600 sq.ft. perfect keep shops gyms warehous space type comfortable. 3600 increas comfort decreas energi bill year-round yep evenin. winter air movement feel never hear.gearless direct drive motor silent energi efficient6 patent mini-ellipto airfoil reduc drag motor allow run higher speed provid cool area 51 standard 52in. ceil fansvari speed control offer fully-adjust airflowcoverag 60ft. x 60ft. room 3 600 sq.ft. uncondit spacerevers requiredin. winter simpli turn fan lowest setting feel air gentl push hot air ceil winterin. balanc way vs. ceil walls reduc heat energi consumpt 30 design indoor use only3ft. drop tube univers mount included1 year part warranty79 lbs. hang weightinclud 50ft. cat-5 cabl,1.0,1.0,2.717258,1.717258,1,1,2,7,,,0.0,0.0,big ass,Big Ass Fans 3600 144 in. Yellow and Silver Aluminum Shop Ceiling Fan,1.0
70488,119781,kohler memoir state comfort height 2-piec 1.6 gpf elong toilet aquapiston flush technologyin. biscuit,memoir,1.0,sophist tradit design serv inspir memoir collection. rich detailing suit product echo styliz line histor renown furnitur architectur design two style offer differ dignifi interpret - clean crisp line state design ad round detail classic style resembl crown molding. fixtur faucet available extens memoir collect accommod larg small bath powder rooms.two-piec toiletelong bowl offer ad room comfortcomfort height featur offer chair-height seat make sit stand easier adults1.6 gallon per flush gpf 2-1/8in. glaze trapwaystandard left-hand trip lever includedseat suppli line includedcombin consist k-4380 bowl k-4642 tankcoordin productsin. memoir collect state designsingle-flush graviti use forc graviti precision-engin tank bowl trapway creat strong siphon flushingaquapiston canist allow water flow bowl side 360 degrees increas power effect flushflush valve 3 2 ratio har natur forc graviti optim flush performancedur canist design 90 less expos seal materi 3in. flapper leak-fre performancelight-touch canist flush requir lower actuat forc flapperstandard 12in. rough-inthree-bolt instal,1.0,1.0,2.68985,1.68985,1,1,1,6,kohler,,0.0,0.0,memoirs,KOHLER Memoirs Stately Comfort Height 2-piece 1.6 GPF Elongated Toilet with AquaPiston Flush Technology in Biscuit,1.0
18526,103303,everbilt 8ft. garag door safeti cabl,garag door spring,1.0,everbilt 8ft. garag door safeti cabl made galvan steel 1/8in. thickness. 7 strand 7 wire per strand strength durabl end-to-end tensil strength 1 400lb.design garag doorsmad galvan steel reliabilitywind unwind onto cabl drum via torsion spring rais lower garag doorminimum end-to-end break strength 1 400lb.,0.666667,1.0,2.67968,1.67968,0,0,3,17,everbilt,,0.0,0.0,garage door spring,Everbilt 8 ft. Garage Door Safety Cable,0.666667
34565,107838,lightun 24in. white bluetooth adjust robot speaker desk lamp alarm clock fm radio usb charg port,robot,1.0,bluetooth adjust robot speaker lamp alarm clock fm radio usb charg port uniqu design pack full sound combo speaker lamp. come equip except robot-lik design 2 high qualiti speakers bluetooth compat play music bluetooth enabl device 3.5 mm line-in jack play music phones ipods mp3 players audio devic bluetooth enabled alarm clock usb portin. back charg phone devices fm radio. item dock lamp. charg phone use usb wire charg instead appl phones. find featur qualityin. brand. lamp absolut must gadget complet bedroom office colleg dorm.bluetooth communic phone devic play music3.5 mm aux line-in play music mobil phones notebook computers mp3 players audio devic bluetooth compatiblealarm clock featur gradual sound prepar bodi wake wake-up light turn light time alarm soundsus one 40-watt type g16.5 e12 candelabra base bulb not included l 7.5in. x w 7.5in. x h 24in. fulli extended sleep set up 90 minutes gradual turn music - light if on turn wellclock fm radio usb charg portin. back two high qualiti speaker,1.0,1.0,2.672238,1.672238,1,1,1,5,,,0.0,0.0,i robot,"LighTunes 24 in. White Bluetooth Adjustable Robot Speaker Desk Lamp with Alarm Clock, FM Radio, and USB Charging Port",1.0


In [7]:
def match_word_n_pos(text, search_term):
    text = nltk.pos_tag(replaces(text).split())
    search_pos = nltk.pos_tag(replaces(search_term).split())
    text = ["%s/%s" % (word[0].lower(), word[1]) for word in text]
    search_pos = ["%s/%s" % (word[0].lower(), word[1]) for word in search_pos]
    
    if len(search_pos) == 0:
        return 0
    matches = 0
    for word in text:
        if word in search_pos:
            matches += 1
    return matches / len(search_pos)

In [183]:
print(match_word_n_pos('GE 36 in. Over-the-Range Microwave Accessory Filler Kit in Slate', 'ge slate microwave'))
print(match_word_n_pos('BrassCraft 3/8 in. O.D. x 36 in. Copper Faucet Riser in Chrome', 'copper faucet'))

0.6666666666666666
1.0


In [140]:
print(nltk.pos_tag(replaces('DEWALT 40-Volt Max XR Electric Cordless String Trimmer').split()),
nltk.pos_tag(replaces('dewalt cordless trimmer'.lower()).split()))

[('DEWALT', 'NNP'), ('40-Volt', 'JJ'), ('Max', 'NNP'), ('XR', 'NNP'), ('Electric', 'NNP'), ('Cordless', 'NNP'), ('String', 'NNP'), ('Trimmer', 'NNP')] [('dewalt', 'NN'), ('cordless', 'NN'), ('trimmer', 'NN')]


In [168]:
nltk.pos_tag('bird stops'.split())

[('bird', 'NN'), ('stops', 'NNS')]

In [180]:
nltk.pos_tag(('Bona Stone, Tile and Laminate Floor Care System').split())

[('Bona', 'NNP'),
 ('Stone,', 'NNP'),
 ('Tile', 'NNP'),
 ('and', 'CC'),
 ('Laminate', 'NNP'),
 ('Floor', 'NNP'),
 ('Care', 'NNP'),
 ('System', 'NNP')]

In [189]:
y_pred = y_pred.round(2)
results = pd.DataFrame({'id':test.id.values, 'relevance':y_pred})
results.to_csv('lgbm2.csv', header=True, index=False)

0.49577

In [199]:
train['match_pos_title'] = train.apply(lambda x: match_word_n_pos(x['orig_product_title'], x['orig_search_term']), axis=1)

In [200]:
cols = ['match_product_title', 'match_description', 'len_search_term_words', 'len_search_term_letters',
        'complete_match_title', 'complete_match_descr',
        'match_product_type', 'match_brands', 'match_pos_title']
X = train[cols].values
grid = GridSearchCV(lgb.LGBMRegressor(),
                    param_grid,
                    cv=3,
                    scoring='neg_mean_squared_error',
                    return_train_score=False)
grid.fit(X, y)
grid_df = pd.DataFrame(grid.cv_results_)
grid_df.drop(columns=grid_df.columns[:3], inplace=True)
grid_df.sort_values(['rank_test_score'], inplace=True)
grid_df.head()

Unnamed: 0,std_score_time,param_learning_rate,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
5,0.005403,0.05,200,"{'learning_rate': 0.05, 'n_estimators': 200}",-0.240903,-0.235613,-0.262358,-0.246291,0.011564,1
7,0.00069,0.1,100,"{'learning_rate': 0.1, 'n_estimators': 100}",-0.240986,-0.235648,-0.262445,-0.24636,0.011581,2
8,0.004449,0.1,200,"{'learning_rate': 0.1, 'n_estimators': 200}",-0.24129,-0.235377,-0.262715,-0.246461,0.011744,3
4,0.002193,0.05,100,"{'learning_rate': 0.05, 'n_estimators': 100}",-0.241046,-0.236004,-0.262684,-0.246578,0.011573,4
6,0.004771,0.1,50,"{'learning_rate': 0.1, 'n_estimators': 50}",-0.241089,-0.236097,-0.262792,-0.246659,0.011588,5


In [202]:
gbm = grid.best_estimator_
gbm.feature_importances_

array([ 734,  776,  576, 1883,  236,  109,  432,  413,  841])

In [203]:
test_orig = pd.read_csv('test.csv', encoding=encoding)
test['orig_search_term'] = test_orig['search_term']
test['orig_product_title'] = test_orig['product_title']
test['match_pos_title'] = test.apply(lambda x: match_word_n_pos(x['orig_product_title'], x['orig_search_term']), axis=1)
X_test = test[cols].values
y_pred = gbm.predict(X_test)
results = pd.DataFrame({'id':test.id.values, 'relevance':y_pred})
results.to_csv('lgbm2.csv', header=True, index=False)

In [212]:
train['preds'] = gbm.predict(X)
train['diff'] = (train['preds'] - train['relevance']).abs()
train.sort_values(by=['diff'], ascending=False)[:50]

Unnamed: 0_level_0,product_uid,product_title,search_term,relevance,description,match_product_title,match_description,preds,diff,complete_match_title,complete_match_descr,len_search_term_words,len_search_term_letters,brands,product_type,match_product_type,match_brands,orig_search_term,orig_product_title,match_pos_title
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
204530,192555,heath bird stop blue ceram wild bird feeder,bird stop,1.0,bird stop blue ceram wild bird feeder featur blend ceram metal construction. circular open ideal feed sunflow seeds shell peanut mealworms. hold 1lb. seed. drainag holesin. tray help keep seed dry. top remov refilling.sleek attract design popsin. gardenversatil design hold mealworms shell peanut sunflow seedsceram metal construct disassembl clean,1.0,1.0,2.848639,1.848639,1,1,2,9,heath,,0.0,0.0,bird stops,Heath Bird Stop Blue Ceramic Wild Bird Feeder,1.0
137108,149071,viagrow strawberri planter vertic garden,strawberri,1.0,viagrow strawberri planter allow grow strawberri util vertic plant design maxim number plant grow per squareft.. vertic strawberri growth prefer way grow strawberries. top side plant space grow ever-bear strawberri season long. polyethylen fabric help retain moistur even summer months.grow 12 strawberri plant per planter 8 vertic 4 top plant hold 50 qt. media soilgreat grow herbs basil flowerslast mani season break easili easi storag,1.0,1.0,2.8458,1.8458,1,1,1,10,,,0.0,0.0,strawberries,Viagrow Strawberry Planter Vertical Garden,0.0
69679,119478,romano 4ft. boxwood spiral topiari tree,topiari tree,1.0,enhanc home romano boxwood spiral topiari tree. wonder full bodi boxwood spiral topiari tree keep color rest lose color leaves. alway look forward rich green tone romano boxwood spiral topiari treein. home. look high qualiti artifici tree make bold statementin. home it.light mobil designov 700 leavesindoor/outdooriron frameuv resist rubber leaves7in. potenh roomin. hous,1.0,1.0,2.822381,1.822381,1,1,2,12,,,0.0,0.0,topiary tree,Romano 4 ft. Boxwood Spiral Topiary Tree,1.0
10345,101774,new york wire brown 5/16in. screen frame corner 4-pack fsp8571-u,screen frame,1.0,5/16 screen frame corner use build repair screen frame. plastic corner made fit channel 5/16 frame. build repair screen frame easi use frame corners.fit 5/16in. framingdesign join length framingbrown plastic constructionweath resistantno miter requirednote product may vari store,1.0,1.0,2.766817,1.766817,1,1,2,12,new york wire,screen frame,1.0,0.0,screen frame,New York Wire Brown 5/16 in. Screen Frame Corners (4-Pack) FSP8571-U,1.0
96532,129999,brasscraft 3/8in. o.d. x 36in. copper faucet riserin. chrome,copper faucet,1.0,brasscraft 3/8in. o.d. tube x 36in. length chrome-pl copper riser faucet instal featur one-piec form metal nosepiec insert. cut-to-length flexibl fit clean profession appearance. usein. potabl water distribut systemsin. access locat only.3/8in. o.d. x 36in. length copper riser faucet installations1-piec form metal nosepiec insertchrom platedcut-to-length flexibl fit clean profession appearancefor usein. potabl water distribut systemsin. access locat onlytemperatur rating 40 180 fpressur rating 125 psi maximumno-lead certifiediapmo csa list asm a112.18.1/csa b125.1designed machin assembledin. usa,1.0,1.0,2.7353,1.7353,1,0,2,13,,,0.0,0.0,copper faucet,BrassCraft 3/8 in. O.D. x 36 in. Copper Faucet Riser in Chrome,1.0
70207,119682,wyndham collect sheffield 59in. vaniti cabinet 58in. mirrorin. white,sheffield,1.0,distinct style eleg line come togeth form complet rang modern classicsin. sheffield bathroom vaniti collection. inspir well establish american standard craft without compromise vaniti design complement decor tradit minimalist modern.construct environment friendly zero emiss solid wood engin prevent warp last lifetime12-stag wood preparation sanding paint hand-finish processhigh water-resist low v.o.c. seal finishbeauti transit style complement bathroompract floor-stand designno assembl requireddeep dowel drawersfully-extend undermount soft-clos drawer slidesconc soft-clos door hingesmet exterior hardwar brush chrome finishplenti storag space2 function doors7 function drawersfaucet s includedcount included58in. match mirror includedsink s includ,1.0,1.0,2.733345,1.733345,1,1,1,9,wyndham collect,,0.0,0.0,sheffield,Wyndham Collection Sheffield 59 in. Vanity Cabinet with 58 in. Mirror in White,0.0
69064,119252,big ass fan 3600 144in. yellow silver aluminum shop ceil fan,big ass,1.0,want air movement without noise. got it. big ass fan 3600 money-saving statement-making air-mov machine. 12ft. wingspan creat serious airflow space 3 600 sq.ft. perfect keep shops gyms warehous space type comfortable. 3600 increas comfort decreas energi bill year-round yep evenin. winter air movement feel never hear.gearless direct drive motor silent energi efficient6 patent mini-ellipto airfoil reduc drag motor allow run higher speed provid cool area 51 standard 52in. ceil fansvari speed control offer fully-adjust airflowcoverag 60ft. x 60ft. room 3 600 sq.ft. uncondit spacerevers requiredin. winter simpli turn fan lowest setting feel air gentl push hot air ceil winterin. balanc way vs. ceil walls reduc heat energi consumpt 30 design indoor use only3ft. drop tube univers mount included1 year part warranty79 lbs. hang weightinclud 50ft. cat-5 cabl,1.0,1.0,2.717258,1.717258,1,1,2,7,,,0.0,0.0,big ass,Big Ass Fans 3600 144 in. Yellow and Silver Aluminum Shop Ceiling Fan,1.0
70488,119781,kohler memoir state comfort height 2-piec 1.6 gpf elong toilet aquapiston flush technologyin. biscuit,memoir,1.0,sophist tradit design serv inspir memoir collection. rich detailing suit product echo styliz line histor renown furnitur architectur design two style offer differ dignifi interpret - clean crisp line state design ad round detail classic style resembl crown molding. fixtur faucet available extens memoir collect accommod larg small bath powder rooms.two-piec toiletelong bowl offer ad room comfortcomfort height featur offer chair-height seat make sit stand easier adults1.6 gallon per flush gpf 2-1/8in. glaze trapwaystandard left-hand trip lever includedseat suppli line includedcombin consist k-4380 bowl k-4642 tankcoordin productsin. memoir collect state designsingle-flush graviti use forc graviti precision-engin tank bowl trapway creat strong siphon flushingaquapiston canist allow water flow bowl side 360 degrees increas power effect flushflush valve 3 2 ratio har natur forc graviti optim flush performancedur canist design 90 less expos seal materi 3in. flapper leak-fre performancelight-touch canist flush requir lower actuat forc flapperstandard 12in. rough-inthree-bolt instal,1.0,1.0,2.68985,1.68985,1,1,1,6,kohler,,0.0,0.0,memoirs,KOHLER Memoirs Stately Comfort Height 2-piece 1.6 GPF Elongated Toilet with AquaPiston Flush Technology in Biscuit,1.0
18526,103303,everbilt 8ft. garag door safeti cabl,garag door spring,1.0,everbilt 8ft. garag door safeti cabl made galvan steel 1/8in. thickness. 7 strand 7 wire per strand strength durabl end-to-end tensil strength 1 400lb.design garag doorsmad galvan steel reliabilitywind unwind onto cabl drum via torsion spring rais lower garag doorminimum end-to-end break strength 1 400lb.,0.666667,1.0,2.67968,1.67968,0,0,3,17,everbilt,,0.0,0.0,garage door spring,Everbilt 8 ft. Garage Door Safety Cable,0.666667
34565,107838,lightun 24in. white bluetooth adjust robot speaker desk lamp alarm clock fm radio usb charg port,robot,1.0,bluetooth adjust robot speaker lamp alarm clock fm radio usb charg port uniqu design pack full sound combo speaker lamp. come equip except robot-lik design 2 high qualiti speakers bluetooth compat play music bluetooth enabl device 3.5 mm line-in jack play music phones ipods mp3 players audio devic bluetooth enabled alarm clock usb portin. back charg phone devices fm radio. item dock lamp. charg phone use usb wire charg instead appl phones. find featur qualityin. brand. lamp absolut must gadget complet bedroom office colleg dorm.bluetooth communic phone devic play music3.5 mm aux line-in play music mobil phones notebook computers mp3 players audio devic bluetooth compatiblealarm clock featur gradual sound prepar bodi wake wake-up light turn light time alarm soundsus one 40-watt type g16.5 e12 candelabra base bulb not included l 7.5in. x w 7.5in. x h 24in. fulli extended sleep set up 90 minutes gradual turn music - light if on turn wellclock fm radio usb charg portin. back two high qualiti speaker,1.0,1.0,2.672238,1.672238,1,1,1,5,,,0.0,0.0,i robot,"LighTunes 24 in. White Bluetooth Adjustable Robot Speaker Desk Lamp with Alarm Clock, FM Radio, and USB Charging Port",1.0


In [218]:
cols = ['match_product_title', 'match_description', 'len_search_term_words', 'len_search_term_letters',
        'complete_match_title', 'complete_match_descr',
        'match_product_type', 'match_brands', 'match_pos_title']
X = train[cols].values
grid = GridSearchCV(lgb.LGBMRegressor(),
                    param_grid,
                    cv=3,
                    scoring='neg_mean_squared_error',
                    return_train_score=False)
grid.fit(X, y)
grid_df = pd.DataFrame(grid.cv_results_)
grid_df.drop(columns=grid_df.columns[:3], inplace=True)
grid_df.sort_values(['rank_test_score'], inplace=True)
grid_df.head()

Unnamed: 0,std_score_time,param_learning_rate,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
5,0.005072,0.05,200,"{'learning_rate': 0.05, 'n_estimators': 200}",-0.240903,-0.235613,-0.262358,-0.246291,0.011564,1
7,0.004358,0.1,100,"{'learning_rate': 0.1, 'n_estimators': 100}",-0.240986,-0.235648,-0.262445,-0.24636,0.011581,2
8,0.003138,0.1,200,"{'learning_rate': 0.1, 'n_estimators': 200}",-0.24129,-0.235377,-0.262715,-0.246461,0.011744,3
4,0.000634,0.05,100,"{'learning_rate': 0.05, 'n_estimators': 100}",-0.241046,-0.236004,-0.262684,-0.246578,0.011573,4
6,0.000413,0.1,50,"{'learning_rate': 0.1, 'n_estimators': 50}",-0.241089,-0.236097,-0.262792,-0.246659,0.011588,5


In [8]:
from sklearn.ensemble import BaggingRegressor

In [None]:
br = BaggingRegressor(lgb.LGBMRegressor())
br.fit(X, y)
y_pred = br.predict(X_test)
results = pd.DataFrame({'id':test.id.values, 'relevance':y_pred})
results.to_csv('lgbm2.csv', header=True, index=False)

0.49497

In [223]:
train = pd.read_csv('train.csv', encoding=encoding, index_col=['id'])
train['orig_search_term'] = train['search_term']
train['orig_product_title'] = train['product_title']
products = pd.read_csv('product_descriptions.csv', encoding=encoding, index_col=['product_uid'])
products['orig_descr'] = products['product_description']
products['descr'] = products['product_description'].apply(stem_sentence)
train['descr'] = train.product_uid.map(products['descr'])
train['orig_descr'] = train.product_uid.map(products['orig_descr'])
for col in ['product_title', 'search_term']:
    train[col] = train[col].apply(stem_sentence)
train.head()



Unnamed: 0_level_0,product_uid,product_title,search_term,relevance,orig_search_term,orig_product_title,descr,orig_descr
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2,100001,simpson strong-ti 12-gaug angl,angl bracket,3.0,angle bracket,Simpson Strong-Tie 12-Gauge Angle,"angl make joint stronger, also provid consistent, straight corners. simpson strong-ti offer wide varieti angl various size thick handl light-duti job project structur connect needed. bent (skewed) match project. outdoor project moistur present, use zmax zinc-coat connectors, provid extra resist corros (look ""z"" end model number).versatil connector various 90 connect home repair projectsstrong angl nail screw fasten alonehelp ensur joint consist straight strongdimensions: 3 in. x 3 in. x 1-1/2 in.mad 12-gaug steelgalvan extra corros resistanceinstal 10d common nail #9 x 1-1/2 in. strong-driv sd screw","Not only do angles make joints stronger, they also provide more consistent, straight corners. Simpson Strong-Tie offers a wide variety of angles in various sizes and thicknesses to handle light-duty jobs or projects where a structural connection is needed. Some can be bent (skewed) to match the project. For outdoor projects or those where moisture is present, use our ZMAX zinc-coated connectors, which provide extra resistance against corrosion (look for a ""Z"" at the end of the model number).Versatile connector for various 90 connections and home repair projectsStronger than angled nailing or screw fastening aloneHelp ensure joints are consistently straight and strongDimensions: 3 in. x 3 in. x 1-1/2 in.Made from 12-Gauge steelGalvanized for extra corrosion resistanceInstall with 10d common nails or #9 x 1-1/2 in. Strong-Drive SD screws"
3,100001,simpson strong-ti 12-gaug angl,l bracket,2.5,l bracket,Simpson Strong-Tie 12-Gauge Angle,"angl make joint stronger, also provid consistent, straight corners. simpson strong-ti offer wide varieti angl various size thick handl light-duti job project structur connect needed. bent (skewed) match project. outdoor project moistur present, use zmax zinc-coat connectors, provid extra resist corros (look ""z"" end model number).versatil connector various 90 connect home repair projectsstrong angl nail screw fasten alonehelp ensur joint consist straight strongdimensions: 3 in. x 3 in. x 1-1/2 in.mad 12-gaug steelgalvan extra corros resistanceinstal 10d common nail #9 x 1-1/2 in. strong-driv sd screw","Not only do angles make joints stronger, they also provide more consistent, straight corners. Simpson Strong-Tie offers a wide variety of angles in various sizes and thicknesses to handle light-duty jobs or projects where a structural connection is needed. Some can be bent (skewed) to match the project. For outdoor projects or those where moisture is present, use our ZMAX zinc-coated connectors, which provide extra resistance against corrosion (look for a ""Z"" at the end of the model number).Versatile connector for various 90 connections and home repair projectsStronger than angled nailing or screw fastening aloneHelp ensure joints are consistently straight and strongDimensions: 3 in. x 3 in. x 1-1/2 in.Made from 12-Gauge steelGalvanized for extra corrosion resistanceInstall with 10d common nails or #9 x 1-1/2 in. Strong-Drive SD screws"
9,100002,behr premium textur deckov 1-gal. #sc-141 tugboat wood concret coat,deck,3.0,deck over,BEHR Premium Textured DeckOver 1-gal. #SC-141 Tugboat Wood and Concrete Coating,"behr premium textur deckov innov solid color coating. bring old, weather wood concret back life. advanc 100% acryl resin formula creat durabl coat tire worn deck, rejuven whole new look. best results, sure proper prepar surfac use applic behr product display above.california residents: see&nbsp;proposit 65 informationrev wood composit decks, railings, porch boat docks, also great concret pool decks, patio sidewalks100% acryl solid color coatingresist crack peel conceal splinter crack 1/4 in.provid durable, mildew resist finishcov 75 sq. ft. 2 coat per galloncr textured, slip-resist finishfor best results, prepar appropri behr product wood concret surfaceactu paint color may vari on-screen printer representationscolor avail tint storesonlin price includ paint care fee follow states: ca, co, ct, me, mn, or, ri, vt","BEHR Premium Textured DECKOVER is an innovative solid color coating. It will bring your old, weathered wood or concrete back to life. The advanced 100% acrylic resin formula creates a durable coating for your tired and worn out deck, rejuvenating to a whole new look. For the best results, be sure to properly prepare the surface using other applicable BEHR products displayed above.California residents: see&nbsp;Proposition 65 informationRevives wood and composite decks, railings, porches and boat docks, also great for concrete pool decks, patios and sidewalks100% acrylic solid color coatingResists cracking and peeling and conceals splinters and cracks up to 1/4 in.Provides a durable, mildew resistant finishCovers up to 75 sq. ft. in 2 coats per gallonCreates a textured, slip-resistant finishFor best results, prepare with the appropriate BEHR product for your wood or concrete surfaceActual paint colors may vary from on-screen and printer representationsColors available to be tinted in most storesOnline Price includes Paint Care fee in the following states: CA, CO, CT, ME, MN, OR, RI, VT"
16,100005,delta vero 1-handl shower faucet trim kit chrome (valv included),rain shower head,2.33,rain shower head,Delta Vero 1-Handle Shower Only Faucet Trim Kit in Chrome (Valve Not Included),"updat bathroom delta vero single-handl shower faucet trim kit chrome. sleek, modern minimalist aesthetic. multichoic univers valv keep water temperatur within +/-3 degre fahrenheit help prevent scalding.california residents: see&nbsp;proposit 65 informationinclud trim kit only, rough-in kit (r10000-unbx) sold separatelyinclud handlemaintain balanc pressur hot cold water even valv turn elsewher systemdu watersens regul state new york, pleas confirm ship zip code restrict use item meet watersens qualif","Update your bathroom with the Delta Vero Single-Handle Shower Faucet Trim Kit in Chrome. It has a sleek, modern and minimalistic aesthetic. The MultiChoice universal valve keeps the water temperature within +/-3 degrees Fahrenheit to help prevent scalding.California residents: see&nbsp;Proposition 65 informationIncludes the trim kit only, the rough-in kit (R10000-UNBX) is sold separatelyIncludes the handleMaintains a balanced pressure of hot and cold water even when a valve is turned on or off elsewhere in the systemDue to WaterSense regulations in the state of New York, please confirm your shipping zip code is not restricted from use of items that do not meet WaterSense qualifications"
17,100005,delta vero 1-handl shower faucet trim kit chrome (valv included),shower faucet,2.67,shower only faucet,Delta Vero 1-Handle Shower Only Faucet Trim Kit in Chrome (Valve Not Included),"updat bathroom delta vero single-handl shower faucet trim kit chrome. sleek, modern minimalist aesthetic. multichoic univers valv keep water temperatur within +/-3 degre fahrenheit help prevent scalding.california residents: see&nbsp;proposit 65 informationinclud trim kit only, rough-in kit (r10000-unbx) sold separatelyinclud handlemaintain balanc pressur hot cold water even valv turn elsewher systemdu watersens regul state new york, pleas confirm ship zip code restrict use item meet watersens qualif","Update your bathroom with the Delta Vero Single-Handle Shower Faucet Trim Kit in Chrome. It has a sleek, modern and minimalistic aesthetic. The MultiChoice universal valve keeps the water temperature within +/-3 degrees Fahrenheit to help prevent scalding.California residents: see&nbsp;Proposition 65 informationIncludes the trim kit only, the rough-in kit (R10000-UNBX) is sold separatelyIncludes the handleMaintains a balanced pressure of hot and cold water even when a valve is turned on or off elsewhere in the systemDue to WaterSense regulations in the state of New York, please confirm your shipping zip code is not restricted from use of items that do not meet WaterSense qualifications"


In [226]:
for col in ['product_title', 'descr']:
    train['match_' + col] = train.apply(lambda row: word_match_count(row['search_term'], row[col]), axis=1)

train['len_search_term_words'] = train['search_term'].apply(lambda x: len(x.split()))
train['len_search_term_letters'] = train['search_term'].str.len()
train['complete_match_title'] = \
    train.apply(lambda row:1 if row['product_title'].find(row['search_term']) >= 0 else 0, axis=1)   
train['complete_match_descr'] = \
    train.apply(lambda row:1 if row['descr'].find(row['search_term']) >= 0 else 0, axis=1)
train['match_pos_title'] = train.apply(lambda x: match_word_n_pos(x['orig_product_title'], x['orig_search_term']), axis=1)



cols = ['match_product_title', 'match_descr', 'len_search_term_words', 'len_search_term_letters',
        'complete_match_title', 'complete_match_descr', 'match_pos_title']
X = train[cols].values
y = train.relevance
br = BaggingRegressor(lgb.LGBMRegressor())
br.fit(X, y)

BaggingRegressor(base_estimator=LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
       importance_type='split', learning_rate=0.1, max_depth=-1,
       min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
       n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
       random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
       subsample=1.0, subsample_for_bin=200000, subsample_freq=0),
         bootstrap=True, bootstrap_features=False, max_features=1.0,
         max_samples=1.0, n_estimators=10, n_jobs=None, oob_score=False,
         random_state=None, verbose=0, warm_start=False)

In [228]:
test = pd.read_csv('test.csv', encoding=encoding, index_col=['id'])
test['orig_search_term'] = test['search_term']
test['orig_product_title'] = test['product_title']
test['descr'] = test.product_uid.map(products['descr'])
test['orig_descr'] = test.product_uid.map(products['orig_descr'])
for col in ['product_title', 'search_term']:
    test[col] = test[col].apply(stem_sentence)
test.head()

Unnamed: 0_level_0,product_uid,product_title,search_term,orig_search_term,orig_product_title,descr,orig_descr
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,100001,simpson strong-ti 12-gaug angl,90 degre bracket,90 degree bracket,Simpson Strong-Tie 12-Gauge Angle,"angl make joint stronger, also provid consistent, straight corners. simpson strong-ti offer wide varieti angl various size thick handl light-duti job project structur connect needed. bent (skewed) match project. outdoor project moistur present, use zmax zinc-coat connectors, provid extra resist corros (look ""z"" end model number).versatil connector various 90 connect home repair projectsstrong angl nail screw fasten alonehelp ensur joint consist straight strongdimensions: 3 in. x 3 in. x 1-1/2 in.mad 12-gaug steelgalvan extra corros resistanceinstal 10d common nail #9 x 1-1/2 in. strong-driv sd screw","Not only do angles make joints stronger, they also provide more consistent, straight corners. Simpson Strong-Tie offers a wide variety of angles in various sizes and thicknesses to handle light-duty jobs or projects where a structural connection is needed. Some can be bent (skewed) to match the project. For outdoor projects or those where moisture is present, use our ZMAX zinc-coated connectors, which provide extra resistance against corrosion (look for a ""Z"" at the end of the model number).Versatile connector for various 90 connections and home repair projectsStronger than angled nailing or screw fastening aloneHelp ensure joints are consistently straight and strongDimensions: 3 in. x 3 in. x 1-1/2 in.Made from 12-Gauge steelGalvanized for extra corrosion resistanceInstall with 10d common nails or #9 x 1-1/2 in. Strong-Drive SD screws"
4,100001,simpson strong-ti 12-gaug angl,metal l bracket,metal l brackets,Simpson Strong-Tie 12-Gauge Angle,"angl make joint stronger, also provid consistent, straight corners. simpson strong-ti offer wide varieti angl various size thick handl light-duti job project structur connect needed. bent (skewed) match project. outdoor project moistur present, use zmax zinc-coat connectors, provid extra resist corros (look ""z"" end model number).versatil connector various 90 connect home repair projectsstrong angl nail screw fasten alonehelp ensur joint consist straight strongdimensions: 3 in. x 3 in. x 1-1/2 in.mad 12-gaug steelgalvan extra corros resistanceinstal 10d common nail #9 x 1-1/2 in. strong-driv sd screw","Not only do angles make joints stronger, they also provide more consistent, straight corners. Simpson Strong-Tie offers a wide variety of angles in various sizes and thicknesses to handle light-duty jobs or projects where a structural connection is needed. Some can be bent (skewed) to match the project. For outdoor projects or those where moisture is present, use our ZMAX zinc-coated connectors, which provide extra resistance against corrosion (look for a ""Z"" at the end of the model number).Versatile connector for various 90 connections and home repair projectsStronger than angled nailing or screw fastening aloneHelp ensure joints are consistently straight and strongDimensions: 3 in. x 3 in. x 1-1/2 in.Made from 12-Gauge steelGalvanized for extra corrosion resistanceInstall with 10d common nails or #9 x 1-1/2 in. Strong-Drive SD screws"
5,100001,simpson strong-ti 12-gaug angl,simpson sku abl,simpson sku able,Simpson Strong-Tie 12-Gauge Angle,"angl make joint stronger, also provid consistent, straight corners. simpson strong-ti offer wide varieti angl various size thick handl light-duti job project structur connect needed. bent (skewed) match project. outdoor project moistur present, use zmax zinc-coat connectors, provid extra resist corros (look ""z"" end model number).versatil connector various 90 connect home repair projectsstrong angl nail screw fasten alonehelp ensur joint consist straight strongdimensions: 3 in. x 3 in. x 1-1/2 in.mad 12-gaug steelgalvan extra corros resistanceinstal 10d common nail #9 x 1-1/2 in. strong-driv sd screw","Not only do angles make joints stronger, they also provide more consistent, straight corners. Simpson Strong-Tie offers a wide variety of angles in various sizes and thicknesses to handle light-duty jobs or projects where a structural connection is needed. Some can be bent (skewed) to match the project. For outdoor projects or those where moisture is present, use our ZMAX zinc-coated connectors, which provide extra resistance against corrosion (look for a ""Z"" at the end of the model number).Versatile connector for various 90 connections and home repair projectsStronger than angled nailing or screw fastening aloneHelp ensure joints are consistently straight and strongDimensions: 3 in. x 3 in. x 1-1/2 in.Made from 12-Gauge steelGalvanized for extra corrosion resistanceInstall with 10d common nails or #9 x 1-1/2 in. Strong-Drive SD screws"
6,100001,simpson strong-ti 12-gaug angl,simpson strong tie,simpson strong ties,Simpson Strong-Tie 12-Gauge Angle,"angl make joint stronger, also provid consistent, straight corners. simpson strong-ti offer wide varieti angl various size thick handl light-duti job project structur connect needed. bent (skewed) match project. outdoor project moistur present, use zmax zinc-coat connectors, provid extra resist corros (look ""z"" end model number).versatil connector various 90 connect home repair projectsstrong angl nail screw fasten alonehelp ensur joint consist straight strongdimensions: 3 in. x 3 in. x 1-1/2 in.mad 12-gaug steelgalvan extra corros resistanceinstal 10d common nail #9 x 1-1/2 in. strong-driv sd screw","Not only do angles make joints stronger, they also provide more consistent, straight corners. Simpson Strong-Tie offers a wide variety of angles in various sizes and thicknesses to handle light-duty jobs or projects where a structural connection is needed. Some can be bent (skewed) to match the project. For outdoor projects or those where moisture is present, use our ZMAX zinc-coated connectors, which provide extra resistance against corrosion (look for a ""Z"" at the end of the model number).Versatile connector for various 90 connections and home repair projectsStronger than angled nailing or screw fastening aloneHelp ensure joints are consistently straight and strongDimensions: 3 in. x 3 in. x 1-1/2 in.Made from 12-Gauge steelGalvanized for extra corrosion resistanceInstall with 10d common nails or #9 x 1-1/2 in. Strong-Drive SD screws"
7,100001,simpson strong-ti 12-gaug angl,simpson strong tie hcc668,simpson strong tie hcc668,Simpson Strong-Tie 12-Gauge Angle,"angl make joint stronger, also provid consistent, straight corners. simpson strong-ti offer wide varieti angl various size thick handl light-duti job project structur connect needed. bent (skewed) match project. outdoor project moistur present, use zmax zinc-coat connectors, provid extra resist corros (look ""z"" end model number).versatil connector various 90 connect home repair projectsstrong angl nail screw fasten alonehelp ensur joint consist straight strongdimensions: 3 in. x 3 in. x 1-1/2 in.mad 12-gaug steelgalvan extra corros resistanceinstal 10d common nail #9 x 1-1/2 in. strong-driv sd screw","Not only do angles make joints stronger, they also provide more consistent, straight corners. Simpson Strong-Tie offers a wide variety of angles in various sizes and thicknesses to handle light-duty jobs or projects where a structural connection is needed. Some can be bent (skewed) to match the project. For outdoor projects or those where moisture is present, use our ZMAX zinc-coated connectors, which provide extra resistance against corrosion (look for a ""Z"" at the end of the model number).Versatile connector for various 90 connections and home repair projectsStronger than angled nailing or screw fastening aloneHelp ensure joints are consistently straight and strongDimensions: 3 in. x 3 in. x 1-1/2 in.Made from 12-Gauge steelGalvanized for extra corrosion resistanceInstall with 10d common nails or #9 x 1-1/2 in. Strong-Drive SD screws"


In [None]:
for col in ['product_title', 'descr']:
    test['match_' + col] = test.apply(lambda row: word_match_count(row['search_term'], row[col]), axis=1)

test['len_search_term_words'] = test['search_term'].apply(lambda x: len(x.split()))
test['len_search_term_letters'] = test['search_term'].str.len()
test['complete_match_title'] = \
    test.apply(lambda row:1 if row['product_title'].find(row['search_term']) >= 0 else 0, axis=1)   
test['complete_match_descr'] = \
    test.apply(lambda row:1 if row['descr'].find(row['search_term']) >= 0 else 0, axis=1)
test['match_pos_title'] = train.apply(lambda x: match_word_n_pos(x['orig_product_title'], x['orig_search_term']), axis=1)

X_test = test[cols]
y_pred = br.predict(X_test)

In [231]:
results = pd.DataFrame({'id':test.index.values, 'relevance':y_pred})
results.to_csv('lgbm2.csv', header=True, index=False)

0.49354

In [232]:
test.to_csv('test_prepared.csv')
train.to_csv('train_prepared.csv')

In [16]:
test = pd.read_csv('test_prepared.csv', index_col=['id'])
test.head()

Unnamed: 0_level_0,product_uid,product_title,search_term,orig_search_term,orig_product_title,descr,orig_descr,match_product_title,match_descr,len_search_term_words,len_search_term_letters,complete_match_title,complete_match_descr,match_pos_title
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1,100001,simpson strong-ti 12-gaug angl,90 degre bracket,90 degree bracket,Simpson Strong-Tie 12-Gauge Angle,"angl make joint stronger, also provid consiste...","Not only do angles make joints stronger, they ...",0.0,0.333333,3,16,0,0,
4,100001,simpson strong-ti 12-gaug angl,metal l bracket,metal l brackets,Simpson Strong-Tie 12-Gauge Angle,"angl make joint stronger, also provid consiste...","Not only do angles make joints stronger, they ...",0.0,0.0,3,15,0,0,
5,100001,simpson strong-ti 12-gaug angl,simpson sku abl,simpson sku able,Simpson Strong-Tie 12-Gauge Angle,"angl make joint stronger, also provid consiste...","Not only do angles make joints stronger, they ...",0.333333,0.333333,3,15,0,0,
6,100001,simpson strong-ti 12-gaug angl,simpson strong tie,simpson strong ties,Simpson Strong-Tie 12-Gauge Angle,"angl make joint stronger, also provid consiste...","Not only do angles make joints stronger, they ...",0.333333,0.333333,3,18,0,0,
7,100001,simpson strong-ti 12-gaug angl,simpson strong tie hcc668,simpson strong tie hcc668,Simpson Strong-Tie 12-Gauge Angle,"angl make joint stronger, also provid consiste...","Not only do angles make joints stronger, they ...",0.25,0.25,4,25,0,0,


In [17]:
train = pd.read_csv('train_prepared.csv', index_col=['id'])
train.head()

Unnamed: 0_level_0,product_uid,product_title,search_term,relevance,orig_search_term,orig_product_title,descr,orig_descr,match_product_title,match_descr,len_search_term_words,len_search_term_letters,complete_match_title,complete_match_descr,match_pos_title
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2,100001,simpson strong-ti 12-gaug angl,angl bracket,3.0,angle bracket,Simpson Strong-Tie 12-Gauge Angle,"angl make joint stronger, also provid consiste...","Not only do angles make joints stronger, they ...",0.5,0.5,2,12,0,0,0.5
3,100001,simpson strong-ti 12-gaug angl,l bracket,2.5,l bracket,Simpson Strong-Tie 12-Gauge Angle,"angl make joint stronger, also provid consiste...","Not only do angles make joints stronger, they ...",0.0,0.0,2,9,0,0,0.0
9,100002,behr premium textur deckov 1-gal. #sc-141 tugb...,deck,3.0,deck over,BEHR Premium Textured DeckOver 1-gal. #SC-141 ...,behr premium textur deckov innov solid color c...,BEHR Premium Textured DECKOVER is an innovativ...,0.0,0.0,1,4,1,1,0.0
16,100005,delta vero 1-handl shower faucet trim kit chro...,rain shower head,2.33,rain shower head,Delta Vero 1-Handle Shower Only Faucet Trim Ki...,updat bathroom delta vero single-handl shower ...,Update your bathroom with the Delta Vero Singl...,0.333333,0.333333,3,16,0,0,0.0
17,100005,delta vero 1-handl shower faucet trim kit chro...,shower faucet,2.67,shower only faucet,Delta Vero 1-Handle Shower Only Faucet Trim Ki...,updat bathroom delta vero single-handl shower ...,Update your bathroom with the Delta Vero Singl...,1.0,1.0,2,13,1,1,0.333333


In [20]:
cols = ['match_product_title', 'match_descr', 'len_search_term_words', 'len_search_term_letters',
        'complete_match_title', 'complete_match_descr', 'match_pos_title']
X = train[cols].values
y = train.relevance
br = BaggingRegressor(lgb.LGBMRegressor())
br.fit(X, y)
train['diff'] = (train['relevance'] - br.predict(X)).abs()
train[train['diff'] > 1.5].count()

product_uid                88
product_title              88
search_term                88
relevance                  88
orig_search_term           88
orig_product_title         88
descr                      88
orig_descr                 88
match_product_title        88
match_descr                88
len_search_term_words      88
len_search_term_letters    88
complete_match_title       88
complete_match_descr       88
match_pos_title            88
diff                       88
dtype: int64

In [21]:
train_clean = train[train['diff'] < 1.5]
X = train_clean[cols].values
y = train_clean['relevance'].values
br.fit(X, y)
X_test = test[cols].values
y_pred = br.predict(X_test)

In [22]:
results = pd.DataFrame({'id':test.index.values, 'relevance':y_pred})
results.to_csv('lgbm3.csv', header=True, index=False)

In [23]:
train[train['diff'] > 1.5]['search_term'].values

array(['face mask', 'chicken wire', 'hot water tank gas', 'screen frame',
       'white lamin white', 'garag door spring', 'air acondicionado',
       'insid door', 'green machin', '36 shower door', 'ridgid tabl',
       'wall fan', '5/8 copper', 'ego batteri', 'paint cup',
       'liquid nitrogen', 'push pin', 'free ship', 'ego batteri',
       'small tree', 'big ass', 'topiari tree', 'sheffield', 'memoir',
       'roof tile', 'plastic case', 'socket ring', 'white glove',
       'round beech', 'air line', 'storm door black', 'memoir',
       'grape plant', 'patio deck', 'granit sand', 'grape plant',
       'sun glass', 'ground clear', 'copper faucet', 'speaker wall mount',
       'roller bear', 'inlin switch', 'heater fan', 'garden door',
       'tomato plant', 'sun glass', 'rachael ray', 'sun glass',
       'batteri charger kit', 'red stone', 'roof tile',
       'small outdoor tabl', 'lamin floor tile', 'solar vent',
       'side shield', 'kerosen heater', 'brick stone saw',
       '

In [24]:
train[train['diff'] > 1.5]

Unnamed: 0_level_0,product_uid,product_title,search_term,relevance,orig_search_term,orig_product_title,descr,orig_descr,match_product_title,match_descr,len_search_term_words,len_search_term_letters,complete_match_title,complete_match_descr,match_pos_title,diff
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
742,100129,sundstrom safeti silicon half mask respir,face mask,1.0,face masks,Sundstrom Safety Silicone Half Mask Respirator,sr 100 half mask air purifi respir intend use ...,The SR 100 half mask Air Purifying Respirator ...,0.500000,1.000000,2,9,0,0,0.000000,1.510020
5267,100909,summerhawk ranch vintag red barn chicken coop,chicken wire,1.0,chicken wire,SummerHawk Ranch Vintage Red Barn Chicken Coop,"start living, sunni side space-effici coop-and...","Start Living, Sunny Side Up with this space-ef...",0.500000,1.000000,2,12,0,0,0.500000,1.532246
9508,101618,"sure comfort 40 gal. tall 3 year 34,000 btu na...",hot water tank gas,1.0,hot water tank gas,"Sure Comfort 40 Gal. Tall 3 Year 34,000 BTU Na...",sure comfort 40 gal. natur gas tall water heat...,The Sure Comfort 40 Gal. Natural Gas Tall Wate...,0.500000,1.000000,4,18,0,0,0.500000,1.580053
10345,101774,new york wire brown 5/16 in. screen frame corn...,screen frame,1.0,screen frame,New York Wire Brown 5/16 in. Screen Frame Corn...,"5/16"" screen frame corner use build repair scr...","The 5/16"" Screen Frame Corners are used to bui...",1.000000,1.000000,2,12,1,1,1.000000,1.720466
13289,102312,sauder shoal creek collect white 6-drawer dresser,white lamin white,1.0,white laminated white,SAUDER Shoal Creek Collection White 6-Drawer D...,shoal creek dresser contemporari style offer a...,The Shoal Creek Dresser has contemporary styli...,0.666667,1.000000,3,17,0,0,0.333333,1.595924
18526,103303,everbilt 8 ft. garag door safeti cabl,garag door spring,1.0,garage door spring,Everbilt 8 ft. Garage Door Safety Cable,everbilt 8 ft. garag door safeti cabl made gal...,The Everbilt 8 ft. Garage Door Safety Cable is...,0.666667,1.000000,3,17,0,0,0.666667,1.591040
20139,103676,ac-saf larg air condition exterior cover,air acondicionado,1.0,aire acondicionado,AC-Safe Large Air Conditioner Exterior Cover,a/c safe larg air condition exterior cover mad...,This A/C Safe Large Air Conditioner Exterior C...,0.500000,0.500000,2,17,0,0,0.000000,1.508702
24328,104842,hillman group 0.243 in. wire x 2 in. insid dia...,insid door,1.0,inside doors,The Hillman Group 0.243 in. Wire x 2 in. Insid...,torsion spring balanc garag door appli torqu s...,Torsion springs balance the garage door by app...,0.500000,1.000000,2,10,0,0,0.000000,1.514659
24860,104984,clean machin patio stripe camouflag 18 in. x 3...,green machin,1.0,green machine,Clean Machine Patio Stripe Camouflage 18 in. x...,patio stripe door mat design camouflag two col...,This Patio Stripe door mat design in our camou...,0.500000,1.000000,2,12,0,0,0.500000,1.532246
29109,106225,showerdoordirect 36 in. frameless shower door ...,36 shower door,1.0,36 shower door,Showerdoordirect 36 in. Frameless Shower Door ...,seal use bottom door use seal side door. desig...,The seal can be used for the bottom of the doo...,1.000000,0.333333,3,14,0,0,0.666667,1.678888


In [29]:
br.get_params()

{'base_estimator__boosting_type': 'gbdt',
 'base_estimator__class_weight': None,
 'base_estimator__colsample_bytree': 1.0,
 'base_estimator__importance_type': 'split',
 'base_estimator__learning_rate': 0.1,
 'base_estimator__max_depth': -1,
 'base_estimator__min_child_samples': 20,
 'base_estimator__min_child_weight': 0.001,
 'base_estimator__min_split_gain': 0.0,
 'base_estimator__n_estimators': 100,
 'base_estimator__n_jobs': -1,
 'base_estimator__num_leaves': 31,
 'base_estimator__objective': None,
 'base_estimator__random_state': None,
 'base_estimator__reg_alpha': 0.0,
 'base_estimator__reg_lambda': 0.0,
 'base_estimator__silent': True,
 'base_estimator__subsample': 1.0,
 'base_estimator__subsample_for_bin': 200000,
 'base_estimator__subsample_freq': 0,
 'base_estimator': LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
        importance_type='split', learning_rate=0.1, max_depth=-1,
        min_child_samples=20, min_child_weight=0.001, min_split_gain=

In [34]:
params = {
    'base_estimator__learning_rate': [0.05, 0.1],
    'base_estimator__n_estimators': [50, 100],
    'max_samples': [0.1, 0.4, 0.7],
    'n_estimators': [20, 40]
}
grid = GridSearchCV(BaggingRegressor(lgb.LGBMRegressor()),
                    params,
                    cv=3,
                    scoring='neg_mean_squared_error',
                    verbose=1,
                    return_train_score=False)
grid.fit(X, y)
grid_df = pd.DataFrame(grid.cv_results_)
grid_df.drop(columns=grid_df.columns[:3], inplace=True)
grid_df.sort_values(['rank_test_score'], inplace=True)
grid_df.head()

Fitting 3 folds for each of 24 candidates, totalling 72 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  72 out of  72 | elapsed:  4.0min finished


Unnamed: 0,std_score_time,param_base_estimator__learning_rate,param_base_estimator__n_estimators,param_max_samples,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
23,0.004545,0.1,100,0.7,40,"{'base_estimator__learning_rate': 0.1, 'base_e...",-0.235134,-0.226643,-0.254811,-0.238863,0.011798,1
20,0.002252,0.1,100,0.4,20,"{'base_estimator__learning_rate': 0.1, 'base_e...",-0.235199,-0.226862,-0.254652,-0.238904,0.011644,2
22,0.001538,0.1,100,0.7,20,"{'base_estimator__learning_rate': 0.1, 'base_e...",-0.235195,-0.22668,-0.254892,-0.238922,0.011815,3
21,0.006369,0.1,100,0.4,40,"{'base_estimator__learning_rate': 0.1, 'base_e...",-0.235216,-0.226844,-0.254804,-0.238954,0.011717,4
18,0.001093,0.1,100,0.1,20,"{'base_estimator__learning_rate': 0.1, 'base_e...",-0.235185,-0.227214,-0.255215,-0.239205,0.01178,5


In [42]:
model = grid.best_estimator_

y_pred = model.predict(X_test)
results = pd.DataFrame({'id':test.index.values, 'relevance':y_pred})
results.to_csv('lgbm3.csv', header=True, index=False)

0.49336

In [76]:
from matplotlib import colors as mcolors
colors = [color for color in mcolors.cnames.keys()]
print(colors)

['aliceblue', 'antiquewhite', 'aqua', 'aquamarine', 'azure', 'beige', 'bisque', 'black', 'blanchedalmond', 'blue', 'blueviolet', 'brown', 'burlywood', 'cadetblue', 'chartreuse', 'chocolate', 'coral', 'cornflowerblue', 'cornsilk', 'crimson', 'cyan', 'darkblue', 'darkcyan', 'darkgoldenrod', 'darkgray', 'darkgreen', 'darkgrey', 'darkkhaki', 'darkmagenta', 'darkolivegreen', 'darkorange', 'darkorchid', 'darkred', 'darksalmon', 'darkseagreen', 'darkslateblue', 'darkslategray', 'darkslategrey', 'darkturquoise', 'darkviolet', 'deeppink', 'deepskyblue', 'dimgray', 'dimgrey', 'dodgerblue', 'firebrick', 'floralwhite', 'forestgreen', 'fuchsia', 'gainsboro', 'ghostwhite', 'gold', 'goldenrod', 'gray', 'green', 'greenyellow', 'grey', 'honeydew', 'hotpink', 'indianred', 'indigo', 'ivory', 'khaki', 'lavender', 'lavenderblush', 'lawngreen', 'lemonchiffon', 'lightblue', 'lightcoral', 'lightcyan', 'lightgoldenrodyellow', 'lightgray', 'lightgreen', 'lightgrey', 'lightpink', 'lightsalmon', 'lightseagreen', 

In [87]:
dirty_words = ['ass']
sizes = ['big', 'small']
filter_words = dirty_words + sizes
stop_words = set(stopwords.words('english'))
for word in filter_words:
    stop_words.add(word)

def word_match_count(search_term, text):
    count = 0
    search_term = search_term.split()
    text = text.split()
    if len(search_term) == 0:
        return 0
    for word in set(search_term):
        count += 1 if text.count(word) > 0 else 0
    return count / len(search_term)

In [88]:
train['search_term2'] = train['search_term'].apply(
    lambda x: ' '.join([word for word in str(x) if word not in stop_words]))
for col in ['product_title', 'descr']:
    train['match_' + col + '2'] = train.apply(lambda row: word_match_count(row['search_term2'], row[col]), axis=1)


In [89]:
cols = ['match_product_title2', 'match_descr2', 'len_search_term_words', 'len_search_term_letters',
        'complete_match_title', 'complete_match_descr', 'match_pos_title']
X = train[cols]
params = {
    'max_samples': [0.2, 0.4],
    'n_estimators': [20, 40]
}
grid = GridSearchCV(BaggingRegressor(lgb.LGBMRegressor()),
                    params,
                    cv=3,
                    scoring='neg_mean_squared_error',
                    verbose=1,
                    return_train_score=False)
grid.fit(X, y)
grid_df = pd.DataFrame(grid.cv_results_)
grid_df.drop(columns=grid_df.columns[:3], inplace=True)
grid_df.sort_values(['rank_test_score'], inplace=True)
grid_df.head()

Fitting 3 folds for each of 4 candidates, totalling 12 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  12 out of  12 | elapsed:   43.4s finished


Unnamed: 0,std_score_time,param_max_samples,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
3,0.002578,0.4,40,"{'max_samples': 0.4, 'n_estimators': 40}",-0.253879,-0.245646,-0.276812,-0.258779,0.013187,1
0,0.0084,0.2,20,"{'max_samples': 0.2, 'n_estimators': 20}",-0.2536,-0.245838,-0.276938,-0.258792,0.013217,2
1,0.005635,0.2,40,"{'max_samples': 0.2, 'n_estimators': 40}",-0.25356,-0.246285,-0.276673,-0.258839,0.012955,3
2,0.002716,0.4,20,"{'max_samples': 0.4, 'n_estimators': 20}",-0.25356,-0.245724,-0.277363,-0.258882,0.013453,4


In [91]:
test['search_term2'] = test['search_term'].apply(
    lambda x: ' '.join([word for word in str(x) if word not in stop_words]))
for col in ['product_title', 'descr']:
    test['match_' + col + '2'] = test.apply(lambda row: word_match_count(row['search_term2'], row[col]), axis=1)
X_test = test[cols]
model = grid.best_estimator_
y_pred = model.predict(X_test)
results = pd.DataFrame({'id':test.index.values, 'relevance':y_pred})
results.to_csv('lgbm3.csv', header=True, index=False)

In [93]:
cols = ['match_product_title', 'match_descr', 'len_search_term_words', 'len_search_term_letters',
        'complete_match_title', 'complete_match_descr', 'match_pos_title']
X = train[cols]
br = BaggingRegressor(lgb.LGBMRegressor(), max_samples=0.1, n_estimators=40)
br.fit(X, y)
X_test = test[cols]
y_pred = br.predict(X_test)
results = pd.DataFrame({'id':test.index.values, 'relevance':y_pred})
results.to_csv('lgbm3.csv', header=True, index=False)

0.49360

In [95]:
train.columns

Index(['product_uid', 'product_title', 'search_term', 'relevance',
       'orig_search_term', 'orig_product_title', 'descr', 'orig_descr',
       'match_product_title', 'match_descr', 'len_search_term_words',
       'len_search_term_letters', 'complete_match_title',
       'complete_match_descr', 'match_pos_title', 'diff', 'descr_cut',
       'match_descr_cut_pos', 'match_descr_cut', 'search_term2',
       'match_product_title2', 'match_descr2'],
      dtype='object')

In [96]:
train = train.drop(columns=['descr_cut', 'match_descr_cut_pos', 'match_descr_cut',
                            'search_term2', 'match_product_title2', 'match_descr2'])

In [99]:
train['diff'] = (train['relevance'] - br.predict(X)).abs()
train[train['diff'].between(0.5, 1.)].count()

product_uid                19139
product_title              19139
search_term                19131
relevance                  19139
orig_search_term           19139
orig_product_title         19139
descr                      19139
orig_descr                 19139
match_product_title        19139
match_descr                19139
len_search_term_words      19139
len_search_term_letters    19139
complete_match_title       19139
complete_match_descr       19139
match_pos_title            19139
diff                       19139
dtype: int64

In [101]:
train[train['diff'].between(0.5, 1.)].sample(50)

Unnamed: 0_level_0,product_uid,product_title,search_term,relevance,orig_search_term,orig_product_title,descr,orig_descr,match_product_title,match_descr,len_search_term_words,len_search_term_letters,complete_match_title,complete_match_descr,match_pos_title,diff
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
219251,204729,behr premium plus #290e-2 oat cake zero voc in...,locit 2 plus 1,1.33,locite 2 plus 1,BEHR Premium Plus #290E-2 Oat Cake Zero VOC In...,"behr premium plus zero voc, self-prim interior...","BEHR Premium Plus Zero VOC, Self-Priming Inter...",0.25,0.5,4,14,0,0,0.25,0.746869
59972,116087,foremost structur suit 20-5/80 in. pedest sink...,foremost toilet,3.0,Foremost toilet,Foremost Structure Suite 20-5/80 in. Pedestal ...,"20-5/8 in. side side, black foremost structur ...","At 20-5/8 in. side to side, this black Foremos...",0.5,0.5,2,15,0,0,0.5,0.555633
115977,138581,homax 1-qt. premix popcorn patch,ceil textur,3.0,ceiling texture,Homax 1-qt. Premixed Popcorn Patch,"homax popcorn ceil patch premixed, readi use p...","Homax Popcorn Ceiling Patch is a premixed, rea...",0.0,1.0,2,11,0,1,0.0,0.590909
53927,114027,2-1/2 in. stainless steel mesh strainer,drain plug 2',2.67,drain plugs 2',2-1/2 in. Stainless Steel Mesh Strainer,keep drain flow freeli danco 2-1/2 in. stainle...,Keep your drain flowing freely with the DANCO ...,0.0,0.333333,3,13,0,0,0.333333,0.506069
89806,127192,leviton 15 amp tamper-resist duplex outlet - w...,15 amp tamp resist outlet,3.0,15 amp tampe resistant outlets,Leviton 15 Amp Tamper-Resistant Duplex Outlet ...,leviton make easi compli 2008 nec mandat tr re...,Leviton makes it easy for you to comply with t...,0.6,0.0,5,25,0,0,0.25,0.784001
187923,180093,hickori hardwar old mission 1-1/2 in. black mi...,mides cabinet,1.33,midesion cabinet,Hickory Hardware Old Mission 1-1/2 in. Black M...,hickori hardwar old mission 1-1/2 in. black mi...,The Hickory Hardware Old Mission 1-1/2 in. Bla...,0.0,0.5,2,13,0,0,0.0,0.926755
67457,118675,lithonia light dentil 1-1/2 ft. x 4 ft. fluore...,kitchen ceil lighten,3.0,kitchen ceiling lightening,Lithonia Lighting Dentil 1-1/2 ft. x 4 ft. Flu...,lithonia light dentil 1-1/2 ft. x 4 ft. fluore...,The Lithonia Lighting Dentil 1-1/2 ft. x 4 ft....,0.333333,0.333333,3,20,0,0,0.333333,0.797791
191913,182929,schluter jolli red brown color-co aluminum 3/8...,brown color scheem,1.33,brown color scheem,Schluter Jolly Red Brown Color-Coated Aluminum...,jolli profil design finish protect tile edg wa...,The Jolly profile is designed to finish and pr...,0.333333,0.0,3,18,0,0,0.333333,0.81758
149656,155843,dremel 3.5 amp multi-max oscil tool kit,dremel oscil grinder,3.0,dremel oscillating grinder,Dremel 3.5 Amp Multi-Max Oscillating Tool Kit,multi-max mm45-02 oscil tool kit featur mm45 m...,The Multi-Max MM45-02 Oscillating Tool kit fea...,0.666667,0.333333,3,20,0,0,0.333333,0.737501
41325,109915,roxul comfortbatt 3-1/2 in. x 15-1/4 in. x 47 ...,roxul,3.0,roxul,Roxul ComfortBatt 3-1/2 in. x 15-1/4 in. x 47 ...,roxul comfortbatt r-15 fire resist thermal hom...,Roxul ComfortBatt R-15 Fire Resistant Thermal ...,1.0,1.0,1,5,1,1,1.0,0.52441


In [105]:
attrs = pd.read_csv('attributes.csv')
brand = attrs[attrs['name'] == 'MFG Brand Name']
brand.index = brand['product_uid']
brand = brand['value']
train['brand'] = train['product_uid'].map(brand).astype(str).apply(stem_sentence)

In [107]:
def f(str1, str2):
    n = len(set(str1.split()).intersection(set(str2.split())))
    return n / len(str1)
    
train['match_brand'] = train.apply(lambda row: f(row['brand'], str(row['search_term'])), axis=1)

cols = ['match_product_title', 'match_descr', 'len_search_term_words', 'len_search_term_letters',
        'complete_match_title', 'complete_match_descr', 'match_pos_title', 'match_brand']
X = train[cols]
br = BaggingRegressor(lgb.LGBMRegressor(), max_samples=0.1, n_estimators=40)
br.fit(X, y)


test['brand'] = test['product_uid'].map(brand).astype(str).apply(stem_sentence)
test['match_brand'] = test.apply(lambda row: f(row['brand'], str(row['search_term'])), axis=1)

X_test = test[cols]
y_pred = br.predict(X_test)
results = pd.DataFrame({'id':test.index.values, 'relevance':y_pred})
results.to_csv('lgbm3.csv', header=True, index=False)

0.49323

In [111]:
from pprint import pprint
pprint(dict(zip(cols, np.mean([tree.feature_importances_ for tree in br.estimators_], axis=0))))

{'complete_match_descr': 55.625,
 'complete_match_title': 58.525,
 'len_search_term_letters': 923.425,
 'len_search_term_words': 258.15,
 'match_brand': 376.35,
 'match_descr': 416.625,
 'match_pos_title': 524.25,
 'match_product_title': 387.05}


In [112]:
train['match_pos_descr'] = train.apply(lambda x: match_word_n_pos(x['orig_descr'], x['orig_search_term']), axis=1)

cols = ['match_product_title', 'match_descr', 'len_search_term_words', 'len_search_term_letters',
        'complete_match_title', 'complete_match_descr', 'match_pos_title', 'match_brand', 'match_pos_descr']
X = train[cols]
br = BaggingRegressor(lgb.LGBMRegressor(), max_samples=0.1, n_estimators=40)
br.fit(X, y)

test['match_pos_descr'] = test.apply(lambda x: match_word_n_pos(x['orig_descr'], x['orig_search_term']), axis=1)

X_test = test[cols]
y_pred = br.predict(X_test)
results = pd.DataFrame({'id':test.index.values, 'relevance':y_pred})
results.to_csv('lgbm3.csv', header=True, index=False)

0.49306

In [113]:
from pprint import pprint
pprint(dict(zip(cols, np.mean([tree.feature_importances_ for tree in br.estimators_], axis=0))))

{'complete_match_descr': 40.775,
 'complete_match_title': 49.425,
 'len_search_term_letters': 788.05,
 'len_search_term_words': 202.525,
 'match_brand': 266.0,
 'match_descr': 310.425,
 'match_pos_descr': 621.55,
 'match_pos_title': 398.325,
 'match_product_title': 322.925}


In [115]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD


x_test_search_term = TruncatedSVD(20).fit_transform(TfidfVectorizer().fit_transform(train['search_term'].str))




AttributeError: 'Series' object has no attribute 'lower'