In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor
from nltk.stem.snowball import SnowballStemmer


  from numpy.core.umath_tests import inner1d


In [2]:
# Load stemmer and data

stemmer = SnowballStemmer('english')

df_train = pd.read_csv('train.csv', encoding="ISO-8859-1")
df_test = pd.read_csv('test.csv', encoding="ISO-8859-1")
# df_attr = pd.read_csv('../input/attributes.csv')
df_pro_desc = pd.read_csv('product_descriptions.csv')

num_train = df_train.shape[0]
num_train

74067

In [3]:
# Define functions

def str_stemmer(s):
    return " ".join([stemmer.stem(word) for word in s.lower().split()])

# Function that counts the number of common words
def str_common_word(str1, str2):
    return sum(int(str2.find(word)>=0) for word in str1.split())

In [4]:
str_common_word('i want you', 'you want i')

3

In [12]:
str_stemmer('i want you to go have fun with lovely differently people out there in the wr')

'i want you to go have fun with love differ peopl out there in the wr'

In [13]:
# Concat train & test data & product description dataset
df_all = pd.concat((df_train, df_test), axis=0, ignore_index=True)
df_all = pd.merge(df_all, df_pro_desc, how='left', on='product_uid')
df_all.head(10)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  


Unnamed: 0,id,product_title,product_uid,relevance,search_term,product_description
0,2,Simpson Strong-Tie 12-Gauge Angle,100001,3.0,angle bracket,"Not only do angles make joints stronger, they ..."
1,3,Simpson Strong-Tie 12-Gauge Angle,100001,2.5,l bracket,"Not only do angles make joints stronger, they ..."
2,9,BEHR Premium Textured DeckOver 1-gal. #SC-141 ...,100002,3.0,deck over,BEHR Premium Textured DECKOVER is an innovativ...
3,16,Delta Vero 1-Handle Shower Only Faucet Trim Ki...,100005,2.33,rain shower head,Update your bathroom with the Delta Vero Singl...
4,17,Delta Vero 1-Handle Shower Only Faucet Trim Ki...,100005,2.67,shower only faucet,Update your bathroom with the Delta Vero Singl...
5,18,Whirlpool 1.9 cu. ft. Over the Range Convectio...,100006,3.0,convection otr,Achieving delicious results is almost effortle...
6,20,Whirlpool 1.9 cu. ft. Over the Range Convectio...,100006,2.67,microwave over stove,Achieving delicious results is almost effortle...
7,21,Whirlpool 1.9 cu. ft. Over the Range Convectio...,100006,3.0,microwaves,Achieving delicious results is almost effortle...
8,23,Lithonia Lighting Quantum 2-Light Black LED Em...,100007,2.67,emergency light,The Quantum Adjustable 2-Light LED Black Emerg...
9,27,House of Fara 3/4 in. x 3 in. x 8 ft. MDF Flut...,100009,3.0,mdf 3/4,Get the House of Fara 3/4 in. x 3 in. x 8 ft. ...


In [14]:
# Stemming process for the text data

df_all['search_term'] = df_all['search_term'].map(lambda x:str_stemmer(x))
df_all['product_title'] = df_all['product_title'].map(lambda x:str_stemmer(x))
df_all['product_description'] = df_all['product_description'].map(lambda x:str_stemmer(x))

df_all.head(10)

Unnamed: 0,id,product_title,product_uid,relevance,search_term,product_description
0,2,simpson strong-ti 12-gaug angl,100001,3.0,angl bracket,"not onli do angl make joint stronger, they als..."
1,3,simpson strong-ti 12-gaug angl,100001,2.5,l bracket,"not onli do angl make joint stronger, they als..."
2,9,behr premium textur deckov 1-gal. #sc-141 tugb...,100002,3.0,deck over,behr premium textur deckov is an innov solid c...
3,16,delta vero 1-handl shower onli faucet trim kit...,100005,2.33,rain shower head,updat your bathroom with the delta vero single...
4,17,delta vero 1-handl shower onli faucet trim kit...,100005,2.67,shower onli faucet,updat your bathroom with the delta vero single...
5,18,whirlpool 1.9 cu. ft. over the rang convect mi...,100006,3.0,convect otr,achiev delici result is almost effortless with...
6,20,whirlpool 1.9 cu. ft. over the rang convect mi...,100006,2.67,microwav over stove,achiev delici result is almost effortless with...
7,21,whirlpool 1.9 cu. ft. over the rang convect mi...,100006,3.0,microwav,achiev delici result is almost effortless with...
8,23,lithonia light quantum 2-light black led emerg...,100007,2.67,emerg light,the quantum adjust 2-light led black emerg lig...
9,27,hous of fara 3/4 in. x 3 in. x 8 ft. mdf flute...,100009,3.0,mdf 3/4,get the hous of fara 3/4 in. x 3 in. x 8 ft. m...


In [8]:
# Create a new column for the length of search term
df_all['len_of_query'] = df_all['search_term'].map(lambda x:len(x.split())).astype(np.int64)

In [10]:
df_all['product_info'] = df_all['search_term']+"\t"+df_all['product_title']+"\t"+df_all['product_description']
df_all['product_info'].head()

0    angl bracket\tsimpson strong-ti 12-gaug angl\t...
1    l bracket\tsimpson strong-ti 12-gaug angl\tnot...
2    deck over\tbehr premium textur deckov 1-gal. #...
3    rain shower head\tdelta vero 1-handl shower on...
4    shower onli faucet\tdelta vero 1-handl shower ...
Name: product_info, dtype: object

In [11]:
# Num. of words in title? [0] => search term    [1] => product_title
df_all['word_in_title'] = df_all['product_info'].map(lambda x:str_common_word(x.split('\t')[0],x.split('\t')[1]))
df_all['word_in_title'].head()

0    1
1    1
2    1
3    1
4    3
Name: word_in_title, dtype: int64

In [12]:
# Num. of words in product description?
df_all['word_in_description'] = df_all['product_info'].map(lambda x:str_common_word(x.split('\t')[0],x.split('\t')[2]))
df_all['word_in_description'].head()

0    1
1    1
2    1
3    1
4    2
Name: word_in_description, dtype: int64

In [13]:
# ??
df_all = df_all.drop(['search_term','product_title','product_description','product_info'],axis=1)
df_all.head()

Unnamed: 0,id,product_uid,relevance,len_of_query,word_in_title,word_in_description
0,2,100001,3.0,2,1,1
1,3,100001,2.5,2,1,1
2,9,100002,3.0,2,1,1
3,16,100005,2.33,3,1,1
4,17,100005,2.67,3,3,2


In [14]:
df_train = df_all.iloc[:num_train]
df_test = df_all.iloc[num_train:]
id_test = df_test['id']

In [15]:
print(df_train.shape)
df_train.head()

(74067, 6)


Unnamed: 0,id,product_uid,relevance,len_of_query,word_in_title,word_in_description
0,2,100001,3.0,2,1,1
1,3,100001,2.5,2,1,1
2,9,100002,3.0,2,1,1
3,16,100005,2.33,3,1,1
4,17,100005,2.67,3,3,2


In [16]:
print(df_test.shape)
df_test.head()

(166693, 6)


Unnamed: 0,id,product_uid,relevance,len_of_query,word_in_title,word_in_description
74067,1,100001,,3,0,1
74068,4,100001,,3,1,1
74069,5,100001,,3,1,1
74070,6,100001,,3,2,2
74071,7,100001,,4,2,2


In [18]:
id_test

74067          1
74068          4
74069          5
74070          6
74071          7
74072          8
74073         10
74074         11
74075         12
74076         13
74077         14
74078         15
74079         19
74080         22
74081         24
74082         25
74083         26
74084         28
74085         29
74086         30
74087         31
74088         32
74089         33
74090         36
74091         39
74092         40
74093         41
74094         42
74095         43
74096         44
           ...  
240730    240731
240731    240732
240732    240733
240733    240734
240734    240735
240735    240736
240736    240737
240737    240738
240738    240739
240739    240740
240740    240741
240741    240742
240742    240743
240743    240744
240744    240745
240745    240746
240746    240747
240747    240748
240748    240749
240749    240750
240750    240751
240751    240752
240752    240753
240753    240754
240754    240755
240755    240756
240756    240757
240757    2407

In [19]:
# Prepare training & test dataset
y_train = df_train['relevance'].values
X_train = df_train.drop(['id','relevance'],axis=1).values
X_test = df_test.drop(['id','relevance'],axis=1).values

In [25]:
df_train.drop(['id','relevance'],axis=1).head()

Unnamed: 0,product_uid,len_of_query,word_in_title,word_in_description
0,100001,2,1,1
1,100001,2,1,1
2,100002,2,1,1
3,100005,3,1,1
4,100005,3,3,2


In [26]:
# Make your train data into array form instead of Dataframe for training your model
df_train.drop(['id','relevance'],axis=1).values

array([[100001,      2,      1,      1],
       [100001,      2,      1,      1],
       [100002,      2,      1,      1],
       ...,
       [206641,      7,      2,      4],
       [206648,      3,      2,      2],
       [206650,      5,      3,      2]])

In [29]:
# Train your model & predict
rf = RandomForestRegressor(n_estimators=15, max_depth=6, random_state=0)
clf = BaggingRegressor(rf, n_estimators=45, max_samples=0.1, random_state=25)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

In [32]:
pred = pd.DataFrame({"id": id_test, "relevance": y_pred})
pred.head()

Unnamed: 0,id,relevance
74067,1,2.072159
74068,4,2.224175
74069,5,2.224175
74070,6,2.34591
74071,7,2.210789


In [34]:
# Export in csv file for the submission

pd.DataFrame({"id": id_test, "relevance": y_pred}).to_csv('submission.csv',index=False)

## Score: 0.48721