# Load cleaned data

In [1]:
import pandas as pd

train = pd.read_csv('train_cleaned.csv')
train.fillna('',inplace=True)
display(train.head(2))
X_train, y_train = train.loc[:, train.columns != 'relevance'], train['relevance']

Unnamed: 0,id,product_uid,product_title,search_term,product_description,MFG Brand Name,Bullet02,Bullet03,Bullet04,Bullet01,...,Certifications and Listings,Bullet09,Assembled Height (in.),Assembled Width (in.),Assembled Depth (in.),Product Length (in.),Bullet10,Indoor/Outdoor,Bullet11,relevance
0,2,100001,simpson strong tie 12 gaug angl,angl bracket,"not only do angles make joints stronger , they...",simpson strong tie,stronger than angled nailing or screw fastenin...,help ensure joints are consistently straight a...,dimensions 3 in. x 3 in. x 1 1/2 in.,versatile connector for various 90 connections...,...,,,,,,,,,,3.0
1,3,100001,simpson strong tie 12 gaug angl,bracket,"not only do angles make joints stronger , they...",simpson strong tie,stronger than angled nailing or screw fastenin...,help ensure joints are consistently straight a...,dimensions 3 in. x 3 in. x 1 1/2 in.,versatile connector for various 90 connections...,...,,,,,,,,,,2.5


# Transformation pipeline: Creating training features

In [18]:
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

class LenghtOfQuery(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        X['len_of_query'] = X['search_term'].apply(lambda x:len(x.split())).astype(np.int64)
        return X

class LenghtCommonWords(BaseEstimator, TransformerMixin):
    def str_common_word(self, str1, str2):
        str1, str2 = str1.lower(), str2.lower()
        words, cnt = str1.split(), 0
        for word in words:
            if str2.find(word)>=0:
                cnt+=1
        return cnt
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        X['word_in_title'] = X.apply(lambda x:self.str_common_word(x['search_term'],x['product_title']), axis=1)
        X['word_in_description'] = X.apply(lambda x:self.str_common_word(x['search_term'],x['product_description']), axis=1)
        return X

class TfIdfBasedFeatures(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.tf_transformer = TfidfVectorizer().fit(X)
        return self
    def transform(self, X, y=None):
        # X['search_term_tfidf'] = self.tf_transformer.transform(X['search_term'])
        return X

class SelectFeatures(BaseEstimator, TransformerMixin):
    def __init__(self, features_list):
        self.features_list = features_list
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        X = X[self.features_list]
        return X


prep_pipeline = Pipeline([
    ("lenght_query", LenghtOfQuery()),
    ("lenght_common_words", LenghtCommonWords()),
    ('tf-idf_features', TfIdfBasedFeatures()),
    ("select_features", SelectFeatures(features_list=["len_of_query","word_in_title","word_in_description"]))
])

# X_train_prep = prep_pipeline.fit_transform(X_train,y_train)
# X_train_prep

### Cosine distance between two sentences using CountVectorizer -> BagOfWords (BOW)

In [None]:
# import string
# from sklearn.metrics.pairwise import cosine_similarity
# from sklearn.feature_extraction.text import CountVectorizer

# def cosine_similarity_between_sentences(sen1,sen2):
    
#     def cosine_sim_vectors(vec1, vec2):
#       vec1 = vec1.reshape(1, -1)
#       vec2 = vec2.reshape(1, -1)
#       # Cosine distance between two TF-IDF vectors
#       return cosine_similarity(vec1,vec2)[0][0]

#     vectorizer = CountVectorizer().fit_transform([sen1,sen2])
#     vectors = vectorizer.toarray()

#     return cosine_sim_vectors(vectors[0], vectors[1])

# Model building and Hyperparameter tunning w/ Grid Search

In [4]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, mean_squared_error
import os
import numpy as np
import pickle

def my_custom_loss_func(y_true,y_pred):
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    print(rmse)
    return rmse

RMSE = make_scorer(my_custom_loss_func, greater_is_better=False)

def execute_pipeline(features,labels, search_space=[
                    {"estimator": [RandomForestRegressor(random_state=42, verbose=1, n_jobs=-1)],
                    "estimator__n_estimators": [10, 25],
                    "estimator__max_depth": [2, 6]
                    }], 
                    cv=3,
                    verbose=1,
                    n_jobs=os.cpu_count() - 2,
                    scoring=RMSE):
    
    pipe = Pipeline([("estimator", RandomForestRegressor())])
    
    gridsearch = GridSearchCV(pipe, search_space, scoring=scoring, cv=cv, verbose=verbose,n_jobs=n_jobs)
    best_model = gridsearch.fit(features, labels)
    print(best_model.best_estimator_)
    print(best_model.best_score_)
    return best_model

best_estimator = execute_pipeline(X_train_prep,y_train)

pickle.dump(best_estimator,open( "best_estimator.pkl", "wb" ))

Fitting 3 folds for each of 4 candidates, totalling 12 fits
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 out of  25 | elapsed:    0.0s finished
Pipeline(steps=[('estimator',
                 RandomForestRegressor(max_depth=6, n_estimators=25, n_jobs=-1,
                                       random_state=42, verbose=1))])
-0.5071780166367612


df_result = pd.DataFrame({'Relevance': y_test, 'Prediction':predicted})
mask = df_result.Relevance < 2
print ("MSE for relevance<2:", mean_squared_error(df_result[mask]['Relevance'], df_result[mask]['Prediction']))

# Generate predictions on test set

In [5]:
import pickle
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, mean_squared_error
import numpy as np

def my_custom_loss_func(y_true,y_pred):
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    print(rmse)
    return rmse

RMSE = make_scorer(my_custom_loss_func, greater_is_better=False)

best_estimator = pickle.load(open( "best_estimator.pkl", "rb" ))

In [19]:
import pandas as pd

X_test = pd.read_csv('test_cleaned.csv')
X_test.fillna('',inplace=True)

X_test_prep = prep_pipeline.transform(X_test)
X_test['relevance'] = best_estimator.predict(X_test_prep)
display(X_test)

X_test[['id','relevance']].to_csv('submission.csv',index=False)

[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  25 out of  25 | elapsed:    0.0s finished


Unnamed: 0,id,product_uid,product_title,search_term,product_description,MFG Brand Name,Bullet02,Bullet03,Bullet04,Bullet01,...,Assembled Width (in.),Assembled Depth (in.),Product Length (in.),Bullet10,Indoor/Outdoor,Bullet11,len_of_query,word_in_title,word_in_description,relevance
0,1,100001,simpson strong tie 12 gaug angl,degre bracket,"not only do angles make joints stronger , they...",simpson strong tie,stronger than angled nailing or screw fastenin...,help ensure joints are consistently straight a...,dimensions 3 in. x 3 in. x 1 1/2 in.,versatile connector for various 90 connections...,...,,,,,,,2,0,0,2.117476
1,4,100001,simpson strong tie 12 gaug angl,metal bracket,"not only do angles make joints stronger , they...",simpson strong tie,stronger than angled nailing or screw fastenin...,help ensure joints are consistently straight a...,dimensions 3 in. x 3 in. x 1 1/2 in.,versatile connector for various 90 connections...,...,,,,,,,2,0,0,2.117476
2,5,100001,simpson strong tie 12 gaug angl,simpson ski abl,"not only do angles make joints stronger , they...",simpson strong tie,stronger than angled nailing or screw fastenin...,help ensure joints are consistently straight a...,dimensions 3 in. x 3 in. x 1 1/2 in.,versatile connector for various 90 connections...,...,,,,,,,3,1,1,2.180334
3,6,100001,simpson strong tie 12 gaug angl,simpson strong tie,"not only do angles make joints stronger , they...",simpson strong tie,stronger than angled nailing or screw fastenin...,help ensure joints are consistently straight a...,dimensions 3 in. x 3 in. x 1 1/2 in.,versatile connector for various 90 connections...,...,,,,,,,3,3,3,2.712659
4,7,100001,simpson strong tie 12 gaug angl,simpson strong tie acc,"not only do angles make joints stronger , they...",simpson strong tie,stronger than angled nailing or screw fastenin...,help ensure joints are consistently straight a...,dimensions 3 in. x 3 in. x 1 1/2 in.,versatile connector for various 90 connections...,...,,,,,,,4,3,3,2.402828
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
166688,240756,224424,stufurhom norma 24in w x 16in x 34in h linen s...,white storag cabinet,create a neat yet stylish storage space for or...,stufurhome,dtc soft closing door and drawers,brushed nickel hardware,2 functional drawers,"solid wood construction , only the side and ba...",...,,,,,,,3,3,3,2.712659
166689,240757,224425,home decor collect 49in alessandro spiceberri ...,adirondack fusion,our bullnose adirondack chair cushions fit adi...,home decorators collection,filled with mildew resistant polyester fibers,available in a variety of designs and colors,2 in. h x 20.5 in. w x 49 in. d,"resists fading , stains and mildew",...,20.5 in,49 in,,,,,2,1,1,2.427454
166690,240758,224426,simpson strong tie hb 3 12 x 14in top flang jo...,,joist hangers are designed to provide support ...,,,,,,...,,,,,,,0,0,0,2.046966
166691,240759,224427,14in 20 tpi x 1 12in stainless steel button he...,hex socket,these socket cap screws are ideal for applicat...,,,,,,...,,,,,,,2,2,2,2.613946
