# Kaggle Home Depot

### Importing Libraries

In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor
from sklearn.linear_model import LogisticRegression
from nltk.stem.snowball import SnowballStemmer
from sklearn.metrics import mean_squared_error
from sklearn.cross_validation import cross_val_score
from sklearn import preprocessing
from sklearn import utils
stemmer = SnowballStemmer('english')
import matplotlib.pyplot as plt
%matplotlib inline



### Reading the data from the files

In [2]:
df_train = pd.read_csv("train.csv",encoding = "ISO-8859-1")
df_test = pd.read_csv("test.csv",encoding = "ISO-8859-1")
df_pro_desc = pd.read_csv("product_descriptions.csv",encoding = "ISO-8859-1")
df_attributes = pd.read_csv("attributes.csv",encoding = "ISO-8859-1")

### Commonly used functions

In [3]:
def stem_values(s):
    return " ".join([stemmer.stem(w) for w in s.lower().split()])

def str_common_word(str1, str2):
    return sum(int(str2.find(w)>=0) for w in str1.split())

### Create the appropriate Data structures

In [4]:
#concatenate train and test dataset and create a new dataframe
df_full = pd.concat((df_train, df_test), axis=0, ignore_index=True)
#add product description to the dataframe
df_full = pd.merge(df_full, df_pro_desc, how='left', on='product_uid')
#use stemmer to stem serch term and prodct title
df_full['search_term'] = df_full['search_term'].map(lambda x:stem_values(x))
df_full['product_title'] = df_full['product_title'].map(lambda x:stem_values(x))
df_full['product_description'] = df_full['product_description'].map(lambda x:stem_values(x))
#count the terms contained in the query
df_full['query_length'] = df_full['search_term'].map(lambda x:len(x.split())).astype(np.int64)
#create a new column so as to use it for counting the common words between the'search_term','product_title','product_description'
df_full['p_info_merged_t'] = df_full['search_term']+"\t"+df_full['product_title']
df_full['p_info_merged_d'] = df_full['search_term']+"\t"+df_full['product_description']

#create a new column with the count of common words in title and search term
df_full['common_w_title'] = df_full['p_info_merged_t'].map(lambda x:str_common_word(x.split('\t')[0],x.split('\t')[1]))
#create a new column with the count of common words in description and search term
df_full['common_w_description'] = df_full['p_info_merged_d'].map(lambda x:str_common_word(x.split('\t')[0],x.split('\t')[1]))


### Creating our Features

In [5]:
df_full['common_words'] = df_full['common_w_title'] + df_full['common_w_description']
df_full['search_ratio'] = (df_full['common_w_title'] + df_full['common_w_description']) / df_full['query_length']
df_full['title_ratio'] = (df_full['common_w_title']) / df_full['query_length']
df_full['desc_ratio'] = (df_full['common_w_description']) / df_full['query_length']

In [6]:
df_full.head()

Unnamed: 0,id,product_title,product_uid,relevance,search_term,product_description,query_length,p_info_merged_t,p_info_merged_d,common_w_title,common_w_description,common_words,search_ratio,title_ratio,desc_ratio
0,2,simpson strong-ti 12-gaug angl,100001,3.0,angl bracket,"not onli do angl make joint stronger, they als...",2,angl bracket\tsimpson strong-ti 12-gaug angl,angl bracket\tnot onli do angl make joint stro...,1,1,2,1.0,0.5,0.5
1,3,simpson strong-ti 12-gaug angl,100001,2.5,l bracket,"not onli do angl make joint stronger, they als...",2,l bracket\tsimpson strong-ti 12-gaug angl,l bracket\tnot onli do angl make joint stronge...,1,1,2,1.0,0.5,0.5
2,9,behr premium textur deckov 1-gal. #sc-141 tugb...,100002,3.0,deck over,behr premium textur deckov is an innov solid c...,2,deck over\tbehr premium textur deckov 1-gal. #...,deck over\tbehr premium textur deckov is an in...,1,1,2,1.0,0.5,0.5
3,16,delta vero 1-handl shower onli faucet trim kit...,100005,2.33,rain shower head,updat your bathroom with the delta vero single...,3,rain shower head\tdelta vero 1-handl shower on...,rain shower head\tupdat your bathroom with the...,1,1,2,0.666667,0.333333,0.333333
4,17,delta vero 1-handl shower onli faucet trim kit...,100005,2.67,shower onli faucet,updat your bathroom with the delta vero single...,3,shower onli faucet\tdelta vero 1-handl shower ...,shower onli faucet\tupdat your bathroom with t...,3,2,5,1.666667,1.0,0.666667


In [None]:
#df_full.drop(['common_w_description'],axis=1, inplace=True)

In [7]:
df_full_clean = df_full.drop(['search_term','product_title','product_description','p_info_merged_t','p_info_merged_d'],axis=1)
df_full_clean.to_csv('features_rf_bag_lg.csv',index=False)

In [8]:
df_full_clean.head()

Unnamed: 0,id,product_uid,relevance,query_length,common_w_title,common_w_description,common_words,search_ratio,title_ratio,desc_ratio
0,2,100001,3.0,2,1,1,2,1.0,0.5,0.5
1,3,100001,2.5,2,1,1,2,1.0,0.5,0.5
2,9,100002,3.0,2,1,1,2,1.0,0.5,0.5
3,16,100005,2.33,3,1,1,2,0.666667,0.333333,0.333333
4,17,100005,2.67,3,3,2,5,1.666667,1.0,0.666667


In [9]:
train_samples = df_train.shape[0]
#split the dataset into training and testing
df_train = df_full_clean.iloc[:train_samples]
df_test = df_full_clean.iloc[train_samples:]
id_test = df_test['id']

In [10]:
df_train.head()

Unnamed: 0,id,product_uid,relevance,query_length,common_w_title,common_w_description,common_words,search_ratio,title_ratio,desc_ratio
0,2,100001,3.0,2,1,1,2,1.0,0.5,0.5
1,3,100001,2.5,2,1,1,2,1.0,0.5,0.5
2,9,100002,3.0,2,1,1,2,1.0,0.5,0.5
3,16,100005,2.33,3,1,1,2,0.666667,0.333333,0.333333
4,17,100005,2.67,3,3,2,5,1.666667,1.0,0.666667


### Setting ML models

In [11]:
#create the X and y values to be used for training and testing
y = df_train['relevance'].values
X = df_train.drop(['id','relevance','product_uid'],axis=1).values
X_df = df_train.drop(['id','relevance','product_uid'],axis=1)
X_test = df_test.drop(['id','relevance','product_uid'],axis=1).values

In [12]:
X[:2]

array([[ 2. ,  1. ,  1. ,  2. ,  1. ,  0.5,  0.5],
       [ 2. ,  1. ,  1. ,  2. ,  1. ,  0.5,  0.5]])

In [37]:
rf = RandomForestRegressor(n_estimators=40, n_jobs=-1)
#Best parameters set found on development set:
#{'bootstrap': True, 'max_depth': 2000, 'max_features': 'log2', 'min_samples_leaf': 40, 'min_samples_split': 18, 'n_estimators': 40}
bag = BaggingRegressor(rf, n_estimators=45, max_samples=0.1, random_state=25)
#Best parameters set found on development set:
#{'C': 10, 'penalty': 'l1', 'random_state': None, 'solver': 'liblinear'}
lg = LogisticRegression(C=10, solver='liblinear', random_state=None,penalty='l1')

In [42]:
rf.fit(X, y)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=40, n_jobs=-1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)

In [43]:
bag.fit(X, y)

BaggingRegressor(base_estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=40, n_jobs=-1, oob_score=False, random_state=None,
           verbose=0, warm_start=False),
         bootstrap=True, bootstrap_features=False, max_features=1.0,
         max_samples=0.1, n_estimators=45, n_jobs=1, oob_score=False,
         random_state=25, verbose=0, warm_start=False)

In [44]:
y = df_train['relevance']
lab_enc = preprocessing.LabelEncoder()
encoded_Y = lab_enc.fit_transform(y)
lg.fit(X,encoded_Y)
#lg.fit(X, y)

LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [45]:
print ('Random Forest RMSE :', mean_squared_error(y, rf.predict(X))**0.5)

Random Forest RMSE : 0.484344002046


In [46]:
print ('Bagging RMSE :', mean_squared_error(y, bag.predict(X))**0.5)

Bagging RMSE : 0.484641674062


In [47]:
print ('Logistic RMSE :', mean_squared_error(encoded_Y, lg.predict(X))**0.5)

Logistic RMSE : 3.80003467684


In [48]:
feat_rank = np.argsort(rf.feature_importances_)[::-1]
feat_rank

array([5, 4, 3, 0, 6, 1, 2], dtype=int64)

In [49]:
X_df.columns[feat_rank][:25]

Index(['title_ratio', 'search_ratio', 'common_words', 'query_length',
       'desc_ratio', 'common_w_title', 'common_w_description'],
      dtype='object')

In [50]:
df_features_rf = pd.DataFrame(rf.feature_importances_,X_df.columns, columns = ['feature_value'])
df_features_rf.sort_values('feature_value', ascending=False)

Unnamed: 0,feature_value
title_ratio,0.588277
search_ratio,0.279886
common_words,0.05643
query_length,0.028894
desc_ratio,0.025278
common_w_title,0.01547
common_w_description,0.005765


In [51]:
scores = np.zeros((feat_rank.shape[0],2))
for i in range(1,feat_rank.shape[0]+1):
    features = [X_df.columns[feat_rank][x] for x in range(i)]
    scores[i-1:] = (i,(cross_val_score(rf, X_df[features], df_train['relevance'], cv=3)).mean())
scores    

array([[ 1.        ,  0.09510632],
       [ 2.        ,  0.11869194],
       [ 3.        ,  0.13630678],
       [ 4.        ,  0.13664011],
       [ 5.        ,  0.13710584],
       [ 6.        ,  0.13687832],
       [ 7.        ,  0.13652303]])

### Hyper parameters tuning

In [None]:
from time import time
from operator import itemgetter
from scipy.stats import randint as sp_randint
from sklearn.grid_search import GridSearchCV, RandomizedSearchCV
from sklearn.datasets import load_digits
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [52]:
# Huper parameter Tuning for Random Forest
rf = RandomForestRegressor()
# specify parameters to be used in GridSearch CV
param_grid = {'max_depth': [1000, 2000, None],
              'max_features': ['sqrt', 'log2', None],
              'min_samples_split': [12, 14, 18, 24],
              'min_samples_leaf': [30, 40, 60, 100],
              'bootstrap': [True, False],
              'n_estimators': [40, 60, 80, 100]}

# run grid search
print ('Running Grid Search...')
grid_search = GridSearchCV(rf, param_grid=param_grid, n_jobs=-1, scoring='mean_squared_error')
grid_search.fit(X, y)
print("Best parameters set found on development set:")
print()
print(grid_search.best_params_)

#print("GridSearchCV took %.2f seconds for %d candidate parameter settings."
#      % (time() - start, len(grid_search.grid_scores_)))
#report(grid_search.grid_scores_)

Running Grid Search...
Best parameters set found on development set:

{'bootstrap': True, 'max_depth': 2000, 'max_features': 'log2', 'min_samples_leaf': 40, 'min_samples_split': 18, 'n_estimators': 40}


In [None]:
y_pred = grid_search.best_estimator_.predict(X_test)

In [34]:
tuned_parameters = [{'penalty': ['l1','l2'], 'random_state': [None],'C': [1, 10, 100, 1000], 'solver': ['liblinear']},
                    {'penalty': ['l2'], 'random_state': [None],'C': [1, 10, 100, 1000], 'solver': ['newton-cg']}]

clf = GridSearchCV(lg, tuned_parameters, cv=5,
                       scoring='mean_squared_error')
clf.fit(X, encoded_Y)
print("Best parameters set found on development set:")
print()
print(clf.best_params_)

  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample

Best parameters set found on development set:

{'C': 10, 'penalty': 'l1', 'random_state': None, 'solver': 'liblinear'}


In [None]:
y_pred

In [None]:
pd.DataFrame({"id": id_test, "relevance": y_pred}).to_csv('results_rf_bag_lg.csv',index=False)