In [None]:
%cd /content/drive/MyDrive/Home_Depot_Case_Study/Workspace4

/content/drive/MyDrive/Home_Depot_Case_Study/Workspace4


In [47]:
import pandas as pd
import numpy as np
import regex as re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import warnings
warnings.filterwarnings('ignore')
from collections import Counter

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [48]:
attr_df = pd.read_csv("attributes.csv", encoding='iso-8859-1')
desc_df = pd.read_csv('product_descriptions.csv', encoding='iso-8859-1')
train_df = pd.read_csv("train.csv", encoding='iso-8859-1')
test_df = pd.read_csv('test.csv', encoding='iso-8859-1')

Merging

In [49]:
def merge_attributes(df):
  product_uids = df['product_uid'].values
  temp = attr_df.loc[attr_df['product_uid'].isin(product_uids)].fillna('')  
  temp['name_value'] = temp['name'] + ' ' + temp['value']
  temp['combined_attr'] = temp.groupby(['product_uid'])['name_value'].transform(lambda x: ' '.join(x))
  temp = temp.drop_duplicates('product_uid')[['product_uid', 'combined_attr']]
  df = pd.merge(df, temp, on='product_uid', how='left').set_index(df.index)
  return df

def merge_brand(df):
  product_uids = df['product_uid'].values
  temp = attr_df.loc[attr_df['product_uid'].isin(product_uids)]  
  brands = temp[temp['name']=='MFG Brand Name']
  brands_temp = brands[['product_uid','value']]
  df = pd.merge(df, brands_temp, on='product_uid', how='left').set_index(df.index)
  df.rename(columns = {'value':'brand'}, inplace = True) 
  return df

def merge_description(df):
  df = pd.merge(df, desc_df, on='product_uid', how='left').set_index(df.index)
  #an extra preprocessing step is performed to seperate the concatenated words in the description. 
  df['product_description'] = df['product_description'].apply(lambda x: ' '.join(re.findall(r'[A-Z]?[^A-Z\s]+|[A-Z]+', x)))
  return df

In [50]:
train_df = merge_attributes(train_df)
train_df = merge_brand(train_df)
train_df = merge_description(train_df)

test_df = merge_attributes(test_df)
test_df = merge_brand(test_df)
test_df = merge_description(test_df)

train_df = train_df.drop('relevance', axis=1)

combined_df = pd.concat([train_df, test_df], axis=0).reset_index()
combined_df = combined_df.drop_duplicates('product_uid')
print(combined_df.shape)
combined_df.isna().sum()

(124428, 8)


index                      0
id                         0
product_uid                0
product_title              0
search_term                0
combined_attr          38165
brand                  38243
product_description        0
dtype: int64

Filling null values

In [51]:
def first_n(n, sent):
  if n > len(sent.split()):
    return 'error101'
  return ' '.join(sent.split()[:n])

def fillna_brand(data, unique_brnds):
  null_df = data[data['brand'].isnull()]
  notnull_df = data.dropna()

  for i, row in null_df.iterrows():
    title = row['product_title']
    if first_n(4, title) in unique_brnds:
      null_df['brand'].loc[i] = first_n(4, title)
    elif first_n(3, title) in unique_brnds:
      null_df['brand'].loc[i] = first_n(3, title)
    elif first_n(2, title) in unique_brnds:
      null_df['brand'].loc[i] = first_n(2, title)
    else:
      null_df['brand'].loc[i] = first_n(1, title)

  data['brand'].loc[null_df.index] = null_df['brand'].values
  return data

def fillna_attributes(data):
  null_df = data[data['combined_attr'].isnull()]
  null_df['combined_attr'] = null_df['product_description'].copy()
  data['combined_attr'].loc[null_df.index] = null_df['combined_attr'].values
  return data

unique_brands = list(combined_df['brand'].unique())
print(len(unique_brands))

combined_df = fillna_brand(combined_df, unique_brands)
combined_df = fillna_attributes(combined_df)

combined_df.isna().sum()

4289


index                  0
id                     0
product_uid            0
product_title          0
search_term            0
combined_attr          0
brand                  0
product_description    0
dtype: int64

creating the text field

In [52]:
combined_df['text'] = combined_df['product_title'] + ' ' + combined_df['brand'] + ' ' + combined_df['product_description']
temp = combined_df.drop(['index', 'id', 'search_term'], axis=1)
print(temp.shape)
temp.head()

(124428, 6)


Unnamed: 0,product_uid,product_title,combined_attr,brand,product_description,text
0,100001,Simpson Strong-Tie 12-Gauge Angle,Bullet01 Versatile connector for various 90Â° ...,Simpson Strong-Tie,"Not only do angles make joints stronger, they ...",Simpson Strong-Tie 12-Gauge Angle Simpson Stro...
2,100002,BEHR Premium Textured DeckOver 1-gal. #SC-141 ...,"Application Method Brush,Roller,Spray Assemble...",BEHR Premium Textured DeckOver,BEHR Premium Textured DECKOVER is an innovativ...,BEHR Premium Textured DeckOver 1-gal. #SC-141 ...
3,100005,Delta Vero 1-Handle Shower Only Faucet Trim Ki...,Bath Faucet Type Combo Tub and Shower Built-in...,Delta,Update your bathroom with the Delta Vero Singl...,Delta Vero 1-Handle Shower Only Faucet Trim Ki...
5,100006,Whirlpool 1.9 cu. ft. Over the Range Convectio...,Appliance Type Over the Range Microwave Assemb...,Whirlpool,Achieving delicious results is almost effortle...,Whirlpool 1.9 cu. ft. Over the Range Convectio...
8,100007,Lithonia Lighting Quantum 2-Light Black LED Em...,Battery Power Type Ni-Cad Battery Size .Built-...,Lithonia Lighting,The Quantum Adjustable 2- Light LED Black Emer...,Lithonia Lighting Quantum 2-Light Black LED Em...


In [54]:
def standardize_units(text):
  text = " " + text + " "
  text = re.sub('( gal | gals | galon )',' gallon ',text)
  text = re.sub('( ft | fts | feets | foot | foots )',' feet ',text)
  text = re.sub('( squares | sq )',' square ',text)
  text = re.sub('( lb | lbs | pounds )',' pound ',text)
  text = re.sub('( oz | ozs | ounces | ounc )',' ounce ',text)
  text = re.sub('( yds | yd | yards )',' yard ',text)
  return text

def preprocessing(sent):
  sent = sent.replace('in.', ' inch ') #If we dont to this then 'in.' will be turned to 'in' in the next step
  words = re.split(r'\W+', sent)
  words = [word.lower() for word in words]
  res = re.sub("[A-Za-z]+", lambda ele: " " + ele[0] + " ", ' '.join(words)) #add space between number and alphabets in a string
  cleaned = standardize_units(res) 
  cleaned = ' '.join(cleaned.split()) #removing extra whitespaces
  return cleaned

temp['text'] = temp['text'].apply(lambda x : preprocessing(x))
temp.head()

Unnamed: 0,product_uid,product_title,combined_attr,brand,product_description,text
0,100001,Simpson Strong-Tie 12-Gauge Angle,Bullet01 Versatile connector for various 90Â° ...,Simpson Strong-Tie,"Not only do angles make joints stronger, they ...",simpson strong tie 12 gauge angle simpson stro...
2,100002,BEHR Premium Textured DeckOver 1-gal. #SC-141 ...,"Application Method Brush,Roller,Spray Assemble...",BEHR Premium Textured DeckOver,BEHR Premium Textured DECKOVER is an innovativ...,behr premium textured deckover 1 gallon sc 141...
3,100005,Delta Vero 1-Handle Shower Only Faucet Trim Ki...,Bath Faucet Type Combo Tub and Shower Built-in...,Delta,Update your bathroom with the Delta Vero Singl...,delta vero 1 handle shower only faucet trim ki...
5,100006,Whirlpool 1.9 cu. ft. Over the Range Convectio...,Appliance Type Over the Range Microwave Assemb...,Whirlpool,Achieving delicious results is almost effortle...,whirlpool 1 9 cu feet over the range convectio...
8,100007,Lithonia Lighting Quantum 2-Light Black LED Em...,Battery Power Type Ni-Cad Battery Size .Built-...,Lithonia Lighting,The Quantum Adjustable 2- Light LED Black Emer...,lithonia lighting quantum 2 light black led em...


creating database for correction of search

In [55]:
temp['cleaned_title'] = temp['product_title'].apply(lambda x : preprocessing(x))
temp['cleaned_brand'] = temp['brand'].apply(lambda x : preprocessing(x))
corpus = temp['cleaned_title'] + " "  + temp['cleaned_brand'] 

#removing stopwords
stp_wrds = set(stopwords.words('english'))
def stop_word_removal(sent):
  words = sent.split()
  words = [w for w in words if not w in stp_wrds]
  return ' '.join(words)

corpus = corpus.apply(lambda x : stop_word_removal(x)) 
np.savetxt(r'Final/cleaning/corpus.txt', corpus.values, fmt='%s')

testing the spelling corrector

In [56]:
#http://norvig.com/spell-correct.html
def words(text): return re.findall(r'\w+', text.lower())
WORDS = Counter(words(open('Final/cleaning/corpus.txt').read()))

def P(word, N=sum(WORDS.values())): 
    "Probability of `word`."
    return WORDS[word] / N
def correction(word): 
    "Most probable spelling correction for word."
    return max(candidates(word), key=P)
def candidates(word): 
    "Generate possible spelling corrections for word."
    return (known([word]) or known(edits1(word)) or known(edits2(word)) or set([word]))
def known(words): 
    "The subset of `words` that appear in the dictionary of WORDS."
    return set(w for w in words if w in WORDS)
def edits1(word):
    "All edits that are one edit away from `word`."
    letters    = 'abcdefghijklmnopqrstuvwxyz'
    splits     = [(word[:i], word[i:])    for i in range(len(word) + 1)]
    deletes    = [L + R[1:]               for L, R in splits if R]
    transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1]
    replaces   = [L + c + R[1:]           for L, R in splits if R for c in letters]
    inserts    = [L + c + R               for L, R in splits for c in letters]
    return set(deletes + transposes + replaces + inserts)
def edits2(word): 
    "All edits that are two edits away from `word`."
    return (e2 for e1 in edits1(word) for e2 in edits1(e1))
def corrected_term(term):
  temp = term.lower().split()
  temp = [correction(word) for word in temp]
  return ' '.join(temp)

In [59]:
for typo in [
             'air conditionar', 
             'toiled',
             'lawn mowe',
             'water heatwr'
]:
  print(corrected_term(typo))


air conditioner
toilet
lawn mower
water heater


saving the file

In [61]:
temp.drop(['cleaned_title', 'cleaned_brand'], axis=1).to_csv('database.csv', index=False)

### Rank BM25

In [64]:
!pip install rank_bm25



In [65]:
from rank_bm25 import BM25Okapi
corpus = temp['text'].values
tokenized_corpus = [doc.split(" ") for doc in corpus]

bm25 = BM25Okapi(tokenized_corpus)

In [None]:
query = "lawn mower"
tokenized_query = query.split(" ")

bm25.get_top_n(tokenized_query, corpus, n=10)

['lawn boy 4 5 ounce green paint spray can lawn boy touch up scratches on your lawn boy walk power mower with this lawn boy green touch up spray paint this spray paint can contains the official lawn boy pantone green color and can be used to touch up scuffs or scratches on your lawn boy mower the can contains 12 ounce and can be used time and time again to keep your lawn boy mower looking new 4 5 ounce can official lawn boy pantone green color use to touch up your lawn boy walk power mower keep your lawn boy looking new for many years',
 'ego 20 inch mower blade ego use the ego 20 inch mower blade when you need a fresh sharp blade for your ego power plus lawn mower this replacement blade is designed for specifically the ego 20 inch mower give your lawn the best sharp cut for a crisp and attractive look 3 in 1 mower blade for mulching bagging and side discharging designed specifically for ego power plus mower model lm 2001 factory sharpened steel blade for excellent cutting performance 

In [None]:
query = "air conditioner"
tokenized_query = query.split(" ")

bm25.get_top_n(tokenized_query, corpus, n=10)

['duck covers elite 34 inch round air conditioner cover duck covers duck covers air conditioner covers provide breakthrough protection that keep air conditioners protected when not in use our innovative multi layered material creates superior airflow between your air conditioner cover and air conditioner eliminating condensation that can damage your outdoor condenser duck covers do not crack or fade over time migrate to the best air conditioner cover today air conditioner cover is 34 inch dia x 30 inch h 100 waterproof air conditioner cover like water off a duck s back breathable uv treated material is used in all duck covers patio heater covers this material won t crack in cold weather easy to use lightweight material thats easy to fold and store 2 inch wide velcro straps to secure air conditioner cover in place',
 'duck covers elite 34 inch square air conditioner cover duck covers duck covers air conditioner covers provide breakthrough protection that keep air conditioners protected 

In [66]:
import pickle
with open('BM25_model.pkl', 'wb') as f:
  pickle.dump(bm25, f)