**Reference**

---

Mainly refered from the following link -- https://github.com/kriz17/Home-Depot-Product-Search-Relevance/blob/master/Extension/bm25.ipynb

In [None]:
import pandas as pd
import numpy as np
import regex as re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import warnings
warnings.filterwarnings('ignore')
from collections import Counter

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
from google.colab import drive

In [None]:
drive.mount('/gdrive')

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).


In [None]:
train_data_path = '/gdrive/MyDrive/ColabNotebooks/thesis/dataset/train.csv'
prod_desc_path='/gdrive/MyDrive/ColabNotebooks/thesis/dataset/product_descriptions.csv'
attr_path='/gdrive/MyDrive/ColabNotebooks/thesis/dataset/attributes.csv'
test_data_path= '/gdrive/MyDrive/ColabNotebooks/thesis/dataset/test.csv'

In [None]:
df_train=pd.read_csv(train_data_path,encoding = "ISO-8859-1")
df_pd_desc=pd.read_csv(prod_desc_path)
df_attributes=pd.read_csv(attr_path)
df_test=pd.read_csv(test_data_path,encoding = "ISO-8859-1")

***Merge all the data set***

---
This is to create the corpus with all the available data set


In [None]:
def merge_attri(df):
  product_uids = df['product_uid'].values
  temp = df_attributes.loc[df_attributes['product_uid'].isin(product_uids)].fillna('')  
  temp['name_value'] = temp['name'] + ' ' + temp['value']
  temp['combined_attr'] = temp.groupby(['product_uid'])['name_value'].transform(lambda x: ' '.join(x))
  temp = temp.drop_duplicates('product_uid')[['product_uid', 'combined_attr']]
  df = pd.merge(df, temp, on='product_uid', how='left').set_index(df.index)
  return df

def merge_brand(df):
  product_uids = df['product_uid'].values
  temp = df_attributes.loc[df_attributes['product_uid'].isin(product_uids)]  
  brands = temp[temp['name']=='MFG Brand Name']
  brands_temp = brands[['product_uid','value']]
  df = pd.merge(df, brands_temp, on='product_uid', how='left').set_index(df.index)
  df.rename(columns = {'value':'brand'}, inplace = True) 
  return df

def merge_desc(df):
  df = pd.merge(df, df_pd_desc, on='product_uid', how='left').set_index(df.index)
  #an extra preprocessing step is performed to seperate the concatenated words in the description. 
  df['product_description'] = df['product_description'].apply(lambda x: ' '.join(re.findall(r'[A-Z]?[^A-Z\s]+|[A-Z]+', x)))
  return df

In [None]:
df_train = merge_attri(df_train)
df_train = merge_brand(df_train)
df_train = merge_desc(df_train)

df_test = merge_attri(df_test)
df_test = merge_brand(df_test)
df_test = merge_desc(df_test)

df_train = df_train.drop('relevance', axis=1)

df_combined = pd.concat([df_train, df_test], axis=0).reset_index()
df_combined = df_combined.drop_duplicates('product_uid')
print(df_combined.shape)
df_combined.isna().sum()

(124428, 8)


index                      0
id                         0
product_uid                0
product_title              0
search_term                0
combined_attr          38165
brand                  38243
product_description        0
dtype: int64

***Basic pre-processing :Fill Null Values***

In [None]:
def first_n(n, sent):
  if n > len(sent.split()):
    return 'error101'
  return ' '.join(sent.split()[:n])

def fillna_brand(data, unique_brnds):
  null_df = data[data['brand'].isnull()]
  notnull_df = data.dropna()

  for i, row in null_df.iterrows():
    title = row['product_title']
    if first_n(4, title) in unique_brnds:
      null_df['brand'].loc[i] = first_n(4, title)
    elif first_n(3, title) in unique_brnds:
      null_df['brand'].loc[i] = first_n(3, title)
    elif first_n(2, title) in unique_brnds:
      null_df['brand'].loc[i] = first_n(2, title)
    else:
      null_df['brand'].loc[i] = first_n(1, title)

  data['brand'].loc[null_df.index] = null_df['brand'].values
  return data

def fillna_attributes(data):
  null_df = data[data['combined_attr'].isnull()]
  null_df['combined_attr'] = null_df['product_description'].copy()
  data['combined_attr'].loc[null_df.index] = null_df['combined_attr'].values
  return data

unique_brands = list(df_combined['brand'].unique())
print(len(unique_brands))

df_combined = fillna_brand(df_combined, unique_brands)
df_combined = fillna_attributes(df_combined)

df_combined.isna().sum()

4289


index                  0
id                     0
product_uid            0
product_title          0
search_term            0
combined_attr          0
brand                  0
product_description    0
dtype: int64

***Text field is being created by merging all the columns ***

In [None]:
df_combined['text'] = df_combined['product_title'] + ' ' + df_combined['brand'] + ' ' + df_combined['product_description']
temp = df_combined.drop(['index', 'id', 'search_term'], axis=1)
print(temp.shape)
temp.head()

(124428, 6)


Unnamed: 0,product_uid,product_title,combined_attr,brand,product_description,text
0,100001,Simpson Strong-Tie 12-Gauge Angle,Bullet01 Versatile connector for various 90° c...,Simpson Strong-Tie,"Not only do angles make joints stronger, they ...",Simpson Strong-Tie 12-Gauge Angle Simpson Stro...
2,100002,BEHR Premium Textured DeckOver 1-gal. #SC-141 ...,"Application Method Brush,Roller,Spray Assemble...",BEHR Premium Textured DeckOver,BEHR Premium Textured DECKOVER is an innovativ...,BEHR Premium Textured DeckOver 1-gal. #SC-141 ...
3,100005,Delta Vero 1-Handle Shower Only Faucet Trim Ki...,Bath Faucet Type Combo Tub and Shower Built-in...,Delta,Update your bathroom with the Delta Vero Singl...,Delta Vero 1-Handle Shower Only Faucet Trim Ki...
5,100006,Whirlpool 1.9 cu. ft. Over the Range Convectio...,Appliance Type Over the Range Microwave Assemb...,Whirlpool,Achieving delicious results is almost effortle...,Whirlpool 1.9 cu. ft. Over the Range Convectio...
8,100007,Lithonia Lighting Quantum 2-Light Black LED Em...,Battery Power Type Ni-Cad Battery Size .Built-...,Lithonia Lighting,The Quantum Adjustable 2- Light LED Black Emer...,Lithonia Lighting Quantum 2-Light Black LED Em...


In [None]:
def standardize_units(text):
  text = " " + text + " "
  text = re.sub('( gal | gals | galon )',' gallon ',text)
  text = re.sub('( ft | fts | feets | foot | foots )',' feet ',text)
  text = re.sub('( squares | sq )',' square ',text)
  text = re.sub('( lb | lbs | pounds )',' pound ',text)
  text = re.sub('( oz | ozs | ounces | ounc )',' ounce ',text)
  text = re.sub('( yds | yd | yards )',' yard ',text)
  return text

def preprocessing(sent):
  sent = sent.replace('in.', ' inch ') #If we dont to this then 'in.' will be turned to 'in' in the next step
  words = re.split(r'\W+', sent)
  words = [word.lower() for word in words]
  res = re.sub("[A-Za-z]+", lambda ele: " " + ele[0] + " ", ' '.join(words)) #add space between number and alphabets in a string
  cleaned = standardize_units(res) 
  cleaned = ' '.join(cleaned.split()) #removing extra whitespaces
  return cleaned

temp['text'] = temp['text'].apply(lambda x : preprocessing(x))
temp.head()

Unnamed: 0,product_uid,product_title,combined_attr,brand,product_description,text
0,100001,Simpson Strong-Tie 12-Gauge Angle,Bullet01 Versatile connector for various 90° c...,Simpson Strong-Tie,"Not only do angles make joints stronger, they ...",simpson strong tie 12 gauge angle simpson stro...
2,100002,BEHR Premium Textured DeckOver 1-gal. #SC-141 ...,"Application Method Brush,Roller,Spray Assemble...",BEHR Premium Textured DeckOver,BEHR Premium Textured DECKOVER is an innovativ...,behr premium textured deckover 1 gallon sc 141...
3,100005,Delta Vero 1-Handle Shower Only Faucet Trim Ki...,Bath Faucet Type Combo Tub and Shower Built-in...,Delta,Update your bathroom with the Delta Vero Singl...,delta vero 1 handle shower only faucet trim ki...
5,100006,Whirlpool 1.9 cu. ft. Over the Range Convectio...,Appliance Type Over the Range Microwave Assemb...,Whirlpool,Achieving delicious results is almost effortle...,whirlpool 1 9 cu feet over the range convectio...
8,100007,Lithonia Lighting Quantum 2-Light Black LED Em...,Battery Power Type Ni-Cad Battery Size .Built-...,Lithonia Lighting,The Quantum Adjustable 2- Light LED Black Emer...,lithonia lighting quantum 2 light black led em...


***database for correction of search***

In [None]:
temp['cleaned_title'] = temp['product_title'].apply(lambda x : preprocessing(x))
temp['cleaned_brand'] = temp['brand'].apply(lambda x : preprocessing(x))
corpus = temp['cleaned_title'] + " "  + temp['cleaned_brand'] 

#removing stopwords
stp_wrds = set(stopwords.words('english'))
def stop_word_removal(sent):
  words = sent.split()
  words = [w for w in words if not w in stp_wrds]
  return ' '.join(words)

corpus = corpus.apply(lambda x : stop_word_removal(x)) 
np.savetxt(r'/gdrive/MyDrive/ColabNotebooks/thesis/final_model/corpus.txt', corpus.values, fmt='%s')

***Test for spell corrector***

In [None]:
#http://norvig.com/spell-correct.html
def words(text): return re.findall(r'\w+', text.lower())
WORDS = Counter(words(open('/gdrive/MyDrive/ColabNotebooks/thesis/final_model/corpus.txt').read()))

def P(word, N=sum(WORDS.values())): 
    "Probability of `word`."
    return WORDS[word] / N
def correction(word): 
    "Most probable spelling correction for word."
    return max(candidates(word), key=P)
def candidates(word): 
    "Generate possible spelling corrections for word."
    return (known([word]) or known(edits1(word)) or known(edits2(word)) or set([word]))
def known(words): 
    "The subset of `words` that appear in the dictionary of WORDS."
    return set(w for w in words if w in WORDS)
def edits1(word):
    "All edits that are one edit away from `word`."
    letters    = 'abcdefghijklmnopqrstuvwxyz'
    splits     = [(word[:i], word[i:])    for i in range(len(word) + 1)]
    deletes    = [L + R[1:]               for L, R in splits if R]
    transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1]
    replaces   = [L + c + R[1:]           for L, R in splits if R for c in letters]
    inserts    = [L + c + R               for L, R in splits for c in letters]
    return set(deletes + transposes + replaces + inserts)
def edits2(word): 
    "All edits that are two edits away from `word`."
    return (e2 for e1 in edits1(word) for e2 in edits1(e1))
def corrected_term(term):
  temp = term.lower().split()
  temp = [correction(word) for word in temp]
  return ' '.join(temp)

In [None]:
for typo in [
             'air conditionar', 
             'toiled',
             'lawn mowe',
             'water heatwr'
]:
  print(corrected_term(typo))

air conditioner
toilet
lawn mower
water heater


In [None]:
temp=temp.drop(['cleaned_title', 'cleaned_brand'], axis=1).to_csv('/gdrive/MyDrive/ColabNotebooks/thesis/corrected/model/database.csv', index=False)

In [None]:
# db=pd.read_csv('/gdrive/MyDrive/ColabNotebooks/thesis/corrected/model/database.csv',encoding = "ISO-8859-1")

In [None]:
#https://stackoverflow.com/questions/57983431/whats-the-most-space-efficient-way-to-compress-serialized-python-data

In [None]:
# import pickle
# import lzma
# with lzma.open("/gdrive/MyDrive/ColabNotebooks/thesis/corrected/model/database.xz", "wb") as f:
#     pickle.dump(db, f)

In [None]:
# with lzma.open("/gdrive/MyDrive/ColabNotebooks/thesis/corrected/model/database.xz") as f:
#     file_content = f.read()

In [None]:
# print(type(file_content))

In [None]:
# import gzip
# # content = b"Lots of content here"
# # with gzip.open('/home/joe/file.txt.gz', 'wb') as f:
# #     f.write(content)

# # with gzip.open('/gdrive/MyDrive/ColabNotebooks/thesis/corrected/model/database.csv.gz', 'wb') as f:
# #     f.write(temp)
# def save_zipped_pickle(obj, filename, protocol=-1):
#     with gzip.open('/gdrive/MyDrive/ColabNotebooks/thesis/corrected/model/db.pgz', 'wb') as f:
#         cPickle.dump(obj, f, protocol)

In [None]:
# import pickle
# with open('/gdrive/MyDrive/ColabNotebooks/thesis/final_model/database.pkl', 'wb') as f:
#   pickle.dump(temp, f)
# # temp.to_pickle()  # where to save it, usually as a .pkl

In [None]:
# df = pd.read_pickle('/gdrive/MyDrive/ColabNotebooks/thesis/final_model/database.pkl')

In [None]:
# temp.to_hdf('data.h5', key='df', mode='w')

In [None]:
# store = pd.HDFStore('store.h5')

# store['temp'] = temp # save it


In [None]:
# store['temp']  # load it

**Creating Rank BM25 model**

In [None]:
!pip install rank_bm25

Collecting rank_bm25
  Downloading rank_bm25-0.2.2-py3-none-any.whl (8.6 kB)
Installing collected packages: rank-bm25
Successfully installed rank-bm25-0.2.2


In [None]:
from rank_bm25 import BM25Okapi
corpus = temp['text'].values
token_corpus = [doc.split(" ") for doc in corpus]
bm25 = BM25Okapi(token_corpus)

In [None]:
import pickle
with open('/gdrive/MyDrive/ColabNotebooks/thesis/final_model/BM25_model.pkl', 'wb') as f:
  pickle.dump(bm25, f)