In [1]:
%cd /content/drive/MyDrive/Home_Depot_Case_Study/Workspace3
!pwd

/content/drive/MyDrive/Home_Depot_Case_Study/Workspace3
/content/drive/MyDrive/Home_Depot_Case_Study/Workspace3


In [2]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns
from matplotlib_venn import venn2
import regex as re
from collections import Counter
!pip install nltk 
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import re
from collections import Counter
from nltk.stem import PorterStemmer 
from wordcloud import WordCloud, STOPWORDS 
from prettytable import PrettyTable
from sklearn.feature_extraction.text import CountVectorizer 
from textblob import TextBlob
from scipy.stats import spearmanr
import warnings
warnings.filterwarnings('ignore')
import pickle

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


# Reading the data


In [None]:
data = pd.read_csv("train.csv", encoding='iso-8859-1')
print(data.shape)

(74067, 5)


**SPLITTING INTO TRAIN AND TEST**

* we want to make sure that we split the data with equal dist of relevance scores hence we are first converting relevance scores into calsses and then splitting them into train and test.

In [None]:
valid = [3.00, 2.33, 2.67, 2.00, 1.67, 1.33, 1.00]
data['relevance_class'] = data['relevance'].apply(lambda score: str(min(valid, key=lambda x:abs(x-score))))
print(data['relevance_class'].unique())

['3.0' '2.33' '2.67' '2.0' '1.0' '1.67' '1.33']


* Note from here on in, the test data refers to the split 20% data from train.csv and not the test.cv data

In [None]:
#index should be [59217, 49176, 26412, 3850, 48569] and [70761, 65893, 18905, 32031, 67726]
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(data, stratify=data['relevance_class'], train_size=0.8, shuffle=True, random_state=122)
print(train_df.shape)
print(test_df.shape)

(59253, 6)
(14814, 6)


# Cleaning

### Merging the attributes, description and brands

#### Adding Attributes

In [None]:
attr_df = pd.read_csv("attributes.csv", encoding='iso-8859-1')

def merge_attributes(df):
  product_uids = df['product_uid'].values
  temp = attr_df.loc[attr_df['product_uid'].isin(product_uids)].fillna('')  
  temp['name_value'] = temp['name'] + ' ' + temp['value']
  temp['combined_attr'] = temp.groupby(['product_uid'])['name_value'].transform(lambda x: ' '.join(x))
  temp = temp.drop_duplicates('product_uid')[['product_uid', 'combined_attr']]
  df = pd.merge(df, temp, on='product_uid', how='left').set_index(df.index)
  return df

In [None]:
train_df = merge_attributes(train_df)
test_df = merge_attributes(test_df)

#### Adding 'brand' attribute as a seperate feature

In [None]:
def merge_brand(df):
  product_uids = df['product_uid'].values
  temp = attr_df.loc[attr_df['product_uid'].isin(product_uids)]  
  brands = temp[temp['name']=='MFG Brand Name']
  brands_temp = brands[['product_uid','value']]
  df = pd.merge(df, brands_temp, on='product_uid', how='left').set_index(df.index)
  df.rename(columns = {'value':'brand'}, inplace = True) 
  return df

In [None]:
train_df = merge_brand(train_df)
test_df = merge_brand(test_df)

#### Adding description to the training data 

In [None]:
desc_df = pd.read_csv('product_descriptions.csv', encoding='iso-8859-1')

def merge_description(df):
  df = pd.merge(df, desc_df, on='product_uid', how='left').set_index(df.index)
  #an extra preprocessing step is performed to seperate the concatenated words in the description. 
  df['product_description'] = df['product_description'].apply(lambda x: ' '.join(re.findall(r'[A-Z]?[^A-Z\s]+|[A-Z]+', x)))
  return df

In [None]:
train_df = merge_description(train_df)
test_df = merge_description(test_df)

In [None]:
test_df.head()

Unnamed: 0,id,product_uid,product_title,search_term,relevance,relevance_class,combined_attr,brand,product_description
70761,211945,198583,WEN 32 in. Bench Grinder Pedestal Stand with W...,crock pot water spigot,1.67,1.67,Bullet01 31.5 in. pedestal for your grinder to...,WEN,"As all woodworkers know, a bench grinder on a ..."
65893,198123,187563,Libman Wood Floor Sponge Mop,can you use sponge mop,2.67,2.67,"Bullet01 Tear resistant, cellulose coated spon...",Libman,This roller style mop is specifically designed...
18905,58391,115538,Defiant 110å¡ White Motion Sensing Outdoor Sec...,honeywell motion sensor outdoor lights,2.0,2.0,Adjustable Detection Sensitivity Yes Adjustabl...,Defiant,The Defiant 110 Outdoor Motion Sensing Securit...
32031,97935,130548,BLACK+DECKER 36-Volt Lithium-Ion Battery,black and decker 36v,2.67,2.67,Bullet01 Lithium-Ion battery always ready Bull...,BLACK+DECKER,BLACK & DECKER LBXR 36 40- Volt Extended Run T...
67726,203399,191672,Klein Tools 6-Piece Trim-Out Set,veneer trim tool,2.33,2.33,"Bullet01 A quick, portable solution for electr...",Klein Tools,Designed to allow electricians to efficiently ...


### Filling Null Values

**Brand**

In [None]:
train_df.drop('id',inplace=True, axis=1)
test_df.drop('id',inplace=True, axis=1)

In [None]:
unique_brands = np.unique(train_df['brand'].dropna().values)
len(unique_brands)

2952

In [None]:
with open('Final/cleaning/unique_brands.pkl','wb') as f:
  pickle.dump(unique_brands, f)

In [None]:
def first_n(n, sent):
  if n > len(sent.split()):
    return 'error101'
  return ' '.join(sent.split()[:n])

def fillna_brand(data, unique_brnds):
  null_df = data[data['brand'].isnull()]
  notnull_df = data.dropna()

  for i, row in null_df.iterrows():
    title = row['product_title']
    if first_n(4, title) in unique_brnds:
      null_df['brand'].loc[i] = first_n(4, title)
    elif first_n(3, title) in unique_brnds:
      null_df['brand'].loc[i] = first_n(3, title)
    elif first_n(2, title) in unique_brnds:
      null_df['brand'].loc[i] = first_n(2, title)
    else:
      null_df['brand'].loc[i] = first_n(1, title)

  data['brand'].loc[null_df.index] = null_df['brand'].values
  return data

In [None]:
train_df = fillna_brand(train_df, unique_brands)
test_df = fillna_brand(test_df, unique_brands)

**Attributes**

* description data has no null values so we can always fill the attributes null values with the description ones.

In [None]:
def fillna_attributes(data):
  null_df = data[data['combined_attr'].isnull()]
  null_df['combined_attr'] = null_df['product_description'].copy()
  data['combined_attr'].loc[null_df.index] = null_df['combined_attr'].values
  return data

In [None]:
train_df = fillna_attributes(train_df)
test_df = fillna_attributes(test_df)

**Filling any other null values**

In [None]:
train_df = train_df.fillna('')
test_df = test_df.fillna('')

### Basic Preprocessing 

In [None]:
def standardize_units(text):
  text = " " + text + " "
  text = re.sub('( gal | gals | galon )',' gallon ',text)
  text = re.sub('( ft | fts | feets | foot | foots )',' feet ',text)
  text = re.sub('( squares | sq )',' square ',text)
  text = re.sub('( lb | lbs | pounds )',' pound ',text)
  text = re.sub('( oz | ozs | ounces | ounc )',' ounce ',text)
  text = re.sub('( yds | yd | yards )',' yard ',text)
  return text

def preprocessing(sent):
  sent = sent.replace('in.', ' inch ') #If we dont to this then 'in.' will be turned to 'in' in the next step
  words = re.split(r'\W+', sent)
  words = [word.lower() for word in words]
  res = re.sub("[A-Za-z]+", lambda ele: " " + ele[0] + " ", ' '.join(words)) #add space between number and alphabets in a string
  cleaned = standardize_units(res) 
  cleaned = ' '.join(cleaned.split()) #removing extra whitespaces
  return cleaned

def preprocessing_search(sent):
  sent = sent.replace('in.', ' inch ')
  words = re.split(r'\W+', sent)
  words = [word.lower() for word in words]
  res = re.sub("[A-Za-z]+", lambda ele: " " + ele[0] + " ", ' '.join(words)) #add space between number and alphabets in a string
  res = standardize_units(res) 
  res = res.replace(' in ', ' inch ') #in search_terms 'in' is used more for 'inch' than as a preposition hence this step shouldn't hurt
  cleaned = ' '.join(res.split()) #removing extra whitespaces
  return cleaned

In [None]:
train_df['cleaned_title'] = train_df['product_title'].apply(lambda x : preprocessing(x))
train_df['cleaned_brand'] = train_df['brand'].apply(lambda x : preprocessing(x))
train_df['cleaned_description'] = train_df['product_description'].apply(lambda x : preprocessing(x))
train_df['cleaned_attributes'] = train_df['combined_attr'].apply(lambda x : preprocessing(x))
train_df['cleaned_search'] = train_df['search_term'].apply(lambda x : preprocessing_search(x))

In [None]:
test_df['cleaned_title'] = test_df['product_title'].apply(lambda x : preprocessing(x))
test_df['cleaned_brand'] = test_df['brand'].apply(lambda x : preprocessing(x))
test_df['cleaned_description'] = test_df['product_description'].apply(lambda x : preprocessing(x))
test_df['cleaned_attributes'] = test_df['combined_attr'].apply(lambda x : preprocessing(x))
test_df['cleaned_search'] = test_df['search_term'].apply(lambda x : preprocessing_search(x))

### Correcting Search Term

In [None]:
temp1 = train_df['cleaned_title'] + " "  + train_df['cleaned_brand'] + " " + train_df['cleaned_description'] + " " + train_df['cleaned_attributes'] 
temp2 = test_df['cleaned_title'] + " "  + test_df['cleaned_brand'] + " " + test_df['cleaned_description'] + " " + test_df['cleaned_attributes'] 
corpus = pd.concat([temp1, temp2], axis=0)

#removing stopwords
stp_wrds = set(stopwords.words('english'))
def stop_word_removal(sent):
  words = sent.split()
  words = [w for w in words if not w in stp_wrds]
  return ' '.join(words)

corpus = corpus.apply(lambda x : stop_word_removal(x)) 
np.savetxt(r'preprocessing/corpus.txt', corpus.values, fmt='%s')
np.savetxt(r'Final/cleaning/corpus.txt', corpus.values, fmt='%s')

In [None]:
#http://norvig.com/spell-correct.html
def words(text): return re.findall(r'\w+', text.lower())
WORDS = Counter(words(open('preprocessing/corpus.txt').read()))

def P(word, N=sum(WORDS.values())): 
    "Probability of `word`."
    return WORDS[word] / N
def correction(word): 
    "Most probable spelling correction for word."
    return max(candidates(word), key=P)
def candidates(word): 
    "Generate possible spelling corrections for word."
    return (known([word]) or known(edits1(word)) or known(edits2(word)) or set([word]))
def known(words): 
    "The subset of `words` that appear in the dictionary of WORDS."
    return set(w for w in words if w in WORDS)
def edits1(word):
    "All edits that are one edit away from `word`."
    letters    = 'abcdefghijklmnopqrstuvwxyz'
    splits     = [(word[:i], word[i:])    for i in range(len(word) + 1)]
    deletes    = [L + R[1:]               for L, R in splits if R]
    transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1]
    replaces   = [L + c + R[1:]           for L, R in splits if R for c in letters]
    inserts    = [L + c + R               for L, R in splits for c in letters]
    return set(deletes + transposes + replaces + inserts)
def edits2(word): 
    "All edits that are two edits away from `word`."
    return (e2 for e1 in edits1(word) for e2 in edits1(e1))
def corrected_term(term):
  temp = term.lower().split()
  temp = [correction(word) for word in temp]
  return ' '.join(temp)

In [None]:
train_df['corrected_search'] = train_df['cleaned_search'].apply(lambda x: corrected_term(x))
test_df['corrected_search'] = test_df['cleaned_search'].apply(lambda x: corrected_term(x))

### Further Cleaning

In [None]:
#stop word removal and stemming
#We didn't do this before because we wanted to fix the typos in the searh term first 
porter = PorterStemmer()
stp_wrds = set(stopwords.words('english'))

def futher_preprocessing(sent):
  sent = sent.replace('_', ' _ ')
  words = sent.split()
  words = [w for w in words if not w in stp_wrds]
  words = [porter.stem(word) for word in words]
  return ' '.join(words)

In [None]:
#storing the final data in a new dataframe 'cleaned_df_train' 
cleaned_df_train = pd.DataFrame() 
cleaned_df_train['title'] = train_df['cleaned_title'].apply(lambda x : futher_preprocessing(x)) 
cleaned_df_train['brand'] = train_df['cleaned_brand'].apply(lambda x : futher_preprocessing(x)) 
cleaned_df_train['description'] = train_df['cleaned_description'].apply(lambda x : futher_preprocessing(x)) 
cleaned_df_train['attributes'] = train_df['cleaned_attributes'].apply(lambda x : futher_preprocessing(x)) 
cleaned_df_train['search'] = train_df['cleaned_search'].apply(lambda x : futher_preprocessing(x)) 
cleaned_df_train['corrected_search'] = train_df['corrected_search'].apply(lambda x : futher_preprocessing(x)) 
cleaned_df_train['relevance'] = train_df['relevance']
cleaned_df_train.head()

Unnamed: 0,title,brand,description,attributes,search,corrected_search,relevance
59217,winter instrument p 9 u 90 seri 2 inch panel m...,winter instrument,winter instrument p 9 u 90 seri panel mount pr...,accessori type gaug bullet 01 gener purpos pan...,tekton pressur gaug,tekton pressur gaug,2.33
49176,american craftsman 24 inch x 36 inch 50 seri r...,american craftsman,seri 50 slider fin slide vinyl window combin c...,seri 50 slider fin slide vinyl window combin c...,48 x 35 slider window,48 x 35 slider window,1.67
26412,kohler windward 6 feet right hand drain tile f...,kohler,look forward privat getaway windward bath feat...,look forward privat getaway windward bath feat...,6 feet bathtub,6 feet bathtub,3.0
3850,35 000 btu hr monterey top vent graviti wall f...,william,instal monterey 35 000 btu natur ga top vent w...,accessori includ automat shutoff ye bullet 01 ...,ventenatur ga heater,ventenatur ga heater,2.33
48569,sylvania 60 watt halogen 19 doubl life soft wh...,sylvania,sylvania doubl life 19 light bulb classic shap...,actual color temperatur k 2750 averag life hou...,cheapest 60 watt light bulb,cleanest 60 watt light bulb,2.0


In [None]:
#storing the final data in a new dataframe 'cleaned_df_test' 
cleaned_df_test = pd.DataFrame() 
cleaned_df_test['title'] = test_df['cleaned_title'].apply(lambda x : futher_preprocessing(x)) 
cleaned_df_test['brand'] = test_df['cleaned_brand'].apply(lambda x : futher_preprocessing(x)) 
cleaned_df_test['description'] = test_df['cleaned_description'].apply(lambda x : futher_preprocessing(x)) 
cleaned_df_test['attributes'] = test_df['cleaned_attributes'].apply(lambda x : futher_preprocessing(x)) 
cleaned_df_test['search'] = test_df['cleaned_search'].apply(lambda x : futher_preprocessing(x)) 
cleaned_df_test['corrected_search'] = test_df['corrected_search'].apply(lambda x : futher_preprocessing(x)) 
cleaned_df_test['relevance'] = test_df['relevance']
cleaned_df_test.head()

Unnamed: 0,title,brand,description,attributes,search,corrected_search,relevance
70761,wen 32 inch bench grinder pedest stand water pot,wen,woodwork know bench grinder normal work surfac...,bullet 01 31 5 inch pedest grinder rest bullet...,crock pot water spigot,crock pot water spigot,1.67
65893,libman wood floor spong mop,libman,roller style mop specif design safe effect cle...,bullet 01 tear resist cellulos coat spong bull...,use spong mop,fan yu use spong mop,2.67
18905,defiant 110å white motion sens outdoor secur l...,defiant,defiant 110 outdoor motion sens secur light pe...,adjust detect sensit ye adjust lamp head ye bu...,honeywel motion sensor outdoor light,honeywel motion sensor outdoor light,2.0
32031,black decker 36 volt lithium ion batteri,black decker,black decker lbxr 36 40 volt extend run time l...,bullet 01 lithium ion batteri alway readi bull...,black decker 36 v,black hand decker 36 v,2.67
67726,klein tool 6 piec trim set,klein tool,design allow electrician effici handl major da...,bullet 01 quick portabl solut electrician go b...,veneer trim tool,veneer trim tool,2.33


**cleaned_df2 with no stemming**

In [None]:
#stop word removal only - no stemming
def futher_preprocessing_without_stem(sent):
  sent = sent.replace('_', ' _ ')
  words = sent.split()
  words = [w for w in words if not w in stp_wrds]
  return ' '.join(words)

In [None]:
cleaned_df2_train = pd.DataFrame()
cleaned_df2_train['title'] = train_df['cleaned_title'].apply(lambda x : futher_preprocessing_without_stem(x)) 
cleaned_df2_train['brand'] = train_df['cleaned_brand'].apply(lambda x : futher_preprocessing_without_stem(x)) 
cleaned_df2_train['description'] = train_df['cleaned_description'].apply(lambda x : futher_preprocessing_without_stem(x))
cleaned_df2_train['attributes'] = train_df['cleaned_attributes'].apply(lambda x : futher_preprocessing_without_stem(x)) 
cleaned_df2_train['search'] = train_df['cleaned_search'].apply(lambda x : futher_preprocessing_without_stem(x)) 
cleaned_df2_train['corrected_search'] = train_df['corrected_search'].apply(lambda x : futher_preprocessing_without_stem(x)) 
cleaned_df2_train['relevance'] = train_df['relevance']
cleaned_df2_train.head()

Unnamed: 0,title,brand,description,attributes,search,corrected_search,relevance
59217,winters instruments p 9 u 90 series 2 inch pan...,winters instruments,winters instruments p 9 u 90 series panel moun...,accessory type gauge bullet 01 general purpose...,tekton pressure gauge,tekton pressure gauge,2.33
49176,american craftsman 24 inch x 36 inch 50 series...,american craftsman,series 50 slider fin sliding vinyl window comb...,series 50 slider fin sliding vinyl window comb...,48 x 35 slider window,48 x 35 slider window,1.67
26412,kohler windward 6 feet right hand drain tile f...,kohler,look forward private getaway windward bath fea...,look forward private getaway windward bath fea...,6 feet bathtub,6 feet bathtub,3.0
3850,35 000 btu hr monterey top vent gravity wall f...,williams,install monterey 35 000 btu natural gas top ve...,accessories included automatic shutoff yes bul...,ventenatural gas heater,ventenatural gas heater,2.33
48569,sylvania 60 watt halogen 19 double life soft w...,sylvania,sylvania double life 19 light bulb classic sha...,actual color temperature k 2750 average life h...,cheapest 60 watt light bulb,cleanest 60 watt light bulb,2.0


In [None]:
#stop word removal only - no stemming
cleaned_df2_test = pd.DataFrame()
cleaned_df2_test['title'] = test_df['cleaned_title'].apply(lambda x : futher_preprocessing_without_stem(x)) 
cleaned_df2_test['brand'] = test_df['cleaned_brand'].apply(lambda x : futher_preprocessing_without_stem(x)) 
cleaned_df2_test['description'] = test_df['cleaned_description'].apply(lambda x : futher_preprocessing_without_stem(x))
cleaned_df2_test['attributes'] = test_df['cleaned_attributes'].apply(lambda x : futher_preprocessing_without_stem(x)) 
cleaned_df2_test['search'] = test_df['cleaned_search'].apply(lambda x : futher_preprocessing_without_stem(x)) 
cleaned_df2_test['corrected_search'] = test_df['corrected_search'].apply(lambda x : futher_preprocessing_without_stem(x)) 
cleaned_df2_test['relevance'] = test_df['relevance']
cleaned_df2_test.head()

Unnamed: 0,title,brand,description,attributes,search,corrected_search,relevance
70761,wen 32 inch bench grinder pedestal stand water...,wen,woodworkers know bench grinder normal work sur...,bullet 01 31 5 inch pedestal grinder rest bull...,crock pot water spigot,crock pot water spigot,1.67
65893,libman wood floor sponge mop,libman,roller style mop specifically designed safely ...,bullet 01 tear resistant cellulose coated spon...,use sponge mop,fan yu use sponge mop,2.67
18905,defiant 110å white motion sensing outdoor secu...,defiant,defiant 110 outdoor motion sensing security li...,adjustable detection sensitivity yes adjustabl...,honeywell motion sensor outdoor lights,honeywell motion sensor outdoor lights,2.0
32031,black decker 36 volt lithium ion battery,black decker,black decker lbxr 36 40 volt extended run time...,bullet 01 lithium ion battery always ready bul...,black decker 36 v,black hand decker 36 v,2.67
67726,klein tools 6 piece trim set,klein tools,designed allow electricians efficiently handle...,bullet 01 quick portable solution electricians...,veneer trim tool,veneer trim tool,2.33


**REPLACING EMPTY STRINGS**

* Brand feature has some empty strings. This could have happened while we were removing stop words as some brands had the first word of title. This can cause problem when we store the file hence we are replacing them.
* Even for search terms, this is observed as some 20 search terms were 'To' and 'or' which got removed during stop word removal. It's not observed in corrected_search as in corrected_search before stopword removal 'To' was converted to 'top' and 'or' to 'r'

In [None]:
cleaned_df_train['brand'] = cleaned_df_train['brand'].replace(to_replace =[""], value ="missing_brand")
cleaned_df_train['search'] = cleaned_df_train['search'].replace(to_replace =[""], value ="missing_search")
cleaned_df_train['corrected_search'] = cleaned_df_train['corrected_search'].replace(to_replace =[""], value ="missing_search")

cleaned_df2_train['brand'] = cleaned_df2_train['brand'].replace(to_replace =[""], value ="missing_brand")
cleaned_df2_train['search'] = cleaned_df2_train['search'].replace(to_replace =[""], value ="missing_search")
cleaned_df2_train['corrected_search'] = cleaned_df2_train['corrected_search'].replace(to_replace =[""], value ="missing_search")

In [None]:
cleaned_df_test['brand'] = cleaned_df_test['brand'].replace(to_replace =[""], value ="missing_brand")
cleaned_df_test['search'] = cleaned_df_test['search'].replace(to_replace =[""], value ="missing_search")
cleaned_df_test['corrected_search'] = cleaned_df_test['corrected_search'].replace(to_replace =[""], value ="missing_search")

cleaned_df2_test['brand'] = cleaned_df2_test['brand'].replace(to_replace =[""], value ="missing_brand")
cleaned_df2_test['search'] = cleaned_df2_test['search'].replace(to_replace =[""], value ="missing_search")
cleaned_df2_test['corrected_search'] = cleaned_df2_test['corrected_search'].replace(to_replace =[""], value ="missing_search")

**Removing the word 'bullet' from the 'attributes' feature**
* Only removing the instance which is followed by 2 digits

In [None]:
cleaned_df_train['attributes'] = cleaned_df_train['attributes'].apply(lambda x: re.sub('bullet \d\d ', '', x))
cleaned_df2_train['attributes'] = cleaned_df2_train['attributes'].apply(lambda x: re.sub('bullet \d\d ', '', x))

cleaned_df_train['description'] = cleaned_df_train['description'].apply(lambda x: re.sub('bullet \d\d ', '', x))
cleaned_df2_train['description'] = cleaned_df2_train['description'].apply(lambda x: re.sub('bullet \d\d ', '', x))

#Renaming 'search' as 'raw_search' and storing the dataframe
cleaned_df_train.rename(columns={"search": "raw_search"}, inplace=True)
cleaned_df2_train.rename(columns={"search": "raw_search"}, inplace=True)

In [None]:
cleaned_df_test['attributes'] = cleaned_df_test['attributes'].apply(lambda x: re.sub('bullet \d\d ', '', x))
cleaned_df2_test['attributes'] = cleaned_df2_test['attributes'].apply(lambda x: re.sub('bullet \d\d ', '', x))

cleaned_df_test['description'] = cleaned_df_test['description'].apply(lambda x: re.sub('bullet \d\d ', '', x))
cleaned_df2_test['description'] = cleaned_df2_test['description'].apply(lambda x: re.sub('bullet \d\d ', '', x))

#Renaming 'search' as 'raw_search' and storing the dataframe
cleaned_df_test.rename(columns={"search": "raw_search"}, inplace=True)
cleaned_df2_test.rename(columns={"search": "raw_search"}, inplace=True)

**Rechecking the indexes**
* index should be [59217, 49176, 26412, 3850, 48569] and [70761, 65893, 18905, 32031, 67726]


In [None]:
print(cleaned_df_train.index[:5], cleaned_df_test.index[:5])

Int64Index([59217, 49176, 26412, 3850, 48569], dtype='int64') Int64Index([70761, 65893, 18905, 32031, 67726], dtype='int64')


**SAVING THE DATAFRAMES**

In [None]:
#relevance class for stratified sampling
cleaned_df_train['relevance_class'] = train_df['relevance_class']
cleaned_df2_train['relevance_class'] = train_df['relevance_class']

cleaned_df_train.to_pickle('preprocessing/cleaned_df_train.pkl')
cleaned_df2_train.to_pickle('preprocessing/cleaned_df2_train.pkl')

In [None]:
#relevance class for stratified sampling
cleaned_df_test['relevance_class'] = test_df['relevance_class']
cleaned_df2_test['relevance_class'] = test_df['relevance_class']

cleaned_df_test.to_pickle('preprocessing/cleaned_df_test.pkl')
cleaned_df2_test.to_pickle('preprocessing/cleaned_df2_test.pkl')