# Load Train set

In [4]:
import pandas as pd

train = pd.read_csv('data/train.csv', encoding="ISO-8859-1")
product_descriptions_with_attributes = pd.read_csv('product_descriptions_with_attributes.csv')
train = pd.merge(train , product_descriptions_with_attributes , how='left', on=['product_uid'])
X_train, y_train = train.loc[:, train.columns != 'relevance'], train['relevance']

# Cleaning Pipeline

In [8]:
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import re
from tqdm import tqdm
import pkg_resources
from symspellpy import SymSpell, Verbosity
from nltk.stem.porter import *
from nltk.corpus import stopwords
import nltk
import string

class OverallStringClean(BaseEstimator, TransformerMixin):
    def clean_str(self, string):
        #string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
        string = re.sub(r"[^A-Za-z0-9/.(),!?\'\`]", " ", string)
        string = re.sub(r"\'s", " \'s", string)
        string = re.sub(r"\'ve", " \'ve", string)
        string = re.sub(r"n\'t", " n\'t", string)
        string = re.sub(r"\'re", " \'re", string)
        string = re.sub(r"\'d", " \'d", string)
        string = re.sub(r"\'ll", " \'ll", string)
        string = re.sub(r",", " , ", string)
        string = re.sub(r"!", " ! ", string)
        string = re.sub(r"\(", " \( ", string)
        string = re.sub(r"\)", " \) ", string)
        string = re.sub(r"\?", " \? ", string)
        string = re.sub(r"\s{2,}", " ", string)
        #string = ''.join([i for i in string if not i.isdigit()])
        return string.strip().lower()
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        for column in X_train.columns:
            X[column] = X[column].map(lambda x:self.clean_str(str(x)))
        return X

class CheckSpell(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)
        dictionary_path = pkg_resources.resource_filename("symspellpy", "frequency_dictionary_en_82_765.txt")
        bigram_path = pkg_resources.resource_filename("symspellpy", "frequency_bigramdictionary_en_243_342.txt")
        self.sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1)
        self.sym_spell.load_bigram_dictionary(bigram_path, term_index=0, count_index=2)
    def check_spell(self, input_term):
        suggestions = self.sym_spell.lookup_compound(input_term, max_edit_distance=2)
        # pbar.update()
        return suggestions[0]._term
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        # pbar = tqdm(total=len(X['search_term']))
        X['search_term'] = X['search_term'].map(self.check_spell)
        # pbar.close()
        return X

class StemmingUnitsSpecialCharacters(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.stemmer = PorterStemmer()
    def str_stem(self, s): 
        if isinstance(s, str):
            s = re.sub(r"([0-9])( *)\.( *)([0-9])", r"\1.\4", s)
            s = re.sub(r"([0-9]+)( *)(inches|inch|in|')\.?", r"\1in. ", s)
            s = re.sub(r"([0-9]+)( *)(foot|feet|ft|'')\.?", r"\1ft. ", s)
            s = re.sub(r"([0-9]+)( *)(pounds|pound|lbs|lb)\.?", r"\1lb. ", s)
            s = re.sub(r"([0-9]+)( *)(square|sq) ?\.?(feet|foot|ft)\.?", r"\1sq.ft. ", s)
            s = re.sub(r"([0-9]+)( *)(cubic|cu) ?\.?(feet|foot|ft)\.?", r"\1cu.ft. ", s)
            s = re.sub(r"([0-9]+)( *)(gallons|gallon|gal)\.?", r"\1gal. ", s)
            s = re.sub(r"([0-9]+)( *)(ounces|ounce|oz)\.?", r"\1oz. ", s)
            s = re.sub(r"([0-9]+)( *)(centimeters|cm)\.?", r"\1cm. ", s)
            s = re.sub(r"([0-9]+)( *)(milimeters|mm)\.?", r"\1mm. ", s)
            s = re.sub(r"([0-9]+)( *)(°|degrees|degree)\.?", r"\1 deg. ", s)
            s = re.sub(r"([0-9]+)( *)(v|volts|volt)\.?", r"\1 volt. ", s)
            s = re.sub(r"([0-9]+)( *)(wattage|watts|watt)\.?", r"\1 watt. ", s)
            s = re.sub(r"([0-9]+)( *)(amperes|ampere|amps|amp)\.?", r"\1 amp. ", s)
            s = re.sub(r"([0-9]+)( *)(qquart|quart)\.?", r"\1 qt. ", s)
            s = re.sub(r"([0-9]+)( *)(hours|hour|hrs.)\.?", r"\1 hr ", s)
            s = re.sub(r"([0-9]+)( *)(gallons per minute|gallon per minute|gal per minute|gallons/min.|gallons/min)\.?", r"\1 gal. per min. ", s)
            s = re.sub(r"([0-9]+)( *)(gallons per hour|gallon per hour|gal per hour|gallons/hour|gallons/hr)\.?", r"\1 gal. per hr ", s)
            # Deal with special characters
            s = s.replace("$"," ")
            s = s.replace("?"," ")
            s = s.replace("&nbsp;"," ")
            s = s.replace("&amp;","&")
            s = s.replace("&#39;","'")
            s = s.replace("/>/Agt/>","")
            s = s.replace("</a<gt/","")
            s = s.replace("gt/>","")
            s = s.replace("/>","")
            s = s.replace("<br","")
            s = s.replace("<.+?>","")
            s = s.replace("[ &<>)(_,;:!?\+^~@#\$]+"," ")
            s = s.replace("'s\\b","")
            s = s.replace("[']+","")
            s = s.replace("[\"]+","")
            s = s.replace("-"," ")
            s = s.replace("+"," ")
            # Remove text between paranthesis/brackets)
            s = s.replace("[ ]?[[(].+?[])]","")
            # remove sizes
            s = s.replace("size: .+$","")
            s = s.replace("size [0-9]+[.]?[0-9]+\\b","")
            
            return " ".join([self.stemmer.stem(re.sub('[^A-Za-z0-9-./]', ' ', word)) for word in s.lower().split()])
        else:
            return "null"
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        X['product_title'] = X['product_title'].apply(self.str_stem)
        X['search_term'] = X['search_term'].apply(self.str_stem)
        return X

class RemoveStopwords(BaseEstimator, TransformerMixin):
    def __init__(self):
        nltk.data.path.append("./")
        self.stopwords = stopwords.words('english')
    def clean_puntuation_stopwords(self, text):
        text = ''.join([word for word in text if word not in string.punctuation])
        text = text.lower()
        text = ' '.join([word for word in text.split() if word not in self.stopwords])
        return text
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        X['product_title'] = X['product_title'].apply(self.clean_puntuation_stopwords)
        X['search_term'] = X['search_term'].apply(self.clean_puntuation_stopwords)
        return X

clean_pipeline = Pipeline([
    ("overall_string_clean", OverallStringClean()),
    ("spellchecker", CheckSpell()),
    ('stem_units_and_special_characters', StemmingUnitsSpecialCharacters()),
    ("remove_stopwords", RemoveStopwords())
])

X_train_prep = clean_pipeline.fit_transform(X_train,y_train)
display(X_train_prep.head(5))

           id product_uid                                      product_title  \
0           2      100001                    simpson strong tie 12 gaug angl   
1           3      100001                    simpson strong tie 12 gaug angl   
2           9      100002  behr premium textur deckov 1gal sc 141 tugboat...   
3          16      100005  delta vero 1 handl shower onli faucet trim kit...   
4          17      100005  delta vero 1 handl shower onli faucet trim kit...   
...       ...         ...                                                ...   
74062  221457      206638  atlant windowpan 576 cd 192 dvd blu ray game m...   
74063  221458      206639  philip 40 watt halogen r20 flood light bulb 12...   
74064  221463      206641  schlage camelot activ age bronz handleset left...   
74065  221471      206648   plastec 11in x 24in rose garden wall decor steel   
74066  221473      206650  lichtenberg pool blue 918 millenni ryan heathe...   

                              search_te

In [17]:
train = pd.concat([X_train_prep , y_train],axis=1)
display(train.head(2))

Unnamed: 0,id,product_uid,product_title,search_term,product_description,MFG Brand Name,Bullet02,Bullet03,Bullet04,Bullet01,...,Certifications and Listings,Bullet09,Assembled Height (in.),Assembled Width (in.),Assembled Depth (in.),Product Length (in.),Bullet10,Indoor/Outdoor,Bullet11,relevance
0,2,100001,simpson strong tie 12 gaug angl,angl bracket,"not only do angles make joints stronger , they...",simpson strong tie,stronger than angled nailing or screw fastenin...,help ensure joints are consistently straight a...,dimensions 3 in. x 3 in. x 1 1/2 in.,versatile connector for various 90 connections...,...,,,,,,,,,,3.0
1,3,100001,simpson strong tie 12 gaug angl,bracket,"not only do angles make joints stronger , they...",simpson strong tie,stronger than angled nailing or screw fastenin...,help ensure joints are consistently straight a...,dimensions 3 in. x 3 in. x 1 1/2 in.,versatile connector for various 90 connections...,...,,,,,,,,,,2.5


In [18]:
train.to_csv('train_cleaned.csv',index=False)

# Clean test set

In [20]:
import pandas as pd

X_test = pd.read_csv('data/test.csv', encoding="ISO-8859-1")
product_descriptions_with_attributes = pd.read_csv('product_descriptions_with_attributes.csv')
X_test = pd.merge(X_test , product_descriptions_with_attributes , how='left', on=['product_uid'])

X_test_prep = clean_pipeline.transform(X_test)

X_test_prep.to_csv('test_cleaned.csv', index=False)