In [1]:
from __future__ import division
import time
import unicodedata
import re

import pandas as pd
import numpy as np

#from sklearn import cross_validation
#from sklearn.feature_extraction.text import TfidfVectorizer
#from sklearn.feature_selection import SelectPercentile, f_classif
#from sklearn.feature_extraction import DictVectorizer

from sklearn.ensemble import RandomForestRegressor, BaggingRegressor
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

#from nltk.stem import porter
#from nltk.stem.snowball import SnowballStemmer

In [2]:
# Start file load timer
start_time = time.time()

# Import all dataframes and build the train dataframe
df_train = pd.DataFrame.from_csv('train.csv', index_col=None, encoding="ISO-8859-1")
df_test = pd.DataFrame.from_csv('test.csv', index_col=None, encoding="ISO-8859-1")
df_prod_desc = pd.DataFrame.from_csv('product_descriptions.csv', index_col=None, encoding="ISO-8859-1")

# Add Brand column/feature, but a lot of rows are NaN
df_attrib = pd.DataFrame.from_csv('attributes.csv', index_col=None, encoding="ISO-8859-1")
df_brand = df_attrib[df_attrib.name == "MFG Brand Name"][["product_uid", "value"]].rename(columns={"value": "brand"})

# Find the length of the train dataset to use for train/test split that occurs later
num_train = df_train.shape[0]

# Concat train and test datasets before cleaning data
df = pd.concat((df_train, df_test), axis=0, ignore_index=True)

# Add prod_desc and brand information
df = pd.merge(df, df_prod_desc, how='left', on='product_uid')
df = pd.merge(df, df_brand, how='left', on='product_uid')

print("--- Files Loaded: %s minutes ---" % round(((time.time() - start_time)/60),2))

--- Files Loaded: 0.07 minutes ---


In [3]:
# Define necessary variables and functions
stemmer = porter.PorterStemmer()
stop_w = ['for', 'xbi', 'and', 'in', 'th','on','sku','with','what','from','that','less','er','ing'] #'electr','paint','pipe','light','kitchen','wood','outdoor','door','bathroom'
strNum = {'zero':0,'one':1,'two':2,'three':3,'four':4,'five':5,'six':6,'seven':7,'eight':8,'nine':9}

def str_stem(s):
    s = unicodedata.normalize('NFD', unicode(s)).encode('ascii', 'ignore')
    s = re.sub(r"(\w)\.([A-Z])", r"\1 \2", s) #Split words with a.A
    s = s.lower()
    s = s.replace("  ", " ")
    s = s.replace(",", "") #could be number / segment later
    s = s.replace("$", " ")
    s = s.replace("?", " ")
    s = s.replace("-", " ")
    s = s.replace("//", "/")
    s = s.replace("..", ".")
    s = s.replace(" / ", " ")
    s = s.replace(" \\ ", " ")
    s = s.replace(".", " . ")
    s = re.sub(r"(^\.|/)", r"", s)
    s = re.sub(r"(\.|/)$", r"", s)
    s = re.sub(r"([0-9])([a-z])", r"\1 \2", s)
    s = re.sub(r"([a-z])([0-9])", r"\1 \2", s)
    s = s.replace(" x ", " xbi ")
    s = re.sub(r"([a-z])( *)\.( *)([a-z])", r"\1 \4", s)
    s = re.sub(r"([a-z])( *)/( *)([a-z])", r"\1 \4", s)
    s = s.replace("*", " xbi ")
    s = s.replace(" by ", " xbi ")
    s = re.sub(r"([0-9])( *)\.( *)([0-9])", r"\1.\4", s)
    s = re.sub(r"([0-9]+)( *)(inches|inch|in|')\.?", r"\1in. ", s)
    s = re.sub(r"([0-9]+)( *)(foot|feet|ft|'')\.?", r"\1ft. ", s)
    s = re.sub(r"([0-9]+)( *)(pounds|pound|lbs|lb)\.?", r"\1lb. ", s)
    s = re.sub(r"([0-9]+)( *)(square|sq) ?\.?(feet|foot|ft)\.?", r"\1sq.ft. ", s)
    s = re.sub(r"([0-9]+)( *)(cubic|cu) ?\.?(feet|foot|ft)\.?", r"\1cu.ft. ", s)
    s = re.sub(r"([0-9]+)( *)(gallons|gallon|gal)\.?", r"\1gal. ", s)
    s = re.sub(r"([0-9]+)( *)(ounces|ounce|oz)\.?", r"\1oz. ", s)
    s = re.sub(r"([0-9]+)( *)(centimeters|cm)\.?", r"\1cm. ", s)
    s = re.sub(r"([0-9]+)( *)(milimeters|mm)\.?", r"\1mm. ", s)
    s = s.replace("°", " degrees ")
    s = re.sub(r"([0-9]+)( *)(degrees|degree)\.?", r"\1deg. ", s)
    s = s.replace(" v ", " volts ")
    s = re.sub(r"([0-9]+)( *)(volts|volt)\.?", r"\1volt. ", s)
    s = re.sub(r"([0-9]+)( *)(watts|watt)\.?", r"\1watt. ", s)
    s = re.sub(r"([0-9]+)( *)(amperes|ampere|amps|amp)\.?", r"\1amp. ", s)
    s = s.replace("  ", " ")
    s = s.replace(" . ", " ")
    s = (" ").join([str(strNum[z]) if z in strNum else z for z in s.split(" ")])
    s = (" ").join([stemmer.stem(z) for z in s.split(" ")])
    s = s.lower()
    s = s.replace("toliet", "toilet")
    s = s.replace("airconditioner", "air conditioner")
    s = s.replace("vinal", "vinyl")
    s = s.replace("vynal", "vinyl")
    s = s.replace("skill", "skil")
    s = s.replace("snowbl", "snow bl")
    s = s.replace("plexigla", "plexi gla")
    s = s.replace("rustoleum", "rust oleum")
    s = s.replace("whirpool", "whirlpool")
    s = s.replace("whirlpoolga", "whirlpool ga")
    s = s.replace("whirlpoolstainless", "whirlpool stainless")
    return s


# Depends on segmentit
def seg_words(str1, str2):
    str2 = str2.lower()
    str2 = re.sub("[^a-z0-9./]", " ", str2)
    str2 = [z for z in set(str2.split()) if len(z) > 2]
    words = str1.lower().split(" ")
    s9 = []
    for word in words:
        if len(word) > 3:
            s1 = []
            s1 += segmentit(word, str2, True)
            if len(s9) > 1:
                s9 += [z for z in s1 if z not in ['er', 'ing', 's', 'less'] and len(z) > 1]
            else:
                s9.append(word)
        else:
            s9.append(word)
    return (" ".join(s9))

def segmentit(s, txt_arr, t):
    st = s
    r = []
    for j in xrange(len(s)):
        for word in txt_arr:
            if word == s[:-j]:
                r.append(s[:-j])
                #print(s[:-j],s[len(s)-j:])
                s = s[len(s)-j:]
                r += segmentit(s, txt_arr, False)
    if t:
        i = len(("").join(r))
        if not i == len(st):
            r.append(st[i:])
    return r


def str_common_word(str1, str2):
    words, cnt = str1.split(), 0
    for word in words:
        if str2.find(word) >= 0:
            cnt += 1
    return cnt


def str_whole_word(str1, str2, i_):
    cnt = 0
    while i_ < len(str2):
        i_ = str2.find(str1, i_)
        if i_ == -1:
            return cnt
        else:
            cnt += 1
            i_ += len(str1)
    return cnt


def levenshtein(str1, str2):
    if len(str1) < len(str2):
        return levenshtein(str2, str1)
    if len(str2) == 0:
        return len(str1)
    previous_row = range(len(str2) + 1)
    for i, c1 in enumerate(str1):
        current_row = [i + 1]
        for j, c2 in enumerate(str2):
            insertions = previous_row[j + 1] + 1 # j+1 instead of j since previous_row and current_row are one character longer
            deletions = current_row[j] + 1       # than str2
            substitutions = previous_row[j] + (c1 != c2)
            current_row.append(min(insertions, deletions, substitutions))
        previous_row = current_row
    return previous_row[-1]

In [4]:
# Apply stemmer function to clean features w/ text
start_time = time.time()
df['search_term'] = df['search_term'].map(lambda x: str_stem(x))
df['product_title'] = df['product_title'].map(lambda x: str_stem(x))
df['product_description'] = df['product_description'].map(lambda x: str_stem(x))
df['brand'] = df['brand'].map(lambda x: str_stem(x))
print("--- Stemming: %s minutes ---" % round(((time.time() - start_time)/60), 2))

--- Stemming: 11.48 minutes ---


In [5]:
# Create the product_info feature
start_time = time.time()
df['product_info'] = df['search_term']+"|"+df['product_title'] +"|"+df['product_description']
print("--- Prod Info: %s minutes ---" % round(((time.time() - start_time)/60), 2))

--- Prod Info: 0.01 minutes ---


In [6]:
# Calculate the Levenshtein edit distances
start_time = time.time()
df['leveneditdist_query_in_title'] = df['product_info'].map(lambda x: levenshtein(x.split('|')[1], x.split('|')[0]))
df['leveneditdist_query_in_product_description'] = df['product_info'].map(lambda x: levenshtein(x.split('|')[2], x.split('|')[0]))
print("--- Levenshtein edit distance: %s minutes ---" % round(((time.time() - start_time)/60), 2))

--- Levenshtein edit distance: 37.94 minutes ---


In [7]:
# Calculate the length of the text features for later use in ratios
df['len_of_query'] = df['search_term'].map(lambda x: len(x.split())).astype(int)
df['len_of_title'] = df['product_title'].map(lambda x: len(x.split())).astype(int)
df['len_of_description'] = df['product_description'].map(lambda x: len(x.split())).astype(int)
df['len_of_brand'] = df['brand'].map(lambda x: len(x.split())).astype(int)
print("--- Len of: %s minutes ---" % round(((time.time() - start_time)/60), 2))

--- Len of: 37.99 minutes ---


In [8]:
# Apply seg_words function to clean and prepare search_term feature
start_time = time.time()
df['search_term'] = df['product_info'].map(lambda x: seg_words(x.split('|')[0], x.split('|')[1]))
print("--- Search Term Segment: %s minutes ---" % round(((time.time() - start_time)/60),2))

--- Search Term Segment: 0.15 minutes ---


In [9]:
# Apply seg_words function to clean data
start_time = time.time()
df['query_in_title'] = df['product_info'].map(lambda x: str_whole_word(x.split('|')[0], x.split('|')[1], 0))
df['query_in_description'] = df['product_info'].map(lambda x: str_whole_word(x.split('|')[0], x.split('|')[2], 0))
print("--- Query In: %s minutes ---" % round(((time.time() - start_time)/60), 2))
start_time = time.time()
df['query_last_word_in_title'] = df['product_info'].map(lambda x: str_common_word(x.split('|')[0].split(" ")[-1], x.split('|')[1]))
df['query_last_word_in_description'] = df['product_info'].map(lambda x: str_common_word(x.split('|')[0].split(" ")[-1], x.split('|')[2]))
print("--- Query Last Word In: %s minutes ---" % round(((time.time() - start_time)/60), 2))
start_time = time.time()
df['word_in_title'] = df['product_info'].map(lambda x: str_common_word(x.split('|')[0], x.split('|')[1]))
df['word_in_description'] = df['product_info'].map(lambda x: str_common_word(x.split('|')[0], x.split('|')[2]))
print("--- Word In: %s minutes ---" % round(((time.time() - start_time)/60), 2))

--- Query In: 0.03 minutes ---
--- Query Last Word In: 0.03 minutes ---
--- Word In: 0.04 minutes ---


In [10]:
# Export df as a .csv
df.to_csv('df.csv')

In [None]:
# Define Stemmer
#stemmer = SnowballStemmer('english')
#def str_stemmer(s):
#    return " ".join([stemmer.stem(word) for word in s.lower().split()])

# Stem the features which will be used in training
#df['product_title'] = df['product_title'].map(lambda x:str_stemmer(x))

In [None]:
# Select the relevant columns (product_title, search_term, and relevance) from train df and convert to a list of dictionaries
#train_dict = df.ix[:, 'product_title':'relevance'].T.to_dict().values()

# Instantiate the vectorizer
#vectorizer = DictVectorizer()

# Vectorize the train data
#train_matrix = vectorizer.fit_transform(train_dict).toarray()