In [1]:
from __future__ import division
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import zipfile, StringIO, requests
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
import re
from sklearn.ensemble import RandomForestRegressor
import nltk.tokenize as tk
from nltk.corpus import stopwords
stopwords = stopwords.words('english')
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer('english')
from nltk.stem.wordnet import WordNetLemmatizer
wordnet = WordNetLemmatizer()

%pylab inline

Populating the interactive namespace from numpy and matplotlib




In [2]:
z = zipfile.ZipFile('data/train_home_depot.zip')
df = pd.read_csv(z.open('train.csv'))

In [3]:
z2 = zipfile.ZipFile('data/attributes.csv.zip')
df_attributes = pd.read_csv(z2.open('attributes.csv'))

In [4]:
z3 = zipfile.ZipFile('data/product_descriptions.csv.zip')
df_description = pd.read_csv(z3.open('product_descriptions.csv'))

In [8]:
df_description['product_description'][0]

'Not only do angles make joints stronger, they also provide more consistent, straight corners. Simpson Strong-Tie offers a wide variety of angles in various sizes and thicknesses to handle light-duty jobs or projects where a structural connection is needed. Some can be bent (skewed) to match the project. For outdoor projects or those where moisture is present, use our ZMAX zinc-coated connectors, which provide extra resistance against corrosion (look for a "Z" at the end of the model number).Versatile connector for various 90 connections and home repair projectsStronger than angled nailing or screw fastening aloneHelp ensure joints are consistently straight and strongDimensions: 3 in. x 3 in. x 1-1/2 in.Made from 12-Gauge steelGalvanized for extra corrosion resistanceInstall with 10d common nails or #9 x 1-1/2 in. Strong-Drive SD screws'

## Things to do
* featurize product title, product description and product attributes
* 

In [10]:
df['searchfix'] = df['search_term'].str.lower().str.decode('ISO-8859-1').str.encode('ascii', 'ignore').str.split()\
.apply(lambda x: [stemmer.stem(item) for item in x]) \
.apply(lambda x: [wordnet.lemmatize(item) for item in x])

In [11]:
df.tail()

Unnamed: 0,id,product_uid,product_title,search_term,relevance,searchfix
74062,221457,206638,Atlantic Windowpane 576 CD or 192 DVD Blu-Ray ...,tv riser glass,1.0,"[tv, riser, glass]"
74063,221458,206639,Philips 40-Watt Halogen R20 Flood Light Bulb (...,r20 halogen light,3.0,"[r20, halogen, light]"
74064,221463,206641,Schlage Camelot In-Active Aged Bronze Handlese...,schlage lock siena half dummy knob with,2.33,"[schlage, lock, siena, half, dummi, knob, with]"
74065,221471,206648,Plastec 11 in. x 24 in. Rose Garden Wall Decor...,zen garden decor,3.0,"[zen, garden, decor]"
74066,221473,206650,LICHTENBERG Pool Blue No. 918 Millennial Ryan ...,fine sheer curtain 63 inches,2.33,"[fine, sheer, curtain, 63, inch]"


In [12]:
df['titlefix'] = df['product_title'].str.lower().str.decode('ISO-8859-1').str.encode('ascii', 'ignore')
def sum_title(df):
    return sum([word in df['titlefix'] for word in df['searchfix']])
df['count_title'] = df.apply(sum_title, axis=1)

In [13]:
def convert(name):
    s1 = re.sub('(.)([A-Z][a-z]+)', r'\1 \2', name)
    return re.sub('([a-z0-9])([A-Z])', r'\1 \2', s1).lower()


In [14]:
def num_word_descrp(x):
    """
    count the number of word in common between searchfix column and description in df_description
    """
    return sum([ word in df_description[df_description['product_uid'] == x['product_uid']]['product_description'].values[0].lower() for word in x['searchfix']])

In [15]:
df['count_common_description'] = df.apply(num_word_descrp, axis=1)

In [15]:
df_description.iloc[1]['product_description']

'BEHR Premium Textured DECKOVER is an innovative solid color coating. It will bring your old, weathered wood or concrete back to life. The advanced 100% acrylic resin formula creates a durable coating for your tired and worn out deck, rejuvenating to a whole new look.  For the best results, be sure to properly prepare the surface using other applicable BEHR products displayed above.California residents: see&nbsp;Proposition 65 informationRevives wood and composite decks, railings, porches and boat docks, also great for concrete pool decks, patios and sidewalks100% acrylic solid color coatingResists cracking and peeling and conceals splinters and cracks up to 1/4 in.Provides a durable, mildew resistant finishCovers up to 75 sq. ft. in 2 coats per gallonCreates a textured, slip-resistant finishFor best results, prepare with the appropriate BEHR product for your wood or concrete surfaceActual paint colors may vary from on-screen and printer representationsColors available to be tinted in 

In [16]:
convert(df_description.iloc[1]['product_description'])

'behr  premium  textured deckover is an innovative solid color coating.  it will bring your old, weathered wood or concrete back to life.  the advanced 100% acrylic resin formula creates a durable coating for your tired and worn out deck, rejuvenating to a whole new look.   for the best results, be sure to properly prepare the surface using other applicable behr products displayed above. california residents: see&nbsp; proposition 65 information revives wood and composite decks, railings, porches and boat docks, also great for concrete pool decks, patios and sidewalks100% acrylic solid color coating resists cracking and peeling and conceals splinters and cracks up to 1/4 in. provides a durable, mildew resistant finish covers up to 75 sq. ft. in 2 coats per gallon creates a textured, slip-resistant finish for best results, prepare with the appropriate behr product for your wood or concrete surface actual paint colors may vary from on-screen and printer representations colors available t

In [17]:
df.head(10)

Unnamed: 0,id,product_uid,product_title,search_term,relevance,searchfix,titlefix,count_title,count_common_description
0,2,100001,Simpson Strong-Tie 12-Gauge Angle,angle bracket,3.0,"[angl, bracket]",simpson strong-tie 12-gauge angle,1,1
1,3,100001,Simpson Strong-Tie 12-Gauge Angle,l bracket,2.5,"[l, bracket]",simpson strong-tie 12-gauge angle,1,1
2,9,100002,BEHR Premium Textured DeckOver 1-gal. #SC-141 ...,deck over,3.0,"[deck, over]",behr premium textured deckover 1-gal. #sc-141 ...,2,2
3,16,100005,Delta Vero 1-Handle Shower Only Faucet Trim Ki...,rain shower head,2.33,"[rain, shower, head]",delta vero 1-handle shower only faucet trim ki...,1,1
4,17,100005,Delta Vero 1-Handle Shower Only Faucet Trim Ki...,shower only faucet,2.67,"[shower, onli, faucet]",delta vero 1-handle shower only faucet trim ki...,2,2
5,18,100006,Whirlpool 1.9 cu. ft. Over the Range Convectio...,convection otr,3.0,"[convect, otr]",whirlpool 1.9 cu. ft. over the range convectio...,1,2
6,20,100006,Whirlpool 1.9 cu. ft. Over the Range Convectio...,microwave over stove,2.67,"[microwav, over, stove]",whirlpool 1.9 cu. ft. over the range convectio...,2,2
7,21,100006,Whirlpool 1.9 cu. ft. Over the Range Convectio...,microwaves,3.0,[microwav],whirlpool 1.9 cu. ft. over the range convectio...,1,1
8,23,100007,Lithonia Lighting Quantum 2-Light Black LED Em...,emergency light,2.67,"[emerg, light]",lithonia lighting quantum 2-light black led em...,2,2
9,27,100009,House of Fara 3/4 in. x 3 in. x 8 ft. MDF Flut...,mdf 3/4,3.0,"[mdf, 3/4]",house of fara 3/4 in. x 3 in. x 8 ft. mdf flut...,2,2


In [18]:
brandnames = df_attributes[df_attributes.name == "MFG Brand Name"][['product_uid', 'value']]
brandnames.index = brandnames['product_uid']
brandnames.value = brandnames.value.str.lower()
# df_search = df[['product_uid', 'search_term']]
# df_search['search_term'] = df_search.search_term.str.split()

In [19]:
df_search2 = df.join(brandnames, on='product_uid', lsuffix='l', rsuffix='r').drop(['product_uidl', 'product_uidr'], axis=1)
df_search2.fillna('NaN', inplace=True)
df_search2['cnt'] = df_search2.apply(lambda row: sum(row.value.find(word.encode('ascii', 'ignore')) > 0 for word in row.searchfix), axis=1)


In [20]:
df_search2.head()

Unnamed: 0,id,product_title,search_term,relevance,searchfix,titlefix,count_title,count_common_description,value,cnt
0,2,Simpson Strong-Tie 12-Gauge Angle,angle bracket,3.0,"[angl, bracket]",simpson strong-tie 12-gauge angle,1,1,simpson strong-tie,0
1,3,Simpson Strong-Tie 12-Gauge Angle,l bracket,2.5,"[l, bracket]",simpson strong-tie 12-gauge angle,1,1,simpson strong-tie,0
2,9,BEHR Premium Textured DeckOver 1-gal. #SC-141 ...,deck over,3.0,"[deck, over]",behr premium textured deckover 1-gal. #sc-141 ...,2,2,behr premium textured deckover,2
3,16,Delta Vero 1-Handle Shower Only Faucet Trim Ki...,rain shower head,2.33,"[rain, shower, head]",delta vero 1-handle shower only faucet trim ki...,1,1,delta,0
4,17,Delta Vero 1-Handle Shower Only Faucet Trim Ki...,shower only faucet,2.67,"[shower, onli, faucet]",delta vero 1-handle shower only faucet trim ki...,2,2,delta,0


# Modeling using Random Forest

In [21]:
X = df_search2[['count_title','count_common_description','cnt']]
y = df_search2['relevance']

In [22]:
RF_mod = RandomForestRegressor(50)
cross_val_score(RF_mod, X, y, cv=5, scoring='r2')


array([ 0.05747352,  0.05582343,  0.03605256,  0.04660999, -0.1266012 ])

In [5]:
z4 = zipfile.ZipFile('data/test.csv.zip')
df4 = pd.read_csv(z4.open('test.csv'))

In [44]:
for i in df['product_title']:
    z = i.find(". 0")
    if z>=0:
        print i

dewalt tstak 44lb. 0-drawer deep box with flat top
kraus all-in-on undermount stainless steel 32.3in. 0-hole doubl bowl kitchen sink
yosemit home decor undermount stainless steel 23in. 0-hole singl bowl kitchen sinkin. satin
elkay lusterton undermount stainless steel 31in. 0-hole singl bowl kitchen sinkin. satin
elkay lusterton undermount stainless steel 31-3/4 xbi 16-1/2 xbi 7-1/2in. 0-hole doubl bowl kitchen sink
elkay signatur plus undermount stainless steel 24in. 0-hole singl bowl kitchen sink
builder edg paint head metal screwsin. 023 wicker (12-pack)
builder edg paint head metal screwsin. 078 wineberri (12-pack)
3m hand-mask 6ft. xbi 90ft. 0.35 mil pre-fold mask film
vigo undermount 32in. 0-hole singl bowl kitchen sink with grid and strainerin. stainless steel
builder edg paint head metal screwsin. 020 cream (12-pack)
sterl carthag undermount stainless steel 32in. 0-hole doubl bowl kitchen sink
schon all-in-on undermount stainless steel 30in. 0-hole doubl bowl kitchen sink with f

In [30]:

#function to clean strings
def str_stem(s):
    if isinstance(s, str):
        s = s.decode('ISO-8859-1').encode('ascii', 'ignore').lower()
        
        # 
        s = s.replace(".",". ")
        s = s.replace(". 0",".0")
        s = s.replace(". 1",".1")
        s = s.replace(". 2",".2")
        s = s.replace(". 3",".3")
        s = s.replace(". 4",".4")
        s = s.replace(". 5",".5")
        s = s.replace(". 6",".6")
        s = s.replace(". 7",".7")
        s = s.replace(". 8",".8")
        s = s.replace(". 9",".9")
        s = s.replace("  "," ")

        s = s.replace("'","in.")
        s = s.replace("inches","in.")
        s = s.replace("inch","in.")
        s = s.replace(" in ","in. ")
        s = s.replace(" in.","in.")

        s = s.replace("''","ft.")
        s = s.replace(" feet ","ft. ")
        s = s.replace("feet","ft.")
        s = s.replace("foot","ft.")
        s = s.replace(" ft ","ft. ")
        s = s.replace(" ft.","ft.")

        s = s.replace(" pounds ","lb. ")
        s = s.replace(" pound ","lb. ")
        s = s.replace("pound","lb.")
        s = s.replace(" lb ","lb. ")
        s = s.replace(" lb.","lb.")
        s = s.replace(" lbs ","lb. ")
        s = s.replace("lbs.","lb.")

        s = s.replace(" x "," xby ")
        s = s.replace("*"," xby ")
        s = s.replace(" by "," xby")
        s = s.replace("x0"," xby 0")
        s = s.replace("x1"," xby 1")
        s = s.replace("x2"," xby 2")
        s = s.replace("x3"," xby 3")
        s = s.replace("x4"," xby 4")
        s = s.replace("x5"," xby 5")
        s = s.replace("x6"," xby 6")
        s = s.replace("x7"," xby 7")
        s = s.replace("x8"," xby 8")
        s = s.replace("x9"," xby 9")
        s = s.replace("0x","0 xby ")
        s = s.replace("1x","1 xby ")
        s = s.replace("2x","2 xby ")
        s = s.replace("3x","3 xby ")
        s = s.replace("4x","4 xby ")
        s = s.replace("5x","5 xby ")
        s = s.replace("6x","6 xby ")
        s = s.replace("7x","7 xby ")
        s = s.replace("8x","8 xby ")
        s = s.replace("9x","9 xby ")

        s = s.replace(" sq ft","sq.ft. ")
        s = s.replace("sq ft","sq.ft. ")
        s = s.replace("sqft","sq.ft. ")
        s = s.replace(" sqft ","sq.ft. ")
        s = s.replace("sq. ft","sq.ft. ")
        s = s.replace("sq ft.","sq.ft. ")
        s = s.replace("sq feet","sq.ft. ")
        s = s.replace("square feet","sq.ft. ")

        s = s.replace(" gallons ","gal. ")
        s = s.replace(" gallon ","gal. ")
        s = s.replace("gallons","gal.")
        s = s.replace("gallon","gal.")
        s = s.replace(" gal ","gal. ")
        s = s.replace(" gal","gal.")

        s = s.replace("ounces","oz.")
        s = s.replace("ounce","oz.")
        s = s.replace(" oz.","oz. ")
        s = s.replace(" oz ","oz. ")

        s = s.replace("centimeters","cm.")
        s = s.replace(" cm.","cm.")
        s = s.replace(" cm ","cm. ")

        s = s.replace("milimeters","mm.")
        s = s.replace(" mm.","mm.")
        s = s.replace(" mm ","mm. ")

        s = s.replace("°","deg. ")
        s = s.replace("degrees","deg. ")
        s = s.replace("degree","deg. ")

        s = s.replace("volts","volt. ")
        s = s.replace("volt","volt. ")

        s = s.replace("watts","watt. ")
        s = s.replace("watt","watt. ")

        s = s.replace("ampere","amp. ")
        s = s.replace("amps","amp. ")
        s = s.replace(" amp ","amp. ")

        s = s.replace("whirpool","whirlpool")
        s = s.replace("whirlpoolga", "whirlpool")
        s = s.replace("whirlpoolstainless","whirlpool stainless")

        s = s.replace("  "," ")
        s = s.replace("..",".")
        #s = (" ").join([stemmer.stem(z) for z in s.lower().split(" ")])
        s = (" ").join([stemmer.stem(z) for z in s.split(" ")])
        return s.lower()
    else:
        return "null"


In [24]:
def data_cleaning(df_org):
    
    df = df_org.copy()
    
    # stem and lematize search term
    df['searchfix'] = df['search_term'].str.lower().str.decode('ISO-8859-1').str.encode('ascii', 'ignore').str.split()\
    .apply(lambda x: [stemmer.stem(item) for item in x]) \
    .apply(lambda x: [wordnet.lemmatize(item) for item in x])
    # count common word between search term and title

    df['titlefix'] = df['product_title'].str.lower().str.decode('ISO-8859-1').str.encode('ascii', 'ignore')
    def sum_title(df):
        return sum([word in df['titlefix'] for word in df['searchfix']])
    df['count_title'] = df.apply(sum_title, axis=1)

    
    # count common word between search term and df_description
    def num_word_descrp(x):
        """
        count the number of word in common between searchfix column and description in df_description
        """
        return sum([ word in df_description[df_description['product_uid'] == x['product_uid']]['product_description'].values[0].lower() for word in x['searchfix']])

    df['count_common_description'] = df.apply(num_word_descrp, axis=1)


    # count common word between search term and brand name fro df_attributes
    brandnames = df_attributes[df_attributes.name == "MFG Brand Name"][['product_uid', 'value']]
    brandnames.index = brandnames['product_uid']
    brandnames.value = brandnames.value.str.lower()

    df_search2 = df.join(brandnames, on='product_uid', lsuffix='l', rsuffix='r')#.drop(['product_uidl', 'product_uidr'], axis=1)
    df_search2.fillna('NaN', inplace=True)
    df_search2['cnt'] = df_search2.apply(lambda row: sum(row.value.find(word) > 0 for word in row.search_term), axis=1)


    return df_search2
    


In [25]:
df_train = data_cleaning(df)

In [26]:
df_test = data_cleaning(df4)

In [27]:
def modeling(estimator, df, df_test, submission=False):
    X = df['count_title']
    y = df['relevance']
    X_test = df_test['count_title']
    estimator.fit(X, y)
    
    result = estimator.predict(X_test)
    
    output_df= pd.DataFrame(df_test['id'],columns=['id',"relevance"])
    output_df['relevance'] = result
    output_df['relevance'] = output_df['relevance'].apply(lambda x: 3 if x>3 else x)
    output_df['relevance'] = output_df['relevance'].apply(lambda x: 1 if x<1 else x) 
    
    
    if submission:
        output_df.to_csv('submission.csv',index=False)
        
    return estimator

In [None]:
df_train['attribute_names']  = df_train.apply(lambda df: ' '.join(df_attributes[df_attributes['product_uid']==df['id']]['name']))

In [None]:
df_attributes[df_attributes['product_uid']==df_train.iloc[0]['product_uidl']]['name']


In [32]:
t = TfidfVectorizer(stop_words='english')

In [31]:
df['product_title'] = df['product_title'].apply(str_stem)

In [33]:
df_tfidf = df['product_title'].unique() 
df_tfidf

array([u'simpson strong-ti 12-gaug angl',
       u'behr premium textur deckov 1-gal. #sc-141 tugboat wood and concret coat',
       u'delta vero 1-handl shower onli faucet trim kitin. chrome (valv not included)',
       ...,
       u'schlage camelot in-act age bronz handleset with left-hand accent lever',
       u'plastec 11in. xbi 24in. rose garden wall decor steel',
       u'lichtenberg pool blue no. 918 millenni ryan heather textur sheer curtain panel, 40in. w xbi 63in. l'], dtype=object)

In [34]:
df_tfidf = pd.Series(df['product_title'].unique()).str.lower().str.decode('ISO-8859-1').str.encode('ascii', 'ignore').str.split()\
    .apply(lambda x: [stemmer.stem(item) for item in x]) \
    .apply(lambda x: [wordnet.lemmatize(item) for item in x])

In [35]:
df_tfidf = [" ".join(title) for title in df_tfidf]

In [36]:
title_vect = t.fit_transform(df_tfidf)


<53468x17863 sparse matrix of type '<type 'numpy.float64'>'
	with 521911 stored elements in Compressed Sparse Row format>

In [37]:
t.get_feature_names()

[u'00',
 u'000',
 u'0000',
 u'00003g',
 u'000d_',
 u'000ft',
 u'000gal',
 u'000in',
 u'000lb',
 u'001',
 u'002',
 u'003',
 u'00304',
 u'004',
 u'005',
 u'006',
 u'006in',
 u'0070a',
 u'007e',
 u'008',
 u'009',
 u'00in',
 u'01',
 u'010',
 u'010c',
 u'011',
 u'012',
 u'012in',
 u'012j',
 u'013',
 u'014a',
 u'014c',
 u'015b',
 u'016',
 u'016in',
 u'018',
 u'018b',
 u'01e',
 u'01f',
 u'01s',
 u'02',
 u'020',
 u'021',
 u'022',
 u'022in',
 u'023',
 u'024in',
 u'025in',
 u'027',
 u'028',
 u'02ft',
 u'02in',
 u'03',
 u'030',
 u'0301172',
 u'0306140',
 u'032',
 u'0323016',
 u'032in',
 u'035in',
 u'036',
 u'0393in',
 u'04',
 u'04014',
 u'04050',
 u'040in',
 u'041a',
 u'043in',
 u'044',
 u'045',
 u'045in',
 u'047',
 u'047107',
 u'047b',
 u'049',
 u'0498096',
 u'04in',
 u'05',
 u'050',
 u'050in',
 u'051a',
 u'051c',
 u'052',
 u'05307',
 u'05in',
 u'05mm',
 u'06',
 u'0625in',
 u'062in',
 u'063in',
 u'065',
 u'065in',
 u'066a',
 u'069',
 u'06in',
 u'06oz',
 u'07',
 u'075in',
 u'076',
 u'077',
 u'078