In [55]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from fuzzywuzzy import fuzz
from sklearn.cross_validation import StratifiedKFold
from sklearn.preprocessing import LabelEncoder

In [19]:
df = pd.read_csv('train.csv')
df = df.fillna('')
df['product_title'] = df['product_title'].astype(str)
df['product_description'] = df['product_description'].astype(str)

In [37]:
X,y = df.drop(['median_relevance','relevance_variance'],axis=1), df.median_relevance

In [None]:
X['query_title_partial'] = df.apply(lambda x: fuzz.ratio(x['query'], x['product_title']), axis=1)
X['query_desc_partial'] = df.apply(lambda x: fuzz.ratio(x['query'], x['product_description']), axis=1)

In [60]:
#Ordinal Encoding of 'Query'
enc = LabelEncoder()
ordinal_query = enc.fit_transform(df['query'])
X['ordinal_query'] = ordinal_query

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


In [61]:

features = ['query_title_partial',
    'query_desc_partial', 'ordinal_query']
X = X[features]

In [65]:
enc.classes_[121]

'led christmas lights'

In [85]:
#Cross validation loop - check Andrew Ng videos
kf = StratifiedKFold(y = y, n_folds=3, shuffle=True, random_state=0)
fold = 0
total_accuracy = 0.
for tr,ts in kf:
    fold += 1
    xtr, ytr = X.iloc[tr], y.iloc[tr]
    xts, yts = X.iloc[ts], y.iloc[ts]
    
    clf = RandomForestClassifier(n_estimators=1000,min_samples_leaf=8,criterion='gini',max_features='sqrt',n_jobs=-1,random_state=0)
    
    clf.fit(xtr,ytr)
    
    fold_acc = clf.score(xts,yts)
    total_accuracy += fold_acc
    print fold, clf.score(xts,yts)
    
print 'Mean Accuracy over 3 folds', total_accuracy / 3.

1 0.613998818665
2 0.608978145304
3 0.60838747785
Mean Accuracy over 3 folds 0.61045481394


In [None]:
#Accuracy over different min_samples_leaf values
leaf=1 - Mean Accuracy over 3 folds 0.564284307935
leaf=2 - Mean Accuracy over 3 folds 0.598641464855
leaf=4 - Mean Accuracy over 3 folds 0.60838747785
leaf=8 - Mean Accuracy over 3 folds 0.61045481394

In [None]:
#Accuracy with 1000 trees over different seeds
Mean Accuracy over 3 folds 0.565564087419
Mean Accuracy over 3 folds 0.566056310297
Mean Accuracy over 3 folds 0.566253199449

In [None]:
#Accuracy with 100 trees over different seeds
Mean Accuracy over 3 folds 0.563792085056
Mean Accuracy over 3 folds 0.559854302028
Mean Accuracy over 3 folds 0.565662531994

In [20]:
df.head(3) #""" use the title and description to predict relevance based on query  """

Unnamed: 0,id,query,product_title,product_description,median_relevance,relevance_variance
0,1,bridal shower decorations,Accent Pillow with Heart Design - Red/Black,Red satin accent pillow embroidered with a hea...,1,0.0
1,2,led christmas lights,Set of 10 Battery Operated Multi LED Train Chr...,Set of 10 Battery Operated Train Christmas Lig...,4,0.0
2,4,projector,ViewSonic Pro8200 DLP Multimedia Projector,,4,0.471


In [21]:
df['query_title_partial'] = df.apply(lambda x: fuzz.ratio(x['query'], x['product_title']), axis=1)
df['query_desc_partial'] = df.apply(lambda x: fuzz.ratio(x['query'], x['product_description']), axis=1)

In [22]:
df.head(3)

Unnamed: 0,id,query,product_title,product_description,median_relevance,relevance_variance,query_title_partial,query_desc_partial
0,1,bridal shower decorations,Accent Pillow with Heart Design - Red/Black,Red satin accent pillow embroidered with a hea...,1,0.0,26,16
1,2,led christmas lights,Set of 10 Battery Operated Multi LED Train Chr...,Set of 10 Battery Operated Train Christmas Lig...,4,0.0,37,0
2,4,projector,ViewSonic Pro8200 DLP Multimedia Projector,,4,0.471,31,0


In [23]:
#Thinking: should I convert my ratios based on their variance to a number between zero and 1?
# What are some common sense features I can make?
# Are there simple ways to automatically detect features in this case, or is it really about building similarity ratios?

In [24]:
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
features = [
    'query_title_partial',
    'query_desc_partial'
]

In [25]:
clf = RandomForestClassifier()
clf.fit(df[features], df['median_relevance'])

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [26]:
pd.crosstab(clf.predict(df[features]), df['median_relevance'])

median_relevance,1,2,3,4
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,96,27,26,43
2,33,204,48,102
3,27,39,219,94
4,618,1206,1444,5932


In [27]:
clf.score(df[features], df['median_relevance'])

0.63506595786572162

In [28]:
#Does the above mean that I got my own data set correct 64% of the time?

In [29]:
df_test = pd.read_csv("test.csv")
df_test = df_test.fillna('')

In [30]:
df_test.head(3)

Unnamed: 0,id,query,product_title,product_description
0,3,electric griddle,Star-Max 48 in Electric Griddle,
1,6,phillips coffee maker,Philips SENSEO HD7810 WHITE Single Serve Pod C...,
2,9,san francisco 49ers,2013 San Francisco 49ers Clock,A 2013 San Francisco 49ers clock is the ultima...


In [31]:
df_test['query_title_partial'] = df.apply(lambda x: fuzz.ratio(x['query'], x['product_title']), axis=1)
df_test['query_desc_partial'] = df.apply(lambda x: fuzz.ratio(x['query'], x['product_description']), axis=1)

In [32]:
df_test.head()

Unnamed: 0,id,query,product_title,product_description,query_title_partial,query_desc_partial
0,3,electric griddle,Star-Max 48 in Electric Griddle,,26,16
1,6,phillips coffee maker,Philips SENSEO HD7810 WHITE Single Serve Pod C...,,37,0
2,9,san francisco 49ers,2013 San Francisco 49ers Clock,A 2013 San Francisco 49ers clock is the ultima...,31,0
3,11,aveeno shampoo,AVEENO 10.5FLOZ NRSH SHINE SH,"Water, Ammonium Lauryl Sulfate, Dimethicone, S...",16,1
4,12,flea and tick control for dogs,Merial Frontline Plus Flea and Tick Control fo...,,24,0


In [33]:
#Now that I have my 'features' how do I get my prediction in the new dataframe?
#How do I look at how effective each feature was, etc....?
#What are so tweakable params for the forests?
#Hints at effective features for text similarity