In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor
from nltk.stem.snowball import SnowballStemmer

stemmer = SnowballStemmer('english')

In [2]:
import re
#List of common words to strip from inputs
from nltk.corpus import stopwords
comwords = set(stopwords.words('english'))
from sklearn.metrics import mean_squared_error, make_scorer
import random
from sklearn import pipeline, grid_search
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import cross_val_score
from sklearn.datasets import make_blobs

In [3]:
def fmean_squared_error(ground_truth, predictions):
    fmean_squared_error_ = mean_squared_error(ground_truth, predictions)**0.5
    return fmean_squared_error_

RSME  = make_scorer(fmean_squared_error, greater_is_better=False)

In [4]:
df_train = pd.read_csv('../csv/train.csv', encoding="ISO-8859-1")
df_test = pd.read_csv('../csv/test.csv', encoding="ISO-8859-1")
# df_attr = pd.read_csv('../input/attributes.csv')
df_pro_desc = pd.read_csv('../csv/product_descriptions.csv')

In [5]:
num_train = df_train.shape[0]

In [7]:
def str_stemmer(s):
    return " ".join([stemmer.stem(word) for word in s.lower().split()])

# def str_common_word(str1, str2):
# return sum(int(str2.find(word)>=0) for word in re.split('[^a-z]',str1))

def str_common_word(str1, str2):
    str1 = re.split('\W+',str1)
    #Filter Common Words
    str1 = filter(lambda w: not w in comwords,str1)
    #Search for full word match
    return sum(int(word in re.split('\W+',str2)) for word in str1)

def two_grammer(str1, str2):
    words1=re.split('\W+',str1)
    words2=re.split('\W+',str2)
    num_2_grams = 0
    for i in range(len(words2)-1):
        for j in range(len(words1)-1):
            num_2_grams = num_2_grams + int(words1[j]==words2[i] and words1[j+1]==words2[i+1])
    return num_2_grams

def three_grammer(str1, str2):
    words1=re.split('\W+',str1)
    words2=re.split('\W+',str2)
    num_3_grams = 0
    for i in range(len(words2)-2):
        for j in range(len(words1)-2):
            num_3_grams = num_3_grams + int(words1[j]==words2[i] and words1[j+1]==words2[i+1] and words1[j+2]==words2[i+2])
    return num_3_grams

In [527]:
# Testing string regular expression functionality.

string1 = "The Woodgrain Millwork 1/2 in. x 3-1/4 in. LWM 6.23 Primed MDF base is the perfect addition to dress up any room.  The Woodgrain Millwork Primed MDF LWM 623 base is used cover any imperfections where floor and wall meet.  Base Moulding can also be used as starter point for a more elaborate built up base boards.   Combing other "
string2 = "duck pizza chips4 sandwich"
nonwords1=re.split('[^0-9/-/]+',string1)
nonwords2=re.split('[0-9]',string2)

print nonwords1
#sum(int(nonword in nonwords1 for word in nonwords2))
#['this', 'is'] in words1
#sum(int("this is".find(word)>=0) for word in "this and".split())

['', '1/2', '.', '3', '1/4', '.', '6.23', '.', '623', '.', '.', '']


In [8]:
df_all = pd.concat((df_train, df_test), axis=0, ignore_index=True)

df_all = pd.merge(df_all, df_pro_desc, how='left', on='product_uid')

In [10]:
df_all_original = df_all
df_all['search_term'] = df_all['search_term'].map(lambda x:str_stemmer(x))
df_all['product_title'] = df_all['product_title'].map(lambda x:str_stemmer(x))
df_all['product_description'] = df_all['product_description'].map(lambda x:str_stemmer(x))

In [11]:
df_all['len_of_query'] = df_all['search_term'].map(lambda x:len(x.split())).astype(np.int64)

In [12]:
df_all['product_info'] = df_all['search_term']+"\t"+df_all['product_title']+"\t"+df_all['product_description']

In [13]:
df_all['word_in_title'] = df_all['product_info'].map(lambda x:str_common_word(x.split('\t')[0],x.split('\t')[1]))

In [14]:
df_all['word_in_description'] = df_all['product_info'].map(lambda x:str_common_word(x.split('\t')[0],x.split('\t')[2]))

In [15]:
df_all['two_grams_in_title'] = df_all['product_info'].map(lambda x:two_grammer(x.split('\t')[0],x.split('\t')[1]))
df_all['two_grams_in_desc'] = df_all['product_info'].map(lambda x:two_grammer(x.split('\t')[0],x.split('\t')[2]))
df_all['three_grams_in_title'] = df_all['product_info'].map(lambda x:three_grammer(x.split('\t')[0],x.split('\t')[1]))
df_all['three_grams_in_desc'] = df_all['product_info'].map(lambda x:three_grammer(x.split('\t')[0],x.split('\t')[2]))

In [20]:
df_all.to_csv("df_all2.csv", encoding="ISO-8859-1")
df_all_play=df_all

In [21]:
df_all = df_all.drop(['search_term','product_title','product_description','product_info'],axis=1)

In [22]:
df_train1 = df_all.iloc[:num_train]
#rows = random.sample(df_train1.index, 5000)

df_train = df_all.iloc[:num_train]
#df_test = df_all.iloc[rows]
df_submit = df_all.iloc[num_train:]
id_test = df_submit['id']

In [23]:
y_train = df_train['relevance'].values
X_train = df_train.drop(['id','relevance'],axis=1).values
#X_test = df_test.drop(['id','relevance'],axis=1).values
#y_test_data = df_test['relevance'].values
X_submit = df_submit.drop(['id','relevance'],axis=1).values

In [24]:
#First Regressor Choice
rf = RandomForestRegressor(n_estimators=15, max_depth=7, random_state=0)
clf = BaggingRegressor(rf, n_estimators=45, max_samples=0.1, random_state=25)
clf.fit(X_train, y_train)
#y_test = clf.predict(X_test)
y_pred = clf.predict(X_submit)

In [None]:
#Custom Regressor Choice
rf = RandomForestRegressor(n_estimators=30, max_depth=None, random_state=0, verbose=1, n_jobs=3)
clf = BaggingRegressor(rf, n_estimators=45)
clf.fit(X_train, y_train)
#y_test = clf.predict(X_test)
y_pred = clf.predict(X_submit)

In [30]:
print cross_val_score(clf, X_train, y_train, n_jobs=3)

[ 0.05740798  0.02344712 -0.09096177]


In [None]:
X, y = make_blobs(n_samples=10000, n_features=10, centers = 100, random_state=0)
clfclass = RandomForestClassifier(n_estimators=10, max_depth=None, min_samples_split=1, random_state=0)
scores = cross_val_score(clfclass, X, y)
print scores

In [None]:
#rfr = RandomForestRegressor()
#clf = pipeline.Pipeline([('rfr', rfr)])
#param_grid = {'rfr__n_estimators' : list(range(22,26,1)), 'rfr__max_depth': list(range(6,9,1))}
#model = grid_search.GridSearchCV(estimator = clf, param_grid = param_grid, n_jobs = -1, cv = 2, verbose = 20, scoring=RSME)
#model.fit(X_train, y_train)

In [537]:
mean_squared_error(y_test,y_test_data)**0.5

0.48749692596475092

In [486]:
pd.DataFrame({"id": id_test, "relevance": y_pred}).to_csv('submission_V9.csv',index=False)

In [547]:
df_all.head()

Unnamed: 0,id,relevance,len_of_query,word_in_title,word_in_description,two_grams_in_title,two_grams_in_desc,three_grams_in_title,three_grams_in_desc,product_uid
0,2,3.0,2,1,1,0,0,0,0,100001
1,3,2.5,2,0,0,0,0,0,0,100001
2,9,3.0,2,0,1,0,0,0,0,100002
3,16,2.33,3,1,1,0,0,0,0,100005
4,17,2.67,3,2,2,2,0,1,0,100005
