In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor
from nltk.stem.snowball import SnowballStemmer

In [2]:
# Snowball Algorithm, convert words to the stem
stemmer = SnowballStemmer('english')

In [3]:
# Read the data
df_train = pd.read_csv('../data/train.csv', encoding="ISO-8859-1")
df_test = pd.read_csv('../data/test.csv', encoding="ISO-8859-1")
df_attr = pd.read_csv('../data/attributes.csv')
df_pro_desc = pd.read_csv('../data/product_descriptions.csv')

In [5]:
# Number of rows in the file
num_train = df_train.shape[0]

In [20]:
# Stack the data.frame on top of each other 
# [rbind] axis=0 is vertical, [cbind] axis=1 is horizontal (c)
df_all = pd.concat((df_train, df_test), axis=0, ignore_index=True)

In [21]:
# Merge data.frame [pretty straight forward]
# left is focused on df_all
df_all = pd.merge(df_all, df_pro_desc, how='left', on='product_uid')

In [22]:
# Not sure what these functions do.
# Example: (angle bracket)
# angle -> angl, bracket -> bracket
# join the two stems by " "
# similar to paste(stem1, stem2, sep=" ")
def str_stemmer(s):
	return " ".join([stemmer.stem(word) for word in s.lower().split()])

In [26]:
df_all['search_term'] = df_all['search_term'].map(lambda x:str_stemmer(x))
df_all['product_title'] = df_all['product_title'].map(lambda x:str_stemmer(x))
df_all['product_description'] = df_all['product_description'].map(lambda x:str_stemmer(x))

In [31]:
# astype converts the data into the form inside (np.int64)
# len(x.split()) counts the number of terms
df_all['len_of_query'] = df_all['search_term'].map(lambda x:len(x.split())).astype(np.int64)

In [33]:
# paste 3 text information, not sure how \t fits in
df_all['product_info'] = df_all['search_term']+"\t"+df_all['product_title']+"\t"+df_all['product_description']

In [35]:
# str2.find(word) finds the location of where the text is located in str2
# For example "abcde".find("cd") will result 2
# if the string is not in, "abcde".find("z") will result -1
# int(str2.find(word) >= 0) checks if the char is in the string 1 for true 0 for false
def str_common_word(str1, str2):
	return sum(int(str2.find(word)>=0) for word in str1.split())

In [36]:
df_all['word_in_title'] = df_all['product_info'].map(lambda x:str_common_word(x.split('\t')[0],x.split('\t')[1]))
df_all['word_in_description'] = df_all['product_info'].map(lambda x:str_common_word(x.split('\t')[0],x.split('\t')[2]))

In [38]:
# remove columns in the data.frame but axis=1 (this is a must)
# remove rows axis=1 or default
df_all = df_all.drop(['search_term','product_title','product_description','product_info'],axis=1)