# Mercari Price Suggestion Challenge
- Task: Build an algorithm that automatically suggests the right product prices. 
- Data: User-inputted text descriptions of their products, including details like product category name, brand name, and item condition.

In [1]:
import warnings
warnings.filterwarnings("ignore")
import time
import numpy as np
import pandas as pd
import pickle
import xgboost
from scipy import sparse
import xgboost

from sklearn import preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import train_test_split

# Import Data

In [None]:
print ('Importing Data')
current_t = time.time()
train_data = pd.read_table('data/train.tsv')
test_data = pd.read_table('data/test.tsv')
print("Data Imported. Time elapsed: " + str(int(time.time()-current_t )) + "s")

Importing Data


## Feature Engineering

### Categorical data

In [None]:
def category(data):
    cat = data.category_name.str.split('/', expand = True)
    data["main_cat"] = cat[0]
    data["subcat1"] = cat[1]
    data["subcat2"] = cat[2]
    try:
        data["subcat3"] = cat[3]
    except:
        data["subcat3"] = np.nan  
    try:
        data["subcat4"] = cat[4]
    except:
        data["subcat4"] = np.nan  
        
def missing_data(data, _value = 'None'):
    # Handle missing data
    for col in data.columns:
        data[col].fillna(_value,inplace=True)

In [None]:
category(train_features)
category(test_features)

missing_data(train_features)
missing_data(test_features)

In [None]:
## convert categorical var to numeric 
le = preprocessing.LabelEncoder()
def cat_to_num(train,test):
    suf="_le"
    for col in ['brand_name','main_cat','subcat1','subcat2','subcat3','subcat4']:
        train[col+suf] = le.fit_transform(train[col])
        dic = dict(zip(le.classes_, le.transform(le.classes_)))
        test[col+suf] = test[col].map(dic).fillna(0).astype(int) 
        
        print("{} is transformed to {}".format(col,col+suf))

In [None]:
cat_to_num(train_features,test_features)

In [None]:
## Length of item discription
train_features['Length_of_item_description']=train_features['item_description'].apply(len)
test_features['Length_of_item_description']=test_features['item_description'].apply(len)

In [None]:
## Combine numeric features
def numeric_to_features(data):
    numeric_features = list(data.apply(lambda x:(x['shipping'],x['item_condition_id'],x['main_cat_le'],\
                                                 x['subcat1_le'],x['subcat2_le'],x['subcat3_le'],\
                                                 x['subcat4_le'],x['Length_of_item_description'],\
                                                 x['brand_name_le']), axis=1))
    return numeric_features


In [None]:
train_numeric_features = numeric_to_features(train_features)
test_numeric_features = numeric_to_features(test_features)
print ("Numeric Features created")

### Text Feature

In [None]:
def text_process(data):
    # Process text    
    # make item_description and name lower case    
    text = list(data.apply(lambda x:'%s %s' %(x['item_description'],x['name']), axis=1))
    return text

In [None]:
train_text =text_process(train_features)
test_text =text_process(test_features)
print ("Text Features created")

In [None]:
# Tfidf
    # save the vectorize
    # pickle.dump(tfidf,open('vectorizer.pkl', "bw",-1))
    # tfidf=pickle.load(open('vectorizer.pkl','br'))
    
# check if we should use max_features parameter
tfidf = TfidfVectorizer(sublinear_tf=True,ngram_range=(1,3), stop_words = 'english',max_features = 5000)
train_text_features = tfidf.fit_transform(train_text)
test_text_features = tfidf.transform(test_text)

In [None]:
#  Stacker for sparse data
print ("Stacking features")
train_final_features = sparse.hstack((train_numeric_features,train_text_features))
test_final_features = sparse.hstack((test_numeric_features,test_text_features))

In [None]:
'''
# save the features
pickle.dump(train_final_features,open('train_features.pkl', "bw"))
pickle.dump(test_final_features,open('test_features.pkl', "bw"))
pickle.dump(train_labels,open('train_labels.pkl', "bw"))
'''

## Pick and Tune the Algorithms

An algorithm may be highly sensitive to some of its features. The choose of good parameters may have a dominant effect on the algorithm performance. 

In this study, we use GridSearchCV to fine tune the algorithm. I start with default parameters and level it up and down. Based on the GridSearchCV function I will adjust the parameters again. For example, if the GridSearchCV chooses the smallest value for the parameter, I will add a smaller number in the search list.

### Load the data

In [None]:
'''
train_final_features = pickle.load(open('train_features.pkl','br'))
test_final_features = pickle.load(open('test_features.pkl','br'))
train_labels = pickle.load(open('train_labels.pkl','br'))
'''

In [None]:
xgb = xgboost.XGBRegressor(n_estimators=500, 
                           learning_rate=0.1, 
                           gamma=0,subsample=0.8,
                           colsample_bytree=1,
                           min_child_weight=1, 
                           max_depth=20,
                           nthread=4,
                           seed=1505)

X = (train_final_features)
Y = (train_labels)

In [None]:
current_t = time.time()
xgb.fit(X,Y)
print("Modeling complete. Time elapsed: " + str(int(time.time()-current_t)) + "s")

In [None]:
# gridsearchcv
'''
print("Initiating grid search")
xgb = XGBRegressor(n_jobs=4)
param_grid = { "n_estimators" : [1000],
                "max_depth" : [38], #17-11
                "min_child_weight" : [11],
                "subsample":[ .8] #subsample ＝ 30%～80% of training set；
                "gamma":[0,0.1,0.2,0.5]
                 "learning_rate":[.055,.060,.065] #learning_rate ＝ 0.1 or smaller
                 "subsample" :[i/10.0 for i in [2,4,8]] 
                 "tree_depth" : [2,8]
                   }
CV_xgb = GridSearchCV(estimator=xgb, param_grid=param_grid,verbose=1)

CV_xgb.fit(X_train, y_train)

print(CV_xgb.best_params_,CV_xgb.best_score_) 
print("All tasks complete.")

'''

### Test

In [None]:
'''
# vectorized error calc
def rmsle(y, y0):
    assert len(y) == len(y0)
    return np.sqrt(np.mean(np.power(np.log1p(y)-np.log1p(y0), 2)))
    
# test
def test_reg(reg, features, labels):
    features_train, features_test, labels_train, labels_test = train_test_split(\
                features, labels, test_size=0.8, random_state=0)
    ### fit the classifier using training set, and test on test set
    reg.fit(features_train, (labels_train))
    y_true = labels_test
    y_pred = (reg.predict(features_test))
    y_pred[y_pred<0]=0
    jag=rmsle(y_true,y_pred)
    print(jag)


test_reg(xgb, train_final_features, train_labels)
'''

## Save the results

In [None]:
outfile_name = 'submit.csv'

pred_label = xgb.predict(test_final_features)
pred_label = np.exp(pred_label)
pred_label = pd.DataFrame(np.array(pred_label), columns = ['price'])
pred_label.index.name = 'test_id'
pred_label.to_csv(outfile_name, encoding='utf-8')
print ('Modeling done!')