In [1]:
import pandas as pd
import numpy as np
import dill
import time

seed = 101 # Lucky seed

In [2]:
def dump_dill(fname, obj):
    with open(fname, 'wb') as f:
        dill.dump(obj, f)
    return None

def split_cat(text):
    try:
        result = text.split('/')
    except:
        result = ['missing','missing','missing']
    return result[:3]

In [3]:
df = pd.read_table('data/train.tsv')
df.head()

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description
0,0,MLB Cincinnati Reds T Shirt Size XL,3,Men/Tops/T-shirts,,10.0,1,No description yet
1,1,Razer BlackWidow Chroma Keyboard,3,Electronics/Computers & Tablets/Components & P...,Razer,52.0,0,This keyboard is in great condition and works ...
2,2,AVA-VIV Blouse,1,Women/Tops & Blouses/Blouse,Target,10.0,1,Adorable top with a hint of lace and a key hol...
3,3,Leather Horse Statues,1,Home/Home Décor/Home Décor Accents,,35.0,1,New with tags. Leather horses. Retail for [rm]...
4,4,24K GOLD plated rose,1,Women/Jewelry/Necklaces,,44.0,0,Complete with certificate of authenticity


Let's start by transforming the price (deskew).

In [4]:
df['price'] = np.log1p(df['price'].values)
df.head()

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description
0,0,MLB Cincinnati Reds T Shirt Size XL,3,Men/Tops/T-shirts,,2.397895,1,No description yet
1,1,Razer BlackWidow Chroma Keyboard,3,Electronics/Computers & Tablets/Components & P...,Razer,3.970292,0,This keyboard is in great condition and works ...
2,2,AVA-VIV Blouse,1,Women/Tops & Blouses/Blouse,Target,2.397895,1,Adorable top with a hint of lace and a key hol...
3,3,Leather Horse Statues,1,Home/Home Décor/Home Décor Accents,,3.583519,1,New with tags. Leather horses. Retail for [rm]...
4,4,24K GOLD plated rose,1,Women/Jewelry/Necklaces,,3.806662,0,Complete with certificate of authenticity


Now let's split the categories.

In [5]:
cats = np.vstack(df['category_name'].map(split_cat).values)
df['cat_0'] = cats[:,0]
df['cat_1'] = cats[:,1]
df['cat_2'] = cats[:,2]
del df['category_name']
df.head()

Unnamed: 0,train_id,name,item_condition_id,brand_name,price,shipping,item_description,cat_0,cat_1,cat_2
0,0,MLB Cincinnati Reds T Shirt Size XL,3,,2.397895,1,No description yet,Men,Tops,T-shirts
1,1,Razer BlackWidow Chroma Keyboard,3,Razer,3.970292,0,This keyboard is in great condition and works ...,Electronics,Computers & Tablets,Components & Parts
2,2,AVA-VIV Blouse,1,Target,2.397895,1,Adorable top with a hint of lace and a key hol...,Women,Tops & Blouses,Blouse
3,3,Leather Horse Statues,1,,3.583519,1,New with tags. Leather horses. Retail for [rm]...,Home,Home Décor,Home Décor Accents
4,4,24K GOLD plated rose,1,,3.806662,0,Complete with certificate of authenticity,Women,Jewelry,Necklaces


Fill missing values in brand_name and item_description.

In [6]:
df['brand_name'].fillna('missing', inplace=True)
df['item_description'].fillna('missing', inplace=True)

Combine name and item_description into full_description.

In [7]:
df['full_description'] = df['name'] + ' ' + df['item_description']
del df['name']
del df['item_description']

Train LDA model to vectorize full_description.

In [8]:
from bayes_opt import BayesianOptimization
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import SGDRegressor
from sklearn.pipeline import Pipeline

def target(**params):
    n_components = int(params['n_components'])
    min_df = params['min_df']
    max_df = params['max_df']
    vect = TfidfVectorizer(ngram_range=(1,2), min_df=min_df, max_df=max_df)
    X_vect = vect.fit_transform(df['full_description'].values)
    if n_components >= len(vect.vocabulary_):
        n_components = len(vect.vocabulary_)-1
    X_vect = TruncatedSVD(n_components=n_components).fit_transform(X_vect)
    model = SGDRegressor(max_iter=10, loss='squared_loss', alpha=0.0)
    score = cross_val_score(model, X_vect, df['price'].values, scoring='r2', cv=3).mean()
    return score

params = {'n_components':(10,300),
          'min_df':(0.0,0.1),
          'max_df':(0.5,1.0)}
bo = BayesianOptimization(target, params, random_state=seed)
bo.gp.set_params(alpha=1e-8)
bo.maximize(init_points=5, n_iter=5, acq='ucb', kappa=5)

[31mInitialization[0m
[94m----------------------------------------------------------------------[0m
 Step |   Time |      Value |    max_df |    min_df |   n_components | 
    1 | 01m29s | [35m   0.03897[0m | [32m   0.7771[0m | [32m   0.0834[0m | [32m      159.7556[0m | 
    2 | 02m17s | [35m   0.13788[0m | [32m   0.6761[0m | [32m   0.0307[0m | [32m      175.4936[0m | 
    3 | 01m24s |    0.02588 |    0.5909 |    0.0894 |        18.2575 | 
    4 | 01m33s |    0.04103 |    0.8928 |    0.0722 |        59.7413 | 
    5 | 03m02s | [35m   0.15712[0m | [32m   0.9827[0m | [32m   0.0190[0m | [32m      208.7303[0m | 
[31mBayesian Optimization[0m
[94m----------------------------------------------------------------------[0m
 Step |   Time |      Value |    max_df |    min_df |   n_components | 
    6 | 25m23s | [35m   0.23410[0m | [32m   0.5000[0m | [32m   0.0000[0m | [32m      300.0000[0m | 
    7 | 01m31s |    0.03365 |    0.5000 |    0.1000 |       263.9

In [9]:
bo.res['max']

{'max_params': {'max_df': 0.5, 'min_df': 0.0, 'n_components': 300.0},
 'max_val': 0.2340952203930706}

In [10]:
vect = TfidfVectorizer(ngram_range=(1,2),
                       min_df=bo.res['max']['max_params']['min_df'],
                       max_df=bo.res['max']['max_params']['max_df'])
lsa = TruncatedSVD(n_components=int(bo.res['max']['max_params']['n_components']), random_state=seed)
pipe = Pipeline([('vect',vect), ('lsa',lsa)])
start = time.time()
pipe.fit(df['full_description'].values, df['price'].values)
end = time.time()
print('Time to train TFIDF + LSA: %0.2fs' % (end - start))

Time to train TFIDF + LSA: 1506.42s


In [11]:
vocab = np.array(pipe.named_steps['vect'].get_feature_names())
idx_0 = pipe.named_steps['lsa'].components_[0].argsort()[:10]
idx_4 = pipe.named_steps['lsa'].components_[4].argsort()[:10]
idx_20 = pipe.named_steps['lsa'].components_[20].argsort()[:10]

In [12]:
vocab[idx_0]

array(['sethious', 'sethious sethious', 'brandyclothes', 'mypinkcloset',
       'nymphetshop', 'nymphetshop brandyclothes', 'stuff2',
       'alextan brandyclothes', 'alextan', 'camila96x brandyclothes'],
      dtype='<U99')

In [13]:
vocab[idx_4]

array(['size', 'condition', 'worn', 'lularoe', 'leggings', 'nike',
       'great condition', 'good', 'great', 'good condition'],
      dtype='<U99')

In [14]:
vocab[idx_20]

array(['great', 'shirt', 'great condition', 'kors', 'michael',
       'michael kors', 'bundle', 'shorts', 'used', 'and'],
      dtype='<U99')

Convert cat_0, cat_1, and cat_2 to population-ordered numeric codes.

In [15]:
feat_dicts = {}
for cat in ['cat_0', 'cat_1', 'cat_2', 'brand_name']:
    names, counts = np.unique(df[cat].values, return_counts=True)
    enc_dict = {k:v for (v,k) in enumerate(names[np.argsort(counts)])}
    feat_dicts[cat] = enc_dict
feat_dicts.keys()

dict_keys(['cat_0', 'cat_1', 'cat_2', 'brand_name'])

In [16]:
for feat, feat_dict in feat_dicts.items():
    df[feat] = df[feat].map(feat_dict)
df.head()

Unnamed: 0,train_id,item_condition_id,brand_name,price,shipping,cat_0,cat_1,cat_2,full_description
0,0,3,4809,2.397895,1,6,95,847,MLB Cincinnati Reds T Shirt Size XL No descrip...
1,1,3,4223,3.970292,0,7,70,670,Razer BlackWidow Chroma Keyboard This keyboard...
2,2,1,4727,2.397895,1,10,111,858,AVA-VIV Blouse Adorable top with a hint of lac...
3,3,1,4809,3.583519,1,5,98,840,Leather Horse Statues New with tags. Leather h...
4,4,1,4809,3.806662,0,10,109,854,24K GOLD plated rose Complete with certificate...


Combine all processed features into training set.

In [17]:
start = time.time()
X_text = pipe.transform(df['full_description'])
end = time.time()
print('Time to transform item_description: %0.2fs' % (end - start))

Time to transform item_description: 94.05s


In [18]:
X_train = np.hstack([df[['item_condition_id', 'shipping', 'cat_0', 'cat_1', 'cat_2', 'brand_name']].values, X_text])
y_train = df['price'].values

Save processed training set to a dill file.

In [19]:
dump_dill('mercari.dill', {'X_train':X_train, 'y_train':y_train})