In [1]:
import pandas as pd
import numpy as np
import dill
import time

seed = 101 # Lucky seed

In [2]:
def dump_dill(fname, obj):
    with open(fname, 'wb') as f:
        dill.dump(obj, f)
    return None

def split_cat(text):
    try:
        result = text.split('/')
    except:
        result = ['missing','missing','missing']
    return result[:3]

In [3]:
df = pd.read_table('data/train.tsv')
df.head()

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description
0,0,MLB Cincinnati Reds T Shirt Size XL,3,Men/Tops/T-shirts,,10.0,1,No description yet
1,1,Razer BlackWidow Chroma Keyboard,3,Electronics/Computers & Tablets/Components & P...,Razer,52.0,0,This keyboard is in great condition and works ...
2,2,AVA-VIV Blouse,1,Women/Tops & Blouses/Blouse,Target,10.0,1,Adorable top with a hint of lace and a key hol...
3,3,Leather Horse Statues,1,Home/Home Décor/Home Décor Accents,,35.0,1,New with tags. Leather horses. Retail for [rm]...
4,4,24K GOLD plated rose,1,Women/Jewelry/Necklaces,,44.0,0,Complete with certificate of authenticity


Let's start by transforming the price (deskew).

In [4]:
df['price'] = np.log1p(df['price'].values)
df.head()

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description
0,0,MLB Cincinnati Reds T Shirt Size XL,3,Men/Tops/T-shirts,,2.397895,1,No description yet
1,1,Razer BlackWidow Chroma Keyboard,3,Electronics/Computers & Tablets/Components & P...,Razer,3.970292,0,This keyboard is in great condition and works ...
2,2,AVA-VIV Blouse,1,Women/Tops & Blouses/Blouse,Target,2.397895,1,Adorable top with a hint of lace and a key hol...
3,3,Leather Horse Statues,1,Home/Home Décor/Home Décor Accents,,3.583519,1,New with tags. Leather horses. Retail for [rm]...
4,4,24K GOLD plated rose,1,Women/Jewelry/Necklaces,,3.806662,0,Complete with certificate of authenticity


Now let's split the categories.

In [5]:
cats = np.vstack(df['category_name'].map(split_cat).values)
df['cat_0'] = cats[:,0]
df['cat_1'] = cats[:,1]
df['cat_2'] = cats[:,2]
del df['category_name']
df.head()

Unnamed: 0,train_id,name,item_condition_id,brand_name,price,shipping,item_description,cat_0,cat_1,cat_2
0,0,MLB Cincinnati Reds T Shirt Size XL,3,,2.397895,1,No description yet,Men,Tops,T-shirts
1,1,Razer BlackWidow Chroma Keyboard,3,Razer,3.970292,0,This keyboard is in great condition and works ...,Electronics,Computers & Tablets,Components & Parts
2,2,AVA-VIV Blouse,1,Target,2.397895,1,Adorable top with a hint of lace and a key hol...,Women,Tops & Blouses,Blouse
3,3,Leather Horse Statues,1,,3.583519,1,New with tags. Leather horses. Retail for [rm]...,Home,Home Décor,Home Décor Accents
4,4,24K GOLD plated rose,1,,3.806662,0,Complete with certificate of authenticity,Women,Jewelry,Necklaces


Fill missing values in brand_name and item_description.

In [6]:
df['brand_name'].fillna('missing', inplace=True)
df['item_description'].fillna('missing', inplace=True)

Combine name, brand_name, and item_description into a single string.

In [7]:
df['full_description'] = df['name'] + ' ' + df['brand_name'] + ' ' + df['item_description']
del df['name']
del df['brand_name']
del df['item_description']
df.head()

Unnamed: 0,train_id,item_condition_id,price,shipping,cat_0,cat_1,cat_2,full_description
0,0,3,2.397895,1,Men,Tops,T-shirts,MLB Cincinnati Reds T Shirt Size XL missing No...
1,1,3,3.970292,0,Electronics,Computers & Tablets,Components & Parts,Razer BlackWidow Chroma Keyboard Razer This ke...
2,2,1,2.397895,1,Women,Tops & Blouses,Blouse,AVA-VIV Blouse Target Adorable top with a hint...
3,3,1,3.583519,1,Home,Home Décor,Home Décor Accents,Leather Horse Statues missing New with tags. L...
4,4,1,3.806662,0,Women,Jewelry,Necklaces,24K GOLD plated rose missing Complete with cer...


Train LSA model to vectorize full_description.

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import Pipeline

vect = TfidfVectorizer()
lsa = TruncatedSVD()
pipe = Pipeline([('vect',TfidfVectorizer(ngram_range=(1,2), stop_words='english')),
                 ('lsa',TruncatedSVD(n_components=50, random_state=seed))])

start = time.time()
pipe.fit(df['full_description'].values, df['price'].values)
end = time.time()
print('Time to train LSA: %0.2fs' % (end - start))

Time to train LSA: 214.81s


In [9]:
vocab = np.array(pipe.named_steps['vect'].get_feature_names())
idx_0 = pipe.named_steps['lsa'].components_[0].argsort()[:10]
idx_1 = pipe.named_steps['lsa'].components_[1].argsort()[:10]
idx_5 = pipe.named_steps['lsa'].components_[5].argsort()[:10]

In [10]:
vocab[idx_0]

array(['heatherfitch heatherfitch', 'heatherfitch', 'sɪᴅᴇ ᴏғ',
       'ᴇɴɢʟᴀɴᴅ ᴘɪɴᴋ', 'ғʟᴏᴡᴇʀs ᴀʀᴏᴜɴᴅ', 'ғʟᴏᴡᴇʀs', 'ᴇᴠᴇɴʟʏ',
       'ᴇᴠᴇɴʟʏ ᴀʀᴏᴜɴᴅ', 'ᴊᴀsᴘᴇʀᴡᴀʀᴇ ʜᴇᴀʀᴛ', 'ᴊᴀsᴘᴇʀᴡᴀʀᴇ ᴘɪɴᴋ'],
      dtype='<U99')

In [11]:
vocab[idx_1]

array(['lularoe', 'nike', 'missing', 'new', 'size', 'brand', 'brand new',
       'free', 'leggings', 'missing description'],
      dtype='<U99')

In [12]:
vocab[idx_5]

array(['american', 'eagle', 'american eagle', 'missing description',
       'missing', 'description', 'pink', 'iphone', 'kors', 'michael'],
      dtype='<U99')

Convert cat_0, cat_1, and cat_2 to population-ordered numeric codes.

In [13]:
cat_dicts = {}
for cat in ['cat_0', 'cat_1', 'cat_2']:
    names, counts = np.unique(df[cat].values, return_counts=True)
    enc_dict = {k:v for (v,k) in enumerate(names[np.argsort(counts)])}
    cat_dicts[cat] = enc_dict
cat_dicts.keys()

dict_keys(['cat_0', 'cat_1', 'cat_2'])

In [14]:
for feat, feat_dict in cat_dicts.items():
    df[feat] = df[feat].map(feat_dict)
df.head()

Unnamed: 0,train_id,item_condition_id,price,shipping,cat_0,cat_1,cat_2,full_description
0,0,3,2.397895,1,6,95,847,MLB Cincinnati Reds T Shirt Size XL missing No...
1,1,3,3.970292,0,7,70,670,Razer BlackWidow Chroma Keyboard Razer This ke...
2,2,1,2.397895,1,10,111,858,AVA-VIV Blouse Target Adorable top with a hint...
3,3,1,3.583519,1,5,98,840,Leather Horse Statues missing New with tags. L...
4,4,1,3.806662,0,10,109,854,24K GOLD plated rose missing Complete with cer...


Combine all processed features into training set.

In [15]:
start = time.time()
X_text = pipe.transform(df['full_description'])
end = time.time()
print('Time to transform full_description: %0.2fs' % (end - start))

Time to transform full_description: 65.56s


In [16]:
X_train = np.hstack([df[['item_condition_id', 'shipping', 'cat_0', 'cat_1', 'cat_2']].values, X_text])
y_train = df['price'].values

Save processed training set to a dill file.

In [17]:
dump_dill('mercari.dill', {'X_train':X_train, 'y_train':y_train})