# For Mercari Price Suggestion Challenge

In [222]:
%matplotlib inline

In [223]:
%%javascript
IPython.OutputArea.prototype._should_scroll = function(lines) {
    return false;
}

<IPython.core.display.Javascript object>

In [224]:
import math
import matplotlib.pyplot as plt
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import numpy as np
import pandas as pd
import pickle
import pprint
import random
import scipy as sp
from scipy import stats, integrate
import seaborn as sns

In [225]:
sns.set(color_codes=True)

In [226]:
random.seed(37)

In [227]:
def IQR(x):
    q1, q3 = np.percentile(x, [25, 75])
    iqr = q3 - q1
    outlier_higher = q3 + 1.5 * iqr
    outlier_lower = q1 - 1.5 * iqr
    return outlier_lower, outlier_higher

## Data exploration

In [297]:
dtypes = {
    "train_id" : np.int64,
    "name" : np.str,
    "item_condition_id" : np.uint8,
    "category_name" : np.str,
    "brand_name": np.str,
    "price": np.float64,
    "shipping": np.uint8,
    "item_description": np.str
}

In [298]:
orig_train_data = pd.read_csv("./data/train.tsv", sep='\t', dtype=dtypes)
test_data = pd.read_csv("./data/test.tsv", sep='\t', dtype=dtypes)

In [299]:
train_data = orig_train_data

In [300]:
print("there are {} rows in train_data and each row has {} columns".format(
    train_data.shape[0], train_data.shape[1]))
print("there are {} rows in test_data and each row has {} columns".format(
    test_data.shape[0], test_data.shape[1]))

print("train_data columns and their types: ")
print(train_data.dtypes)

print("- test_data columns and their types: ")
print(test_data.dtypes)

there are 1482535 rows in train_data and each row has 8 columns
there are 693359 rows in test_data and each row has 7 columns
train_data columns and their types: 
train_id               int64
name                  object
item_condition_id      uint8
category_name         object
brand_name            object
price                float64
shipping               uint8
item_description      object
dtype: object
- test_data columns and their types: 
test_id               int64
name                 object
item_condition_id     uint8
category_name        object
brand_name           object
shipping              uint8
item_description     object
dtype: object


In [305]:
train_data.sample(n=3)

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description
297351,297351,Nike Lunarglide 7- size 6.5,3,Women/Shoes/Athletic,Nike,36.0,0,Pink grey and orange Nikes. Worn once but they...
885669,885669,Elena locket with real Vervain,1,Women/Jewelry/Necklaces,,10.0,1,Vampire Diaries Elena locket with real Vervain...
1195941,1195941,iPhone 6 iPhone 6s Case,1,"Electronics/Cell Phones & Accessories/Cases, C...",,10.0,1,For iPhone 6 iPhone 6s Cute Baby Pacifier Milk...


In [232]:
test_data.sample(n=3)

Unnamed: 0,test_id,name,item_condition_id,category_name,brand_name,shipping,item_description
426633,426633,UGG Boots,1,Women/Shoes/Boots,UGG Australia,0,I am selling a NEW pair of UGG Boots.They are ...
545255,545255,Vera Bradley Large makeup bag,3,Women/Women's Handbags/Cosmetic Bags,Vera Bradley,1,Large bag double zipper Zipper pouch inside al...
197893,197893,Women's Cardigan Sweater HOLD,2,Women/Sweaters/Cardigan,Mossimo,1,MOSSIMO •Women's XS cardigan sweater •Black •X...


*test_id* and *train_id* could be ignored. *item_condition_id* and *shipping* are actually category data. *name*, *category_name*, *brand_name* and *item_description* are text data.
Since *price* is the only continuous data and it is the output, seems there is no need to check distribution and normalize data.

create subsets of train_data and test_data to do trail exploration

In [306]:
sub_train_data = train_data.sample(16384, random_state=37)
sub_test_data = test_data.sample(16384, random_state=37)

## Data Processing

In [307]:
pd.isna(train_data).sum()
#pd.isna(sub_train_data).sum()

train_id                  0
name                      0
item_condition_id         0
category_name          6327
brand_name           632682
price                     0
shipping                  0
item_description          4
dtype: int64

### Drop *Train_id*

In [308]:
def drop_train_id(data):
    return data.drop(columns=['train_id'])

In [309]:
train_data = drop_train_id(train_data)
sub_train_data = drop_train_id(sub_train_data)

### Drop NA item_description

In [310]:
def drop_missing_item_desc(data):
    return data.drop(data[data['item_description'].isna()].index)

In [311]:
train_data = drop_missing_item_desc(train_data)
sub_train_data = drop_missing_item_desc(sub_train_data)

### category *item_condition_id* and *shipping*

In [314]:
def category_fields(data):
    data['item_condition_id'] = data['item_condition_id'].astype('category')
    data['shipping'] = data['shipping'].astype('category')
    print(data.item_condition_id.cat.categories)
    print(data.shipping.cat.categories)
    return data

In [315]:
sub_train_data = category_fields(sub_train_data)
train_data = category_fields(train_data)

UInt64Index([1, 2, 3, 4, 5], dtype='uint64')
UInt64Index([0, 1], dtype='uint64')
UInt64Index([1, 2, 3, 4, 5], dtype='uint64')
UInt64Index([0, 1], dtype='uint64')


In [316]:
print("there are " + str(pd.isna(train_data['item_condition_id']).sum())
      + " items missing item_condition_id")

there are 0 items missing item_condition_id


In [317]:
print("there are " + str(pd.isna(train_data['shipping']).sum())
      + " items missing shipping")

there are 0 items missing shipping


### *category_name*

normally, *category_name* is in a form like 'a/b/c', but some products *category_name* contain more than three '/' character. Like *Electronics/Computers & Tablets/iPad/Tablet/eB...*. There will be more than three segments if to split *category_name* with '/'. It's reasonable to keep first two first two segment and merge others. For example, *Electronics/Computers & Tablets/iPad/Tablet/eB...* is transfered to *Electronics*, *Computers & Tablets*, and *iPad/Tablet/eB...*

In [318]:
def filter_long_category(data):
    long_category_product = data[data['category_name'].str.count('/') > 3]
    return long_category_product['category_name'].unique()

In [319]:
print(filter_long_category(sub_train_data))

['Electronics/Computers & Tablets/iPad/Tablet/eBook Readers'
 'Electronics/Computers & Tablets/iPad/Tablet/eBook Access']


In [320]:
def split_category(data):
    splited = data['category_name'].str.split('/', expand=True)
    splited[2] = np.where(splited[3].isnull(), 
                          splited[2], splited[2]+'/'+splited[3]+'/'+splited[4])
    data['c1'], data['c2'], data['c3'] = splited[0], splited[1], splited[2]
    return data

In [321]:
sub_train_data= split_category(sub_train_data)
train_data = split_category(train_data)

### process text data

In [322]:
all_text_fields = ['name', 'category_name', 'brand_name', 'item_description',
                   'c1', 'c2', 'c3']

In [323]:
def lowercase(data, fields=all_text_fields):
    for field in fields:
        data[field] = data[field].str.lower()
    return data

In [324]:
sub_train_data = lowercase(sub_train_data)
train_data = lowercase(train_data)

In [325]:
def remove_punctuation(data, fields=all_text_fields):
    def remove(sentence):
        if sentence is None:
            return None
        
        if not isinstance(sentence, str):
            if math.isnan(sentence):
                return None
            
            print('{} type is {}'.format(sentence, type(sentence)))
            return sentence
            
        tokenizer = RegexpTokenizer(r'\w+')        
        tokenizer = tokenizer.tokenize(sentence)
        return ' '.join(tokenizer)
    
    for field in fields:
        data[field] = data[field].apply(remove)
    
    return data

In [326]:
def remove_stopwords(data, fields=all_text_fields): 
    def remove(sentence):
        if sentence is None:
            return None
        
        if not isinstance(sentence, str):
            if math.isnan(sentence):
                return None
            
            print('{} type is {}'.format(sentence, type(sentence)))
            return sentence
    
        filtered_words = [w for w in word_tokenize(sentence) 
                          if not w in stopwords.words('english')]
        return ' '.join(filtered_words)
    
    for field in fields:
        data[field] = data[field].apply(remove)
    
    return data

In [347]:
sub_train_data = remove_punctuation(sub_train_data)
#train_data = remove_punctuation(train_data)

In [348]:
sub_train_data = remove_stopwords(sub_train_data)
#train_data = remove_stopwords(train_data)

### brand name

In [None]:
abbreviation_dict = {
    'vs' : 'victoria secret',
}

In [419]:
def build_brand_name_list(data):
    orig = data['brand_name']
    orig = orig[orig.notna()]
    # remove duplicated
    orig = set(orig)
    # remove space in word
    orig = [w.replace(' ', '') for w in orig]
    return orig

In [420]:
known_brand_list = build_brand_name_list(sub_train_data)

In [421]:
print('there are {} known brand names'.format(len(known_brand_list)))

there are 943 known brand names


In [422]:
with open('brand_name_list', 'wb') as f:
    pickle.dump(known_brand_list, f)

*name* usually contains *brand_name* information. It is able to use *name* to predict missing *brand_name*

In [423]:
def match_in_list(sent, items=known_brand_list):
    candidate = []
    for item in items:
        match, _ = contains(sent, item)
        if match:
            candidate.append(item)
    else:
        return candidate

def contains(sent, key_words):
    sent = sent.split(' ')
    key_words = key_words.split(' ')

    match_points = [sent.index(w) for w in key_words if w in sent]

    if len(match_points) == 0:
        return False, []

    matched_index = []
    for match_point in match_points:
        for i in range(len(key_words)):
            if match_point + i >= len(sent)\
                or not sent[match_point + i] == key_words[i]:
                break
        else:
            matched_index.append(match_point)

    return len(matched_index) != 0, matched_index

In [437]:
s = sub_train_data.sample()
name = s['name'].to_string(index=False)
print('name is: ' + name)
print('actual brand name is ' + s['brand_name'].to_string(index=False))
print('predict brand name is ' + str(match_in_list(name)))

name is: iphone 5c
actual brand name is apple
predict brand name is []


### missing data

to fix the missing problem first

In [214]:
print("there are " + str(pd.isna(train_data['category_name']).sum())
      + " items missing item_condition_id")
print("there are " + str(pd.isna(sub_train_data['category_name']).sum())
      + " items missing item_condition_id in sub_train_data")

there are 6327 items missing item_condition_id
there are 58 items missing item_condition_id in sub_train_data


In [215]:
sub_train_data[pd.isna(sub_train_data['category_name'])].sample(n=5, random_state=37)

Unnamed: 0,name,item_condition_id,category_name,brand_name,price,shipping,item_description
116650,boys size 8 reg,3,,gap,10.0,0,boys size 8 regular gap
1148297,lacoste 6,3,,lacoste,7.0,0,says 6 adult size
541465,henna kit,1,,,9.0,0,never used gift style
236657,rock revival jeans,3,,rock revival,56.0,0,size 30 waist 30 inseam great pain jeans rips ...
489216,coach sandals size 9,1,,coach,26.0,0,description yet


In [216]:
print("there are {} unique category_names".format(len(pd.unique(sub_train_data['category_name']))))

there are 687 unique category_names


there are too much categories and seems tough to use a simple classify algorithm to handle it.

In [None]:
unique_c1 = pd.unique(sub_train_data['c1'])
print("there are {} unique c1 name: {}".format(len(unique_c1), unique_c1))

print()
unique_c2_c1 = {c1_name: pd.unique(sub_train_data[sub_train_data['c1'] == c1_name]['c2'])
             for c1_name in unique_c1}
for c1_name, c2_names in unique_c2_c1.items():
    pprint.pprint("there are {} unique c2 name under c1 {}: {}".format(
        len(c2_names), c1_name, c2_names))
unique_c2 = pd.unique(sub_train_data['c2'])
print("there are {} unique c2 name".format(len(unique_c2)))
print()

unique_c3_c2 = {c2_name: pd.unique(sub_train_data[sub_train_data['c2'] == c2_name]['c3'])
         for c2_name in unique_c2}
for c2_name, c3_names in unique_c3_c2.items():
    pprint.pprint("there are {} unique c3 name under c2 {}: {}".format(
        len(c3_names), c2_name, c3_names))
unique_c3 = pd.unique(sub_train_data['c3'])
print("there are {} unique c3 name".format(len(unique_c3)))
print()

Since 687 *category_name* is a lot of categories and hard to handle by a classify algorithm, tend to classify to *c1* first, there are only 11 of them. Then classify to *c2* in each *c1*, Then to *c3* in each *c2*.

In [None]:
sub_train_data[sub_train_data['c1'].isna()].sample(n=5, random_state=37)

In [None]:
data = list(sub_train_data['name'])
target = list(sub_train_data['c1'])

### distribution of *price*

In [None]:
sub_train_data['price'].describe()

There are several products which prices are 0. Not sure it is kind of missing data, or they are their real prices.

In [None]:
sub_train_data[sub_train_data['price'] == 0].sample(n=3, random_state=37)

Since price zero products include "cell phone", "shoes", clothes" and so on, tend to believe they are missing data.

In [None]:
def drop_missing_price(data):
    return data[data['price'] != 0]

In [None]:
sub_train_data = drop_missing_price(sub_train_data)
train_data = drop_missing_price(train_data)

In [None]:
fig = plt.figure(figsize=(24,8))
ax1 = fig.add_subplot(121)
ax2 = fig.add_subplot(122)

sns.distplot(sub_train_data['price'], ax=ax1)
sns.boxplot(x=sub_train_data['price'], ax=ax2)

plt.show()

A long tail distribution shape. Use log to normalize it.

In [None]:
def log_price(data):
    data['price'] = np.log(data['price'])
    return data

In [None]:
sub_train_data= log_price(sub_train_data)
train_data = log_price(train_data)

In [None]:
fig = plt.figure(figsize=(24,8))
ax1 = fig.add_subplot(121)
ax2 = fig.add_subplot(122)

sns.distplot(sub_train_data['price'], ax=ax1)
sns.boxplot(x=sub_train_data['price'], ax=ax2)

plt.show()

There are a number of merchandises prices are greater than 5.0 and seems like a lot outliers. But it is not right to put every goods together, like to use paper prices as a standard to mersure necklace prices. It might be a better idea to detect outliers of the price with some kind of category

In [None]:
sns.set(font_scale=2.5)
fig = plt.figure(figsize=(40,24))

ax1 = fig.add_subplot(211)
ax2 = fig.add_subplot(212)

sns.boxplot(x="item_condition_id", y='price', data=sub_train_data, ax=ax1)
sns.boxplot(x="shipping", y='price', data=sub_train_data, ax=ax2)

plt.show()
sns.set(font_scale=1)

it turns out that,
- a product with higher item_condition_id value, the averge price is lower than a product with lower item_condition_id value. So value 5 of item_condition_id means the worse, value 1 means the best
- shipping products average prices are higher than non-shipping products

### *category_name* -- c1/c2/c3

In [None]:
sns.set(font_scale=2.5)
plt.figure(figsize=(24,8))
fig = sns.boxplot(x="c1", y='price', data=sub_train_data.dropna())
fig.set_xticklabels(sub_train_data['c1'].unique(), rotation=30)
sns.set(font_scale=1)

highest prices of *men* stuff and *electronics* are greater than 1.75 and both lowest are less than 0.25, especially *men* stuff lowest price are almost 0.5, while most categories lowest prices are lower than 0.25 and close to 0.125. prices of *sports & outdoors* are in a small range of (0.75, 1.25]. *women* stuff prices are concentrate in the range of (1.0, 1.6]. *beauty* stuff prices are concentrate in the range of (1.0, 1.25). other categories prices distribute in whole range.

In [None]:
sns.set(font_scale=3)
fig = plt.figure(figsize=(24,64))
sns.stripplot(x="price", y='c2', hue='c1', data=sub_train_data, size=8)
sns.set(font_scale=1)

In [None]:
sns.set(font_scale=2.5)
fig = plt.figure(figsize=(32,64))

ax1 = fig.add_subplot(311)
ax2 = fig.add_subplot(312)
ax3 = fig.add_subplot(313)

ax1.set_xticklabels(sub_train_data['c1'].unique(), rotation=30)
sns.stripplot(x="c1", y='price', data=sub_train_data, ax=ax1)

ax2.set_xticklabels(sub_train_data['c2'].unique())
sns.stripplot(x='price', y='c2', data=sub_train_data, ax=ax2)

ax3.set_xticklabels(sub_train_data['c3'].unique())
sns.stripplot(x='price', y='c3', data=sub_train_data, ax=ax3)

plt.show()
sns.set(font_scale=1)

#### missing data

In [None]:
print("there are " + str(pd.isna(train_data['name']).sum())
      + " items missing in name")
print("there are " + str(pd.isna(train_data['category_name']).sum())
      + " items missing in category_name")
print("there are " + str(pd.isna(train_data['brand_name']).sum())
      + " items missing in brand_name")
print("there are " + str(pd.isna(train_data['item_description']).sum())
      + " items missing in item_description")

In [None]:
sub_train_data[pd.isna(sub_train_data['category_name'])]