In [2]:
import pandas as pd
import numpy as np
import re

In [3]:
from sklearn.metrics import mean_squared_error
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from sklearn.feature_extraction.text import TfidfVectorizer

ps = PorterStemmer()
stop_words = set(stopwords.words('english'))

# Mercari Kaggle Dataset
- What is Mercari?
    - Mercari is a website that allows people to sell/buy light used or new-ish products. Their slogan is literally "Sell or buy. Almost anything".
- What type of data is in this dataset
    - This dataset consist of item name, brand, description, category, condition and if shipping for the item is included or not. Most of the columns are pure text and need to be cleaned
- Purpose?
    - Predict the price of the items given the features mentioned above


# Methods

- stem_str_item: function to clean text in item description
- clean_str_brand_name: function to clean text in brand name
- impute_category_name: function to clean text in category name
- rmsle: calculates the root mean square log error
- word_freq: gets the count of unique words within a column

In [4]:
# Stems words to their root words and removes all characters that are not alphabets
def stem_str_item(str):
    ret_str = ""
    for w in word_tokenize(str.lower()):
        if w not in stop_words and w.isalpha() and len(w) > 1:
            ret_str = ret_str + " " + ps.stem(w)
    ret_str = re.sub("[^a-zA-Z]", " ", ret_str)
    return ret_str.strip()

In [5]:
# Stems words to their root words and removes all characters that are not alphabets
def clean_str_brand_name(str):
    ret_str = ""
    for w in str.lower():
        if w.isalnum() and len(w) > 0:
            ret_str = ret_str + w
#     ret_str = re.sub("[^a-zA-Z]", "", ret_str)
    return ret_str.strip()

In [6]:
def impute_category_name(row):
    if pd.isnull(row['category_name'])== True:
        if row['clean_brand_name'] in brand_cat_dict.keys():
            return brand_cat_dict[row['clean_brand_name']]
        else:
            return "No category name"
    else:
        return row['category_name']

In [7]:
# code copied from https://www.kaggle.com/marknagelberg/rmsle-function
def rmsle(y_pred, y_test) : 
    assert len(y_test) == len(y_pred)
    return np.sqrt(np.mean((np.log(1+y_pred) - np.log(1+y_test))**2))

In [8]:
# Gets the count of most frequent words give a dataframe
def word_freq(df, col):
    word_frequency = {}
    word_frequency_lst = []
    for index,row in df.iterrows(): 
        for w in list(set(row[col].split(' '))):
            if w not in word_frequency:
                word_frequency[w] = 1
            else:
                word_frequency[w] += 1

    for key, value in word_frequency.items():
        temp = [key, value]
        word_frequency_lst.append(temp)
    word_freq_df = pd.DataFrame(word_frequency_lst, columns=["unique_word", 'frequency'])
    word_freq_df = word_freq_df.sort_values(['frequency'], ascending=False)
    return word_freq_df

# Read Data

In [9]:
unclean_train_data = pd.read_table('/Users/joashc/Downloads/mercari-price-suggestion-challenge/train.tsv')
unclean_train_data.shape

  """Entry point for launching an IPython kernel.


(1482535, 8)

In [10]:
unclean_train_data.head()

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description
0,0,MLB Cincinnati Reds T Shirt Size XL,3,Men/Tops/T-shirts,,10.0,1,No description yet
1,1,Razer BlackWidow Chroma Keyboard,3,Electronics/Computers & Tablets/Components & P...,Razer,52.0,0,This keyboard is in great condition and works ...
2,2,AVA-VIV Blouse,1,Women/Tops & Blouses/Blouse,Target,10.0,1,Adorable top with a hint of lace and a key hol...
3,3,Leather Horse Statues,1,Home/Home Décor/Home Décor Accents,,35.0,1,New with tags. Leather horses. Retail for [rm]...
4,4,24K GOLD plated rose,1,Women/Jewelry/Necklaces,,44.0,0,Complete with certificate of authenticity


# Clean Data

### Removing Unwanted Columns and Duplicate Rows

In [11]:
unclean_train_data_v2 = unclean_train_data.drop(columns='train_id')

In [12]:
unclean_train_data_v3 = unclean_train_data_v2.drop_duplicates()
unclean_train_data_v3.shape

(1482486, 7)

### Dealing with Nulls

In [13]:
unclean_train_data_v3.isnull().sum()

name                      0
item_condition_id         0
category_name          6327
brand_name           632641
price                     0
shipping                  0
item_description          4
dtype: int64

#### Brand Name Nulls

In [15]:
print(round((unclean_train_data_v3.brand_name.isnull().sum()/unclean_train_data_v3.shape[0])*100, 2), 
      '% of the brand names',
      'in the dataset are null.')

42.67 % of the brand names in the dataset are null.


In [12]:
brand_value_counts = (pd.DataFrame(unclean_train_data_v3.brand_name.value_counts())
    .reset_index()
    .rename(columns={'index':'brand',
                    'brand_name':'count_rows'}))

print('Number of brands:', brand_value_counts.shape[0]-1)
brand_value_counts.head()

Number of brands: 4808


Unnamed: 0,brand,count_rows
0,PINK,54088
1,Nike,54043
2,Victoria's Secret,48035
3,LuLaRoe,31024
4,Apple,17322


Fill brand name nulls with "nobrandname"

In [13]:
unclean_train_data_v3['brand_name'] = unclean_train_data_v3['brand_name'].fillna("nobrandname")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [14]:
unclean_train_data_v3.isnull().sum()

name                    0
item_condition_id       0
category_name        6327
brand_name              0
price                   0
shipping                0
item_description        4
dtype: int64

#### Item Description Nulls

In [15]:
print('Approx', unclean_train_data_v3.item_description.isnull().sum()/unclean_train_data_v3.shape[0], 
      '% of the dataset contains item_description that are null.')
print()
print('I am not deleting these rows because if the test data contains nulls in the item_description,',
     'I want to impute those values.')

Approx 2.698170505488753e-06 % of the dataset contains item_description that are null.

I am not deleting these rows because if the test data contains nulls in the item_description, I want to impute those values.


In [16]:
print('Unique item descriptions:', unclean_train_data_v3.item_description.nunique())

Unique item descriptions: 1281426


In [17]:
pd.DataFrame(unclean_train_data_v3.item_description.value_counts()).head()

Unnamed: 0,item_description
No description yet,82489
New,4099
Brand new,3058
Good condition,1274
Great condition,1158


Fill null item descriptions with "No description yet" as there are 82489 items that currently do not have any descriptions. This is ~6% of the dataset.

In [18]:
unclean_train_data_v3['item_description'] = unclean_train_data_v3['item_description'].fillna('No description yet')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [19]:
unclean_train_data_v3.isnull().sum()

name                    0
item_condition_id       0
category_name        6327
brand_name              0
price                   0
shipping                0
item_description        0
dtype: int64

For imputing the rest of the nulls for category_name, I want to use the item_description and brand_name columns. I first have to clean these columns to deal with the rest of the nulls.

### Initial Text Cleaning
- item_description
    - Make everything lower case
    - Tokenize
    - Stem
    - Take out non-alphabetical characters
    - Removal of stopwords
- brand_name
    - Make everything lower case
    - Tokenize
    - Take out spaces
    - Take out special characters
- item_name
    - Same steps as item description

In [38]:
%%time
unclean_train_data_v3['stemmed_item_description'] = unclean_train_data_v3['item_description'].apply(
    lambda x: stem_str_item(str(x)))

CPU times: user 13min 29s, sys: 4.17 s, total: 13min 34s
Wall time: 13min 57s


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [62]:
# replace nulls after cleaning with default talenized value
unclean_train_data_v3['stemmed_item_description'] = np.where(
    unclean_train_data_v3['stemmed_item_description'].isnull()==True,
'descript yet', unclean_train_data_v3['stemmed_item_description'])

In [14]:
%%time
unclean_train_data_v3['clean_brand_name'] = unclean_train_data_v3['brand_name'].apply(
    lambda x: clean_str_brand_name(str(x)))

CPU times: user 2.82 s, sys: 47.8 ms, total: 2.87 s
Wall time: 2.88 s


In [54]:
unclean_train_data_v3.isnull().sum()

name                           0
item_condition_id              0
category_name               5899
brand_name                     0
price                          0
shipping                       0
item_description               0
stemmed_item_description    3320
clean_brand_name               0
dtype: int64

### Clean category_name Column

In [67]:
# replace / with a space
unclean_train_data_v3['category_name'] = unclean_train_data_v3['category_name'].str.replace('/',' ')

In [71]:
%%time
unclean_train_data_v3['clean_category_name'] = unclean_train_data_v3['category_name'].apply(
    lambda x: stem_str_item(str(x)))

CPU times: user 3min 48s, sys: 1.28 s, total: 3min 49s
Wall time: 3min 52s


In [74]:
unclean_train_data_v3['clean_category_name'].isnull().sum()

0

In [66]:
unclean_train_data_v3['clean_category_name'] = unclean_train_data_v3['clean_category_name'].fillna('unknown')

### Clean name column

In [52]:
# replace / with a space
unclean_train_data_v3['name'] = unclean_train_data_v3['name'].str.replace('/',' ')

In [53]:
%%time
unclean_train_data_v3['clean_item_name'] = unclean_train_data_v3['name'].apply(
    lambda x: stem_str_item(str(x)))

CPU times: user 3min 39s, sys: 1.17 s, total: 3min 40s
Wall time: 3min 42s


In [68]:
unclean_train_data_v3['clean_item_name'].isnull().sum()

0

In [67]:
unclean_train_data_v3['clean_item_name'] = unclean_train_data_v3['clean_item_name'].fillna('unknown')

In [62]:
# unclean_train_data_v3[unclean_train_data_v3['clean_item_name'].isnull()==True].name.value_counts()

### Assigning Catgeories and Sub Categories based on Mercari site

In [20]:
unclean_train_data_v3.head()

Unnamed: 0,name,item_condition_id,category_name,brand_name,price,shipping,item_description
0,MLB Cincinnati Reds T Shirt Size XL,3,Men/Tops/T-shirts,,10.0,1,No description yet
1,Razer BlackWidow Chroma Keyboard,3,Electronics/Computers & Tablets/Components & P...,Razer,52.0,0,This keyboard is in great condition and works ...
2,AVA-VIV Blouse,1,Women/Tops & Blouses/Blouse,Target,10.0,1,Adorable top with a hint of lace and a key hol...
3,Leather Horse Statues,1,Home/Home Décor/Home Décor Accents,,35.0,1,New with tags. Leather horses. Retail for [rm]...
4,24K GOLD plated rose,1,Women/Jewelry/Necklaces,,44.0,0,Complete with certificate of authenticity


In [22]:
mercari_category = pd.read_csv('~/mercari_price_suggestion/mercari_price_suggestion/mercari_category.csv')
print(mercari_category.shape)
mercari_category.head(10)

(161, 2)


Unnamed: 0,category,sub_category
0,Women,Dresses
1,Women,Tops & blouses
2,Women,Sweaters
3,Women,Jeans
4,Women,Pants
5,Women,Skirts
6,Women,Coats & jackets
7,Women,Suits & blazers
8,Women,Athletic apparel
9,Women,Swimwear


#### Assigning Categories

In [21]:
mercari_category_lst = ['Women', 'Toys', 'Vintage', 'Electronics', 'Home', 'Beauty',
      'Handmade', 'Sports',  'Men', 'Kids', 'Other']

In [22]:
def assign_category(row, cat_lst=mercari_category_lst):
    for cat in cat_lst:
        if row['category_name'].lower().find(cat.lower()) >=0:
            return cat

In [23]:
unclean_train_data_v3['assigned_category'] = unclean_train_data_v3.apply(assign_category, axis=1)
unclean_train_data_v3.shape

(1482486, 13)

In [24]:
unclean_train_data_v3.assigned_category.isnull().sum()

457

In [25]:
unclean_train_data_v3['assigned_category'] = unclean_train_data_v3['assigned_category'].fillna('Other')

#### Assigning Sub Categories

In [26]:
sub_category_dict = {}
for cat in mercari_category.category.unique():
    sub_category_dict[cat] = []

for index, row in mercari_category.iterrows():
    sub_category_dict[row['category']].append(row['sub_category'])    

In [27]:
def assign_sub_category(row, sub_cat_dict=sub_category_dict):
    sub_cat_lst = sub_category_dict[row['assigned_category']]
    for sub_cat in sub_cat_lst:
        if row['category_name'].lower().find(sub_cat.lower()) >=0:
            return sub_cat

In [28]:
%%time
unclean_train_data_v3['assigned_sub_category'] = unclean_train_data_v3.apply(assign_sub_category, axis=1)
print(unclean_train_data_v3.shape)

(1482486, 13)
CPU times: user 1min 27s, sys: 713 ms, total: 1min 28s
Wall time: 1min 29s


In [29]:
unclean_train_data_v3.assigned_sub_category.isnull().sum()

103216

Could work on better resolving the categories. But they are good enough for now. Will impute the nulls with "Other"

In [47]:
unclean_train_data_v3['assigned_sub_category'] = unclean_train_data_v3['assigned_sub_category'].fillna('Other')

In [57]:
unclean_train_data_v3.isnull().sum()

name                           0
item_condition_id              0
category_name                  0
brand_name                     0
price                          0
shipping                       0
item_description               0
stemmed_item_description       0
clean_brand_name               0
clean_category_name         7932
clean_item_name             4246
assigned_category              0
assigned_sub_category          0
dtype: int64

# Future Improvements

- Clean text data better before tokenizing words
    - eg Replace "/" or "-" with a space
    - Take out special characters
- Get extra context out of the description of the item, name or category (size, oz, etc)
- Re-order list in category and sub categories to assign the correct values
    - Sport category didnt get the sub categories well
- Acquire brand name from item name if the brand name is null
- Expand the original category column better