link: https://www.kaggle.com/kswamy15/mercari-using-pytorch

In [1]:
import torch
from torch.autograd import Variable
from torch import optim
from torch.optim import lr_scheduler
from torch import nn as nn
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils
import torch.nn.functional as F

In [2]:
import numpy as np
import pandas as pd
import time

In [3]:
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split
import math

In [4]:
train_df = pd.read_csv('data/mercari/train.tsv', sep='\t')
test_df = pd.read_csv('data/mercari/test.tsv', sep='\t')

In [5]:
train_df.describe()

Unnamed: 0,train_id,item_condition_id,price,shipping
count,1482535.0,1482535.0,1482535.0,1482535.0
mean,741267.0,1.90738,26.73752,0.4472744
std,427971.1,0.9031586,38.58607,0.4972124
min,0.0,1.0,0.0,0.0
25%,370633.5,1.0,10.0,0.0
50%,741267.0,2.0,17.0,0.0
75%,1111900.0,3.0,29.0,1.0
max,1482534.0,5.0,2009.0,1.0


In [6]:
train_df.shape

(1482535, 8)

In [7]:
train_df.head()

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description
0,0,MLB Cincinnati Reds T Shirt Size XL,3,Men/Tops/T-shirts,,10.0,1,No description yet
1,1,Razer BlackWidow Chroma Keyboard,3,Electronics/Computers & Tablets/Components & P...,Razer,52.0,0,This keyboard is in great condition and works ...
2,2,AVA-VIV Blouse,1,Women/Tops & Blouses/Blouse,Target,10.0,1,Adorable top with a hint of lace and a key hol...
3,3,Leather Horse Statues,1,Home/Home Décor/Home Décor Accents,,35.0,1,New with tags. Leather horses. Retail for [rm]...
4,4,24K GOLD plated rose,1,Women/Jewelry/Necklaces,,44.0,0,Complete with certificate of authenticity


In [8]:
print('Train columns with null values:\n', train_df.isnull().sum())
print("-"*10)

print('Test/Validation columns with null values:\n', test_df.isnull().sum())
print("-"*10)

train_df.describe(include = 'all')

Train columns with null values:
 train_id                  0
name                      0
item_condition_id         0
category_name          6327
brand_name           632682
price                     0
shipping                  0
item_description          4
dtype: int64
----------
Test/Validation columns with null values:
 test_id                   0
name                      0
item_condition_id         0
category_name          3058
brand_name           295525
shipping                  0
item_description          0
dtype: int64
----------


Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description
count,1482535.0,1482535,1482535.0,1476208,849853,1482535.0,1482535.0,1482531
unique,,1225273,,1287,4809,,,1281426
top,,Bundle,,"Women/Athletic Apparel/Pants, Tights, Leggings",PINK,,,No description yet
freq,,2232,,60177,54088,,,82489
mean,741267.0,,1.90738,,,26.73752,0.4472744,
std,427971.1,,0.9031586,,,38.58607,0.4972124,
min,0.0,,1.0,,,0.0,0.0,
25%,370633.5,,1.0,,,10.0,0.0,
50%,741267.0,,2.0,,,17.0,0.0,
75%,1111900.0,,3.0,,,29.0,1.0,


Calculate how much of the brand names are not there.

In [9]:
print(632682/train_df.shape[0])

0.4267568725190299


Around 50% is not there hence we should probably not consider this.

In [10]:
train_df = train_df.drop(['brand_name'], axis=1)
test_df = test_df.drop(['brand_name'], axis=1)

# categories

We need to check how many categories there are

In [35]:
# Merge the two dataframes
frames = [train_df, test_df]
combined_df = pd.concat(frames)

In [39]:
combined_cat_df = combined_df['category_name']
def split_cat(text):
    try: return text.split("/")
    except: pass

combined_cat_df = combined_cat_df.apply(lambda x: split_cat(x))

In [54]:
def no_of_cats(cat_list):
    try: return len(cat_list)
    except: return 0
    
no_of_cats = pd.DataFrame(combined_cat_df.apply(lambda x: no_of_cats(x)))

In [75]:
# no_of_cats['category_name'].max(axis=1)
index_whr_max_categories = no_of_cats['category_name'].argmax()
print(index_whr_max_categories)
max_num_of_categories = len(split_cat(combined_df.iloc[[index_whr_max_categories]]['category_name'].tolist()[0]))
print('there are a maximum of {} categories and this is happened in row:'.format(max_num_of_categories))
combined_df.iloc[[index_whr_max_categories]]

239
there are a maximum of 5 categories and this is happened in row:


Unnamed: 0,category_name,item_condition_id,item_description,name,price,shipping,test_id,train_id
239,Electronics/Computers & Tablets/iPad/Tablet/eB...,1,Zag invisible shield for IPad air,Zagg invisible shield for IPad air,10.0,1,,239.0


In [None]:
def split_cat(text):
    try: return text.split("/")
    except: return ("None", "None", "None")

In [11]:
train_df[train_df.isnull().any(axis=1)]

Unnamed: 0,train_id,name,item_condition_id,category_name,price,shipping,item_description
122,122,Bundle,3,,59.0,0,Sizes and info of clothes can be found on thei...
155,155,3 Nora Roberts Books,3,,10.0,1,For aferg16.
258,258,ACER Laptop,5,,14.0,0,"Powers on, no screen display, no external dama..."
984,984,AUTHENTIC BRWN MICHAEL KORS MAKEUP STAIN,4,,18.0,0,No description yet
1185,1185,Teenage Mutant Ninja Turtle Van/ Extras,3,,9.0,0,"Teenage Mutant Ninja Turtle Van, April O'Neil ..."
1274,1274,Black Ribbed Off-the-Shoulder Crop Top,2,,15.0,0,From the silence + noise brand sold at Urban O...
1320,1320,Nice Condition Pet Escort,2,,12.0,0,Nice condition I will wipe down before selling
1419,1419,3 Toy Bundle,1,,34.0,0,"* brand new/NIB * ** price firm, no free shipp..."
1491,1491,Baby Sign,3,,3.0,1,Magnetic baby sign
1521,1521,Nolan Ryan Patch Baseball Card,2,,3.0,1,No description yet


In [12]:
value_list = ['iPhone']
train_df[train_df.name.isin(value_list)]

Unnamed: 0,train_id,name,item_condition_id,category_name,price,shipping,item_description
161166,161166,iPhone,1,Electronics/Cell Phones & Accessories/Cell Pho...,4.0,1,Luxury 360° Hybrid Acrylic Hard Case Original ...
331692,331692,iPhone,2,Electronics/Cell Phones & Accessories/Cell Pho...,76.0,0,iCloud lock!!! Screen protector no scratches! ...
502203,502203,iPhone,3,Electronics/Cell Phones & Accessories/Cell Pho...,26.0,1,No description yet
901734,901734,iPhone,1,Electronics/Cell Phones & Accessories/Cell Pho...,17.0,1,Apple iPhone headphones for iPhones compatible...
968525,968525,iPhone,2,Electronics/Cell Phones & Accessories/Cell Pho...,19.0,0,Two iPhone 4s they are locked so they are to b...
1198530,1198530,iPhone,2,Electronics/Cell Phones & Accessories/Cell Pho...,116.0,0,iPhone 5c no cracks brand new unlocked (no SIM...
1220558,1220558,iPhone,4,Electronics/Cell Phones & Accessories/Cell Pho...,16.0,1,"iPhone 4 Cracked front Back up for sale, had i..."
1385743,1385743,iPhone,4,Electronics/Cell Phones & Accessories/Cell Pho...,20.0,1,It works won't turn on need a battery and new ...
1473558,1473558,iPhone,3,Electronics/Cell Phones & Accessories/Cell Pho...,80.0,1,iPhone 6 model number A1586. Found never could...


For the missing category names we should try to find some unsupervised learning so that some amount filling of the data should be present.

# Running NLP on the categories

We will first try to classify the documents and see if we can get some meaningful classification based on that.

Idea is to use only the name to predict the category name

So we will drop all the remaining columns

In [14]:
print(train_df.columns.tolist())

['train_id', 'name', 'item_condition_id', 'category_name', 'price', 'shipping', 'item_description']


In [15]:
from copy import deepcopy
category_df = deepcopy(train_df)

In [17]:
category_df = category_df.drop(['train_id', 'item_condition_id', 'price', 'shipping'], axis=1)

In [19]:
category_df.sample(2)

Unnamed: 0,name,category_name,item_description
141410,Color pop,Beauty/Makeup/Lips,4 bundled color pop lip Matte gloss . Haven't ...
1141493,Bluetooth sunglasses✨✨✨ FOR KIMARI✨✨,Electronics/Cell Phones & Accessories/Headsets,"Brand new, with case and cord, also free shipp..."


In [31]:
# predict_category_df = category_df[category_df.isnull().any(axis=1)]
predict_category_df = category_df[pd.isnull(category_df['category_name'])]
train_test_categry_df = category_df[pd.notnull(category_df['category_name'])]
train_categry_df, test_categry_df = train_test_split(train_test_categry_df, test_size=0.2, random_state=42)
print('separated into predict, train and test')
print(category_df.shape, predict_category_df.shape, train_categry_df.shape, test_categry_df.shape)
print(predict_category_df.shape[0] + train_categry_df.shape[0] + test_categry_df.shape[0])

separated into predict, train and test
(1482535, 3) (6327, 3) (1180966, 3) (295242, 3)
1482535


In [33]:
X_train_category_df = train_categry_df[['name', 'item_description']]
y_train_category_df = train_categry_df[['category_name']]
X_test_category_df = test_categry_df[['name', 'item_description']]
y_test_category_df = test_categry_df[['category_name']]
print('separate to x and y')
print(X_train_category_df.shape, y_train_category_df.shape, X_test_category_df.shape, y_test_category_df.shape)

separate to x and y
(1180966, 2) (1180966, 1) (295242, 2) (295242, 1)


category names are based on parent -> sub category -> subcategory etc. Need to find how many categories are there.

In [34]:
y_train_category_df

Unnamed: 0,category_name
943192,Kids/Boys 0-24 Mos/Shoes
397137,Women/Shoes/Athletic
697816,Kids/Toys/Hobbies
533679,Women/Women's Accessories/Scarves & Wraps
301180,Sports & Outdoors/Team Sports/Football
1109668,Electronics/Video Games & Consoles/Games
1045872,Home/Other/Other
1019048,Women/Dresses/Knee-Length
555202,Beauty/Tools & Accessories/Makeup Brushes & Tools
498191,Home/Kitchen & Dining/Coffee & Tea Accessories


In [22]:
X_category_df = category_df[['name', 'item_description']]
y_category_df = category_df[['category_name']]