In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import datetime
import pprint
pp = pprint.PrettyPrinter(indent=2)
from bokeh.plotting import figure 
from bokeh.io import output_notebook, show
from bokeh.layouts import gridplot
from bokeh.models import ColumnDataSource, ranges, Range1d, LabelSet, Label, Whisker
output_notebook()
pd.__version__

'0.23.4'

In [2]:
df = pd.read_csv("Dataset_N.csv", sep=';')
df.dropna(inplace=True)
df.sample(5)

Unnamed: 0,V1,V2,V3,V4,V5
4423,28234542,USB MEMORY,VERBATIM STORE 'N' GO OTG MICRO DRIVE 32 GB US...,479,VERBATIM
863,23631295,USB MEMORY,"PINSTRIPE USB DRIVE 4GB STORE USB-STICK 2.0, ...",SPEICHERSTICKS,VERBATIM
2677,66611734,BICYCLES,1679308000 PROPHETE ATB 24 EINSTEIGER 6.1,SPORT&FITNESS_FAHRRAEDER_KINDER-&JUGENDRAEDER,PROPHETE
2560,29868925,WASHINGMACHINES,CWF14K21 PACKGR:5.00,STAND_STANDWAESCHEPFLEGE_STANDWASCHMASCHINE-FR...,CONSTRUCTA
1179,62236775,CONTACT LENSES,BIOFINITY TORIC 3ER BOX3 STÜCKUNISEX,LINSEN,COOPER


## Initial details

The dataset has 2 usable features in V3 and V4 and the labels for classification in V2. We can use different methods to achieve classification. Lets have a look to the category distribution first.

In [3]:
cnt_category = df['V2'].value_counts()
cnt_category.head()

CONTACT LENSES     1767
USB MEMORY         1722
WASHINGMACHINES    1596
BICYCLES           1570
Name: V2, dtype: int64

In [4]:
x = cnt_category.index.tolist()
y = cnt_category.values

source = ColumnDataSource(data=dict(x=x, y=y))

x_label = "Categories"
y_label = "Category Count"
title = "Category Counts"

plot = figure(
    plot_width=950, 
    plot_height=300,
    x_axis_label = x_label,
    y_axis_label = y_label,
    x_minor_ticks=2,
    x_range = source.data["x"],
    y_range = Range1d(0, 1000))

plot.title.text = title
plot.title.align = "center"
plot.title.text_color = "black"
plot.title.text_font_size = "15px"

labels = LabelSet(x='x', y='y', text='y', level='glyph',
        x_offset=-20, y_offset=5, source=source, render_mode='canvas')

plot.vbar(source=source,x='x',top='y',bottom=0, width=0.3, color="navy", alpha=0.5)
plot.toolbar.logo = None
plot.toolbar_location = None
plot.add_layout(labels)
show(plot)

## Data quality

The data has even number of categories which makes it perfect for machine learning. If the dataset did not have equal number than in the training data we should have made up for it. NaN rows are removed.

## Predicting product category

Input: Text

Fields: V3 + V4

Example: “GB DATATRAVELER 101 8GB DATATRAVELER 101 DATA...	HARD-/SOFTWARE - ARTGR HARD-/SOFTWARE”

Output: product category

(These are located in V2)

Example: USB MEMORY

In order to make predictions a bit more efficient we create a category_id that is a number and combine two columns into one (V4+V5).


In [5]:
df['Text'] = df.V3.str.cat(df.V4)
df.sample(2)

Unnamed: 0,V1,V2,V3,V4,V5,Text
867,42314255,BICYCLES,FUJI SPORTIF 2.3 TRIPLE LTD SCHWARZ 56 CM,1_5_1,FUJI,FUJI SPORTIF 2.3 TRIPLE LTD SCHWARZ 56 CM1_5_1
7715,42327967,BICYCLES,MERIDA SCULTURA 400 SE SCHWARZ/BLAU 52 CM,1_5_1,MERIDA,MERIDA SCULTURA 400 SE SCHWARZ/BLAU 52 CM1_5_1


In [6]:
df.drop(columns=['V1', 'V3', 'V4', 'V5'], inplace=True)
df.sample(2)

Unnamed: 0,V2,Text
3381,CONTACT LENSES,"ACUVUE OASYS 2-WOCHENLINSEN WEICH, 12 STUECK /..."
4527,WASHINGMACHINES,"WAK28227 WEISS WASCHMASCHINE, 7 KG FUELLMENGE..."


In [7]:
df.V2 = pd.Categorical(df.V2)
df['category_id'] = df.V2.cat.codes
df.sample(5)

Unnamed: 0,V2,Text,category_id
2368,CONTACT LENSES,FOCUS DAILIES TORIC TORISCHE TAGESLINSEN WEICH...,1
686,BICYCLES,BERGAMONT REVOX 3.0 SCHWARZ/BLAU/ORANGE 60 CM1...,0
6185,BICYCLES,1605446001 28 FITNESSBIKE VIKING TRIESTE DAME...,0
6677,WASHINGMACHINES,WAP 28480 MKGROSSGERSOLO_WAESCHEPFLEGE_WASCHVO...,3
358,WASHINGMACHINES,HGG WAM AEG L74484WVFL LAVAMAT WEIÃŸUNASSIGNED,3


In [8]:
category_dict = {
    0: 'BICYCLES',
    1: 'CONTACT LENSES',
    2: 'USB MEMORY',
    3: 'WASHINGMACHINES',
}

## Ready for modeling

Data is cleaned, we have a numerical category_id and the fields that we are going to use for classification input are combined.

### Text Representation

The classifiers and learning algorithms can not work on raw text documents. One common approach for extracting features from text is to use the bag of words model: a model where for each document, a product description in our case, the presence (and often the frequency) of words is used.

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB

In [10]:
X_train, X_test, y_train, y_test = train_test_split(df['Text'], df['category_id'], random_state = 0)
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train)
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
clf = MultinomialNB().fit(X_train_tfidf, y_train)

## Testing our model

In [11]:
category_dict.get(clf.predict(count_vect.transform(["FAHRRAEDER>>KINDERFAHRRAEDER>>KINDERRAEDER20"]))[0])

'BICYCLES'

In [12]:
category_dict.get(clf.predict(count_vect.transform(["CRUZER GLIDE 8GB CRUZER GLIDE 8GB "]))[0])

'USB MEMORY'

In [13]:
category_dict.get(clf.predict(count_vect.transform(["SOFLENS TORIC6 STÜCKUNISEXLINSEN	"]))[0])

'CONTACT LENSES'

In [16]:
df.shape

(6655, 3)