In [4]:
! pip install stop_words

Collecting stop_words
  Downloading stop-words-2015.2.23.1.tar.gz
Building wheels for collected packages: stop-words
  Running setup.py bdist_wheel for stop-words ... [?25ldone
[?25h  Stored in directory: /root/.cache/pip/wheels/22/74/80/77275c2f9f2f1d9841b51e169a38985640a10fbd2711d10791
Successfully built stop-words
Installing collected packages: stop-words
Successfully installed stop-words-2015.2.23.1


In [18]:
# Import libraries
import numpy as np
import pandas as pd
import sklearn.metrics
from autosklearn.pipeline.components.classification.k_nearest_neighbors import KNearestNeighborsClassifier
import sklearn.model_selection
from stop_words import get_stop_words
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import classification_report
from sklearn.externals import joblib

In [19]:
# Load Dataset
df = pd.read_csv('../data/external/dataset.csv')

In [20]:
# Split data set into train and test
X = df[['name', 'description']]
y = df.category
X_train, X_test, y_train, y_test = \
        sklearn.model_selection.train_test_split(X, y, random_state=1)
X_train.shape
X_test.shape

(495, 2)

In [21]:
# Preprocess text data 
X_train.loc[:,"name"] = X_train.name.apply(lambda x : str.lower(x))
X_train.loc[:,"description"] = X_train.description.apply(lambda x : str.lower(x))
X_train['description'] = X_train['description'].apply(lambda x: ' '.join([word for word in x.split() if word not in (get_stop_words('portuguese'))]))
X_train.shape

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item_labels[indexer[info_axis]]] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


(1485, 2)

In [22]:
# Encode train categorical variables as numeric values
labels = LabelEncoder()
y_train_label = labels.fit_transform(y_train)
y_train_label.shape

(1485,)

In [23]:
# Tokenize train text
count_vect = CountVectorizer(lowercase=False)
X_train_counts = count_vect.fit_transform(X_train.name, X_train.description)
X_train_counts.shape

(1485, 1760)

In [24]:
# From occurrences to frequencies
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(1485, 1760)

In [25]:
# Training classifier
clf = KNearestNeighborsClassifier(len(y.unique()), 'distance', 1)
clf.fit(X_train_tfidf, y_train_label)

<autosklearn.pipeline.components.classification.k_nearest_neighbors.KNearestNeighborsClassifier at 0x7f8b61303898>

In [26]:
# Create product dataframe
product = {"name": "Blusão Bichinho Chic Capuz Marrom","description": "Blusão Blusão Bichinho Chic Capuz Marrom Capuz, é uma Linda peça feito em soft brush, super macio e quentinho. Possui cordão para regulagem no pescoço e lindo bordado exclusiva Bichinho Chic."}
df_product = pd.DataFrame([dict(product)])

In [27]:
# Tokenize product data
X_prod_counts = count_vect.transform(df_product)
df_product.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1 entries, 0 to 0
Data columns (total 2 columns):
description    1 non-null object
name           1 non-null object
dtypes: object(2)
memory usage: 96.0+ bytes


In [28]:
# From occurrences to frequencies
X_prod_tfidf = tfidf_transformer.transform(X_prod_counts)
X_prod_tfidf.shape

(2, 1760)

In [29]:
# Predict category from product name and description
y_hat = clf.predict(X_prod_tfidf)
y_hat

array([72, 72])

In [30]:
# Encode test categorical variables as numeric values
y_test_label = labels.fit_transform(y_test)
y_test_label.shape

(495,)

In [None]:
# Evaluation of the model's performance
X_test.loc[:,"name"] = X_test.name.apply(lambda x : str.lower(x))
X_test.loc[:,"description"] = X_test.description.apply(lambda x : str.lower(x))
X_test['description'] = X_test['description'].apply(lambda x: ' '.join([word for word in x.split() if word not in (get_stop_words('portuguese'))]))
X_pred_counts = count_vect.fit_transform(X_test.name, X_test.description)
X_pred_tfidf = tfidf_transformer.transform(X_pred_counts)
y_pred = clf.predict(X_pred_tfidf)
target_names = ['name', 'description']
print(classification_report(y_test_label, y_pred, target_names=target_names))

In [None]:
# Model persistence
joblib.dump(clf, '../models/'+'classifier.pkl') 