In [142]:
import math
import numpy
import pandas
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

In [143]:
def print_cross_validation_accuracy(model, x, y, cross_val):
    scores = cross_val_score(model, x, y, cv = cross_val)
    print('Cross Validation Accuracy: ', scores)
    print('Average Cros-Val Accuracy: ','%0.2f (+/- %0.2f)' % (scores.mean(), scores.std() * 2))

In [144]:
# Preprocess data, remove labels whose number of instance less than 8
products_data = pandas.DataFrame(pandas.read_csv("products.csv", sep = ','))
print(products_data.shape)
products_data = products_data[
                              products_data.groupby('label_category')['label_category']
                                           .transform('count')
                                           .ge(10)
                             ]
# Remove rows having unmeaningful value in 'store' col
for index, row in products_data.iterrows():
    if '#' in str(row['store']):
        products_data.drop(index, inplace=True)
        
products_label = products_data[['label_category']].values.ravel()

#---------------------------------------TEXT CLASSIFIER------------------------------------------------
# Text part consists of strings in 3 columns: name, top_desc, feature_desc
products_data_text = list()
for index, row in products_data.iterrows():
    products_data_text.append(                         
                          str(row['name']) +         
                          str(row['top_description']) + 
                          str(row['feature_description']))
print(products_data.shape)

(1118, 35)
(954, 35)


In [145]:
X_text = TfidfVectorizer().fit_transform(products_data_text).toarray()
y_text = products_label
X_text_train, X_text_test, y_text_train, y_text_test = train_test_split(X_text, 
                                                                        y_text, 
                                                                        test_size=0.33, 
                                                                        random_state=1511849)
text_clf = Pipeline([('clf', LinearSVC())])
text_clf.fit(X_text_train, y_text_train)

Pipeline(memory=None,
     steps=[('clf', LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0))])

In [146]:
predict_text = text_clf.predict(X_text_test)
print('Accuracy Score: ',accuracy_score(predict_text, y_text_test))

Accuracy Score:  0.8444444444444444


In [147]:
print_cross_validation_accuracy(model = text_clf, x = X_text, y = y_text, cross_val = 5)

Cross Validation Accuracy:  [0.82352941 0.875      0.87958115 0.89502762 0.83146067]
Average Cros-Val Accuracy:  0.86 (+/- 0.06)


In [148]:
#---------------------------------------TABULAR CLASSIFIER------------------------------------------------
# Tabular part consists of cols in label_to_pick list
label_to_pick = ['store','brand', 'base_category', 'location', 'price/0/origin']
products_data_tabular = products_data.copy()[label_to_pick]

# Replace all NaN values
for index, row in products_data_tabular.iterrows():
    if math.isnan(row['price/0/origin']):
        products_data_tabular.loc[index, 'price/0/origin'] = 0 
print(products_data_tabular.shape)
print(products_data_tabular.head())

(954, 5)
          store       brand    base_category      location  price/0/origin
2           NaN      KOKOMI  Bách Hóa Online     Việt Nam          81000.0
3  Tiki Trading       Sasco  Bách Hóa Online     Việt Nam         335000.0
4  Tiki Trading      Micoem  Bách Hóa Online     Việt Nam         182000.0
5  Tiki Trading  Milaganics  Bách Hóa Online     Việt Nam         300000.0
6  Tiki Trading      Haribo  Bách Hóa Online   Thổ Nhĩ Kỳ          24000.0


In [149]:
# Separate to multi-columns with binary values
products_data_tabular_dummies = pandas.get_dummies(
    products_data_tabular[['store','brand', 'base_category', 'location']])
products_data_tabular_dummies['price/0/origin'] = products_data_tabular['price/0/origin']
print(products_data_tabular_dummies.shape)
#print(products_data_tabular_dummies.head())

(954, 662)


In [150]:
X_tabular = products_data_tabular_dummies
y_tabular = products_label
X_tabular_train, X_tabular_test, y_tabular_train, y_tabular_test = train_test_split(
                                                                        X_tabular, 
                                                                        y_tabular, 
                                                                        test_size=0.33, 
                                                                        random_state=1511849)
random_forest_clf = RandomForestClassifier(n_estimators=127, 
                                           criterion = 'gini', 
                                           oob_score=True, 
                                           max_features = 'log2',
                                           random_state=15118)
random_forest_clf.fit(X_tabular_train, y_tabular_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='log2', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=127, n_jobs=1,
            oob_score=True, random_state=15118, verbose=0,
            warm_start=False)

In [151]:
predict_tabular = random_forest_clf.predict(X_tabular_test)
print('Accuracy Score: ',accuracy_score(predict_tabular, y_tabular_test))

Accuracy Score:  0.6730158730158731


In [152]:
print_cross_validation_accuracy(model = random_forest_clf, 
                                x = products_data_tabular_dummies, 
                                y = products_label, 
                                cross_val = 5)

Cross Validation Accuracy:  [0.65196078 0.735      0.65968586 0.71270718 0.56741573]
Average Cros-Val Accuracy:  0.67 (+/- 0.12)
