In [64]:
import pandas
import math
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

In [60]:
def print_cross_validation_accuracy(model, x, y, cross_val):
    scores = cross_val_score(model, x, y, cv = cross_val)
    print(scores)
    print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

In [3]:
products_data = pandas.DataFrame(pandas.read_csv("products.csv", sep = ','))
products_data = products_data[products_data.groupby('label_category')['label_category'].transform('count').ge(8)]
products_label = products_data[['label_category']].values.ravel()
products_data_text=[]
for index, row in products_data.iterrows():
    products_data_text.append(                         
                          str(row['name']) +         
                          str(row['top_description']) + 
                          str(row['feature_description']))

In [4]:
X_text = TfidfVectorizer().fit_transform(products_data_text).toarray()
y_text = products_label
X_text_train, X_text_test, y_text_train, y_text_test = train_test_split(X_text, y_text, test_size=0.33, random_state=1511849)

#knn_clf = KNeighborsClassifier(n_neighbors=7)

text_clf = Pipeline([('clf', LinearSVC())])
text_clf.fit(X_text_train, y_text_train)

print_cross_validation_accuracy(model = text_clf, x = X_text, y = y_text, cross_val = 5)

[0.81938326 0.85520362 0.87793427 0.87378641 0.80808081]
Accuracy: 0.85 (+/- 0.06)


In [65]:
products_data_tabular = products_data.copy()
products_data_tabular.drop(axis='columns', inplace=True, labels='name')
products_data_tabular.drop(axis='columns', inplace=True, labels='label_category')
products_data_tabular.drop(axis='columns', inplace=True, labels='url')
products_data_tabular.drop(axis='columns', inplace=True, labels='top_description')
products_data_tabular.drop(axis='columns', inplace=True, labels='feature_description')
products_data_tabular.drop(axis='columns', inplace=True, labels='_id')
products_data_tabular.drop(axis='columns', inplace=True, labels='price/0/date')
products_data_tabular.drop(axis='columns', inplace=True, labels='price/1/date')
products_data_tabular.drop(axis='columns', inplace=True, labels='price/1/current')
products_data_tabular.drop(axis='columns', inplace=True, labels='price/1/origin')
products_data_tabular.drop(axis='columns', inplace=True, labels='price/2/date')
products_data_tabular.drop(axis='columns', inplace=True, labels='price/2/current')
products_data_tabular.drop(axis='columns', inplace=True, labels='price/2/origin')
products_data_tabular.drop(axis='columns', inplace=True, labels='price/3/date')
products_data_tabular.drop(axis='columns', inplace=True, labels='price/3/current')
products_data_tabular.drop(axis='columns', inplace=True, labels='price/3/origin')
products_data_tabular.drop(axis='columns', inplace=True, labels='price/4/date')
products_data_tabular.drop(axis='columns', inplace=True, labels='price/4/current')
products_data_tabular.drop(axis='columns', inplace=True, labels='price/4/origin')
products_data_tabular.drop(axis='columns', inplace=True, labels='price/5/date')
products_data_tabular.drop(axis='columns', inplace=True, labels='price/5/current')
products_data_tabular.drop(axis='columns', inplace=True, labels='price/5/origin')
products_data_tabular.drop(axis='columns', inplace=True, labels='price/6/date')
products_data_tabular.drop(axis='columns', inplace=True, labels='price/6/current')
products_data_tabular.drop(axis='columns', inplace=True, labels='price/6/origin')
products_data_tabular.drop(axis='columns', inplace=True, labels='price/7/date')
products_data_tabular.drop(axis='columns', inplace=True, labels='price/7/current')
products_data_tabular.drop(axis='columns', inplace=True, labels='price/7/origin')
print(products_data_tabular.head())

                 store       brand    base_category  __v    location  \
0  Ho&#xE0;ng V&#x169;      Damtuh  Bách Hóa Online    4   Hàn Quốc    
2                  NaN      KOKOMI  Bách Hóa Online    6   Việt Nam    
3         Tiki Trading       Sasco  Bách Hóa Online    1   Việt Nam    
4         Tiki Trading      Micoem  Bách Hóa Online    4   Việt Nam    
5         Tiki Trading  Milaganics  Bách Hóa Online    6   Việt Nam    

   price/0/current  price/0/origin  
0         199000.0        399000.0  
2          70000.0         81000.0  
3         317000.0        335000.0  
4         174000.0        182000.0  
5         139000.0        300000.0  


In [66]:
store_list = []
brand_list = []
base_category_list = []
location_list = []
for index, row in products_data_tabular.iterrows():
    current_store = str(row['store'])
    current_brand = str(row['brand'])
    current_base_category = str(row['base_category'])
    current_location = str(row['location'])
    
    if current_store not in store_list:
        store_list.append(current_store)
    if current_brand not in brand_list:
        brand_list.append(current_brand)    
    if current_base_category not in base_category_list:
        base_category_list.append(current_base_category)   
    if current_location not in location_list:
        location_list.append(current_location) 
    if math.isnan(row['price/0/current']):
        products_data_tabular.loc[index, 'price/0/current'] = 0
    if math.isnan(row['price/0/current']):
        products_data_tabular.loc[index, 'price/0/origin'] = 0
    products_data_tabular.loc[index, 'store'] = store_list.index(current_store)
    products_data_tabular.loc[index, 'brand'] = brand_list.index(current_brand)
    products_data_tabular.loc[index, 'base_category'] = base_category_list.index(current_base_category)
    products_data_tabular.loc[index, 'location'] = location_list.index(current_location)

In [56]:
print(products_data_tabular)

      store  brand  base_category  __v  location  price/0/current  \
0         0      0              0    4         0         199000.0   
2         1      1              0    6         1          70000.0   
3         2      2              0    1         1         317000.0   
4         2      3              0    4         1         174000.0   
5         2      4              0    6         1         139000.0   
6         2      5              0    4         2          22000.0   
7         2      6              0    3         3          62000.0   
8         1      7              0    3         0         154000.0   
9         1      8              0    2         1          57000.0   
10        2      9              0    4         4          84000.0   
11        2     10              0    3         1         261000.0   
12        3     11              0    2         5         109000.0   
13        2     12              0    5         6         169000.0   
14        1     13              0 

In [58]:
random_forest_clf = RandomForestClassifier(n_estimators=550, oob_score=True, random_state=1511849)

In [67]:
print_cross_validation_accuracy(model = random_forest_clf, x = products_data_tabular, y = products_label, cross_val = 5)

[0.52422907 0.61085973 0.5915493  0.59223301 0.47979798]
Accuracy: 0.56 (+/- 0.10)
