In [1]:
import pandas
import math
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

In [2]:
def print_cross_validation_accuracy(model, x, y, cross_val):
    scores = cross_val_score(model, x, y, cv = cross_val)
    print(scores)
    print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

In [3]:
products_data = pandas.DataFrame(pandas.read_csv("products.csv", sep = ','))
products_data = products_data[products_data.groupby('label_category')['label_category'].transform('count').ge(8)]
products_label = products_data[['label_category']].values.ravel()
products_data_text=[]
for index, row in products_data.iterrows():
    products_data_text.append(                         
                          str(row['name']) +         
                          str(row['top_description']) + 
                          str(row['feature_description']))

In [4]:
X_text = TfidfVectorizer().fit_transform(products_data_text).toarray()
y_text = products_label
X_text_train, X_text_test, y_text_train, y_text_test = train_test_split(X_text, y_text, test_size=0.33, random_state=1511849)

#knn_clf = KNeighborsClassifier(n_neighbors=7)

text_clf = Pipeline([('clf', LinearSVC())])
text_clf.fit(X_text_train, y_text_train)

print_cross_validation_accuracy(model = text_clf, x = X_text, y = y_text, cross_val = 5)

[0.81938326 0.85520362 0.87793427 0.87378641 0.80808081]
Accuracy: 0.85 (+/- 0.06)


In [102]:
label_to_pick = ['store','brand', 'base_category', 'location', 'price/0/origin']
# '__v'
products_data_tabular = products_data.copy()[label_to_pick]
print(products_data_tabular.head())

                 store       brand    base_category    location  \
0  Ho&#xE0;ng V&#x169;      Damtuh  Bách Hóa Online   Hàn Quốc    
2                  NaN      KOKOMI  Bách Hóa Online   Việt Nam    
3         Tiki Trading       Sasco  Bách Hóa Online   Việt Nam    
4         Tiki Trading      Micoem  Bách Hóa Online   Việt Nam    
5         Tiki Trading  Milaganics  Bách Hóa Online   Việt Nam    

   price/0/origin  
0        399000.0  
2         81000.0  
3        335000.0  
4        182000.0  
5        300000.0  


In [103]:
store_list = []
brand_list = []
base_category_list = []
location_list = []
for index, row in products_data_tabular.iterrows():
    current_store = str(row['store'])
    current_brand = str(row['brand'])
    current_base_category = str(row['base_category'])
    current_location = str(row['location'])    
    if current_store not in store_list:
        store_list.append(current_store)
    if current_brand not in brand_list:
        brand_list.append(current_brand)    
    if current_base_category not in base_category_list:
        base_category_list.append(current_base_category)   
    if current_location not in location_list:
        location_list.append(current_location)        
#     if math.isnan(row['price/0/current']):
#         products_data_tabular.loc[index, 'price/0/current'] = 0
    if math.isnan(row['price/0/origin']):
        products_data_tabular.loc[index, 'price/0/origin'] = 0         
    products_data_tabular.loc[index, 'store'] = store_list.index(current_store)
    products_data_tabular.loc[index, 'brand'] = brand_list.index(current_brand)
    products_data_tabular.loc[index, 'base_category'] = base_category_list.index(current_base_category)
    products_data_tabular.loc[index, 'location'] = location_list.index(current_location)

In [105]:
random_forest_clf = RandomForestClassifier(n_estimators=100, oob_score=True, random_state=15118)
print_cross_validation_accuracy(model = random_forest_clf, x = products_data_tabular, y = products_label, cross_val = 10)



[0.49180328 0.61666667 0.63157895 0.70909091 0.72641509 0.63809524
 0.66336634 0.60606061 0.57291667 0.40217391]
Accuracy: 0.61 (+/- 0.19)
