In [2]:
import StringIO
import pandas
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.model_selection import KFold, cross_val_score, StratifiedKFold, train_test_split
from sklearn.metrics import classification_report, accuracy_score, r2_score
from sklearn.svm import SVC
import re
from sklearn.model_selection import GridSearchCV
import numpy as np



In [3]:
data = pandas.read_csv('./data_seta/kontur_train.csv', index_col='id')
data = data.fillna('')
data.head()

Unnamed: 0_level_0,name,tare
id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,Котлеты МЛМ из говядины 335г,коробка
1,Победа Вкуса конфеты Мишки в лесу 250г(КФ ПОБЕ...,коробка
2,"ТВОРОГ (ЮНИМИЛК) ""ПРОСТОКВАШИНО"" ЗЕРНЕНЫЙ 130Г...",стаканчик
3,Сыр Плавленый Веселый Молочник с Грибами 190г ...,контейнер
4,Жевательный мармелад Маша и медведь буквы 100г,пакет без формы


In [4]:
tares = {}

for i, tare in enumerate(list(data['tare'].unique())):
    tares[tare] = i
tares

{'коробка': 0,
 'стаканчик': 1,
 'контейнер': 2,
 'пакет без формы': 3,
 'бутылка': 4,
 'лоток': 5,
 'вакуумная упаковка': 6,
 'без упаковки': 7,
 'туба': 8,
 'обертка': 9,
 'пакет прямоугольный': 10,
 'усадочная упаковка': 11,
 'банка металлическая': 12,
 'пачка': 13,
 'гофрокороб': 14,
 'упаковка с газовым наполнением': 15,
 'тортница': 16,
 'банка неметаллическая': 17,
 'ведро': 18,
 'колбасная оболочка': 19,
 'ячеистая упаковка': 20}

In [5]:
def filter_not_valuable_words(phrase):
    words = phrase.split(' ')
    valuable_words = []

    for word in words:
        word = re.sub('[^a-я%/]', ' ', word)
        if len(word) > 2:
            valuable_words.append(word)
        else:
            if word == 'г' or word == 'гр':
                valuable_words.append('грамм')
            if '%' in word:
                valuable_words.append('процент')
            if word == 'л':
                valuable_words.append('литр')
            if word == 'ву':
                valuable_words.append('вакуумная')
            if word == 'вс':
                pass
            if 'жб' in word:
                valuable_words.append('банкаметаллическая')
    return ' '.join(valuable_words)

In [6]:
y_data = data.replace({'tare': tares})['tare'].to_numpy()
X_data = data['name'].str.lower().replace('[0-9/]', '', regex=True).map(filter_not_valuable_words)


In [7]:
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.33, random_state=42)


In [8]:
tdif = TfidfVectorizer()
X_train_1 = tdif.fit_transform(X_train)
X_train_1.data

array([0.44567024, 0.0977727 , 0.41914361, ..., 0.55635561, 0.58595115,
       0.28402534])

In [9]:
vectorizer = CountVectorizer()
X_train_2 = vectorizer.fit_transform(X_train)

In [10]:
kf = KFold(n_splits=5, random_state=42, shuffle=True)


In [11]:
rfc1=RandomForestClassifier(random_state=42, n_estimators= 35)
rfc1.fit(X_train_1, y_train)

RandomForestClassifier(n_estimators=35, random_state=42)

In [12]:
rfc2=RandomForestClassifier(random_state=42, n_estimators= 35)
rfc2.fit(X_train_2, y_train)

RandomForestClassifier(n_estimators=35, random_state=42)

In [25]:
X_test_1 = tdif.transform(X_test)
X_test_2 = vectorizer.transform(X_test)

y_pred_forest_1 = rfc1.predict(X_test_1)
y_pred_forest_2 = rfc2.predict(X_test_2)


score_1 = accuracy_score(y_test, y_pred_forest_1)
score_2 = accuracy_score(y_test, y_pred_forest_2)
score_1
score_2

0.801923363649918

In [14]:
print(rfc1.feature_importances_)


[9.51335672e-06 3.62192428e-05 1.15972616e-05 ... 7.87861990e-06
 6.33543667e-07 3.04668308e-06]


In [23]:
from sklearn.tree import export_graphviz
from os import system
import pydotplus
from io import StringIO
from graphviz import Source
from IPython.display import Image

estimator_1 = rfc1.estimators_[1]
#
classnames = list(data['tare'].unique())

# # Export as dot file
dotfile = StringIO()

export_graphviz(estimator_1, out_file=dotfile)
graph=pydotplus.graph_from_dot_data(dotfile.getvalue())
graph.write_png("dtree.png")
graph.write_pdf("dtree.pdf")

True