# Возьмем готовую разбивку номенклатур по категориям

In [1]:
import pandas
import numpy as np
import codecs
import sklearn
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn import cross_validation, grid_search, linear_model, metrics

colnames = ['name', 'id', 'class']
data = pandas.read_csv('nameCatIdCatName.csv', names=colnames, encoding='utf8')
data.dropna()
names = data['name'].tolist()
classes = data['class'].tolist()



# Найдем параметры регресии

In [16]:
vectorizer =  TfidfVectorizer()
corpus = vectorizer.fit_transform(names)
clf = LogisticRegressionCV(n_jobs=-1, max_iter=100, refit=True)
clf.fit(corpus, classes)

LogisticRegressionCV(Cs=10, class_weight=None, cv=None, dual=False,
           fit_intercept=True, intercept_scaling=1.0, max_iter=100,
           multi_class='ovr', n_jobs=-1, penalty='l2', random_state=None,
           refit=True, scoring=None, solver='lbfgs', tol=0.0001, verbose=0)

# Построим по ним модель и проверим

In [2]:
train_data, test_data, train_labels, test_labels = cross_validation.train_test_split(names, classes, 
                                                                                     test_size = 0.3, 
                                                                                     random_state = 0)

In [4]:
clf = LogisticRegression(C=10.0, class_weight='balanced', dual=False,
           fit_intercept=True, intercept_scaling=1.0, max_iter=100,
           multi_class='ovr', n_jobs=-1, penalty='l2', random_state=None,
           solver='lbfgs', tol=0.0001)
log = Pipeline([('vectorizer', TfidfVectorizer()),
                      ('classifier', clf)])
log.fit(train_data, train_labels)

Pipeline(steps=[('vectorizer', TfidfVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm=u'l2', preprocessor=None, smooth_id...enalty='l2', random_state=None,
          solver='lbfgs', tol=0.0001, verbose=0, warm_start=False))])

In [23]:
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier(max_depth=2048, min_samples_leaf=4)
log = Pipeline([('vectorizer', TfidfVectorizer()),
                      ('classifier', clf)])
log.fit(train_data, train_labels)

Pipeline(steps=[('vectorizer', TfidfVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm=u'l2', preprocessor=None, smooth_id...it=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best'))])

In [24]:
score = log.score(test_data, test_labels)
print(score)

0.742105263158


# Полученная точность 93%

In [None]:
log_params = {
'classifier__solver' : ['newton-cg', 'sag', 'lbfgs'],
'classifier__max_iter' : [100, 200],
'classifier__C': [0.001, 0.1, 1, 10, 100],
'classifier__class_weight' : ['balanced', None],
# 'classifier__refit' : [False, True],
'classifier__multi_class' : ['ovr', 'multinomial']
}
cv = cross_validation.StratifiedShuffleSplit(train_labels, n_iter = 10, test_size = 0.2, random_state = 0)
grid_cv = grid_search.GridSearchCV(log, log_params, scoring = 'accuracy', cv = cv)
grid_cv.fit(train_data, train_labels)
print grid_cv.best_score_
print grid_cv.best_params_
# 0.854

# Раз уж мы тут все собрались, то посмотрим, что все это значит для наших  номенклатур

In [31]:
content = codecs.open("nom3utf.csv", 'r', 'utf8')
lines = [x.replace("\\", "") for x in content.readlines()]
real_test = np.random.choice(np.array(lines), size = 5)

# Логистическая регрессия:

In [34]:
predicts = log.predict_proba(real_test)
for x, predict in zip(real_test, predicts):
    print(x)
    for p, l in zip(predict, log.classes_):
        if (p > 0.3):
            print(p)
            print(l)
    print('\n')

Серетид пор. д/инг. 25мкг+50мкг/доза 120доз 

0.941164512651
ЛЕКАРСТВА ПО РЕЦЕПТАМ


Но-шпа таб. 40мг №100

0.889259218141
СРЕДСТВА ОТ БОЛИ


SCHOLL GELACTIV WORK СТЕЛЬКИ Д/АКТИВ РАБОТЫ Д/МУЖ Рекитт Бенкизер Хелскэр (Великобритания) Лимитед

0.811672990605
УХОД ЗА БОЛЬНЫМИ


SENI ПЕЛЕНКА SOFT BASIC 90X60 N10

0.48853554977
УХОД ЗА БОЛЬНЫМИ


Флюанксол табл. п.о. 1мг конт. N50 Lundbeck

0.822047732727
ЛЕКАРСТВА ПО РЕЦЕПТАМ




# Ну, не так уж и плохо