In [2]:
from sklearn.datasets import fetch_20newsgroups
import numpy as np
import heapq

In [3]:
all_categories = fetch_20newsgroups().target_names
all_categories

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

Возьмём темы из одного раздела, возможно, их будет сложнее отличать друг от друга

In [4]:
categories = [
    'sci.electronics',
    'sci.space',
    'sci.med'
]
train_data = fetch_20newsgroups(subset='train', categories=categories, remove=('headers', 'footers', 'quotes'))
test_data = fetch_20newsgroups(subset='test', categories=categories, remove=('headers', 'footers', 'quotes'))

Для векторизации текстов воспользуемся CountVectorizer, он представляет документ как мешок слов. Можно всячески варировать извлечение признаков (убирать редкие слова, убирать частые слова, убирать слова общей лексики, брать биграмы и т.д.)

In [5]:
from sklearn.feature_extraction.text import CountVectorizer

In [6]:
CountVectorizer()

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [7]:
count_vectorizer = CountVectorizer(min_df=5, ngram_range=(1, 2)) 

In [8]:
sparse_feature_matrix = count_vectorizer.fit_transform(train_data.data)
sparse_feature_matrix

<1778x10885 sparse matrix of type '<class 'numpy.int64'>'
	with 216486 stored elements in Compressed Sparse Row format>

In [9]:
num_2_words = {
    v: k
    for k, v in count_vectorizer.vocabulary_.items()
}

In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

Обучим логистическую регрессию для предсказания темы документа

In [11]:
algo = LogisticRegression()
algo.fit(sparse_feature_matrix, train_data.target)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

Слова с наибольшим положительным весом, являются характерными словами темы

In [12]:
W = algo.coef_.shape[1]
for c in algo.classes_:
    topic_words = [
        num_2_words[w_num]
        for w_num in heapq.nlargest(10, range(W), key=lambda w: algo.coef_[c, w])
    ]
    print(',  '.join(topic_words))


circuit,  electronics,  power,  chips,  parts,  the number,  them,  used,  tv,  ve
msg,  medical,  my,  blood,  disease,  doctor,  health,  treatment,  your,  needles
space,  orbit,  nasa,  thanks for,  launch,  earth,  sorry,  moon,  spacecraft,  solar


Сравним качество на фолдах с качеством на трейне и на отложенном тесте

In [13]:
algo = LogisticRegression()
arr = cross_val_score(algo, sparse_feature_matrix, train_data.target, cv=5, scoring='accuracy')
print(arr)
print(np.mean(arr))

[0.8487395  0.84550562 0.83426966 0.83943662 0.82768362]
0.8391270024469429


Почему это неправильная кроссвалидация?

In [14]:
algo.fit(sparse_feature_matrix, train_data.target)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [15]:
accuracy_score(algo.predict(sparse_feature_matrix), train_data.target)

0.9803149606299213

In [16]:
accuracy_score(algo.predict(count_vectorizer.transform(test_data.data)), test_data.target)

0.7928994082840237

Мы видим переобучение, это проклятие размерности

In [17]:
algo = LogisticRegression(penalty='l1', C=0.1)
arr = cross_val_score(algo, sparse_feature_matrix, train_data.target, cv=5, scoring='accuracy')
print(arr)
print(np.mean(arr))

[0.72829132 0.74719101 0.73033708 0.74647887 0.71186441]
0.7328325372866697


In [18]:
algo.fit(sparse_feature_matrix, train_data.target)

LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [19]:
accuracy_score(algo.predict(sparse_feature_matrix), train_data.target)

0.7935883014623172

In [20]:
accuracy_score(algo.predict(count_vectorizer.transform(test_data.data)), test_data.target)

0.6813186813186813

Добавление регуляризатора уменьшает отличие на трейне и тесте, но ухудшает качество. Поиграйтесь дома с параметрами регуляризации, чтобы получить максимальное качество.

In [21]:
C = [0.35, 0.5, 0.6, 0.9]
train_accuracy = []
test_accuracy = []
mean_CV_accuracy = []
for c in C:
    algo = LogisticRegression(penalty='l1', C=c)
    arr = cross_val_score(algo, sparse_feature_matrix, train_data.target, cv=5, scoring='accuracy')
    mean_CV_accuracy.append(np.mean(arr))
    algo.fit(sparse_feature_matrix, train_data.target)
    train_accuracy.append(accuracy_score(algo.predict(sparse_feature_matrix), train_data.target))
    test_accuracy.append(accuracy_score(algo.predict(count_vectorizer.transform(test_data.data)), test_data.target))

print('\t\t C:', C)
print('Mean CV accuracy:', mean_CV_accuracy)
print('Train accuracy:\t', train_accuracy)
print('Test accuracy:\t', test_accuracy)

		 C: [0.35, 0.5, 0.6, 0.9]
Mean CV accuracy: [0.7902074603475387, 0.7980869430232577, 0.8003231184674148, 0.8065203106571136]
Train accuracy:	 [0.907199100112486, 0.9358830146231721, 0.9471316085489314, 0.9623172103487064]
Test accuracy:	 [0.7404902789518174, 0.7531699070160609, 0.7540152155536771, 0.7599323753169906]


Чтобы не делать векторизацию и обучение раздельно, есть удобный класс Pipeline. Он позволяет объединить в цепочку последовательность действий

In [22]:
from sklearn.pipeline import Pipeline

In [23]:
pipeline = Pipeline([("vectorizer", CountVectorizer(min_df=5, ngram_range=(1, 2))), ("algo", LogisticRegression())])

In [24]:
pipeline.fit(train_data.data, train_data.target)

Pipeline(memory=None,
     steps=[('vectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=5,
        ngram_range=(1, 2), preprocessor=None, stop_words=None,
       ...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

In [25]:
accuracy_score(pipeline.predict(train_data.data), train_data.target)

0.9803149606299213

In [26]:
accuracy_score(pipeline.predict(test_data.data), test_data.target)

0.7928994082840237

Значения примерно такие же как мы получали ранее, делаяя шаги раздельно.

In [27]:
from sklearn.pipeline import make_pipeline

При кроссвалидации нужно, чтобы CountVectorizer не обучался на тесте (иначе объекты становятся зависимыми). Pipeline позволяет это просто сделать.

In [28]:
pipeline = make_pipeline(CountVectorizer(min_df=5, ngram_range=(1, 2)), LogisticRegression())
arr = cross_val_score(pipeline, train_data.data, train_data.target, cv=5, scoring='accuracy')
print(arr)
print(np.mean(arr))

[0.83753501 0.84550562 0.82303371 0.83943662 0.83050847]
0.835203886828576


In [29]:
pipeline = make_pipeline(CountVectorizer(min_df=5, ngram_range=(1, 2)), LogisticRegression())
arr = cross_val_score(pipeline, train_data.data, train_data.target, cv=3, scoring='accuracy')
print(arr)
print(np.mean(arr))

[0.80269815 0.81618887 0.79898649]
0.8059578338878507


В Pipeline можно добавлять новые шаги препроцессинга данных

In [30]:
from sklearn.feature_extraction.text import TfidfTransformer

Warning-и в данном случае это нормально, не пугайтесь. Это будет исправлено в следующих версиях библиотеки sklearn

In [31]:
pipeline = make_pipeline(CountVectorizer(min_df=5, ngram_range=(1, 2)), TfidfTransformer(), LogisticRegression())
arr = cross_val_score(pipeline, train_data.data, train_data.target, cv=5, scoring='accuracy')
print(arr)
print(np.mean(arr))

[0.87114846 0.87078652 0.84831461 0.85633803 0.83898305]
0.8571141323991462


In [32]:
pipeline.fit(train_data.data, train_data.target)

Pipeline(memory=None,
     steps=[('countvectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=5,
        ngram_range=(1, 2), preprocessor=None, stop_words=None,
  ...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

In [33]:
accuracy_score(pipeline.predict(train_data.data), train_data.target)

0.96962879640045

In [34]:
accuracy_score(pipeline.predict(test_data.data), test_data.target)

0.8241758241758241

Качество стало немного лучше

In [35]:
?TfidfTransformer

# Задание

1. Поиграйтесь с параметрами регуляризации, параметрами CountVectorizer и TfidfTransformer, чтобы получить максимальное качество. (нужно будет отправить на проверку, checker будет выложет позже)
2. Постройте список важных слов и словосочетаний для каждой темы (на основе значений коэффициентов). Это чисто по фану

In [37]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=17)
#vect = CountVectorizer() #min_df=5, ngram_range=(1, 2)
#tfidf = TfidfTransformer()
#logit = LogisticRegression()
logit_pipe = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer(norm='l2')) ,('logit', LogisticRegression())])
logit_pipe_params = {
    'vect__min_df': [3,5],
    'vect__ngram_range': [(1,3), (1,4)],
    'vect__stop_words': [None, 'english'],
    'vect__max_features': [7000, 10000, 20000],
    #'tfidf__norm=': ['l2'],
    'logit__C': np.logspace(0, 2, 5),
    'logit__penalty': ['l1', 'l2']}
def my_score(model):
    train_predict = accuracy_score(model.predict(train_data.data), train_data.target)
    test_predict = accuracy_score(model.predict(test_data.data), test_data.target)
    return abs(train_predict - test_predict)

opt_params = GridSearchCV(estimator=logit_pipe, param_grid=logit_pipe_params, cv=skf, \
                           scoring=my_score, n_jobs=-1, return_train_score=True)
#arr = cross_val_score(logit_pipe, train_data.data, train_data.target, cv=5, scoring='accuracy', fit_params=logit_pipe_params)

In [None]:
%%time
opt_params.fit(train_data.data, train_data.target)

In [113]:
accuracy_score(opt_params.predict(train_data.data), train_data.target)

0.9808773903262092

Предыдущий результат:

0.9758155230596175

0.9713160854893138

In [114]:
accuracy_score(opt_params.predict(test_data.data), test_data.target)

0.8554522400676247

Предыдущий результат:

0.8588334742180896

0.8503803888419273

In [115]:
opt_params.best_params_

{'logit__C': 10.0,
 'logit__penalty': 'l2',
 'vect__max_features': 10000,
 'vect__min_df': 3,
 'vect__ngram_range': (1, 3),
 'vect__stop_words': 'english'}

По этим параметрам чекер выдал - 0.8647885525628333

In [116]:
?LogisticRegression

Vowpal Wabbit on GitHub: https://github.com/JohnLangford/vowpal_wabbit

Vowpal Wabbit Tutorial: https://github.com/JohnLangford/vowpal_wabbit/wiki/Tutorial

In [None]:
from vowpalwabbit.sklearn_vw import VWClassifier

In [None]:
pipeline = make_pipeline(CountVectorizer(min_df=5, ngram_range=(1, 2)), TfidfTransformer(), VWClassifier())
arr = cross_val_score(pipeline, train_data.data, train_data.target, cv=5, scoring='accuracy')
print(arr)
print(np.mean(arr))

не работает :( VWClassifier только для бинарной классификации

In [None]:
import re

with open('train', 'w') as f:
    for text, target in zip(train_data.data, train_data.target):
        f.write('{} | {}\n'.format(target + 1, ' '.join(re.findall('\w+', text.lower()))))
        
with open('test', 'w') as f:
    for text, target in zip(test_data.data, test_data.target):
        f.write('{} | {}\n'.format(target + 1, ' '.join(re.findall('\w+', text.lower()))))

In [None]:
!rm train.cache
!vw -d train  -c --passes 10 -f vw.model --oaa 3

In [None]:
!vw -i vw.model -t test -p test.out

In [None]:
count = 0
hits = 0
with open('test', 'r') as f_features, open('test.out', 'r') as f_predictions:
    for line_features, line_predictions in zip(f_features, f_predictions):
        count += 1
        hits += int(line_features.split()[0]) == int(line_predictions)
        
1. * hits / count

In [None]:
!rm train.cache
!vw -d train  -c --passes 10 -f vw.model --ect 3 --quiet
!vw -i vw.model -t test -p test.out --quiet

count = 0
hits = 0
with open('test', 'r') as f_features, open('test.out', 'r') as f_predictions:
    for line_features, line_predictions in zip(f_features, f_predictions):
        count += 1
        hits += int(line_features.split()[0]) == int(line_predictions)
        
1. * hits / count

In [None]:
!rm train.cache
!vw -d train  -c --passes 10 -f vw.model --csoaa 3 --quiet
!vw -i vw.model -t test -p test.out --quiet

count = 0
hits = 0
with open('test', 'r') as f_features, open('test.out', 'r') as f_predictions:
    for line_features, line_predictions in zip(f_features, f_predictions):
        count += 1
        hits += int(line_features.split()[0]) == int(line_predictions)
        
1. * hits / count