В качестве набора данных используется [Twenty Newsgroups](https://scikit-learn.org/0.19/datasets/twenty_newsgroups.html). Данные будут загружены посредством sklearn.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
%matplotlib inline

In [2]:
pd.options.display.max_columns = None

In [3]:
data = fetch_20newsgroups()
categories = data.target_names[:10]

In [4]:
train = fetch_20newsgroups(subset='train',
                           remove=('headers', 'footers', 'quotes'),
                           categories=categories)
test = fetch_20newsgroups(subset='test',
                          remove=('headers', 'footers', 'quotes'),
                          categories=categories)

In [5]:
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(train.data)

In [6]:
parameters = {'kernel': ['linear'], 'C':[0.001, 0.01, 0.1, 1, 10, 100]}
svc = SVC()
clf = GridSearchCV(svc, parameters)
clf.fit(vectors, train.target)

GridSearchCV(estimator=SVC(),
             param_grid={'C': [0.001, 0.01, 0.1, 1, 10, 100],
                         'kernel': ['linear']})

In [7]:
clf.best_params_

{'C': 1, 'kernel': 'linear'}

In [8]:
best_estimator = clf.best_estimator_

In [9]:
vectors_test = vectorizer.transform(test.data)

In [10]:
pred = best_estimator.predict(vectors_test)
accuracy_score(test.target, pred)

0.7263294422827496

In [11]:
best_estimator.coef_

<45x67469 sparse matrix of type '<class 'numpy.float64'>'
	with 594784 stored elements in Compressed Sparse Row format>

In [12]:
# Отсортированные по модулю веса слов
W = np.sort(np.absolute(best_estimator.coef_.toarray()))
# Отсортированные по модулю весов индексы слов
indices = np.argsort(np.absolute(best_estimator.coef_.toarray()))
# Последние 10 индексов
indices = np.array(indices[0,-10:])
[vectorizer.get_feature_names()[i] for i in indices]

['3d',
 'atheism',
 'deletion',
 'that',
 'people',
 'who',
 'god',
 'image',
 'religion',
 'graphics']