In [36]:
from sklearn import datasets
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV

In [37]:
newsgroups = datasets.fetch_20newsgroups(subset='all', categories=['alt.atheism', 'sci.space'])

In [38]:
X = newsgroups.data
y = newsgroups.target

In [39]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(X)

In [41]:
cv = KFold(n_splits = 5, shuffle = True, random_state = 241)

In [46]:
grid = {'C': np.power(10.0, np.arange(-5, 6))}
clf = SVC(kernel = 'linear', random_state = 241)
gs = GridSearchCV(clf, grid, scoring = 'accuracy', cv = cv, n_jobs = -1, verbose = 1)
gs.fit(X, y)

Fitting 5 folds for each of 11 candidates, totalling 55 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   26.8s
[Parallel(n_jobs=-1)]: Done  55 out of  55 | elapsed:   33.9s finished


GridSearchCV(cv=KFold(n_splits=5, random_state=241, shuffle=True),
             estimator=SVC(kernel='linear', random_state=241), n_jobs=-1,
             param_grid={'C': array([1.e-05, 1.e-04, 1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02,
       1.e+03, 1.e+04, 1.e+05])},
             scoring='accuracy', verbose=1)

In [47]:
gs.best_params_

{'C': 1.0}

In [51]:
final_model = SVC(kernel = 'linear', random_state = 241, C = 1.0)
final_model.fit(X, y)

SVC(kernel='linear', random_state=241)

Now we need to get 10 words with largest weights

In [63]:
coefs = final_model.coef_
coefs = abs(coefs.todense().A1)
coefs = np.argsort(coefs)

In [69]:
result = pd.DataFrame(final_model.coef_.todense())
result = result.abs()
result = result.sort_values(0, axis=1)
values = result.iloc[:,-10:].columns

In [75]:
features = vectorizer.get_feature_names()
words = []
for item in values:
    words.append(features[item])

In [77]:
words_sorted = np.sort(words)
words_sorted

array(['atheism', 'atheists', 'bible', 'god', 'keith', 'moon', 'religion',
       'sci', 'sky', 'space'], dtype='<U8')

In [82]:
f = open('res.txt','w')
for k in range(len(words_sorted)):
    f.write(words_sorted[k])
    if k != (len(words_sorted) - 1):
        f.write(' ')
f.close()