In [1]:
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn import datasets
from sklearn.feature_extraction.text import TfidfVectorizer


newsgroups = datasets.fetch_20newsgroups(
                    subset='all', 
                    categories=['alt.atheism', 'sci.space']
             )

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


In [30]:
from sklearn.model_selection import train_test_split
X = newsgroups.data
y = newsgroups.target


In [33]:
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(X)
X_test = vectorizer.transform(X)

In [35]:
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV

grid = {'C': np.power(10.0, np.arange(-5, 6))}
cv = KFold(n_splits=5, shuffle=True, random_state=241)
clf = SVC(kernel='linear', random_state=241)
gs = GridSearchCV(clf, grid, scoring='accuracy', cv=cv)
gs.fit(X_train, y)

GridSearchCV(cv=KFold(n_splits=5, random_state=241, shuffle=True),
             error_score='raise-deprecating',
             estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='auto_deprecated', kernel='linear',
                           max_iter=-1, probability=False, random_state=241,
                           shrinking=True, tol=0.001, verbose=False),
             iid='warn', n_jobs=None,
             param_grid={'C': array([1.e-05, 1.e-04, 1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02,
       1.e+03, 1.e+04, 1.e+05])},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=0)

In [39]:
print(gs.best_params_)

{'C': 1.0}


In [45]:
print(gs.cv_results_['mean_test_score'])

[0.55263158 0.55263158 0.55263158 0.55263158 0.95016797 0.99328108
 0.99328108 0.99328108 0.99328108 0.99328108 0.99328108]


In [54]:
for a in gs.cv_results_:
    if a == 'mean_test_score':
        print('mean_test_score:', gs.cv_results_[a])
        
    elif a == 'params':
        print('params: ', gs.cv_results_[a])

params:  [{'C': 1e-05}, {'C': 0.0001}, {'C': 0.001}, {'C': 0.01}, {'C': 0.1}, {'C': 1.0}, {'C': 10.0}, {'C': 100.0}, {'C': 1000.0}, {'C': 10000.0}, {'C': 100000.0}]
mean_test_score: [0.55263158 0.55263158 0.55263158 0.55263158 0.95016797 0.99328108
 0.99328108 0.99328108 0.99328108 0.99328108 0.99328108]


In [55]:
clf = SVC(C=1.0, kernel='linear', random_state=241)
clf.fit(X_train, y)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='linear', max_iter=-1, probability=False, random_state=241,
    shrinking=True, tol=0.001, verbose=False)

In [56]:
coef = clf.coef_
coef

<1x28382 sparse matrix of type '<class 'numpy.float64'>'
	with 18404 stored elements in Compressed Sparse Row format>

In [78]:
import scipy.sparse
df = abs((pd.DataFrame.sparse.from_spmatrix(coef)).transpose())

In [79]:
df

Unnamed: 0,0
0,0.292581
1,0.123148
2,0.000000
3,0.000000
4,0.000000
...,...
28377,0.009864
28378,0.009864
28379,0.019729
28380,0.058313


In [80]:
df10 = df.sort_values(by=[0], ascending=False).head(10)
df10

Unnamed: 0,0
24019,2.663165
12871,1.920379
5088,1.25469
5093,1.24918
17802,1.201611
23673,1.180132
21850,1.139081
5776,1.130612
15606,1.097094
22936,1.029307


In [81]:
indeces = df10.index

In [82]:
words = []
feature_mapping = vectorizer.get_feature_names()
for i in indeces:
    words.append(feature_mapping[i])
words.sort()
print(words)

['atheism', 'atheists', 'bible', 'god', 'keith', 'moon', 'religion', 'sci', 'sky', 'space']


In [83]:
result = ' '.join(map(str, words))
result

'atheism atheists bible god keith moon religion sci sky space'