In [122]:
import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn import svm
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer

In [123]:
newsgroups = datasets.fetch_20newsgroups(
                    subset='all', 
                    categories=['alt.atheism', 'sci.space']
             )

In [124]:
vectorizer = TfidfVectorizer()

In [125]:
X = vectorizer.fit_transform(newsgroups.data)
X

<1786x28382 sparse matrix of type '<class 'numpy.float64'>'
	with 303138 stored elements in Compressed Sparse Row format>

In [126]:
y = newsgroups.target
y

array([0, 0, 1, ..., 1, 1, 0])

In [127]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(newsgroups.data)
feature_names = vectorizer.get_feature_names_out()

In [128]:
cv = KFold(n_splits=5, shuffle=True, random_state=241)
cv

KFold(n_splits=5, random_state=241, shuffle=True)

In [131]:
clf = svm.SVC(kernel='linear', random_state=241)
clf

SVC(kernel='linear', random_state=241)

In [132]:
grid = {'C': np.power(10.0, np.arange(-5, 6))}
gs = GridSearchCV(clf, grid, scoring='accuracy', cv=cv)
gs.fit(X, y)

GridSearchCV(cv=KFold(n_splits=5, random_state=241, shuffle=True),
             estimator=SVC(kernel='linear', random_state=241),
             param_grid={'C': array([1.e-05, 1.e-04, 1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02,
       1.e+03, 1.e+04, 1.e+05])},
             scoring='accuracy')

In [133]:
clf = svm.SVC(C = gs.best_params_['C'], kernel='linear', random_state=241)
clf.fit(X, y)

SVC(kernel='linear', random_state=241)

In [134]:
coefs = abs(clf.coef_.todense().A1)
coefs

array([0.29258057, 0.12314757, 0.        , ..., 0.01972862, 0.05831336,
       0.00297347])

In [135]:
ind_t10 = np.argsort(coefs)[-10:]
ind_t10

array([22936, 15606,  5776, 21850, 23673, 17802,  5093,  5088, 12871,
       24019])

In [138]:
words = [feature_names[i] for i in ind_t10]
words.sort()

In [137]:
f = open('q1.txt', 'w')
f.write(','.join(words))
f.close()