In [112]:
import pandas as pd
import numpy as np


from sklearn.model_selection import GridSearchCV, KFold

from sklearn.svm import SVC

from sklearn import datasets
from sklearn.feature_extraction.text import TfidfVectorizer

In [6]:
newsgroups = datasets.fetch_20newsgroups(subset='all', categories=['alt.atheism', 'sci.space'])

In [7]:
train = newsgroups.data
y = newsgroups.target

In [113]:
tfidf = TfidfVectorizer()

X_train = tfidf.fit_transform(train)
features = tfidf.get_feature_names()



In [114]:
len(features)

28382

In [115]:
X_train.shape

(1786, 28382)

In [116]:
svm_grid = {
    'C': np.power(10.0, np.arange(-5, 6))
}
cv = KFold(n_splits=5, shuffle=True, random_state=241)

In [117]:
svm = SVC(random_state=241, kernel='linear')
gs = GridSearchCV(svm, param_grid=svm_grid, cv=cv, scoring='accuracy', verbose=3)

In [118]:
gs.fit(X_train, y)

Fitting 5 folds for each of 11 candidates, totalling 55 fits
[CV 1/5] END ...........................C=1e-05;, score=0.545 total time=   3.4s
[CV 2/5] END ...........................C=1e-05;, score=0.580 total time=   3.5s
[CV 3/5] END ...........................C=1e-05;, score=0.571 total time=   3.3s
[CV 4/5] END ...........................C=1e-05;, score=0.501 total time=   3.1s
[CV 5/5] END ...........................C=1e-05;, score=0.566 total time=   3.3s
[CV 1/5] END ..........................C=0.0001;, score=0.545 total time=   3.3s
[CV 2/5] END ..........................C=0.0001;, score=0.580 total time=   3.4s
[CV 3/5] END ..........................C=0.0001;, score=0.571 total time=   3.3s
[CV 4/5] END ..........................C=0.0001;, score=0.501 total time=   3.1s
[CV 5/5] END ..........................C=0.0001;, score=0.566 total time=   3.2s
[CV 1/5] END ...........................C=0.001;, score=0.545 total time=   3.3s
[CV 2/5] END ...........................C=0.001;

GridSearchCV(cv=KFold(n_splits=5, random_state=241, shuffle=True),
             estimator=SVC(kernel='linear', random_state=241),
             param_grid={'C': array([1.e-05, 1.e-04, 1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02,
       1.e+03, 1.e+04, 1.e+05])},
             scoring='accuracy', verbose=3)

In [119]:
gs.best_params_, gs.best_score_

({'C': 1.0}, 0.9932804406678872)

In [120]:
best_svm = SVC(random_state=241, kernel='linear', C=gs.best_params_['C'])

In [156]:
%%time
best_svm.fit(X_train, y)

CPU times: user 1.87 s, sys: 23.5 ms, total: 1.89 s
Wall time: 1.94 s


SVC(kernel='linear', random_state=241)

In [157]:
best_words = pd.DataFrame(np.abs(best_svm.coef_.toarray().T), index=features, columns=['svm_coefs'])\
                .sort_values(by='svm_coefs', ascending=False)

In [158]:
best_words.head(10)

Unnamed: 0,svm_coefs
space,2.663165
god,1.920379
atheism,1.25469
atheists,1.24918
moon,1.201611
sky,1.180132
religion,1.139081
bible,1.130612
keith,1.097094
sci,1.029307


In [159]:
names = list(best_words.head(10).index)
names.sort()
names

['atheism',
 'atheists',
 'bible',
 'god',
 'keith',
 'moon',
 'religion',
 'sci',
 'sky',
 'space']

In [161]:
with open('2.answer-text-analysis.txt', 'w') as f:
    for i in names:
        f.write(i)
        f.write(' ')