In [2]:
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn import datasets
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cross_validation import KFold
from sklearn.grid_search import GridSearchCV

In [3]:
def save(name, data):
    print "save", name, ":", data
    with open(name, 'w') as f:
        f.write(data)
        
def data_target(csv):
    return csv.ix[:, 1:], csv.ix[:, 0]

RND = 241

In [4]:
data = pd.read_csv('svm-data.csv', header=None)
X, y = data_target(data)
data

Unnamed: 0,0,1,2
0,0,0.7,0.29
1,1,0.23,0.55
2,0,0.72,0.42
3,0,0.98,0.68
4,0,0.48,0.39
5,1,0.34,0.73
6,0,0.44,0.06
7,1,0.4,0.74
8,0,0.18,0.18
9,1,0.53,0.53


In [5]:
sv = SVC(C=100000, random_state=241, kernel='linear')
sv.fit(X, y)

save("1", " ".join([str(i+1) for i in sorted(sv.support_)]))

save 1 : 4 5 10


In [6]:
categories = ['alt.atheism', 'sci.space']
newsgroups = datasets.fetch_20newsgroups(
                subset='all', 
                categories=categories
             )

In [7]:
X, y = newsgroups.data, newsgroups.target
vectorizer = TfidfVectorizer()
X_vect = vectorizer.fit_transform(X)

In [15]:
grid = {'C': np.power(10.0, np.arange(-5, 6))}
cv = KFold(y.size, n_folds=5, shuffle=True, random_state=RND)
clf = SVC(kernel='linear', random_state=RND)
gs = GridSearchCV(clf, grid, scoring='accuracy', cv=cv)
gs.fit(X_vect, y)

GridSearchCV(cv=sklearn.cross_validation.KFold(n=1786, n_folds=5, shuffle=True, random_state=241),
       error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=241, shrinking=True,
  tol=0.001, verbose=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'C': array([  1.00000e-05,   1.00000e-04,   1.00000e-03,   1.00000e-02,
         1.00000e-01,   1.00000e+00,   1.00000e+01,   1.00000e+02,
         1.00000e+03,   1.00000e+04,   1.00000e+05])},
       pre_dispatch='2*n_jobs', refit=True, scoring='accuracy', verbose=0)

In [16]:
best_cv, best_C = 0, 0
for a in gs.grid_scores_:
    if a.mean_validation_score > best_cv:
        best_cv = a.mean_validation_score # оценка на кросс-валидации
        best_C = a.parameters['C']
    print a.mean_validation_score, a.parameters 
print "Best:", best_C, best_cv

0.552631578947 {'C': 1.0000000000000001e-05}
0.552631578947 {'C': 0.0001}
0.552631578947 {'C': 0.001}
0.552631578947 {'C': 0.01}
0.950167973124 {'C': 0.10000000000000001}
0.993281075028 {'C': 1.0}
0.993281075028 {'C': 10.0}
0.993281075028 {'C': 100.0}
0.993281075028 {'C': 1000.0}
0.993281075028 {'C': 10000.0}
0.993281075028 {'C': 100000.0}
Best: 1.0 0.993281075028


In [17]:
best_sv = SVC(C=best_C, random_state=RND, kernel='linear')
best_sv.fit(X_vect, y)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=241, shrinking=True,
  tol=0.001, verbose=False)

In [68]:
feature_names = vectorizer.get_feature_names()
top10 = np.argsort(np.absolute(np.asarray(best_sv.coef_.todense())).reshape(-1))[-10:]
save("2", " ".join(sorted([feature_names[i] for i in top10])))

save 2 : atheism atheists bible god keith moon religion sci sky space
