# Опорные объекты

In [14]:
import pandas as pd
from sklearn.svm import SVC

In [8]:
data = pd.read_csv('data/svm-data.csv', header=None)
data.head()

Unnamed: 0,0,1,2
0,0.0,0.7,0.29
1,1.0,0.23,0.55
2,0.0,0.72,0.42
3,0.0,0.98,0.68
4,0.0,0.48,0.39


In [15]:
X = data.loc[:,1:]
y = data[0]

In [18]:
clf = SVC(C=100000, kernel='linear', random_state=241)
clf.fit(X, y)

SVC(C=100000, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=241, shrinking=True,
  tol=0.001, verbose=False)

In [20]:
clf.support_

array([3, 4, 9], dtype=int32)

# Анализ текстов

In [36]:
from sklearn import datasets
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, KFold
import pandas as pd
import numpy as np

In [29]:
newsgroups = datasets.fetch_20newsgroups(
                    subset='all', 
                    categories=['alt.atheism', 'sci.space']
             )

In [30]:
X = newsgroups['data']
y = newsgroups['target']

In [50]:
tfidf = TfidfVectorizer(X)
X_transformed = tfidf.fit_transform(X)

In [46]:
grid = {'C': np.power(10.0, np.arange(-5, 6))}
cv = KFold(n_splits=5, shuffle=True, random_state=241)
clf = SVC(kernel='linear', random_state=241)
gs = GridSearchCV(clf, grid, scoring='accuracy', cv=cv)
gs.fit(X_transformed, y)

GridSearchCV(cv=KFold(n_splits=5, random_state=241, shuffle=True),
       error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=241, shrinking=True,
  tol=0.001, verbose=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'C': array([1.e-05, 1.e-04, 1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02,
       1.e+03, 1.e+04, 1.e+05])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=0)

In [47]:
best_C = gs.best_params_['C']

In [48]:
print(best_C)

1.0


In [52]:
clf = SVC(kernel='linear', random_state=241, C=best_C)
clf.fit(tfidf.transform(X), y)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=241, shrinking=True,
  tol=0.001, verbose=False)

In [146]:
feature_mapping = tfidf.get_feature_names()
coef = pd.DataFrame(data=clf.coef_.data, index=clf.coef_.indices, columns=['data'])

coef_top = abs(coef['data']).sort_values(ascending=False).head(10)

words = []
for i in coef_top.index:
    words.append(feature_mapping[i])
words.sort()
print(words)


###################
top_words = coef['data'].map(lambda x: abs(x)).sort_values(ascending=False).head(10).index.map(lambda i: feature_mapping[i])
print(top_words)

['atheism', 'atheists', 'bible', 'god', 'keith', 'moon', 'religion', 'sci', 'sky', 'space']
Index(['space', 'god', 'atheism', 'atheists', 'moon', 'sky', 'religion',
       'bible', 'keith', 'sci'],
      dtype='object')


# Логистическая регрессия

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score
import math

In [172]:
data = pd.read_csv('data/data-logistic.csv', header=None)
data.head()

Unnamed: 0,0,1,2
0,-1,-0.663827,-0.138526
1,1,1.994596,2.468025
2,-1,-1.247395,0.749425
3,1,2.309374,1.899836
4,1,0.849143,2.40775


In [182]:
X = data.loc[:,1:]
y = data[0]

In [184]:
def fw1(w1, w2, y, X, k, C):
    l = len(y)
    S = 0
    for i in range(0, l):
        S += y[i] * X[1][i] * (1.0 - 1.0 / (1.0 + math.exp(-y[i] * (w1*X[1][i] + w2*X[2][i]))))

    return w1 + (k * (1.0 / l) * S) - k * C * w1

def fw2(w1, w2, y, X, k, C):
    l = len(y)
    S = 0
    for i in range(0, l):
        S += y[i] * X[2][i] * (1.0 - 1.0 / (1.0 + math.exp(-y[i] * (w1*X[1][i] + w2*X[2][i]))))

    return w2 + (k * (1.0 / l) * S) - k * C * w2

In [186]:
def grad(y, X, C=0.0, w1=0.0, w2=0.0, k=0.1, err=1e-5):
    i = 0
    i_max = 10000
    w1_new, w2_new = w1, w2

    while True:
        i += 1
        w1_new, w2_new = fw1(w1, w2, y, X, k, C), fw2(w1, w2, y, X, k, C)
        e = math.sqrt((w1_new - w1) ** 2 + (w2_new - w2) ** 2)

        if i >= i_max or e <= err:
            break
        else:
            w1, w2 = w1_new, w2_new

    return [w1_new, w2_new]

In [190]:
w1, w2 = grad(y, X)
rw1, rw2 = grad(y, X, 10.0)

In [192]:
def a(X, w1, w2):
    return 1.0 / (1.0 + math.exp(-w1 * X[1] - w2 * X[2]))


y_score = X.apply(lambda x: a(x, w1, w2), axis=1)
y_rscore = X.apply(lambda x: a(x, rw1, rw2), axis=1)

auc = roc_auc_score(y, y_score)
rauc = roc_auc_score(y, y_rscore)
print(auc,rauc)

0.9268571428571428 0.9362857142857142


# Метрики качества классификации

In [None]:
data = pd.read_csv('data/classification.csv', header=None)
data.head()