# Jubakit

### Step.1 データセットの読み込み

In [None]:
import pandas as pd

def load_dataset(path, label="Y"):
    df = pd.read_csv(path)
    y = df[label].tolist()
    X = df.drop(label, axis=1).as_matrix()
    return X, y

X_train, y_train = load_dataset("data/default_train.csv")
X_test, y_test = load_dataset("data/default_test.csv")

### Step.2 データセットの変換

In [None]:
from jubakit.classifier import Dataset

dataset_train = Dataset.from_array(X_train, y_train)
dataset_test = Dataset.from_array(X_test, y_test)

### Step.3 学習と予測を行う

In [None]:
from jubakit.classifier import Classifier, Config

config = Config(method="AROW", parameter={"regularization_weight": 0.1})
classifier = Classifier.run(config)

# 学習を行う
for _ in classifier.train(dataset_train):
    pass

# 予測を行う
y_pred = []
for (_, _, result) in classifier.classify(dataset_test):
    y_pred.append(int(result[0][0]))

# 分析結果を集計する（分類チュートリアルから流用）
def analyze_results(labels, results, pos_label=1, neg_label=0):
    tp, fp, tn, fn = 0, 0, 0, 0
    for label, result in zip(labels, results):
        if label == pos_label and label == result:
            tp += 1
        elif label == pos_label and label != result:
            fn += 1
        elif labels != pos_label and label == result:
            tn += 1
        else:
            fp += 1
    accuracy = float(tp + tn) / float(tp + tn + fp + fn)
    precision = float(tp) / float(tp + fp)
    recall = float(tp) / float(tp + fn)
    f_value = 2.0 * recall * precision / (recall + precision)
    # confusion matrix
    confusion = pd.DataFrame([[tp, fp], [fn, tn]], index=[pos_label, neg_label], columns=[pos_label, neg_label])
    return confusion, accuracy, precision, recall, f_value

confusion, accuracy, precision, recall, f_value = analyze_results(y_test, y_pred)
print('confusion matrix\n{0}\n'.format(confusion))
print('metric    : score')
print('accuracy  : {0:.3f}'.format(accuracy))
print('precision : {0:.3f}'.format(precision))
print('recall    : {0:.3f}'.format(recall))
print('f_value   : {0:.3f}'.format(f_value))

### Step.4 scikit-learn wrapperを試してみる

scikit-learn と同じインターフェースでJubatusを使うことのできる機能がJubakitには搭載されています。

分類を行う場合には、線形分類器クラス`LinearClassifier`もしくは近傍探索クラス`NearestNeighborsClassifier`を使うことができます。

In [None]:
from jubakit.wrapper.classifier import LinearClassifier

clf = LinearClassifier(method="AROW", regularization_weight=0.1)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

confusion, accuracy, precision, recall, f_value = analyze_results(y_test, y_pred)
print('confusion matrix\n{0}\n'.format(confusion))
print('metric    : score')
print('accuracy  : {0:.3f}'.format(accuracy))
print('precision : {0:.3f}'.format(precision))
print('recall    : {0:.3f}'.format(recall))
print('f_value   : {0:.3f}'.format(f_value))

### Step.5 パラメータチューニングをしてみる

scikit-learnのGridSearchCVを利用してパラメータチューニングを行うことができます。

In [None]:
from jubakit.wrapper.classifier import LinearClassifier
from sklearn.model_selection import GridSearchCV

param_grid = {
    "method": ["AROW", "CW"],
    "regularization_weight": [0.01, 0.1, 1.0, 10, 100]
}

clf = LinearClassifier()
searcher = GridSearchCV(clf, cv=4, param_grid=param_grid, verbose=2)
searcher.fit(X_train, y_train)

In [None]:
from operator import itemgetter
means = searcher.cv_results_['mean_test_score']
params = searcher.cv_results_['params']
grid_scores = sorted([(mean, param) for mean, param in zip(means, params)], key=itemgetter(0), reverse=True)
for i, grid_score in enumerate(grid_scores):
    print('Rank:{0}\tScore:{1:.3f}\tParameter:{2}'.format(i+1, grid_score[0], grid_score[1]))