# チェック用スクリプト
- check_hoge : 結果が視覚化できる(2次元用)
- evaluate_hogehoge : 定量評価。今のところclassificationとregression、多次元でも可能
## 分類

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
import numpy as np
from sklearn.datasets import make_moons
# 学習器
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
# scaler
from sklearn.preprocessing import StandardScaler
# model_selection
from sklearn.model_selection import train_test_split
def check_classification(X,y):
    h = .02  # step size in the mesh
    names = ["LogisticRegression","Nearest Neighbors", "Linear SVM", "RBF SVM", "Gaussian Process",
             "Decision Tree", "Random Forest", "Neural Net", "AdaBoost",
             "Naive Bayes", "QDA"]

    classifiers = [
        LogisticRegression(),
        KNeighborsClassifier(3),
        SVC(kernel="linear", C=0.025),
        SVC(gamma=2, C=1),
        GaussianProcessClassifier(1.0 * RBF(1.0)), 
        DecisionTreeClassifier(max_depth=5),
        RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
        MLPClassifier(alpha=1),
        AdaBoostClassifier(),
        GaussianNB(),
        QuadraticDiscriminantAnalysis()    
    ]
    X = StandardScaler().fit_transform(X)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.5, random_state=42)

    x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
    y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
    # just plot the dataset first
    cm = plt.cm.RdBu
    row = 2
    cm_bright = ListedColormap(['#FF0000', '#0000FF','#58BE89','#FBA848'])
    figure = plt.figure(figsize=(18, 6))
    ax = plt.subplot(row, np.ceil(len(classifiers)/row), 1)
    ax.set_title("original")
    # Plot also the training points
    ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright, edgecolors='gray')
    ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright, alpha=0.6)
    # 各学習器プロットする/結果を保存する
    cls_result = {}
    for i,(name, clf) in enumerate(zip(names, classifiers)):
        ax = plt.subplot(row, np.ceil(len(classifiers)/row), i+2)
        clf.fit(X_train, y_train)
        # Plot the decision boundary. For that, we will assign a color to each
        # point in the mesh [x_min, x_max]x[y_min, y_max].
        if hasattr(clf, "decision_function"):
            Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
        else:
            Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]

        # Put the result into a color plot
        Z = Z.reshape(xx.shape)
        ax.contourf(xx, yy, Z, cmap=cm, alpha=.8)
        # scattering original data
        ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright, edgecolors='k')
        ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright, edgecolors='k', alpha=0.6)

        ax.set_xlim(xx.min(), xx.max())
        ax.set_ylim(yy.min(), yy.max())
        ax.set_xticks(())
        ax.set_yticks(())
        ax.set_title(name)

In [None]:
def evaluation_classification(X, y):
    classifiers = [
        LogisticRegression(),
        KNeighborsClassifier(3),
        SVC(kernel="linear", C=0.025),
        SVC(gamma=2, C=1),
        GaussianProcessClassifier(1.0 * RBF(1.0)), 
        DecisionTreeClassifier(max_depth=5),
        RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
        MLPClassifier(alpha=1),
        AdaBoostClassifier(),
        GaussianNB(),
        QuadraticDiscriminantAnalysis()    
    ]
    X = StandardScaler().fit_transform(X)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.5, random_state=42)
    for clf in classifiers:
        print("\n",clf.__class__.__name__)
        clf.fit(X_train, y_train)
        pred_y = clf.predict(X_test)

        if hasattr(clf, "decision_function"):
            pred_y_score = clf.decision_function(X_test)
        else:
            pred_y_score = clf.predict_proba(X_test)[:,1]
        # Confusion Matrix
        print("confusion matrix\n", metrics.confusion_matrix(y_test, pred_y))
        print("accuracy\n", metrics.accuracy_score(y_test, pred_y))
        print("classification report\n",metrics.classification_report(y_test,pred_y))
        print("hamming loss", metrics.hamming_loss(y_test, pred_y))
        print("jaccard similarity", metrics.jaccard_similarity_score(y_test,pred_y))
    #     print("log loss", metrics.log_loss(y_test, pred_y_score)) 
        print("cohen kappa score", metrics.cohen_kappa_score(y_test, pred_y))
        if len(set(y))==2:
            # Compute ROC curve and ROC area for each class
            fpr, tpr, _ = metrics.roc_curve(y_test, pred_y_score,pos_label=1)
            roc_auc = metrics.auc(fpr, tpr)
            print("auc area", roc_auc)

            #compute precision-recall curve
            precision, recall, thresholds = metrics.precision_recall_curve(y_test, pred_y_score)
            average_precision_score = metrics.average_precision_score(y_test,pred_y_score)
            print("average precision score:", average_precision_score)
            # ROC曲線
            plt.figure(figsize=(7,3.5))
            plt.subplot(121)
            lw = 2
            plt.plot(fpr, tpr, color='darkorange', lw=lw, label='ROC curve')
            plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
            plt.xlim([0.0, 1.0])
            plt.ylim([0.0, 1.05])
            plt.xlabel('False Positive Rate')
            plt.ylabel('True Positive Rate')
            plt.title('ROC curve: \nAUC={:.2f}'.format(roc_auc))
            plt.legend(loc="lower right")

            # precision-recall curve
            plt.subplot(122)
            plt.plot()
            plt.plot(recall, precision, color='b', alpha=0.2, label="RR curve")
            plt.fill_between(recall, precision, step='post', alpha=0.2,color='b')
            plt.xlabel('Recall')
            plt.ylabel('Precision')
            plt.ylim([0.0, 1.05])
            plt.xlim([0.0, 1.0])
            plt.title('Precision-Recall curve: \nAP={0:0.2f}'.format(average_precision_score))
            plt.legend(loc="lower right")
            plt.show()

## 回帰

In [None]:
from sklearn.datasets import load_boston
from sklearn.linear_model import LinearRegression
from sklearn import model_selection, metrics
import numpy as np
boston = load_boston()
def evaluate_regression(X,y):
    reg = LinearRegression()
    X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y,train_size=0.8)
    reg.fit(X_train,y_train)
    pred_y = reg.predict(X_test)
    print("Variance score", metrics.explained_variance_score(y_test, pred_y))
    print("MAE : Mean absolute error", metrics.mean_absolute_error(y_test, pred_y))
    print("MSE : Mean squared error", metrics.mean_squared_error(y_test, pred_y))
    print("RMSE: Root mean squared error", np.sqrt(metrics.mean_squared_error(y_test, pred_y)))
    print("R2 score", metrics.r2_score(y_test, pred_y))

## クラスタリング

In [None]:
from sklearn import cluster
from sklearn import mixture
from sklearn.cluster import KMeans
from sklearn.neighbors import kneighbors_graph
from sklearn import datasets 
def check_clustering(X, n_clusters):
    params = {
        'quantile': .3,
        'eps': .3,
        'damping': .9,
        'preference': -200,
        'n_neighbors': 10,
        'n_clusters': n_clusters}
    bandwidth = cluster.estimate_bandwidth(X, quantile=params['quantile'])
    kmeans = cluster.KMeans(n_clusters=params["n_clusters"])
    ms = cluster.MeanShift(bandwidth=bandwidth, bin_seeding=True)
    two_means = cluster.MiniBatchKMeans(n_clusters=params['n_clusters'])
    ward = cluster.AgglomerativeClustering(n_clusters=params['n_clusters'], linkage='ward')
    spectral = cluster.SpectralClustering(n_clusters=params['n_clusters'], eigen_solver='arpack',affinity="nearest_neighbors")
    dbscan = cluster.DBSCAN(eps=params['eps'])
    affinity_propagation = cluster.AffinityPropagation(damping=params['damping'], preference=params['preference'])
    average_linkage = cluster.AgglomerativeClustering(linkage="average", affinity="cityblock", n_clusters=params['n_clusters'])
    birch = cluster.Birch(n_clusters=params['n_clusters'])
    gmm = mixture.GaussianMixture(n_components=params['n_clusters'], covariance_type='full')

    clustering_algorithms = [
        ("KMeans", kmeans),
        ('MiniBatchKMeans', two_means),
        ('AffinityPropagation', affinity_propagation),
        ('MeanShift', ms),
            ('SpectralClustering', spectral),
            ('Ward', ward),
            ('AgglomerativeClustering', average_linkage),
            ('DBSCAN', dbscan),
            ('Birch', birch),]
#             ('GaussianMixture', gmm)
    cm_bright = ListedColormap(['#FF0000', '#0000FF','#58BE89','#FBA848'])
    figure = plt.figure(figsize=(18, 8))
    ax = plt.subplot(251)
    ax.set_title("original")
    ax.scatter(X[:, 0], X[:, 1], c=y, cmap=cm_bright, edgecolors='gray')
    for i,(name, algo) in enumerate(clustering_algorithms):
        print(name,algo)
        pred_y = algo.fit_predict(X)
        plt.subplot(2,5,i+2)
        plt.scatter(X[:,0],X[:,1], c=pred_y,cmap=cm_bright, alpha=0.8,edgecolors='gray')
        plt.title(name)

In [None]:
X, y = make_moons(n_samples=200, noise=0.1)
X, y = datasets.make_blobs(centers=2, n_samples=200, n_features=2)


check_classification(X,y)
check_clustering(X,2)

In [None]:
from sklearn.datasets import load_iris
data = load_iris()
X = data.data
y = data.target
evaluation_classification(X,y)