## day9 評価とモデル選択

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
# 学習器
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
# 評価指標
from sklearn import metrics
# モデル選択
from sklearn import model_selection

## 2クラス問題
breast_cancerを使ってみる

In [None]:
from sklearn.datasets import load_breast_cancer
breast_cancer = load_breast_cancer()
clf = LogisticRegression()
X_train, X_test, y_train, y_test = model_selection.train_test_split(breast_cancer.data, breast_cancer.target,train_size=0.8)
clf.fit(X_train,y_train)
# 結果を返す
pred_y = clf.predict(X_test)
# スコアを出すための関数。
# ref : [decision_functionとの違いは？](https://stackoverflow.com/questions/36543137/whats-the-difference-between-predict-proba-and-decision-function-in-sklearn-py)
pred_y_score = clf.predict_proba(X_test)[:,1]

def report(y_test, pred_y, pred_y_score):
    print("#################################")
    print("######CLASSIFICATION REPORT######")
    print("#################################")
    # Confusion Matrix
    print("confusion matrix\n", metrics.confusion_matrix(y_test, pred_y))
    print("accuracy\n", metrics.accuracy_score(y_test, pred_y))
    print("classification report\n",metrics.classification_report(y_test,pred_y))
    print("hamming loss", metrics.hamming_loss(y_test, pred_y))
    print("jaccard similarity", metrics.jaccard_similarity_score(y_test,pred_y))
#     print("log loss", metrics.log_loss(y_test, pred_y_score)) 
    print("cohen kappa score", metrics.cohen_kappa_score(y_test, pred_y))

    # Compute ROC curve and ROC area for each class
    fpr, tpr, _ = metrics.roc_curve(y_test, pred_y_score,pos_label=1)
    roc_auc = metrics.auc(fpr, tpr)
    print("auc area", roc_auc)

    #compute precision-recall curve
    precision, recall, thresholds = metrics.precision_recall_curve(y_test, pred_y_score)
    average_precision_score = metrics.average_precision_score(y_test,pred_y_score)
    print("average precision score:", average_precision_score)
    # ROC曲線
    plt.figure(figsize=(7,3.5))
    plt.subplot(121)
    lw = 2
    plt.plot(fpr, tpr, color='darkorange', lw=lw, label='ROC curve')
    plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC curve: \nAUC={:.2f}'.format(roc_auc))
    plt.legend(loc="lower right")

    # precision-recall curve
    plt.subplot(122)
    plt.plot()
    plt.plot(recall, precision, color='b', alpha=0.2, label="RR curve")
    plt.fill_between(recall, precision, step='post', alpha=0.2,color='b')
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.ylim([0.0, 1.05])
    plt.xlim([0.0, 1.0])
    plt.title('Precision-Recall curve: \nAP={0:0.2f}'.format(average_precision_score))
    plt.legend(loc="lower right")
    plt.show()
report(y_test, pred_y, pred_y_score)

## 実際に比べてみる


In [None]:
%matplotlib inline
from matplotlib.colors import ListedColormap
import numpy as np
from sklearn.datasets import make_moons
X, y = make_moons(n_samples=200, noise=0.2)
# 学習器
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
# scaler
from sklearn.preprocessing import StandardScaler
# model_selection
from sklearn.model_selection import train_test_split
h = .02  # step size in the mesh
names = ["LogisticRegression","Nearest Neighbors", "Linear SVM", "RBF SVM", "Gaussian Process",
         "Decision Tree", "Random Forest", "Neural Net", "AdaBoost",
         "Naive Bayes", "QDA"]

classifiers = [
    LogisticRegression(),
    KNeighborsClassifier(3),
    SVC(kernel="linear", C=0.025),
    SVC(gamma=2, C=1),
    GaussianProcessClassifier(1.0 * RBF(1.0)), 
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    MLPClassifier(alpha=1),
    AdaBoostClassifier(),
    GaussianNB(),
    QuadraticDiscriminantAnalysis()    
]
X = StandardScaler().fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.5, random_state=42)

x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
# just plot the dataset first
cm = plt.cm.RdBu
row = 2
cm_bright = ListedColormap(['#FF0000', '#0000FF'])
figure = plt.figure(figsize=(18, 6))
ax = plt.subplot(row, np.ceil(len(classifiers)/row), 1)
ax.set_title("original")
# Plot also the training points
ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright, edgecolors='gray')
ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright, alpha=0.6)
# 各学習器プロットする/結果を保存する
cls_result = {}
for i,(name, clf) in enumerate(zip(names, classifiers)):
    ax = plt.subplot(row, np.ceil(len(classifiers)/row), i+2)
    clf.fit(X_train, y_train)
    # Plot the decision boundary. For that, we will assign a color to each
    # point in the mesh [x_min, x_max]x[y_min, y_max].
    if hasattr(clf, "decision_function"):
        Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
    else:
        Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]
    
    # Put the result into a color plot
    Z = Z.reshape(xx.shape)
    ax.contourf(xx, yy, Z, cmap=cm, alpha=.8)
    # scattering original data
    ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright, edgecolors='k')
    ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright, edgecolors='k', alpha=0.6)

    ax.set_xlim(xx.min(), xx.max())
    ax.set_ylim(yy.min(), yy.max())
    ax.set_xticks(())
    ax.set_yticks(())
    ax.set_title(name)
    # テスト用
    pred_y = clf.predict(X_test)
    if hasattr(clf, "decision_function"):
        pred_y_score = clf.decision_function(X_test)
    else:
        pred_y_score = clf.predict_proba(X_test)[:,1]
    cls_result[name] = {}
    cls_result[name]["pred_y"] = pred_y
    cls_result[name]["y_test"] = y_test
    cls_result[name]["pred_y_score"] = pred_y_score


In [None]:
for name in names:
    print(name)
    pred_y = cls_result[name]["pred_y"]
    y_test = cls_result[name]["y_test"]
    pred_y_score = cls_result[name]["pred_y_score"]
    report( y_test,pred_y,pred_y_score)

### Regression
- variance score
- mean absolute error (MAE)
- mean squared error (MSE)
- median absolute error
- R2 score

In [None]:
from sklearn.datasets import load_boston
from sklearn.linear_model import LinearRegression
from sklearn import model_selection, metrics
import numpy as np
boston = load_boston()
reg = LinearRegression()
X_train, X_test, y_train, y_test = model_selection.train_test_split(boston.data, boston.target,train_size=0.8)
reg.fit(X_train,y_train)
pred_y = reg.predict(X_test)
print("Variance score", metrics.explained_variance_score(y_test, pred_y))
print("MAE : Mean absolute error", metrics.mean_absolute_error(y_test, pred_y))
print("MSE : Mean squared error", metrics.mean_squared_error(y_test, pred_y))
print("RMSE: Root mean squared error", np.sqrt(metrics.mean_squared_error(y_test, pred_y)))
print("R2 score", metrics.r2_score(y_test, pred_y))
# print("Median sbsolute error", metrics.median_absolute_error(y_test, pred_y))


### Clustering

In [None]:
from sklearn.datasets import make_moons
data, target = make_moons(noise=0.05,n_samples=1000)
base_colors = ["r", "g"]
colors = [base_colors[i] for i in target]
plt.figure(figsize=(5,5))
plt.scatter(data[:,0],data[:,1], color=colors, alpha=0.5)

In [None]:
# http://scikit-learn.org/stable/auto_examples/cluster/plot_cluster_comparison.html
from sklearn import cluster, datasets, mixture
from sklearn.neighbors import kneighbors_graph

params = {'quantile': .3,
                'eps': .3,
                'damping': .9,
                'preference': -200,
                'n_neighbors': 10,
                'n_clusters': 2}
bandwidth = cluster.estimate_bandwidth(data, quantile=params['quantile'])
kmeans = cluster.KMeans(n_clusters=params["n_clusters"])
ms = cluster.MeanShift(bandwidth=bandwidth, bin_seeding=True)
two_means = cluster.MiniBatchKMeans(n_clusters=params['n_clusters'])
ward = cluster.AgglomerativeClustering(n_clusters=params['n_clusters'], linkage='ward')
spectral = cluster.SpectralClustering(n_clusters=params['n_clusters'], eigen_solver='arpack',affinity="nearest_neighbors")
dbscan = cluster.DBSCAN(eps=params['eps'])
affinity_propagation = cluster.AffinityPropagation(damping=params['damping'], preference=params['preference'])
average_linkage = cluster.AgglomerativeClustering(linkage="average", affinity="cityblock", n_clusters=params['n_clusters'])
birch = cluster.Birch(n_clusters=params['n_clusters'])
gmm = mixture.GaussianMixture(n_components=params['n_clusters'], covariance_type='full')

clustering_algorithms = [
    ("KMeans", kmeans),
    ('MiniBatchKMeans', two_means),
    ('AffinityPropagation', affinity_propagation),
    ('MeanShift', ms),
    ('SpectralClustering', spectral),
    ('Ward', ward),
    ('AgglomerativeClustering', average_linkage),
    ('DBSCAN', dbscan),
    ('Birch', birch),
#     ('GaussianMixture', gmm)
]
from sklearn.cluster import KMeans
row = 3
plt.figure(figsize=(9,9))
for i,(name, algo) in enumerate(clustering_algorithms):
    print(name,algo)
    pred_y = algo.fit_predict(data)
    c = [base_colors[i] for i in pred_y]
    plt.subplot(row, len(clustering_algorithms)/row,i+1)
    plt.scatter(data[:,0],data[:,1], color=c, alpha=0.5)
    plt.title(name)