In [None]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split,StratifiedKFold
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve,auc,confusion_matrix
from sklearn.tree import DecisionTreeClassifier

In [None]:
df = pd.read_csv('01_WineRank_2値.csv',encoding='shift_jis',engine='python')

In [None]:
df.head()

In [None]:
df.shape

# 特徴量（2変量）抽出

In [None]:
# ROCなので、2変量でしか意味がない
# 7（密度）と10（アルコール度数）が重要。⇒RFのimportancesから。
df2_features = df.iloc[:,[7,10]].values
df2_y = df.iloc[:,-1].values

# データ分割

In [None]:
X_train , X_test , y_train , y_test = train_test_split(df2_features,df2_y,test_size=0.5)

# インスタンスの定義

In [None]:
lr = LogisticRegression(penalty='l2',random_state=1,C=100)
dt = DecisionTreeClassifier(min_samples_leaf=400)
rf = RandomForestClassifier(n_estimators=1000,random_state=1,n_jobs=-1)
labels = ['Logistic Regression','Decision Tree','Random Forest']

# 評価の算出と、ROCの可視化

In [None]:
fig = plt.figure(figsize=(7,5))
ax = fig.add_subplot(1,1,1)
#ax2 = fig.add_subplot(1,2,2)
colors = ['r','g','b']

In [None]:
for clf,clf_label,color in zip([lr,dt,rf],labels,colors):
    clf.fit(X_train,y_train)
    probas = clf.predict_proba(X_test)
    fpr , tpr , thresholds = roc_curve(y_test,probas[:,1],pos_label='B')
    #print(confusion_matrix(y_true=df2_y[test],y_pred=lr.predict(df2_features[test])))
    plt.plot(fpr,tpr,color=color)
    print(thresholds)

In [None]:
plt.xlabel('false positive rate')

In [None]:
plt.ylabel('true positive rate')

In [None]:
plt.legend(labels,loc='lower right')

In [None]:
plt.show()