In [19]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.metrics import confusion_matrix, precision_recall_curve, roc_curve

### 정확도 계산 함수 모음

In [15]:
def print_confusion_matrix(y_test, y_pred):
    cm = confusion_matrix(y_test, y_pred)
    cm_df = pd.DataFrame(cm,
                         index = ['l_fg','l_fist','l_tb','r_fg','r_fist','r_tb'], 
                         columns = ['l_fg','l_fist','l_tb','r_fg','r_fist','r_tb'])
    #Plotting the confusion matrix
    plt.figure(figsize=(5,4))
    sns.heatmap(cm_df, annot=True)
    plt.title('RandomForest')
    plt.ylabel('Actal Values')
    plt.xlabel('Predicted Values')
    plt.show()
    plt.clf()

def print_auc_roc(model, x_test):
    global y_test
    #Calculate the y_score
    y_score = model.predict_proba(x_test)
    #Binarize the output
    y_test_bin = label_binarize(y_test, classes=[0,1,2])
    n_classes = y_test_bin.shape[1]

    sum=0
    fpr = dict()
    tpr = dict()
    roc_auc = dict()
    fpr_sum=[]
    tpr_sum=[]


    colors = ['blue', 'red', 'green']
    for i in range(n_classes):
        fpr[i], tpr[i], _ = roc_curve(y_test_bin[:, i], y_score[:, i])
        fpr_sum.append(fpr[i])
        tpr_sum.append(tpr[i])
        #plt.plot(fpr[i], tpr[i], color=colors[i], lw=2)
        print('AUC for Class {}: {}'.format(i, auc(fpr[i], tpr[i])))
        sum+=auc(fpr[i], tpr[i])
        
    print("average sum:", sum/3)
    fpr_avg=[]
    tpr_avg=[]
    for i in range(max(fpr_sum[0].shape[0],fpr_sum[1].shape[0],fpr_sum[2].shape[0])):
        num=0
        sum2=0
        if i< fpr_sum[0].shape[0]:
            num+=1
            sum2+=fpr_sum[0][i]
        if i< fpr_sum[1].shape[0]:
            num+=1
            sum2+=fpr_sum[1][i]
        if i< fpr_sum[2].shape[0]:
            num+=1
            sum2+=fpr_sum[2][i]

        fpr_avg.append(sum2/num)

    for i in range(max(tpr_sum[0].shape[0],tpr_sum[1].shape[0],tpr_sum[2].shape[0])):
        num=0
        sum2=0
        if i< tpr_sum[0].shape[0]:
            num+=1
            sum2+=tpr_sum[0][i]
        if i< tpr_sum[1].shape[0]:
            num+=1
            sum2+=tpr_sum[1][i]
        if i< tpr_sum[2].shape[0]:
            num+=1
            sum2+=tpr_sum[2][i]

        tpr_avg.append(sum2/num)
        
    return fpr_avg, tpr_avg
    #plt.plot(fpr_avg, tpr_avg, color='blue', lw=2)

    #plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    #plt.xlim([0.0, 1.0])
    #plt.ylim([0.0, 1.05])
    #plt.xlabel('False Positive Rate')
    #plt.ylabel('True Positive Rate')
    #plt.title('Receiver Operating Characteristic Curves')
    #plt.show()

def print_pr_curve(model, x_test):
    #Calculate the y_score
    y_score = model.predict_proba(x_test)
    #Binarize the output
    y_test_bin = label_binarize(y_test, classes=[0,1,2])
    n_classes = y_test_bin.shape[1]

    sum=0
    pr = dict()
    rc = dict()
    #roc_auc = dict()
    pr_sum=[]
    rc_sum=[]

    pr_avg=[]
    rc_avg=[]


    colors = ['blue', 'red', 'green']
    for i in range(n_classes):
        pr[i], rc[i], _ = precision_recall_curve(y_test_bin[:, i], y_score[:, i])
        base_rate=y_score[:, i].mean()
        pr_sum.append(pr[i])
        rc_sum.append(rc[i])
        #plt.plot(rc[i], pr[i], color=colors[i], lw=2)

    for i in range(max(pr_sum[0].shape[0],pr_sum[1].shape[0],pr_sum[2].shape[0])):
        num=0
        sum2=0
        if i< pr_sum[0].shape[0]:
            num+=1
            sum2+=pr_sum[0][i]
        if i< pr_sum[1].shape[0]:
            num+=1
            sum2+=pr_sum[1][i]
        if i< pr_sum[2].shape[0]:
            num+=1
            sum2+=pr_sum[2][i]

        pr_avg.append(sum2/num)

    for i in range(max(rc_sum[0].shape[0],rc_sum[1].shape[0],rc_sum[2].shape[0])):
        num=0
        sum2=0
        if i< rc_sum[0].shape[0]:
            num+=1
            sum2+=rc_sum[0][i]
        if i< rc_sum[1].shape[0]:
            num+=1
            sum2+=rc_sum[1][i]
        if i< rc_sum[2].shape[0]:
            num+=1
            sum2+=rc_sum[2][i]

        rc_avg.append(sum2/num)
        
    return pr_avg, rc_avg


    #plt.xlim([0.0, 1.0])
    #plt.ylim([0.0, 1.05])
    #plt.xlabel('Recall')
    #plt.ylabel('Precision')
    #plt.title('Precision-Recall Curve')
    #plt.show()

def print_feature_importances(model, train_data):
    importances=model.feature_importances_
    indices=np.argsort(importances)[::-1]

    print('Feature ranking:')

    for f in range(train_data.shape[1]):
        print('{}. feature {} ({:.3f})'.format(f+1, train_data.columns[indices][f], importances[indices[f]]))
    plt.figure(figsize=(10,8))
    plt.title('feature importances')
    plt.bar(range(train_data.shape[1]), importances[indices],
            color='r', align='center')
    for i,v in enumerate(range(train_data.shape[1])):
        plt.text(v, importances[indices][i],round(importances[indices][i],2), fontsize=9, color='black', horizontalalignment='center', verticalalignment='bottom')
    plt.xticks(range(train_data.shape[1]), train_data.columns[indices], rotation=45)
    plt.xlim([-1,train_data.shape[1]])
    plt.show()

### data 불러오기

In [4]:
pp=pd.read_csv('./data/total_pp-1_aug_keypoint.csv')
cnv=pd.read_csv('./data/cnv_total-2_aug_keypoint.csv')


### keypoint data와 label data 나누기

In [5]:
pp_X,pp_Y=pp.iloc[:,1:],pp.iloc[:,:1]
cnv_X,cnv_Y=cnv.iloc[:,1:],cnv.iloc[:,:1]

### test data, train data 나누기

In [6]:
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
#train data와 test data로 나눔,비율은 80 20으로
pp_X_train, pp_X_test, pp_y_train, pp_y_test = train_test_split(pp_X,pp_Y, test_size=0.2, stratify=pp_Y)
cnv_X_train, cnv_X_test, cnv_y_train, cnv_y_test = train_test_split(cnv_X,cnv_Y, test_size=0.2, stratify=cnv_Y)

### 학습하기- xgboost

In [22]:
from xgboost import XGBClassifier

model=XGBClassifier()
model.fit(pp_X_train, pp_y_train)
pp_y_pred=model.predict(pp_X_test)
print('========pp========')
print('precision:', precision_score(pp_y_test, pp_y_pred, average='macro'))
print('recall:', recall_score(pp_y_test, pp_y_pred, average='macro'))
print('f1:', f1_score(pp_y_test, pp_y_pred, average='macro'))
model.save_model('./model/xgboost_total_pp-1.model')

model=XGBClassifier()
model.fit(cnv_X_train, cnv_y_train)
cnv_y_pred=model.predict(cnv_X_test)
print('========cnv========')
print('precision:', precision_score(cnv_y_test, cnv_y_pred, average='macro'))
print('recall:', recall_score(cnv_y_test, cnv_y_pred, average='macro'))
print('f1:', f1_score(cnv_y_test, cnv_y_pred, average='macro'))
model.save_model('./model/xgboost_total_cnv-2.model')

# print_confusion_matrix(pp_y_test,pp_y_pred)
# a2,b2=print_auc_roc(model, pp_X_test)
# c2,d2=print_pr_curve(model, pp_X_test)

precision: 0.3976948101948101
recall: 0.37409145303882146
f1: 0.38004726777000597
precision: 0.5962167096324597
recall: 0.5948282722179781
f1: 0.5948569607018018


### 학습하기 - RandomForestCalssifier

In [32]:
from sklearn.ensemble import RandomForestClassifier
import joblib
import pickle

model=RandomForestClassifier()
model.fit(pp_X_train, pp_y_train)
pp_y_pred=model.predict(pp_X_test)
print('========pp========')
print('precision:', precision_score(pp_y_test, pp_y_pred, average='macro'))
print('recall:', recall_score(pp_y_test, pp_y_pred, average='macro'))
print('f1:', f1_score(pp_y_test, pp_y_pred, average='macro'))
joblib.dump(model, './model/randomforest_total_pp-1.pkl')

model=RandomForestClassifier()
model.fit(cnv_X_train, cnv_y_train)
cnv_y_pred=model.predict(cnv_X_test)
print('========cnv========')
print('precision:', precision_score(cnv_y_test, cnv_y_pred, average='macro'))
print('recall:', recall_score(cnv_y_test, cnv_y_pred, average='macro'))
print('f1:', f1_score(cnv_y_test, cnv_y_pred, average='macro'))
joblib.dump(model, './model/randomforest_total_cnv-2.pkl')


  model.fit(pp_X_train, pp_y_train)


precision: 0.4016958501804269
recall: 0.37583995478732324
f1: 0.38324641932236864


  model.fit(cnv_X_train, cnv_y_train)


precision: 0.6051375332732557
recall: 0.5896318754774637
f1: 0.5937299259940118


['./model/randomforest_total_cnv-2.pkl']

### 학습하기- GradientBoosting

In [36]:
from sklearn.ensemble import GradientBoostingClassifier

model=GradientBoostingClassifier()
model.fit(pp_X_train, pp_y_train)
pp_y_pred=model.predict(pp_X_test)
print('========pp========')
print('precision:', precision_score(pp_y_test, pp_y_pred, average='macro'))
print('recall:', recall_score(pp_y_test, pp_y_pred, average='macro'))
print('f1:', f1_score(pp_y_test, pp_y_pred, average='macro'))
joblib.dump(model, './model/GradientBoosting_total_pp-1.pkl')

model=GradientBoostingClassifier()
model.fit(cnv_X_train, cnv_y_train)
cnv_y_pred=model.predict(cnv_X_test)
print('========cnv========')
print('precision:', precision_score(cnv_y_test, cnv_y_pred, average='macro'))
print('recall:', recall_score(cnv_y_test, cnv_y_pred, average='macro'))
print('f1:', f1_score(cnv_y_test, cnv_y_pred, average='macro'))
joblib.dump(model, './model/GradientBoosting_total_cnv-2.pkl')

  y = column_or_1d(y, warn=True)


precision: 0.3290843926547092
recall: 0.28999744789218473
f1: 0.299386988954406


  y = column_or_1d(y, warn=True)


precision: 0.5564010938935522
recall: 0.5454524233935999
f1: 0.5482909855002878


['./model/GradientBoosting_total_cnv-2.pkl']

### 학습하기-AdaGradient Boosting 

In [37]:
from sklearn.ensemble import AdaBoostClassifier

model=AdaBoostClassifier()
model.fit(pp_X_train, pp_y_train)
pp_y_pred=model.predict(pp_X_test)
print('========pp========')
print('precision:', precision_score(pp_y_test, pp_y_pred, average='macro'))
print('recall:', recall_score(pp_y_test, pp_y_pred, average='macro'))
print('f1:', f1_score(pp_y_test, pp_y_pred, average='macro'))
joblib.dump(model, './model/AdaGradientBoosting_total_pp-1.pkl')

model=AdaBoostClassifier()
model.fit(cnv_X_train, cnv_y_train)
cnv_y_pred=model.predict(cnv_X_test)
print('========cnv========')
print('precision:', precision_score(cnv_y_test, cnv_y_pred, average='macro'))
print('recall:', recall_score(cnv_y_test, cnv_y_pred, average='macro'))
print('f1:', f1_score(cnv_y_test, cnv_y_pred, average='macro'))
joblib.dump(model, './model/GAdaGradientBoosting_total_cnv-2.pkl')

  y = column_or_1d(y, warn=True)


precision: 0.25671887427017764
recall: 0.24307723649828913
f1: 0.2373401307280368


  y = column_or_1d(y, warn=True)


precision: 0.4254628390477448
recall: 0.42478288663950425
f1: 0.4239712438921137


['./model/GAdaGradientBoosting_total_cnv-2.pkl']

### 학습하기 - Bagging

In [38]:
from sklearn.ensemble import BaggingClassifier

model=BaggingClassifier()
model.fit(pp_X_train, pp_y_train)
pp_y_pred=model.predict(pp_X_test)
print('========pp========')
print('precision:', precision_score(pp_y_test, pp_y_pred, average='macro'))
print('recall:', recall_score(pp_y_test, pp_y_pred, average='macro'))
print('f1:', f1_score(pp_y_test, pp_y_pred, average='macro'))
joblib.dump(model, './model/Bagging_total_pp-1.pkl')

model=BaggingClassifier()
model.fit(cnv_X_train, cnv_y_train)
cnv_y_pred=model.predict(cnv_X_test)
print('========cnv========')
print('precision:', precision_score(cnv_y_test, cnv_y_pred, average='macro'))
print('recall:', recall_score(cnv_y_test, cnv_y_pred, average='macro'))
print('f1:', f1_score(cnv_y_test, cnv_y_pred, average='macro'))
joblib.dump(model, './model/Bagging_total_cnv-2.pkl')

  y = column_or_1d(y, warn=True)


precision: 0.35741166797938345
recall: 0.3367876591560803
f1: 0.33758340545212756


  y = column_or_1d(y, warn=True)


precision: 0.5441163903134378
recall: 0.5386383530685002
f1: 0.5380236694138378


['./model/Bagging_total_cnv-2.pkl']

### 학습하기- ExtraTrees

In [39]:
from sklearn.ensemble import ExtraTreesClassifier

model=ExtraTreesClassifier()
model.fit(pp_X_train, pp_y_train)
pp_y_pred=model.predict(pp_X_test)
print('========pp========')
print('precision:', precision_score(pp_y_test, pp_y_pred, average='macro'))
print('recall:', recall_score(pp_y_test, pp_y_pred, average='macro'))
print('f1:', f1_score(pp_y_test, pp_y_pred, average='macro'))
joblib.dump(model, './model/ExtraTrees_total_pp-1.pkl')

model=ExtraTreesClassifier()
model.fit(cnv_X_train, cnv_y_train)
cnv_y_pred=model.predict(cnv_X_test)
print('========cnv========')
print('precision:', precision_score(cnv_y_test, cnv_y_pred, average='macro'))
print('recall:', recall_score(cnv_y_test, cnv_y_pred, average='macro'))
print('f1:', f1_score(cnv_y_test, cnv_y_pred, average='macro'))
joblib.dump(model, './model/ExtraTrees_total_cnv-2.pkl')


  model.fit(pp_X_train, pp_y_train)


precision: 0.42515128060793694
recall: 0.3905769234716603
f1: 0.3982953498429165


  model.fit(cnv_X_train, cnv_y_train)


precision: 0.6065021854152289
recall: 0.597504721585604
f1: 0.5996298486272096


['./model/ExtraTrees_total_cnv-2.pkl']