### Classification

- Binary Classification(SVM, Logistic Regression)
- Multi Classification(SVM, Logistic Regression)
- K-Fold cross validation
- 정확도(accuracy, precision, recall, f1_score) 추출 및 비교


In [2]:
import pymysql
import pandas as pd
import numpy as np
import sklearn.svm as svm

def load_score_data():
    xlsx_file = 'db_score_3_labels.xlsx'
    db_score = pd.read_excel(xlsx_file)
    
    conn = pymysql.connect(host='localhost', user='root', password='gusaud123', db='university')
    curs = conn.cursor(pymysql.cursors.DictCursor)
    
    drop_sql = """drop table if exists db_score"""
    curs.execute(drop_sql)
    conn.commit()
    
    import sqlalchemy
    
    database_username = 'root'
    database_password = 'gusaud123'
    database_ip = 'localhost'
    database_name = 'university'
    database_connection = sqlalchemy.create_engine('mysql+pymysql://{0}:{1}@{2}/{3}'.
                                                    format(database_username, database_password,
                                                          database_ip, database_name))
    db_score.to_sql(con=database_connection, name='db_score', if_exists='replace')

# mysql 테이블 구축
#load_score_data()


In [12]:
def classification_performance_eval(y, y_predict):
    tp, tn, fp, fn = 0,0,0,0
    for y, yp in zip(y, y_predict):
        if y == 1 and yp == 1:
            tp += 1
        elif y == 1 and yp == -1:
            fn += 1 
        elif y == -1 and yp == 1:
            fp += 1
        else:
            tn += 1

    if (tp == 0):
        tp = 1
        
    accuracy = (tp+tn)/(tp+tn+fp+fn)
    precision = tp/(tp+fp)
    recall = tp/(tp+fn)
    f1_score = 2*precision*recall/(precision+recall)
    
    
    return accuracy, precision, recall, f1_score


def Svm_Performance_train_test_split(X,y):
    from sklearn.model_selection import train_test_split
    from sklearn.svm import SVC
    from sklearn.preprocessing import StandardScaler

    X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.33, random_state=42)
    
    svm_model = SVC(kernel='rbf', C=8, gamma=0.1)
    svm_model.fit(X_train, y_train)
    y_predict = svm_model.predict(X_test) # 테스트
    
    accuracy, precision, recall, f1_score = classification_performance_eval(y_test, y_predict)
    
    print("Evaluation for SVM with test_split")    
    print("accuracy=%f" %accuracy)
    print("precision=%f" %precision)
    print("recall=%f" %recall)
    print("f1_score=%f" %f1_score)
    print('\n')
    

def LogisticRegression_Performance_train_test_split(X,y):
    from sklearn.model_selection import train_test_split
    from sklearn.preprocessing import StandardScaler
    from sklearn.linear_model import LogisticRegression
    
    X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.33, shuffle = True, random_state=42)

    
    # logistic 모델 적용
    log_reg = LogisticRegression()
    log_reg.fit(X_train, y_train)
    
    y_predict = log_reg.predict(X_test)

    accuracy, precision, recall, f1_score = classification_performance_eval(y_test, y_predict)
    
    print("Evaluation for LogisticRegression with test_split")    
    print("accuracy=%f" %accuracy)
    print("precision=%f" %precision)
    print("recall=%f" %recall)
    print("f1_score=%f" %f1_score)
    print('\n')
    

def SVM_performance_k_fold_cross_validation(X, y):
    from sklearn.model_selection import KFold
    kf = KFold (n_splits=5, random_state=42, shuffle=True)
    
    from sklearn.preprocessing import StandardScaler
    from sklearn.svm import SVC
    
    accuracy = []
    precision = []
    recall = []
    f1_score = []
    
    
    for train_index, test_index in kf.split(X): 
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
            
        svm_model = SVC(kernel='rbf', C=8, gamma=0.1)
        svm_model.fit(X_train, y_train)
        y_predict = svm_model.predict(X_test) # 테스트
       
    
        acc, prec, rec, f1 = classification_performance_eval(y_test, y_predict)    
        accuracy.append(acc)
        precision.append(prec)
        recall.append(rec)
        f1_score.append(f1)
    
    import statistics
    print("Evaluation for SVM with k_fold")
    print("average_accuracy =", statistics.mean(accuracy))
    print("average_precision =", statistics.mean(precision))
    print("average_recall =", statistics.mean(recall))
    print("average_f1_score =", statistics.mean(f1_score))
    print('\n')

def LogisticRegression_performance_k_fold_cross_validation(X, y):
    from sklearn.model_selection import KFold
    kf = KFold (n_splits=5, random_state=42, shuffle=True)
    from sklearn.preprocessing import StandardScaler
    from sklearn.linear_model import LogisticRegression
    
    accuracy = []
    precision = []
    recall = []
    f1_score = []
    
    
    for train_index, test_index in kf.split(X): 
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        # logistic 모델 적용
        log_reg = LogisticRegression()
        log_reg.fit(X_train, y_train)

        y_predict = log_reg.predict(X_test)
        acc, prec, rec, f1 = classification_performance_eval(y_test, y_predict)    
        accuracy.append(acc)
        precision.append(prec)
        recall.append(rec)
        f1_score.append(f1)
    
    import statistics
    
    print("Evaluation for LogisticRegression with k_fold")
    print("average_accuracy =", statistics.mean(accuracy))
    print("average_precision =", statistics.mean(precision))
    print("average_recall =", statistics.mean(recall))
    print("average_f1_score =", statistics.mean(f1_score))
    print('\n')


# MySql에서 db_score데이터 불러오기
conn = pymysql.connect(host='localhost', user='root', password='gusaud123', db='university')
curs = conn.cursor(pymysql.cursors.DictCursor)

sql = "select * from db_score"
curs.execute(sql)
data = curs.fetchall()

curs.close()
conn.close()

X = [ (t['homework'], t['discussion'], t['final'] )  for t in data ]
X = np.array(X)


y = [ 1 if (t['grade'] == 'B') else -1 for t in data ]
y = np.array(y)


Svm_Performance_train_test_split(X,y)
LogisticRegression_Performance_train_test_split(X,y)
SVM_performance_k_fold_cross_validation(X, y)
LogisticRegression_performance_k_fold_cross_validation(X, y)

Evaluation for SVM with test_split
accuracy=0.709677
precision=0.500000
recall=0.555556
f1_score=0.526316


Evaluation for LogisticRegression with test_split
accuracy=0.718750
precision=1.000000
recall=0.100000
f1_score=0.181818


Evaluation for SVM with k_fold
average_accuracy = 0.683625730994152
average_precision = 0.6088888888888888
average_recall = 0.46095238095238095
average_f1_score = 0.44948384948384945


Evaluation for LogisticRegression with k_fold
average_accuracy = 0.6594736842105263
average_precision = 0.8666666666666667
average_recall = 0.15508658008658008
average_f1_score = 0.26015873015873014




In [7]:
def Multiclassification_performanceA_eval(y, y_predict):c
    a= y
    b= y
    c=y
    A = y_predict
    B = y_predict
    C = y_predict
    
    tpA, tnA, fpA, fnA = 0,0,0,0
    
    tpB, tnB, fpB, fnB = 0,0,0,0
    
    tpC, tnC, fpC, fnC = 0,0,0,0
    
    for y, yp in zip(a, A):
        
        if y == 1 and yp == 1:
            tpA += 1
        elif y == 1 and (yp == 2 or yp == 3):
            fnA += 1 
        elif (y == 2 or y == 3)and yp == 1 :
            fpA += 1
        elif (y == 2 or y == 3) and (yp == 2 or yp ==3):
            tnA += 1
    
    
    for y, yp in zip(b, B):
        
        if (y == 2 and yp == 2):
            tpB += 1
        elif y == 2 and (yp == 1 or yp == 3):
            fnB += 1 
        elif (y == 1 or y == 3)and yp == 2 :
            fpB += 1
        elif (y == 1 or y == 3) and (yp == 1 or yp ==3):
            tnB += 1

    for y, yp in zip(c, C):
        if (y == 3 and yp == 3):
            tpC += 1
        elif y == 3  and (yp == 1 or yp == 2):
            fnC += 1 
        elif (y == 1 or y == 2)and yp == 3 :
            fpC += 1
        elif (y == 1 or y == 2) and (yp == 1 or yp ==2):
            tnC += 1                
            
    if (tpA == 0):
        tpA = 1
    if (tpB == 0):
        tpB = 1
    if (tpC == 0):
        tpC = 1
        
    accuracy = (tpA + tpB  +tpC )/(tpA+tnA+fpA+fnA)
    
    precisionA = tpA/(tpA+fpA)
    recallA = tpA/(tpA+fnA)
    f1_scoreA = 2*precisionA*recallA/(precisionA+recallA)
    
    precisionB = tpB/(tpB+fpB)
    recallB = tpB/(tpB+fnB)
    f1_scoreB = 2*precisionB*recallB/(precisionB+recallB)
    
    precisionC = tpC/(tpC+fpC)
    recallC = tpC/(tpC+fnC)
    f1_scoreC = 2*precisionC*recallC/(precisionC+recallC)
    
    return accuracy, precisionA, recallA, f1_scoreA, precisionB, recallB, f1_scoreB, precisionC, recallC, f1_scoreC


def Svm_Performance_train_test_split(X,y):
    from sklearn.model_selection import train_test_split
    from sklearn.svm import SVC
    from sklearn.preprocessing import StandardScaler

    X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.33, random_state=42)

    #데이터 스케일링
    sc = StandardScaler()
    sc.fit(X_train)

    X_train_std = sc.transform(X_train)
    X_test_std = sc.transform(X_test)
    
    svm_model = SVC(kernel='rbf', C=8, gamma=0.1)
    svm_model.fit(X_train_std, y_train) # SVM 분류 모델 훈련
    
    y_predict = svm_model.predict(X_test_std) # 테스트
    
    
    accuracy, precisionA, recallA, f1_scoreA, precisionB, recallB, f1_scoreB, precisionC, recallC, f1_scoreC = Multiclassification_performanceA_eval(y_test, y_predict)
    
    
    print("Evaluation for SVM with test_split")
    print("accuracy=%f" %accuracy)
    print("precision(A)=%f" %precisionA)
    print("recall(A)=%f" %recallA)
    print("f1_score(A)=%f" %f1_scoreA)
    print('\n')
    print("precision(B)=%f" %precisionB)
    print("recall(B)=%f" %recallB)
    print("f1_score(B)=%f" %f1_scoreB)
    print('\n')
    print("precision(C)=%f" %precisionC)
    print("recall(C)=%f" %recallC)
    print("f1_score(C)=%f" %f1_scoreC)
    print('\n')

def LogisticRegression_Performance_train_test_split(X,y):
    from sklearn.model_selection import train_test_split
    from sklearn.svm import SVC
    from sklearn.preprocessing import StandardScaler
    from sklearn.linear_model import LogisticRegression
    
    X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.33, shuffle = True, random_state=42)

    
    #데이터 스케일링
    sc = StandardScaler()
    sc.fit(X_train)

    X_train_std = sc.transform(X_train)
    X_test_std = sc.transform(X_test)
    
    log_reg = LogisticRegression()
    log_reg.fit(X_train_std, y_train)
    
    y_predict = log_reg.predict(X_test_std)

    accuracy, precisionA, recallA, f1_scoreA, precisionB, recallB, f1_scoreB, precisionC, recallC, f1_scoreC = Multiclassification_performanceA_eval(y_test, y_predict)
    
    
    print("Evaluation for LogisticRegression with test_split")
    print("accuracy=%f" %accuracy)
    print("precision(A)=%f" %precisionA)
    print("recall(A)=%f" %recallA)
    print("f1_score(A)=%f" %f1_scoreA)
    print('\n')
    print("precision(B)=%f" %precisionB)
    print("recall(B)=%f" %recallB)
    print("f1_score(B)=%f" %f1_scoreB)
    print('\n')
    print("precision(C)=%f" %precisionC)
    print("recall(C)=%f" %recallC)
    print("f1_score(C)=%f" %f1_scoreC)
    print('\n')

    


def SVM_performance_k_fold_cross_validation(X, y):
    from sklearn.model_selection import KFold
    kf = KFold (n_splits=5, random_state=42, shuffle=True)
    
    from sklearn.preprocessing import StandardScaler
    from sklearn.svm import SVC
    
    accuracy = []
    precisionA = []
    recallA = []
    f1_scoreA = []
    
    precisionB = []
    recallB = []
    f1_scoreB = []
    
    precisionC = []
    recallC = []
    f1_scoreC = []
    
    
    for train_index, test_index in kf.split(X): 
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        #데이터 스케일링
        sc = StandardScaler()
        sc.fit(X_train)
    
        X_train_std = sc.transform(X_train)
        X_test_std = sc.transform(X_test)
        
        #svm모델 적용
        svm_model = SVC(kernel='rbf', C=8, gamma=0.1)
        svm_model.fit(X_train_std, y_train) # SVM 분류 모델 훈련

        y_predict = svm_model.predict(X_test_std) # 테스트        
    
        acc, precA, recA, f1A, precB, recB, f1B, precC, recC, f1C = Multiclassification_performanceA_eval(y_test, y_predict)
        
        accuracy.append(acc)
        precisionA.append(precA)
        recallA.append(recA)
        f1_scoreA.append(f1A)
        
        precisionB.append(precB)
        recallB.append(recB)
        f1_scoreB.append(f1B)
        
        precisionC.append(precC)
        recallC.append(recC)
        f1_scoreC.append(f1C)       
    
    import statistics
    print("Evaluation for SVM with k_fold")
    print("average_accuracy =", statistics.mean(accuracy))
    print("average_precision(A) =", statistics.mean(precisionA))
    print("average_recall(A) =", statistics.mean(recallA))
    print("average_f1_score(A) =", statistics.mean(f1_scoreA))
    print('\n')    
    print("average_precision(B) =", statistics.mean(precisionB))
    print("average_recall(B) =", statistics.mean(recallB))
    print("average_f1_score(B) =", statistics.mean(f1_scoreB))
    print('\n')
    print("average_precision(C) =", statistics.mean(precisionC))
    print("average_recall(C) =", statistics.mean(recallC))
    print("average_f1_score(C) =", statistics.mean(f1_scoreC))    
    print('\n')

def LogisticRegression_performance_k_fold_cross_validation(X, y):
    from sklearn.model_selection import KFold
    kf = KFold (n_splits=5, random_state=42, shuffle=True)
    from sklearn.preprocessing import StandardScaler
    from sklearn.linear_model import LogisticRegression
    
    accuracy = []
    precisionA = []
    recallA = []
    f1_scoreA = []
    
    precisionB = []
    recallB = []
    f1_scoreB = []
    
    precisionC = []
    recallC = []
    f1_scoreC = []
    
    
    for train_index, test_index in kf.split(X): 
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        # 데이터 스케일링
        sc = StandardScaler()
        sc.fit(X_train)

        X_train_std = sc.transform(X_train)
        X_test_std = sc.transform(X_test)

        # logistic 모델 적용
        log_reg = LogisticRegression()
        log_reg.fit(X_train_std, y_train)

        y_predict = log_reg.predict(X_test_std)
        acc, precA, recA, f1A, precB, recB, f1B, precC, recC, f1C = Multiclassification_performanceA_eval(y_test, y_predict)
        
        accuracy.append(acc)
        precisionA.append(precA)
        recallA.append(recA)
        f1_scoreA.append(f1A)
        
        precisionB.append(precB)
        recallB.append(recB)
        f1_scoreB.append(f1B)
        
        precisionC.append(precC)
        recallC.append(recC)
        f1_scoreC.append(f1C)     
    
    import statistics
    
    print("Evaluation for LogisticRegression with k_fold")
    print("average_accuracy =", statistics.mean(accuracy))
    print("average_precision(A) =", statistics.mean(precisionA))
    print("average_recall(A) =", statistics.mean(recallA))
    print("average_f1_score(A) =", statistics.mean(f1_scoreA))
    print('\n')    
    print("average_precision(B) =", statistics.mean(precisionB))
    print("average_recall(B) =", statistics.mean(recallB))
    print("average_f1_score(B) =", statistics.mean(f1_scoreB))
    print('\n')
    print("average_precision(C) =", statistics.mean(precisionC))
    print("average_recall(C) =", statistics.mean(recallC))
    print("average_f1_score(C) =", statistics.mean(f1_scoreC))    
    print('\n')

# MySql에서 db_score데이터 불러오기
conn = pymysql.connect(host='localhost', user='root', password='gusaud123', db='university')
curs = conn.cursor(pymysql.cursors.DictCursor)

sql = "select * from db_score"
curs.execute(sql)
data = curs.fetchall()

curs.close()
conn.close()

X = [ (t['homework'], t['discussion'], t['final'] )  for t in data ]
X = np.array(X)

y = [ (t['grade']) for t in data ]
y = np.array(y)

for i in range (0,len(y)):
    if(y[i] == 'A'):
        y[i] = 1
    elif(y[i] == 'B'):
        y[i] = 2 
    else:
        y[i] = 3
     
y = np.array(y, dtype = np.int64)

Svm_Performance_train_test_split(X,y)
LogisticRegression_Performance_train_test_split(X,y)
SVM_performance_k_fold_cross_validation(X, y)
LogisticRegression_performance_k_fold_cross_validation(X, y)

Evaluation for SVM with test_split
accuracy=0.612903
precision(A)=0.687500
recall(A)=1.000000
f1_score(A)=0.814815


precision(B)=0.454545
recall(B)=0.555556
f1_score(B)=0.500000


precision(C)=0.750000
recall(C)=0.272727
f1_score(C)=0.400000


Evaluation for LogisticRegression with test_split
accuracy=0.709677
precision(A)=0.733333
recall(A)=1.000000
f1_score(A)=0.846154


precision(B)=0.545455
recall(B)=0.666667
f1_score(B)=0.600000


precision(C)=1.000000
recall(C)=0.454545
f1_score(C)=0.625000


Evaluation for SVM with k_fold
average_accuracy = 0.6374269005847953
average_precision(A) = 0.7361111111111112
average_recall(A) = 0.8292857142857143
average_f1_score(A) = 0.7080175706646294


average_precision(B) = 0.4676190476190476
average_recall(B) = 0.560952380952381
average_f1_score(B) = 0.5017366946778712


average_precision(C) = 0.7542857142857143
average_recall(C) = 0.685
average_f1_score(C) = 0.6446153846153846


Evaluation for LogisticRegression with k_fold
average_accuracy = 0.6