In [1]:
import pandas as pd
import plotly.express as px
import plotly.graph_objs as go
from plotly.subplots import make_subplots
import plotly.io as pio

# (함수) DataFrame으로 불러오는 함수

In [2]:
def openDataFrame(path):
    return pd.read_csv(path, on_bad_lines='skip', encoding = "ISO-8859-1")

# 3.26 Wine Quality Prediction Analysis - Classification

In [3]:
path_wine = './winequality.csv'

In [4]:
df_wine = openDataFrame(path_wine)
df_wine

Unnamed: 0,type,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,white,7.0,0.270,0.36,20.7,0.045,45.0,170.0,1.00100,3.00,0.45,8.8,6
1,white,6.3,0.300,0.34,1.6,0.049,14.0,132.0,0.99400,3.30,0.49,9.5,6
2,white,8.1,0.280,0.40,6.9,0.050,30.0,97.0,0.99510,3.26,0.44,10.1,6
3,white,7.2,0.230,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,6
4,white,7.2,0.230,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6492,red,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5,5
6493,red,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,,11.2,6
6494,red,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0,6
6495,red,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,10.2,5


# 범주형 특성 처리

In [5]:
df_wine['type'] = (df_wine['type']=='white').astype(int)
df_wine

Unnamed: 0,type,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,1,7.0,0.270,0.36,20.7,0.045,45.0,170.0,1.00100,3.00,0.45,8.8,6
1,1,6.3,0.300,0.34,1.6,0.049,14.0,132.0,0.99400,3.30,0.49,9.5,6
2,1,8.1,0.280,0.40,6.9,0.050,30.0,97.0,0.99510,3.26,0.44,10.1,6
3,1,7.2,0.230,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,6
4,1,7.2,0.230,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6492,0,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5,5
6493,0,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,,11.2,6
6494,0,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0,6
6495,0,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,10.2,5


# quality 조정 (4-, 8+)

In [6]:
df_wine['quality'] = df_wine['quality'].replace({3:4, 9:8})
df_wine

Unnamed: 0,type,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,1,7.0,0.270,0.36,20.7,0.045,45.0,170.0,1.00100,3.00,0.45,8.8,6
1,1,6.3,0.300,0.34,1.6,0.049,14.0,132.0,0.99400,3.30,0.49,9.5,6
2,1,8.1,0.280,0.40,6.9,0.050,30.0,97.0,0.99510,3.26,0.44,10.1,6
3,1,7.2,0.230,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,6
4,1,7.2,0.230,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6492,0,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5,5
6493,0,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,,11.2,6
6494,0,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0,6
6495,0,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,10.2,5


# k-NN/median 이용한 결측치 채우기

In [7]:
from sklearn.impute import KNNImputer
from sklearn.impute import SimpleImputer
import numpy as np

X = df_wine

# KNNImputer 객체 생성
imputer_KNN = KNNImputer(n_neighbors=2, weights="uniform")
imputer_simple = SimpleImputer(strategy="median")

# 결측치 대체
X_imputed_KNN = imputer_KNN.fit_transform(X)
X_imputed_simple = imputer_simple.fit_transform(X)

In [8]:
df_imputed_KNN = pd.DataFrame(X_imputed_KNN, columns=df_wine.columns, index=df_wine.index)
df_imputed_KNN

Unnamed: 0,type,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,1.0,7.0,0.270,0.36,20.7,0.045,45.0,170.0,1.00100,3.00,0.450,8.8,6.0
1,1.0,6.3,0.300,0.34,1.6,0.049,14.0,132.0,0.99400,3.30,0.490,9.5,6.0
2,1.0,8.1,0.280,0.40,6.9,0.050,30.0,97.0,0.99510,3.26,0.440,10.1,6.0
3,1.0,7.2,0.230,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.400,9.9,6.0
4,1.0,7.2,0.230,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.400,9.9,6.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6492,0.0,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.580,10.5,5.0
6493,0.0,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,0.845,11.2,6.0
6494,0.0,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.750,11.0,6.0
6495,0.0,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.710,10.2,5.0


In [9]:
df_imputed_simple = pd.DataFrame(X_imputed_simple, columns=df_wine.columns, index=df_wine.index)
df_imputed_simple

Unnamed: 0,type,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,1.0,7.0,0.270,0.36,20.7,0.045,45.0,170.0,1.00100,3.00,0.45,8.8,6.0
1,1.0,6.3,0.300,0.34,1.6,0.049,14.0,132.0,0.99400,3.30,0.49,9.5,6.0
2,1.0,8.1,0.280,0.40,6.9,0.050,30.0,97.0,0.99510,3.26,0.44,10.1,6.0
3,1.0,7.2,0.230,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,6.0
4,1.0,7.2,0.230,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,6.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6492,0.0,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5,5.0
6493,0.0,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,0.51,11.2,6.0
6494,0.0,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0,6.0
6495,0.0,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,10.2,5.0


In [10]:
df_imputed_simple.copy().drop(columns=['quality'])

Unnamed: 0,type,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,1.0,7.0,0.270,0.36,20.7,0.045,45.0,170.0,1.00100,3.00,0.45,8.8
1,1.0,6.3,0.300,0.34,1.6,0.049,14.0,132.0,0.99400,3.30,0.49,9.5
2,1.0,8.1,0.280,0.40,6.9,0.050,30.0,97.0,0.99510,3.26,0.44,10.1
3,1.0,7.2,0.230,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9
4,1.0,7.2,0.230,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9
...,...,...,...,...,...,...,...,...,...,...,...,...
6492,0.0,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5
6493,0.0,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,0.51,11.2
6494,0.0,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0
6495,0.0,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,10.2


In [11]:
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.metrics import classification_report_imbalanced

# 1. DATA

## (함수) Dataframe split

In [12]:
def splitDataframe(dataframe, randomState=100):
    copy_df=dataframe.copy()
    # 데이터셋을 트레이닝 셋과 임시 셋으로 60:40 비율로 나눔
    X = copy_df.drop(columns=['quality'])
    y = copy_df['quality']
    X_train, X_tmp, y_train, y_tmp = train_test_split(X, y)
    
    # oversampling(ros), undersampling(rus)
    ros = RandomOverSampler(random_state=randomState)
    X_train_ros, y_train_ros = ros.fit_resample(X_train, y_train)
    rus = RandomUnderSampler(random_state=randomState)
    X_train_rus, y_train_rus = rus.fit_resample(X_train, y_train)
    # 임시 셋을 밸리데이션 셋과 테스트 셋으로 50:50 비율로 나눔
    X_val, X_test, y_val, y_test = train_test_split(X_tmp, y_tmp, test_size=0.5, random_state=100)
    myDict = {
        'X_train_ros': X_train_ros,
        'y_train_ros': y_train_ros,
        'X_train_rus': X_train_rus,
        'y_train_rus': y_train_rus,
        'X_val': X_val,
        'y_val': y_val,
        'X_test': X_test,
        'y_test': y_test
    }
    return myDict    
    

In [13]:
df_split = splitDataframe(df_imputed_simple)

In [14]:
df_split['X_train_ros']

Unnamed: 0,type,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,1.0,7.4,0.190,0.30,12.80,0.053,48.5,229.0,0.99860,3.14,0.49,9.1
1,1.0,6.6,0.230,0.26,1.30,0.045,16.0,128.0,0.99340,3.36,0.60,10.0
2,1.0,5.9,0.250,0.27,1.50,0.029,37.0,81.0,0.98920,3.20,0.46,12.2
3,1.0,6.5,0.220,0.19,4.50,0.096,16.0,115.0,0.99370,3.02,0.44,9.6
4,1.0,6.2,0.270,0.32,8.80,0.047,65.0,224.0,0.99610,3.17,0.47,8.9
...,...,...,...,...,...,...,...,...,...,...,...,...
10755,1.0,7.5,0.420,0.34,4.30,0.040,34.0,108.0,0.99155,3.14,0.45,12.8
10756,1.0,5.8,0.315,0.27,1.55,0.026,15.0,70.0,0.98994,3.37,0.40,11.9
10757,1.0,5.2,0.300,0.34,1.50,0.038,18.0,96.0,0.98942,3.56,0.48,13.0
10758,1.0,6.8,0.280,0.43,7.60,0.030,30.0,110.0,0.99164,3.08,0.59,12.5


In [15]:
X_train_ros = df_split['X_train_ros']

In [16]:
y_train_ros = df_split['y_train_ros']

In [17]:
X_val = df_split['X_val']
y_val = df_split['y_val']

## importing modules

In [38]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.decomposition import KernelPCA


# 2. Cross Validation & Grid Search

### 파라미터:
    - C : SVM에서 규제 매개변수로 작을수록 규제가 크고, 클수록 규제가 작아집니다. GridSearchCV를 사용하여 최적의 규제 매개변수를 찾을 수 있습니다.
    - kernel : SVM에서 커널 함수를 지정합니다. 일반적으로 'linear', 'poly', 'rbf', 'sigmoid' 등의 값을 사용합니다. GridSearchCV를 사용하여 최적의 커널 함수를 찾을 수 있습니다.
    - gamma : SVM에서 RBF 커널을 사용할 때 사용하는 매개변수로 값이 작을수록 결정 경계가 부드러워지고, 클수록 결정 경계가 불규칙해집니다. GridSearchCV를 사용하여 최적의 gamma 값을 찾을 수 있습니다.

### 스코어링 옵션:

    - accuracy : 분류 문제에서 가장 일반적으로 사용되는 성능 지표입니다.
    - precision : 양성 클래스로 예측한 샘플 중에서 실제로 양성 클래스인 샘플의 비율입니다.
    - recall : 실제 양성 클래스인 샘플 중에서 양성 클래스로 예측한 샘플의 비율입니다.
    - f1-score : precision과 recall의 조화평균입니다.

In [19]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix

### (함수) Data Split해서 적절하게 입력 형태로 주는 함수

In [20]:
def getSplitDataBundle(dataframe, randomState=100):
    copy_df=dataframe.copy()
    # 데이터셋을 트레이닝 셋과 임시 셋으로 60:40 비율로 나눔
    X = copy_df.drop(columns=['quality'])
    y = copy_df['quality']
    X_train, X_tmp, y_train, y_tmp = train_test_split(X, y, test_size=0.4, random_state=100)
    
    # oversampling(ros), undersampling(rus)
    ros = RandomOverSampler(random_state=randomState)
    X_train_ros, y_train_ros = ros.fit_resample(X_train, y_train)
    rus = RandomUnderSampler(random_state=randomState)
    X_train_rus, y_train_rus = rus.fit_resample(X_train, y_train)
    # 임시 셋을 밸리데이션 셋과 테스트 셋으로 50:50 비율로 나눔
    X_val, X_test, y_val, y_test = train_test_split(X_tmp, y_tmp, test_size=0.5, random_state=100)
    
    myDict = {
        'over': {
            'X': X_train_ros,
            'y': y_train_ros,            
        },
        'under': {
            'X': X_train_rus,
            'y': y_train_rus,
        },
        'validation': {
            'X': X_val,
            'y': y_val,                        
        },
        'test': {
            'X': X_test,
            'y': y_test
            
        },
        'input_over': [X_train_ros, y_train_ros, X_val, y_val],
        'input_under': [X_train_rus, y_train_rus, X_val, y_val],
    }
    return myDict    


In [21]:
dataBundle = getSplitDataBundle(df_imputed_simple)

In [83]:
def showValidationResult(df_y_val, df_y_pred):
    delta = list(df_y_val - df_y_pred)
    correct = (delta == 0) 
    fig = px.scatter(pd.DataFrame({'delta': delta}), y="delta", width=800, height=400)
    fig.show()
    print('정답률:', round(delta.count(0)/len(delta), 2))

In [23]:
def eval_model(df_y_true, df_y_pred):
    y_true = list(df_y_true)
    y_pred = list(df_y_pred)

    # Confusion Matrix
    confusion_mat = confusion_matrix(y_true, y_pred, labels=[4, 5, 6, 7, 8])
    print(confusion_mat)
    
    # Total Accuracy
    accuracy = accuracy_score(y_true, y_pred) # 전체 정확도
    print(accuracy)
    
    # Class Accuracy & Macro / Weighted Accuracy
    class_names = [4, 5, 6, 7, 8] # 클래스 이름
    macro_accuracy = 0
    weighted_accuracy = 0

    for class_name in class_names:
        indices = [i for i, x in enumerate(y_true) if x == class_name] # 해당 클래스의 샘플 인덱스
        y_true_class = [y_true[i] for i in indices]
        y_pred_class = [y_pred[i] for i in indices]
        accuracy_class = accuracy_score(y_true_class, y_pred_class) # 해당 클래스의 정확도
        precision_class = precision_score(y_true, y_pred, labels=[class_name], average='micro')
        recall_class = recall_score(y_true, y_pred, labels=[class_name], average='micro')
        f1_class = f1_score(y_true, y_pred, labels=[class_name], average='micro')
        print("Class", class_name)
        print(class_name, "class Accuracy: ", accuracy_class)
        print(class_name, "class Precision: ", precision_class)
        print(class_name, "class Recall: ", recall_class)
        print(class_name, "class f1: ", f1_class)
        macro_accuracy += accuracy_class
        weighted_accuracy += accuracy_class * len(indices)

    macro_accuracy /= len(class_names) # 매크로 평균
    weighted_accuracy /= len(y_true) # 가중 평균
    print("Macro-Average Accuracy:", macro_accuracy)
    print("Weighted-Average Accuracy:", weighted_accuracy)

In [24]:
pipeline = Pipeline([
    ('scaler', StandardScaler()),  # 입력 데이터에 대한 표준화 작업
    ('svm', SVC())  # SVM 모델
])
pipeline.get_params()

{'memory': None,
 'steps': [('scaler', StandardScaler()), ('svm', SVC())],
 'verbose': False,
 'scaler': StandardScaler(),
 'svm': SVC(),
 'scaler__copy': True,
 'scaler__with_mean': True,
 'scaler__with_std': True,
 'svm__C': 1.0,
 'svm__break_ties': False,
 'svm__cache_size': 200,
 'svm__class_weight': None,
 'svm__coef0': 0.0,
 'svm__decision_function_shape': 'ovr',
 'svm__degree': 3,
 'svm__gamma': 'scale',
 'svm__kernel': 'rbf',
 'svm__max_iter': -1,
 'svm__probability': False,
 'svm__random_state': None,
 'svm__shrinking': True,
 'svm__tol': 0.001,
 'svm__verbose': False}

In [44]:
def SVC_gridSearchCV_scoring(listXYXY, listC=[0.1, 10], listKernel=['poly', 'rbf'], listGamma=[0.1, 10], scoring='accuracy'):
    # data 선언
    X_train = listXYXY[0]
    y_train = listXYXY[1]
    X_val = listXYXY[2]
    y_val = listXYXY[3]
    
    # 스케일러 모델 파이프라인
    pipeline = Pipeline([
        ('scaler', StandardScaler()),  # 입력 데이터에 대한 표준화 작업
        ('svm', SVC())  # SVM 모델
    ])

    # 탐색할 하이퍼파라미터를 지정합니다.
    param_grid = {'svm__C': listC,
                  'svm__kernel': listKernel,
                  'svm__gamma': listGamma}

    # GridSearchCV를 실행합니다.
    grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, scoring=scoring, cv=5)
    grid_search.fit(X_train, y_train)

    # 최적의 파라미터와 스코어링 점수를 출력합니다.
    print("Best parameters: ", grid_search.best_params_)
    print("Best score: ", grid_search.best_score_)

    # 최적의 모델을 사용하여 예측합니다.
    y_pred = grid_search.predict(X_val)
    
    # 분류 성능 지표를 출력합니다.
#     print("Accuracy score: ", accuracy_score(y_val, y_pred))
#     print("Precision score: ", precision_score(y_val, y_pred))
#     print("Recall score: ", recall_score(y_val, y_pred))
#     print("F1 score: ", f1_score(y_val, y_pred))

    return pd.DataFrame({'y_true': y_val, 'y_pred': y_pred})

In [45]:
df_result = SVC_gridSearchCV_scoring(dataBundle['input_over'], [0.1, 1, 10], ['rbf'], [0.1, 1, 10], scoring='accuracy')
df_result

Best parameters:  {'svm__C': 1, 'svm__gamma': 10, 'svm__kernel': 'rbf'}
Best score:  0.8921441228588304


Unnamed: 0,y_true,y_pred
960,6.0,6.0
3668,5.0,6.0
1804,6.0,6.0
4723,6.0,6.0
4206,6.0,6.0
...,...,...
4823,7.0,6.0
875,7.0,6.0
3354,7.0,6.0
3220,5.0,6.0


In [46]:
# SVC_gridSearchCV_scoring(dataBundle['input_over'], [0.1, 1, 10], ['poly'], [0.1, 1, 10], scoring='accuracy')

In [47]:
y_true = df_result['y_true']
y_pred = df_result['y_pred']

In [48]:
y_pred.shape

(1299,)

In [63]:
showValidationResult(y_true, y_pred)

정답률: 0.6


In [50]:
eval_model(y_true, y_pred)

[[  3   0  45   0   0]
 [  0 123 292   0   0]
 [  0   7 577   0   0]
 [  0   0 152  62   0]
 [  0   0  27   0  11]]
0.5973826020015397
Class 4
4 class Accuracy:  0.0625
4 class Precision:  1.0
4 class Recall:  0.0625
4 class f1:  0.11764705882352941
Class 5
5 class Accuracy:  0.2963855421686747
5 class Precision:  0.9461538461538461
5 class Recall:  0.2963855421686747
5 class f1:  0.4513761467889908
Class 6
6 class Accuracy:  0.988013698630137
6 class Precision:  0.5279048490393413
6 class Recall:  0.988013698630137
6 class f1:  0.6881335718545021
Class 7
7 class Accuracy:  0.2897196261682243
7 class Precision:  1.0
7 class Recall:  0.2897196261682243
7 class f1:  0.4492753623188406
Class 8
8 class Accuracy:  0.2894736842105263
8 class Precision:  1.0
8 class Recall:  0.2894736842105263
8 class f1:  0.4489795918367347
Macro-Average Accuracy: 0.38521851023551246
Weighted-Average Accuracy: 0.5973826020015397


In [51]:
df_wine.shape

(6497, 13)

In [52]:
df_wine.shape[0]*0.6

3898.2

In [53]:
df_wine.shape[0]*0.2

1299.4

In [54]:
y_pred.shape[0]

1299

In [77]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix

def KPCA_SVC_gridSearchCV_scoring(listXYXY, paramsGS, scoring='accuracy'):
    # data 선언
    X_train = listXYXY[0]
    y_train = listXYXY[1]
    X_val = listXYXY[2]
    y_val = listXYXY[3]
    
    Cs = paramsGS['svm__C']
    Kers = paramsGS['svm__kernel']
    gamms = paramsGS['svm__gamma']
    kpcaComps = paramsGS['kpca__n_components']
    kpcaGamms = paramsGS['kpca__gamma']
    kpcaKers = paramsGS['kpca__kernel']
    
    # 스케일러 모델 파이프라인
    pipeline = Pipeline([
        ('scaler', StandardScaler()),  # 입력 데이터에 대한 표준화 작업
        ("kpca", KernelPCA()),
        ('svm', SVC())  # SVM 모델
    ])

    # 탐색할 하이퍼파라미터를 지정합니다.
    param_grid = {'svm__C': Cs,
                  'svm__kernel': Kers,
                  'svm__gamma': gamms, 
                  'kpca__n_components': kpcaComps,
                  'kpca__gamma': kpcaGamms,
                  'kpca__kernel': kpcaKers,
                 }

    # GridSearchCV를 실행합니다.
    grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, scoring=scoring, cv=5)
    grid_search.fit(X_train, y_train)

    # 최적의 파라미터와 스코어링 점수를 출력합니다.
    print("Best parameters: ", grid_search.best_params_)
    print("Best score: ", grid_search.best_score_)

    # 최적의 모델을 사용하여 예측합니다.
    y_pred = grid_search.predict(X_val)

    return pd.DataFrame({'y_true': y_val, 'y_pred': y_pred})

In [69]:
pipeline = Pipeline([
    ('scaler', StandardScaler()),  # 입력 데이터에 대한 표준화 작업
    ("kpca", KernelPCA()),
    ('svm', SVC())  # SVM 모델
])
pipeline.get_params()

{'memory': None,
 'steps': [('scaler', StandardScaler()),
  ('kpca', KernelPCA()),
  ('svm', SVC())],
 'verbose': False,
 'scaler': StandardScaler(),
 'kpca': KernelPCA(),
 'svm': SVC(),
 'scaler__copy': True,
 'scaler__with_mean': True,
 'scaler__with_std': True,
 'kpca__alpha': 1.0,
 'kpca__coef0': 1,
 'kpca__copy_X': True,
 'kpca__degree': 3,
 'kpca__eigen_solver': 'auto',
 'kpca__fit_inverse_transform': False,
 'kpca__gamma': None,
 'kpca__iterated_power': 'auto',
 'kpca__kernel': 'linear',
 'kpca__kernel_params': None,
 'kpca__max_iter': None,
 'kpca__n_components': None,
 'kpca__n_jobs': None,
 'kpca__random_state': None,
 'kpca__remove_zero_eig': False,
 'kpca__tol': 0,
 'svm__C': 1.0,
 'svm__break_ties': False,
 'svm__cache_size': 200,
 'svm__class_weight': None,
 'svm__coef0': 0.0,
 'svm__decision_function_shape': 'ovr',
 'svm__degree': 3,
 'svm__gamma': 'scale',
 'svm__kernel': 'rbf',
 'svm__max_iter': -1,
 'svm__probability': False,
 'svm__random_state': None,
 'svm__shrinki

In [82]:
myParamsGS = {
    'svm__C': [0.1, 1, 10],
    'svm__kernel': ['rbf'],
    'svm__gamma': [0.1, 1, 10], 
    'kpca__n_components': [9, 12],
    'kpca__gamma': [1, 10],
    'kpca__kernel': ['rbf'],
}

In [78]:
df_kpca_result = KPCA_SVC_gridSearchCV_scoring(dataBundle['input_over'], myParamsGS, scoring='accuracy')
df_kpca_result

Best parameters:  {'kpca__gamma': 0.01, 'kpca__kernel': 'rbf', 'kpca__n_components': 9, 'svm__C': 10, 'svm__gamma': 10, 'svm__kernel': 'rbf'}
Best score:  0.7754282339043118


Unnamed: 0,y_true,y_pred
960,6.0,7.0
3668,5.0,6.0
1804,6.0,5.0
4723,6.0,5.0
4206,6.0,5.0
...,...,...
4823,7.0,8.0
875,7.0,6.0
3354,7.0,7.0
3220,5.0,6.0


In [79]:
showValidationResult(df_kpca_result['y_true'], df_kpca_result['y_pred'])

정답률: 0.5


In [80]:
eval_model(df_kpca_result['y_true'], df_kpca_result['y_pred'])

[[ 16  21   6   4   1]
 [ 40 272  67  32   4]
 [ 25 158 223 145  33]
 [  5  10  49 126  24]
 [  3   0   5  17  13]]
0.5003849114703618
Class 4
4 class Accuracy:  0.3333333333333333
4 class Precision:  0.1797752808988764
4 class Recall:  0.3333333333333333
4 class f1:  0.23357664233576644
Class 5
5 class Accuracy:  0.655421686746988
5 class Precision:  0.5900216919739696
5 class Recall:  0.655421686746988
5 class f1:  0.6210045662100456
Class 6
6 class Accuracy:  0.3818493150684932
6 class Precision:  0.6371428571428571
6 class Recall:  0.3818493150684932
6 class f1:  0.47751605995717344
Class 7
7 class Accuracy:  0.5887850467289719
7 class Precision:  0.3888888888888889
7 class Recall:  0.5887850467289719
7 class f1:  0.46840148698884765
Class 8
8 class Accuracy:  0.34210526315789475
8 class Precision:  0.17333333333333334
8 class Recall:  0.34210526315789475
8 class f1:  0.2300884955752213
Macro-Average Accuracy: 0.46029892900713626
Weighted-Average Accuracy: 0.5003849114703618


0       6
1       6
2       6
3       6
4       6
       ..
6492    5
6493    6
6494    6
6495    5
6496    6
Name: quality, Length: 6497, dtype: int64