# (함수) DataFrame으로 불러오는 함수

In [1]:
import pandas as pd

In [2]:
def openDataFrame(path):
    return pd.read_csv(path, on_bad_lines='skip', encoding = "ISO-8859-1")

# 0. 원본 데이터 :  Wine Quality Prediction Analysis - Classification

In [3]:
path_wine = './winequality.csv'

In [4]:
df_wine = openDataFrame(path_wine)
df_wine

Unnamed: 0,type,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,white,7.0,0.270,0.36,20.7,0.045,45.0,170.0,1.00100,3.00,0.45,8.8,6
1,white,6.3,0.300,0.34,1.6,0.049,14.0,132.0,0.99400,3.30,0.49,9.5,6
2,white,8.1,0.280,0.40,6.9,0.050,30.0,97.0,0.99510,3.26,0.44,10.1,6
3,white,7.2,0.230,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,6
4,white,7.2,0.230,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6492,red,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5,5
6493,red,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,,11.2,6
6494,red,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0,6
6495,red,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,10.2,5


# 1. 전처리기(나만의 변환기들)

In [5]:
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin

## 1.1 변환기: Type 이진화

In [6]:
class TypeBinaryConverter(BaseEstimator, TransformerMixin):
    def __init__(self, name='type'):
        self.name = name
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X[self.name] = (X[self.name] == 'white').astype(int)
        return X

## 1.2 변환기: quality 3->4, 9->8 변환

In [7]:
class MergeQuality(BaseEstimator, TransformerMixin):
    def __init__(self, name='quality'):
        self.name = name
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X[self.name] = X[self.name].replace({3:4, 9:8})
        return X

## 1.3 변환기: quality 4, ... ,8 이진그룹화 변환

In [8]:
class QualityGroups(BaseEstimator, TransformerMixin):
    def __init__(self, listGroups=['quality_4', 'quality_5', 'quality_6', 'quality_7', 'quality_8']):
        self.listGroups = listGroups
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        for i, name in enumerate(self.listGroups):
            X[name] = (X['quality']==i+4).astype(int)
        return X

## 1.4 변환기: 특성들의 극단치 모두 제거

In [9]:
class DropOutliers(BaseEstimator, TransformerMixin):
    def __init__(self, scope=5):
        self.scope = scope
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        myList = []
        for name in X.columns:
            if name not in ['type', 'quality']:
                Q1 = X[[name]].quantile(0.25)
                Q3 = X[[name]].quantile(0.75)
                IQR = Q3 - Q1
                lower_bound = Q1 - self.scope * IQR
                upper_bound = Q3 + self.scope * IQR

                # 극단치가 있는 열과 행을 찾음
                outlier_cols = np.where((X[[name]] < lower_bound) | (X[[name]] > upper_bound))[1]
                outlier_rows = np.where((X[[name]] < lower_bound) | (X[[name]] > upper_bound))[0]

                # 극단치가 있는 행의 인덱스를 반환
                outlier_indices = pd.Series(outlier_rows).unique()    
                myList += list(outlier_indices)
        newDf = X.copy().drop(myList, axis=0)
        return newDf

## 1.5 변환기: log_scaler

In [10]:
import numpy as np

class LogScaler(BaseEstimator, TransformerMixin):
    def __init__(self, listNames=['chlorides', 'residual sugar']):
        self.listNames = listNames
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        for name in self.listNames:
            X[name] = [np.log(x+1) for x in list(X[name])]
        return X

In [11]:
# logScaler = LogScaler()
# logScaler.transform(df_wine.iloc[:, 1:])

## 1.6 KNN이용한 결측치 채우기

In [12]:
from sklearn.impute import KNNImputer
from sklearn.impute import SimpleImputer

## 1.7 DataFrame 포맷으로 되돌리기

In [13]:
class FormatDataFrame(BaseEstimator, TransformerMixin):
    def __init__(self, column_names=['type','fixed acidity','volatile acidity','citric acid','residual sugar','chlorides','free sulfur dioxide','total sulfur dioxide','density','pH','sulphates','alcohol','quality','quality_4','quality_5','quality_6','quality_7','quality_8']):
        self.column_names = column_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return pd.DataFrame(X, columns=self.column_names)

class FormatDataFrame_test(BaseEstimator, TransformerMixin):
    def __init__(self, column_names=['type','fixed acidity','volatile acidity','citric acid','residual sugar','chlorides','free sulfur dioxide','total sulfur dioxide','density','pH','sulphates','alcohol','quality']):
        self.column_names = column_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return pd.DataFrame(X, columns=self.column_names)

# 2. 전처리 파이프라인 구성

## 2.1 Pipeline을 이용한 전처리 작동 코드

In [14]:
from sklearn.pipeline import Pipeline

preproc_pipeline = Pipeline([
    ('binary_type', TypeBinaryConverter()), # type변수를 0,1 binary화
    ('drop_outliers', DropOutliers(scope=5)), # 이상치 제거
    ('merge_quality', MergeQuality()), # quality변수 3->4, 9->8 변환
    ('quality_groups', QualityGroups()), # quality변수 one-hot encode(binary 모델 구축을 위한)
    ('log_scaler', LogScaler()), # log정규화
    ('knn_imputer', KNNImputer(n_neighbors=2, weights="uniform")), # 결측치 KNN 메꾸기
    ('format_dataframe', FormatDataFrame()) # 데이터프레임으로 재정의
])

# 각단계 전처리를 끄고 싶으면(하지 않고 싶으면), 각 라인을 주석처리하면 됨.
# 예를 들어, 두 번째 줄 ('drop_outliers', DropOutliers(scope=5))을 주석처리하면 극단치 제거가 되지 않음.

In [30]:
preproc_pipeline_test = Pipeline([
    ('binary_type', TypeBinaryConverter()), # type변수를 0,1 binary화
    ('drop_outliers', DropOutliers(scope=5)), # 이상치 제거
    ('merge_quality', MergeQuality()), # quality변수 3->4, 9->8 변환
    #('quality_groups', QualityGroups()), # quality변수 one-hot encode(binary 모델 구축을 위한)
    ('log_scaler', LogScaler()), # log정규화
    ('knn_imputer', KNNImputer(n_neighbors=2, weights="uniform")), # 결측치 KNN 메꾸기
    ('format_dataframe', FormatDataFrame_test()) # 데이터프레임으로 재정의
])

df_test = preproc_pipeline_test.fit_transform(df_wine.copy())
df_test['quality'] = df_test['quality'].map({4: 0, 5: 1, 6: 2, 7: 3, 8: 4})

In [16]:
# 다음과 같이 일반적인 estimator처럼 fit_transform() 메소드로 전처리 가능
df_preproc = preproc_pipeline.fit_transform(df_wine.copy())

# 3. 전처리된 데이터프레임으로 작업하기

In [17]:
# 전처리된 데이터프레임 호출
df_preproc

Unnamed: 0,type,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,quality_4,quality_5,quality_6,quality_7,quality_8
0,1.0,7.0,0.270,0.36,3.077312,0.044017,45.0,170.0,1.00100,3.00,0.45,8.8,6.0,0.0,0.0,1.0,0.0,0.0
1,1.0,6.3,0.300,0.34,0.955511,0.047837,14.0,132.0,0.99400,3.30,0.49,9.5,6.0,0.0,0.0,1.0,0.0,0.0
2,1.0,8.1,0.280,0.40,2.066863,0.048790,30.0,97.0,0.99510,3.26,0.44,10.1,6.0,0.0,0.0,1.0,0.0,0.0
3,1.0,7.2,0.230,0.32,2.251292,0.056380,47.0,186.0,0.99560,3.19,0.40,9.9,6.0,0.0,0.0,1.0,0.0,0.0
4,1.0,7.2,0.230,0.32,2.251292,0.056380,47.0,186.0,0.99560,3.19,0.40,9.9,6.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6414,0.0,6.2,0.600,0.08,1.098612,0.086178,32.0,44.0,0.99490,3.45,0.58,10.5,5.0,0.0,1.0,0.0,0.0,0.0
6415,0.0,5.9,0.550,0.10,1.163151,0.060154,39.0,51.0,0.99512,3.52,0.93,11.2,6.0,0.0,0.0,1.0,0.0,0.0
6416,0.0,6.3,0.510,0.13,1.193922,0.073250,29.0,40.0,0.99574,3.42,0.75,11.0,6.0,0.0,0.0,1.0,0.0,0.0
6417,0.0,5.9,0.645,0.12,1.098612,0.072321,32.0,44.0,0.99547,3.57,0.71,10.2,5.0,0.0,1.0,0.0,0.0,0.0


In [18]:
# 이렇게 하면 X
X = df_preproc.iloc[:, :-6]


Unnamed: 0,type,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,1.0,7.0,0.270,0.36,3.077312,0.044017,45.0,170.0,1.00100,3.00,0.45,8.8
1,1.0,6.3,0.300,0.34,0.955511,0.047837,14.0,132.0,0.99400,3.30,0.49,9.5
2,1.0,8.1,0.280,0.40,2.066863,0.048790,30.0,97.0,0.99510,3.26,0.44,10.1
3,1.0,7.2,0.230,0.32,2.251292,0.056380,47.0,186.0,0.99560,3.19,0.40,9.9
4,1.0,7.2,0.230,0.32,2.251292,0.056380,47.0,186.0,0.99560,3.19,0.40,9.9
...,...,...,...,...,...,...,...,...,...,...,...,...
6414,0.0,6.2,0.600,0.08,1.098612,0.086178,32.0,44.0,0.99490,3.45,0.58,10.5
6415,0.0,5.9,0.550,0.10,1.163151,0.060154,39.0,51.0,0.99512,3.52,0.93,11.2
6416,0.0,6.3,0.510,0.13,1.193922,0.073250,29.0,40.0,0.99574,3.42,0.75,11.0
6417,0.0,5.9,0.645,0.12,1.098612,0.072321,32.0,44.0,0.99547,3.57,0.71,10.2


In [19]:
# 이렇게 하면 quality_6의 목적변수
# y_6 = df_preproc[['quality_6']]
for i in range(4, 9):
    column_name = 'quality_' + str(i)
    globals()['y_'+str(i)] = df_preproc[[column_name]]

In [31]:
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.metrics import classification_report_imbalanced
from sklearn.preprocessing import OrdinalEncoder
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split

# 데이터셋을 트레이닝 셋과 임시 셋으로 60:40 비율로 나눔
X_train, X_tmp, y_train, y_tmp = train_test_split(df_test.iloc[:, :11], df_test.iloc[:, 12:], test_size=0.4, random_state=100)

# oversampling(ros), undersampling(rus)
ros = SMOTE(k_neighbors=4)
X_train_ros, y_train_ros = ros.fit_resample(X_train, y_train)

rus = RandomUnderSampler(random_state=100)
X_train_rus, y_train_rus = rus.fit_resample(X_train, y_train)

# 임시 셋을 밸리데이션 셋과 테스트 셋으로 50:50 비율로 나눔
X_val, X_test, y_val, y_test = train_test_split(X_tmp, y_tmp, test_size=0.5, random_state=100)

In [32]:
y_train

Unnamed: 0,quality
3569,3
3822,1
5274,1
3669,2
4555,3
...,...
6026,2
79,1
3927,2
5955,2


In [24]:
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
from sklearn.metrics import f1_score, roc_auc_score
from sklearn.metrics import make_scorer

In [40]:
    xgb_orign = XGBClassifier()
    xgb_ros = XGBClassifier()
    xgb_rus = XGBClassifier()
# use_label_encoder=False, objective='multi:softprob', num_class=5
    # 탐색할 파라미터 그리드 정의
    param_distributions = {
        'max_depth': [3, 5, 7, 9, 11, 13],
        'learning_rate': np.logspace(-3, 0, 100),
        'n_estimators': [50, 100, 200, 300, 400],
        'gamma': [0, 0.1, 0.2, 0.3, 0.4, 0.5],
        'subsample': [0.6, 0.7, 0.8, 0.9, 1.0],
        'colsample_bytree': [0.6, 0.7, 0.8, 0.9, 1.0],
        'reg_alpha': np.logspace(-3, 3, 100),
        'reg_lambda': np.logspace(-3, 3, 100)
    }

    # 그리드 서치 객체 생성
    ran_search_orign = RandomizedSearchCV(estimator=xgb_orign, param_distributions=param_distributions, n_iter=100, cv=5, scoring='f1_weighted', n_jobs=-1)
    ran_search_ros = RandomizedSearchCV(estimator=xgb_ros, param_distributions=param_distributions, n_iter=100, cv=5, scoring='f1_weighted', n_jobs=-1)
    ran_search_rus = RandomizedSearchCV(estimator=xgb_rus, param_distributions=param_distributions, n_iter=100, cv=5, scoring='f1_weighted', n_jobs=-1)

    # 그리드 서치 수행
    ran_search_orign.fit(X_train, y_train)
    ran_search_ros.fit(X_train_ros, y_train_ros)
    ran_search_rus.fit(X_train_rus, y_train_rus)

    # 최적 파라미터와 점수 출력
    print('-------------------------------------------------------------------------------------------------------------------')
    print("orign data result: ", ran_search_orign.best_params_)
    print("orign data result: ", ran_search_orign.best_score_)
    print('-------------------------------------------------------------------------------------------------------------------')
    print("ros data result: ", ran_search_ros.best_params_)
    print("ros data result: ", ran_search_ros.best_score_)
    print('-------------------------------------------------------------------------------------------------------------------')
    print("rus data result: ", ran_search_rus.best_params_)
    print("rus data result: ", ran_search_rus.best_score_)
    print('-------------------------------------------------------------------------------------------------------------------')

-------------------------------------------------------------------------------------------------------------------
orign data result:  {'subsample': 0.7, 'reg_lambda': 0.0093260334688322, 'reg_alpha': 0.001, 'n_estimators': 200, 'max_depth': 9, 'learning_rate': 0.021544346900318846, 'gamma': 0.4, 'colsample_bytree': 0.6}
orign data result:  0.6096729576839672
-------------------------------------------------------------------------------------------------------------------
ros data result:  {'subsample': 0.7, 'reg_lambda': 0.012328467394420659, 'reg_alpha': 0.08697490026177834, 'n_estimators': 300, 'max_depth': 11, 'learning_rate': 0.04037017258596556, 'gamma': 0, 'colsample_bytree': 0.9}
ros data result:  0.8231513698226998
-------------------------------------------------------------------------------------------------------------------
rus data result:  {'subsample': 0.9, 'reg_lambda': 30.538555088334185, 'reg_alpha': 0.0023101297000831605, 'n_estimators': 300, 'max_depth': 13, 'le

In [41]:
# -------------------------------------------------------------------------------------------------------------------
# orign data result:  {'subsample': 0.7, 'reg_lambda': 0.0093260334688322, 'reg_alpha': 0.001, 'n_estimators': 200, 'max_depth': 9, 'learning_rate': 0.021544346900318846, 'gamma': 0.4, 'colsample_bytree': 0.6}
# orign data result:  0.6096729576839672
# -------------------------------------------------------------------------------------------------------------------
# ros data result:  {'subsample': 0.7, 'reg_lambda': 0.012328467394420659, 'reg_alpha': 0.08697490026177834, 'n_estimators': 300, 'max_depth': 11, 'learning_rate': 0.04037017258596556, 'gamma': 0, 'colsample_bytree': 0.9}
# ros data result:  0.8231513698226998
# -------------------------------------------------------------------------------------------------------------------

xgb_model_orign = XGBClassifier(subsample=0.7,
    reg_lambda=0.0093260334688322,
    reg_alpha=0.001,
    n_estimators=200,
    max_depth=9,
    learning_rate=0.021544346900318846,
    gamma=0.4,
    colsample_bytree=0.6)
xgb_model_ros = XGBClassifier(subsample=0.7,
    reg_lambda=0.012328467394420659,
    reg_alpha=0.08697490026177834,
    n_estimators=300,
    max_depth=11,
    learning_rate=0.04037017258596556,
    gamma=0,
    colsample_bytree=0.9)
xgb_model_orign.fit(X_train, y_train)
xgb_model_ros.fit(X_train_ros, y_train_ros)

xgb_ros_pred = xgb_model_ros.predict(X_val)
print(xgb_ros_pred)
xgb_orign_pred = xgb_model_orign.predict(X_val)
print(xgb_orign_pred)

[4 1 1 ... 2 2 1]
[2 1 1 ... 2 2 1]


In [42]:
# -------------------------------------------------------------------------------------------------------------------
# rus data result:  {'subsample': 0.9, 'reg_lambda': 30.538555088334185, 'reg_alpha': 0.0023101297000831605, 'n_estimators': 300, 'max_depth': 13, 'learning_rate': 0.30538555088334157, 'gamma': 0, 'colsample_bytree': 0.6}
# rus data result:  0.47339478970718024
# -------------------------------------------------------------------------------------------------------------------

xgb_model_rus = XGBClassifier(subsample=0.9,
    reg_lambda=30.538555088334185,
    reg_alpha=0.0023101297000831605,
    n_estimators=300,
    max_depth=13,
    learning_rate=0.30538555088334157,
    gamma=0,
    colsample_bytree=0.6)
xgb_model_rus.fit(X_train_rus, y_train_rus)
xgb_rus_pred = xgb_model_rus.predict(X_val)
print(xgb_rus_pred)

[3 3 1 ... 3 2 1]


In [43]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, balanced_accuracy_score
import matplotlib.pyplot as plt

def eval_model(y_true, y_pred):
    # Confusion Matrix
    confusion_mat = confusion_matrix(y_true, y_pred, labels=[4,5,6,7,8])
    print(confusion_mat)
    
    # Total Accuracy
    accuracy = accuracy_score(y_true, y_pred) # 전체 정확도
    print(accuracy)
    
    # Class Accuracy & Macro / Weighted Accuracy
    class_names = [4,5,6,7,8] # 클래스 이름
    macro_accuracy = 0
    weighted_accuracy = 0

    for class_name in class_names:
        indices = [i for i, x in enumerate(y_true) if x == class_name] # 해당 클래스의 샘플 인덱스
        y_true_class = [y_true[i] for i in indices]
        y_pred_class = [y_pred[i] for i in indices]
        accuracy_class = accuracy_score(y_true_class, y_pred_class) # 해당 클래스의 정확도
        precision_class = precision_score(y_true, y_pred, labels=[class_name], average='micro')
        recall_class = recall_score(y_true, y_pred, labels=[class_name], average='micro')
        f1_class = f1_score(y_true, y_pred, labels=[class_name], average='micro')
        print("Class", class_name)
        print(class_name, "class Accuracy: ", accuracy_class)
        print(class_name, "class Precision: ", precision_class)
        print(class_name, "class Recall: ", recall_class)
        print(class_name, "class f1: ", f1_class)
        macro_accuracy += accuracy_class
        weighted_accuracy += accuracy_class * len(indices)

    macro_accuracy /= len(class_names) # 매크로 평균
    weighted_accuracy /= len(y_true) # 가중 평균
    print("Macro-Average Accuracy:", macro_accuracy)
    print("Weighted-Average Accuracy:", weighted_accuracy)

In [44]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

def eval_model_v2(y_true, y_pred):
    class_names = [0, 1, 2, 3, 4]

    # Confusion Matrix
    confusion_mat = confusion_matrix(y_true, y_pred, labels=class_names)
    print("Confusion Matrix:\n", confusion_mat)

    # Total Accuracy
    total_accuracy = accuracy_score(y_true, y_pred)
    print("Total Accuracy:", total_accuracy)

    # Class-wise Metrics
    precision = precision_score(y_true, y_pred, labels=class_names, average=None)
    recall = recall_score(y_true, y_pred, labels=class_names, average=None)
    f1 = f1_score(y_true, y_pred, labels=class_names, average=None)

    for i, class_name in enumerate(class_names):
        print(f"Class {class_name}:")
        print(f"  Accuracy: {confusion_mat[i, i] / sum(confusion_mat[i, :])}")
        print(f"  Precision: {precision[i]}")
        print(f"  Recall: {recall[i]}")
        print(f"  F1-score: {f1[i]}")

    # Macro-Average Metrics
    macro_precision = sum(precision) / len(class_names)
    macro_recall = sum(recall) / len(class_names)
    macro_f1 = sum(f1) / len(class_names)
    print("\nMacro-Average Metrics:")
    print(f"  Macro-Average Precision: {macro_precision}")
    print(f"  Macro-Average Recall: {macro_recall}")
    print(f"  Macro-Average F1-score: {macro_f1}")

    # Weighted-Average Metrics
    weights = [sum(confusion_mat[i, :]) for i in range(len(class_names))]
    total_samples = sum(weights)
    weighted_precision = sum(precision[i] * weights[i] for i in range(len(class_names))) / total_samples
    weighted_recall = sum(recall[i] * weights[i] for i in range(len(class_names))) / total_samples
    weighted_f1 = sum(f1[i] * weights[i] for i in range(len(class_names))) / total_samples
    print("\nWeighted-Average Metrics:")
    print(f"  Weighted-Average Precision: {weighted_precision}")
    print(f"  Weighted-Average Recall: {weighted_recall}")
    print(f"  Weighted-Average F1-score: {weighted_f1}")

In [45]:
eval_model_v2(y_val, xgb_orign_pred)


Confusion Matrix:
 [[  1  31  17   0   0]
 [  2 274 121   2   0]
 [  0  96 439  29   0]
 [  0   9 117 102   0]
 [  0   2  20   8  14]]
Total Accuracy: 0.6464174454828661
Class 0:
  Accuracy: 0.02040816326530612
  Precision: 0.3333333333333333
  Recall: 0.02040816326530612
  F1-score: 0.03846153846153846
Class 1:
  Accuracy: 0.6867167919799498
  Precision: 0.6650485436893204
  Recall: 0.6867167919799498
  F1-score: 0.6757090012330457
Class 2:
  Accuracy: 0.7783687943262412
  Precision: 0.6148459383753502
  Recall: 0.7783687943262412
  F1-score: 0.6870109546165885
Class 3:
  Accuracy: 0.4473684210526316
  Precision: 0.723404255319149
  Recall: 0.4473684210526316
  F1-score: 0.5528455284552847
Class 4:
  Accuracy: 0.3181818181818182
  Precision: 1.0
  Recall: 0.3181818181818182
  F1-score: 0.4827586206896552

Macro-Average Metrics:
  Macro-Average Precision: 0.6673264141434305
  Macro-Average Recall: 0.4502087977611894
  Macro-Average F1-score: 0.4873571286912225

Weighted-Average Metrics

In [47]:
eval_model_v2(y_val, xgb_ros_pred)

Confusion Matrix:
 [[ 15  20  14   0   0]
 [ 17 274 101   7   0]
 [  8 106 389  53   8]
 [  0  13  85 121   9]
 [  1   0  17  10  16]]
Total Accuracy: 0.6347352024922118
Class 0:
  Accuracy: 0.30612244897959184
  Precision: 0.36585365853658536
  Recall: 0.30612244897959184
  F1-score: 0.3333333333333333
Class 1:
  Accuracy: 0.6867167919799498
  Precision: 0.6634382566585957
  Recall: 0.6867167919799498
  F1-score: 0.6748768472906403
Class 2:
  Accuracy: 0.6897163120567376
  Precision: 0.641914191419142
  Recall: 0.6897163120567376
  F1-score: 0.6649572649572649
Class 3:
  Accuracy: 0.5307017543859649
  Precision: 0.6335078534031413
  Recall: 0.5307017543859649
  F1-score: 0.5775656324582339
Class 4:
  Accuracy: 0.36363636363636365
  Precision: 0.48484848484848486
  Recall: 0.36363636363636365
  F1-score: 0.4155844155844156

Macro-Average Metrics:
  Macro-Average Precision: 0.5579124889731898
  Macro-Average Recall: 0.5153787342077216
  Macro-Average F1-score: 0.5332634987247775

Weight

In [46]:
eval_model_v2(y_val, xgb_rus_pred)

Confusion Matrix:
 [[ 29  10   8   2   0]
 [ 99 164  79  35  22]
 [ 69 145 158 101  91]
 [ 12  19  40  83  74]
 [  1   1   5  13  24]]
Total Accuracy: 0.35669781931464173
Class 0:
  Accuracy: 0.5918367346938775
  Precision: 0.1380952380952381
  Recall: 0.5918367346938775
  F1-score: 0.223938223938224
Class 1:
  Accuracy: 0.41102756892230574
  Precision: 0.4837758112094395
  Recall: 0.41102756892230574
  F1-score: 0.4444444444444444
Class 2:
  Accuracy: 0.2801418439716312
  Precision: 0.5448275862068965
  Recall: 0.2801418439716312
  F1-score: 0.37002341920374704
Class 3:
  Accuracy: 0.36403508771929827
  Precision: 0.3547008547008547
  Recall: 0.36403508771929827
  F1-score: 0.3593073593073593
Class 4:
  Accuracy: 0.5454545454545454
  Precision: 0.11374407582938388
  Recall: 0.5454545454545454
  F1-score: 0.18823529411764706

Macro-Average Metrics:
  Macro-Average Precision: 0.3270287132083626
  Macro-Average Recall: 0.4384991561523316
  Macro-Average F1-score: 0.31718974820228435

Wei