# (함수) DataFrame으로 불러오는 함수

In [17]:
import pandas as pd

In [18]:
def openDataFrame(path):
    return pd.read_csv(path, on_bad_lines='skip', encoding = "ISO-8859-1")

# 0. 원본 데이터 :  Wine Quality Prediction Analysis - Classification

In [19]:
path_wine = './winequality.csv'

In [20]:
df_wine = openDataFrame(path_wine)
df_wine

Unnamed: 0,type,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,white,7.0,0.270,0.36,20.7,0.045,45.0,170.0,1.00100,3.00,0.45,8.8,6
1,white,6.3,0.300,0.34,1.6,0.049,14.0,132.0,0.99400,3.30,0.49,9.5,6
2,white,8.1,0.280,0.40,6.9,0.050,30.0,97.0,0.99510,3.26,0.44,10.1,6
3,white,7.2,0.230,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,6
4,white,7.2,0.230,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6492,red,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5,5
6493,red,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,,11.2,6
6494,red,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0,6
6495,red,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,10.2,5


# 1. 전처리기(나만의 변환기들)

In [21]:
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin

## 1.1 변환기: Type 이진화

In [22]:
class TypeBinaryConverter(BaseEstimator, TransformerMixin):
    def __init__(self, name='type'):
        self.name = name
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X[self.name] = (X[self.name] == 'white').astype(int)
        return X

## 1.2 변환기: quality 3->4, 9->8 변환

In [23]:
class MergeQuality(BaseEstimator, TransformerMixin):
    def __init__(self, name='quality'):
        self.name = name
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X[self.name] = X[self.name].replace({3:4, 9:8})
        return X

## 1.3 변환기: quality 4, ... ,8 이진그룹화 변환

In [24]:
class QualityGroups(BaseEstimator, TransformerMixin):
    def __init__(self, listGroups=['quality_4', 'quality_5', 'quality_6', 'quality_7', 'quality_8']):
        self.listGroups = listGroups
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        for i, name in enumerate(self.listGroups):
            X[name] = (X['quality']==i+4).astype(int)
        return X

## 1.4 변환기: 특성들의 극단치 모두 제거

In [25]:
class DropOutliers(BaseEstimator, TransformerMixin):
    def __init__(self, scope=5):
        self.scope = scope
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        myList = []
        for name in X.columns:
            if name not in ['type', 'quality']:
                Q1 = X[[name]].quantile(0.25)
                Q3 = X[[name]].quantile(0.75)
                IQR = Q3 - Q1
                lower_bound = Q1 - self.scope * IQR
                upper_bound = Q3 + self.scope * IQR

                # 극단치가 있는 열과 행을 찾음
                outlier_cols = np.where((X[[name]] < lower_bound) | (X[[name]] > upper_bound))[1]
                outlier_rows = np.where((X[[name]] < lower_bound) | (X[[name]] > upper_bound))[0]

                # 극단치가 있는 행의 인덱스를 반환
                outlier_indices = pd.Series(outlier_rows).unique()    
                myList += list(outlier_indices)
        newDf = X.copy().drop(myList, axis=0)
        return newDf

## 1.5 변환기: log_scaler

In [26]:
import numpy as np

class LogScaler(BaseEstimator, TransformerMixin):
    def __init__(self, listNames=['chlorides', 'residual sugar']):
        self.listNames = listNames
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        for name in self.listNames:
            X[name] = [np.log(x+1) for x in list(X[name])]
        return X

In [27]:
# logScaler = LogScaler()
# logScaler.transform(df_wine.iloc[:, 1:])

## 1.6 KNN이용한 결측치 채우기

In [28]:
from sklearn.impute import KNNImputer
from sklearn.impute import SimpleImputer

## 1.7 DataFrame 포맷으로 되돌리기

In [32]:
class FormatDataFrame(BaseEstimator, TransformerMixin):
    def __init__(self, column_names=['type','fixed acidity','volatile acidity','citric acid','residual sugar','chlorides','free sulfur dioxide','total sulfur dioxide','density','pH','sulphates','alcohol','quality','quality_4','quality_5','quality_6','quality_7','quality_8']):
        self.column_names = column_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return pd.DataFrame(X, columns=self.column_names)

class FormatDataFrame_test(BaseEstimator, TransformerMixin):
    def __init__(self, column_names=['type','fixed acidity','volatile acidity','citric acid','residual sugar','chlorides','free sulfur dioxide','total sulfur dioxide','density','pH','sulphates','alcohol','quality']):
        self.column_names = column_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return pd.DataFrame(X, columns=self.column_names)

# 2. 전처리 파이프라인 구성

## 2.1 Pipeline을 이용한 전처리 작동 코드

In [30]:
from sklearn.pipeline import Pipeline

preproc_pipeline = Pipeline([
    ('binary_type', TypeBinaryConverter()), # type변수를 0,1 binary화
    ('drop_outliers', DropOutliers(scope=5)), # 이상치 제거
    ('merge_quality', MergeQuality()), # quality변수 3->4, 9->8 변환
    ('quality_groups', QualityGroups()), # quality변수 one-hot encode(binary 모델 구축을 위한)
    ('log_scaler', LogScaler()), # log정규화
    ('knn_imputer', KNNImputer(n_neighbors=2, weights="uniform")), # 결측치 KNN 메꾸기
    ('format_dataframe', FormatDataFrame()) # 데이터프레임으로 재정의
])

# 각단계 전처리를 끄고 싶으면(하지 않고 싶으면), 각 라인을 주석처리하면 됨.
# 예를 들어, 두 번째 줄 ('drop_outliers', DropOutliers(scope=5))을 주석처리하면 극단치 제거가 되지 않음.

In [33]:
preproc_pipeline_test = Pipeline([
    ('binary_type', TypeBinaryConverter()), # type변수를 0,1 binary화
    ('drop_outliers', DropOutliers(scope=5)), # 이상치 제거
    ('merge_quality', MergeQuality()), # quality변수 3->4, 9->8 변환
    #('quality_groups', QualityGroups()), # quality변수 one-hot encode(binary 모델 구축을 위한)
    ('log_scaler', LogScaler()), # log정규화
    ('knn_imputer', KNNImputer(n_neighbors=2, weights="uniform")), # 결측치 KNN 메꾸기
    ('format_dataframe', FormatDataFrame_test()) # 데이터프레임으로 재정의
])

df_test = preproc_pipeline_test.fit_transform(df_wine.copy())

In [34]:
# 다음과 같이 일반적인 estimator처럼 fit_transform() 메소드로 전처리 가능
df_preproc = preproc_pipeline.fit_transform(df_wine.copy())

# 3. 전처리된 데이터프레임으로 작업하기

In [35]:
# 전처리된 데이터프레임 호출
df_preproc

Unnamed: 0,type,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,quality_4,quality_5,quality_6,quality_7,quality_8
0,1.0,7.0,0.270,0.36,3.077312,0.044017,45.0,170.0,1.00100,3.00,0.45,8.8,6.0,0.0,0.0,1.0,0.0,0.0
1,1.0,6.3,0.300,0.34,0.955511,0.047837,14.0,132.0,0.99400,3.30,0.49,9.5,6.0,0.0,0.0,1.0,0.0,0.0
2,1.0,8.1,0.280,0.40,2.066863,0.048790,30.0,97.0,0.99510,3.26,0.44,10.1,6.0,0.0,0.0,1.0,0.0,0.0
3,1.0,7.2,0.230,0.32,2.251292,0.056380,47.0,186.0,0.99560,3.19,0.40,9.9,6.0,0.0,0.0,1.0,0.0,0.0
4,1.0,7.2,0.230,0.32,2.251292,0.056380,47.0,186.0,0.99560,3.19,0.40,9.9,6.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6414,0.0,6.2,0.600,0.08,1.098612,0.086178,32.0,44.0,0.99490,3.45,0.58,10.5,5.0,0.0,1.0,0.0,0.0,0.0
6415,0.0,5.9,0.550,0.10,1.163151,0.060154,39.0,51.0,0.99512,3.52,0.93,11.2,6.0,0.0,0.0,1.0,0.0,0.0
6416,0.0,6.3,0.510,0.13,1.193922,0.073250,29.0,40.0,0.99574,3.42,0.75,11.0,6.0,0.0,0.0,1.0,0.0,0.0
6417,0.0,5.9,0.645,0.12,1.098612,0.072321,32.0,44.0,0.99547,3.57,0.71,10.2,5.0,0.0,1.0,0.0,0.0,0.0


In [36]:
# 이렇게 하면 X
X = df_preproc.iloc[:, :-6]
X

Unnamed: 0,type,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,1.0,7.0,0.270,0.36,3.077312,0.044017,45.0,170.0,1.00100,3.00,0.45,8.8
1,1.0,6.3,0.300,0.34,0.955511,0.047837,14.0,132.0,0.99400,3.30,0.49,9.5
2,1.0,8.1,0.280,0.40,2.066863,0.048790,30.0,97.0,0.99510,3.26,0.44,10.1
3,1.0,7.2,0.230,0.32,2.251292,0.056380,47.0,186.0,0.99560,3.19,0.40,9.9
4,1.0,7.2,0.230,0.32,2.251292,0.056380,47.0,186.0,0.99560,3.19,0.40,9.9
...,...,...,...,...,...,...,...,...,...,...,...,...
6414,0.0,6.2,0.600,0.08,1.098612,0.086178,32.0,44.0,0.99490,3.45,0.58,10.5
6415,0.0,5.9,0.550,0.10,1.163151,0.060154,39.0,51.0,0.99512,3.52,0.93,11.2
6416,0.0,6.3,0.510,0.13,1.193922,0.073250,29.0,40.0,0.99574,3.42,0.75,11.0
6417,0.0,5.9,0.645,0.12,1.098612,0.072321,32.0,44.0,0.99547,3.57,0.71,10.2


In [37]:
# 이렇게 하면 quality_6의 목적변수
# y_6 = df_preproc[['quality_6']]
for i in range(4, 9):
    column_name = 'quality_' + str(i)
    globals()['y_'+str(i)] = df_preproc[[column_name]]

In [38]:
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.metrics import classification_report_imbalanced
from sklearn.preprocessing import OrdinalEncoder
from imblearn.over_sampling import SMOTE

# 데이터셋을 트레이닝 셋과 임시 셋으로 60:40 비율로 나눔
X_train, X_tmp, y_train, y_tmp = train_test_split(df_test.iloc[:, :11], df_test.iloc[:, 12:], test_size=0.4, random_state=100)

# oversampling(ros), undersampling(rus)
ros = SMOTE(k_neighbors=4)
X_train_ros, y_train_ros = ros.fit_resample(X_train, y_train)

rus = RandomUnderSampler(random_state=100)
X_train_rus, y_train_rus = rus.fit_resample(X_train, y_train)

# 임시 셋을 밸리데이션 셋과 테스트 셋으로 50:50 비율로 나눔
X_val, X_test, y_val, y_test = train_test_split(X_tmp, y_tmp, test_size=0.5, random_state=100)

In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
from sklearn.metrics import f1_score, roc_auc_score
from sklearn.metrics import make_scorer


In [6]:

# 로지스틱 회귀(멀티클래스) 모델 정의
log_reg_orign = LogisticRegression(multi_class='auto')
log_reg_ros = LogisticRegression(multi_class='auto')
log_reg_rus = LogisticRegression(multi_class='auto')

# 탐색할 파라미터 그리드 정의
param_grid1 = {'C': [0.01, 0.1, 1, 10], 
              'max_iter': [100, 200, 300],
              'penalty': ['l2', 'None'],
              'solver': ['newton-cg', 'lbfgs', 'sag', 'saga'],
              'class_weight': ['None', 'balanced']}

param_grid2 = {'C': [0.01, 0.1, 1, 10], 
              'max_iter': [100, 200, 300],
              'penalty': ['l1', 'elasticnet'],
              'solver': ['saga'],
              'class_weight': ['None', 'balanced']}

# 그리드 서치 객체 생성
grid_search_orign = GridSearchCV(log_reg_orign, param_grid=[param_grid1, param_grid2], cv=5, n_jobs=-1, scoring='f1_weighted')
grid_search_ros = GridSearchCV(log_reg_ros, param_grid=[param_grid1, param_grid2], cv=5, n_jobs=-1, scoring='f1_weighted')
grid_search_rus = GridSearchCV(log_reg_rus, param_grid=[param_grid1, param_grid2], cv=5, n_jobs=-1, scoring='f1_weighted')

# scoring 옵션 참고사항
## orign은 imbalanced 데이터로 이를 평가하기 위해서는 balanced/weighted의 개념을 사용, over 및 under의 경우 아무것도 적용하지 않음/macro의 개념을 사용
## orign은 다음을 사용 - 'balanced_accuracy', 'precision_weighted', 'recall_weighted', 'f1_weighted', 'roc_auc_ovr_weighted'
## ros,rus는 다음을 사용 - 'accruracy', 'precision_macro', 'recall_macro', 'f1_macro', 'roc_auc_ovr'

# 그리드 서치 수행
grid_search_orign.fit(X_train, y_train)
grid_search_ros.fit(X_train_ros, y_train_ros)
grid_search_rus.fit(X_train_rus, y_train_rus)

# 최적 파라미터와 점수 출력
print('-------------------------------------------------------------------------------------------------------------------')
print("orign data result: ", grid_search_orign.best_params_)
print("orign data result: ", grid_search_orign.best_score_)
print('-------------------------------------------------------------------------------------------------------------------')
print("ros data result: ", grid_search_ros.best_params_)
print("ros data result: ", grid_search_ros.best_score_)
print('-------------------------------------------------------------------------------------------------------------------')
print("rus data result: ", grid_search_rus.best_params_)
print("rus data result: ", grid_search_rus.best_score_)
print('-------------------------------------------------------------------------------------------------------------------')

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/mod

-------------------------------------------------------------------------------------------------------------------
orign data result:  {'C': 10, 'class_weight': 'balanced', 'max_iter': 100, 'penalty': 'l2', 'solver': 'newton-cg'}
orign data result:  0.2961196234856458
-------------------------------------------------------------------------------------------------------------------
ros data result:  {'C': 10, 'class_weight': 'balanced', 'max_iter': 100, 'penalty': 'l2', 'solver': 'newton-cg'}
ros data result:  0.38480427581202337
-------------------------------------------------------------------------------------------------------------------
rus data result:  {'C': 1, 'class_weight': 'balanced', 'max_iter': 100, 'penalty': 'l2', 'solver': 'newton-cg'}
rus data result:  0.32522148670258916
-------------------------------------------------------------------------------------------------------------------


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
900 fits failed out of a total of 1200.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
600 fits failed with the following error:
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/sklearn/linear_model/_logistic.py", line 1160, in fit
    self._validate_params()
  File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11

In [39]:
# ros data result for f1_weighted:  {'C': 10, 'class_weight': 'balanced', 'max_iter': 100, 'penalty': 'l2', 'solver': 'newton-cg'}
logit_model_orign = LogisticRegression(multi_class='auto', solver='newton-cg', C =10, max_iter=100, class_weight='balanced', penalty='l2')
logit_model_ros = LogisticRegression(multi_class='auto', solver='newton-cg', C =10, max_iter=100, class_weight='balanced', penalty='l2')
logit_model_orign.fit(X_train, y_train)
logit_model_ros.fit(X_train_ros, y_train_ros)

logit_ros_pred = logit_model_ros.predict(X_val)
print(logit_ros_pred)
logit_orign_pred = logit_model_orign.predict(X_val)
print(logit_orign_pred)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[7. 5. 5. ... 7. 7. 5.]
[7. 5. 5. ... 7. 7. 5.]


In [75]:
# rus data result:  {'C': 1, 'class_weight': 'balanced', 'max_iter': 100, 'penalty': 'l2', 'solver': 'newton-cg'}
# rus data result:  0.32522148670258916

logit_model_rus = LogisticRegression(solver='newton-cg', C =1, max_iter=100, class_weight='balanced', penalty='l2')
logit_model_rus.fit(X_train_rus, y_train_rus)
logit_rus_pred = logit_model_rus.predict(X_val)
print(logit_rus_pred)

[6. 5. 5. ... 6. 8. 5.]


  y = column_or_1d(y, warn=True)


In [67]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, balanced_accuracy_score
import matplotlib.pyplot as plt

def eval_model(y_true, y_pred):
    # Confusion Matrix
    confusion_mat = confusion_matrix(y_true, y_pred, labels=[4,5,6,7,8])
    print(confusion_mat)
    
    # Total Accuracy
    accuracy = accuracy_score(y_true, y_pred) # 전체 정확도
    print(accuracy)
    
    # Class Accuracy & Macro / Weighted Accuracy
    class_names = [4,5,6,7,8] # 클래스 이름
    macro_accuracy = 0
    weighted_accuracy = 0

    for class_name in class_names:
        indices = [i for i, x in enumerate(y_true) if x == class_name] # 해당 클래스의 샘플 인덱스
        y_true_class = [y_true[i] for i in indices]
        y_pred_class = [y_pred[i] for i in indices]
        accuracy_class = accuracy_score(y_true_class, y_pred_class) # 해당 클래스의 정확도
        precision_class = precision_score(y_true, y_pred, labels=[class_name], average='micro')
        recall_class = recall_score(y_true, y_pred, labels=[class_name], average='micro')
        f1_class = f1_score(y_true, y_pred, labels=[class_name], average='micro')
        print("Class", class_name)
        print(class_name, "class Accuracy: ", accuracy_class)
        print(class_name, "class Precision: ", precision_class)
        print(class_name, "class Recall: ", recall_class)
        print(class_name, "class f1: ", f1_class)
        macro_accuracy += accuracy_class
        weighted_accuracy += accuracy_class * len(indices)

    macro_accuracy /= len(class_names) # 매크로 평균
    weighted_accuracy /= len(y_true) # 가중 평균
    print("Macro-Average Accuracy:", macro_accuracy)
    print("Weighted-Average Accuracy:", weighted_accuracy)

In [64]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

def eval_model_v2(y_true, y_pred):
    class_names = [4, 5, 6, 7, 8]

    # Confusion Matrix
    confusion_mat = confusion_matrix(y_true, y_pred, labels=class_names)
    print("Confusion Matrix:\n", confusion_mat)

    # Total Accuracy
    total_accuracy = accuracy_score(y_true, y_pred)
    print("Total Accuracy:", total_accuracy)

    # Class-wise Metrics
    precision = precision_score(y_true, y_pred, labels=class_names, average=None)
    recall = recall_score(y_true, y_pred, labels=class_names, average=None)
    f1 = f1_score(y_true, y_pred, labels=class_names, average=None)

    for i, class_name in enumerate(class_names):
        print(f"Class {class_name}:")
        print(f"  Accuracy: {confusion_mat[i, i] / sum(confusion_mat[i, :])}")
        print(f"  Precision: {precision[i]}")
        print(f"  Recall: {recall[i]}")
        print(f"  F1-score: {f1[i]}")

    # Macro-Average Metrics
    macro_precision = sum(precision) / len(class_names)
    macro_recall = sum(recall) / len(class_names)
    macro_f1 = sum(f1) / len(class_names)
    print("\nMacro-Average Metrics:")
    print(f"  Macro-Average Precision: {macro_precision}")
    print(f"  Macro-Average Recall: {macro_recall}")
    print(f"  Macro-Average F1-score: {macro_f1}")

    # Weighted-Average Metrics
    weights = [sum(confusion_mat[i, :]) for i in range(len(class_names))]
    total_samples = sum(weights)
    weighted_precision = sum(precision[i] * weights[i] for i in range(len(class_names))) / total_samples
    weighted_recall = sum(recall[i] * weights[i] for i in range(len(class_names))) / total_samples
    weighted_f1 = sum(f1[i] * weights[i] for i in range(len(class_names))) / total_samples
    print("\nWeighted-Average Metrics:")
    print(f"  Weighted-Average Precision: {weighted_precision}")
    print(f"  Weighted-Average Recall: {weighted_recall}")
    print(f"  Weighted-Average F1-score: {weighted_f1}")

In [69]:
eval_model_v2(y_val, logit_orign_pred)


Confusion Matrix:
 [[ 27  14   1   2   5]
 [ 96 177  37  26  63]
 [ 68 148  68 140 140]
 [ 15  24  19  82  88]
 [  4   4   3  16  17]]
Total Accuracy: 0.28894080996884736
Class 4:
  Accuracy: 0.5510204081632653
  Precision: 0.12857142857142856
  Recall: 0.5510204081632653
  F1-score: 0.20849420849420847
Class 5:
  Accuracy: 0.44360902255639095
  Precision: 0.4822888283378747
  Recall: 0.44360902255639095
  F1-score: 0.4621409921671018
Class 6:
  Accuracy: 0.12056737588652482
  Precision: 0.53125
  Recall: 0.12056737588652482
  F1-score: 0.19653179190751444
Class 7:
  Accuracy: 0.35964912280701755
  Precision: 0.3082706766917293
  Recall: 0.35964912280701755
  F1-score: 0.33198380566801616
Class 8:
  Accuracy: 0.38636363636363635
  Precision: 0.054313099041533544
  Recall: 0.38636363636363635
  F1-score: 0.09523809523809525

Macro-Average Metrics:
  Macro-Average Precision: 0.30093880652851324
  Macro-Average Recall: 0.372241913155367
  Macro-Average F1-score: 0.2588777786949873

Weight

In [73]:
eval_model_v2(y_val, logit_ros_pred)

Confusion Matrix:
 [[ 26  15   0   3   5]
 [ 94 174  38  30  63]
 [ 63 141  69 157 134]
 [ 14  23  16  96  79]
 [  3   4   2  19  16]]
Total Accuracy: 0.2967289719626168
Class 4:
  Accuracy: 0.5306122448979592
  Precision: 0.13
  Recall: 0.5306122448979592
  F1-score: 0.20883534136546186
Class 5:
  Accuracy: 0.43609022556390975
  Precision: 0.48739495798319327
  Recall: 0.43609022556390975
  F1-score: 0.46031746031746035
Class 6:
  Accuracy: 0.12234042553191489
  Precision: 0.552
  Recall: 0.12234042553191489
  F1-score: 0.20029027576197386
Class 7:
  Accuracy: 0.42105263157894735
  Precision: 0.31475409836065577
  Recall: 0.42105263157894735
  F1-score: 0.3602251407129456
Class 8:
  Accuracy: 0.36363636363636365
  Precision: 0.05387205387205387
  Recall: 0.36363636363636365
  F1-score: 0.093841642228739

Macro-Average Metrics:
  Macro-Average Precision: 0.3076042220431806
  Macro-Average Recall: 0.37474637824181894
  Macro-Average F1-score: 0.26470197207731616

Weighted-Average Metric

In [76]:
eval_model_v2(y_val, logit_rus_pred)

Confusion Matrix:
 [[ 30  10   2   2   5]
 [117 150  36  30  66]
 [ 93 116  76 116 163]
 [ 27  18  39  51  93]
 [  4   2   5  13  20]]
Total Accuracy: 0.2546728971962617
Class 4:
  Accuracy: 0.6122448979591837
  Precision: 0.11070110701107011
  Recall: 0.6122448979591837
  F1-score: 0.1875
Class 5:
  Accuracy: 0.37593984962406013
  Precision: 0.5067567567567568
  Recall: 0.37593984962406013
  F1-score: 0.4316546762589928
Class 6:
  Accuracy: 0.1347517730496454
  Precision: 0.4810126582278481
  Recall: 0.1347517730496454
  F1-score: 0.2105263157894737
Class 7:
  Accuracy: 0.2236842105263158
  Precision: 0.24056603773584906
  Recall: 0.2236842105263158
  F1-score: 0.2318181818181818
Class 8:
  Accuracy: 0.45454545454545453
  Precision: 0.05763688760806916
  Recall: 0.45454545454545453
  F1-score: 0.10230179028132994

Macro-Average Metrics:
  Macro-Average Precision: 0.2793346894679186
  Macro-Average Recall: 0.3602332371409319
  Macro-Average F1-score: 0.23276019282959562

Weighted-Avera