In [279]:
import numpy as np
from collections import Counter
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
import random
from tqdm import tqdm
from sklearn.svm import SVC

def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)


class RUSBoost:

    def __init__(self, base_estimator=None, n_estimators=500, learning_rate=1e-2, seed=42):
        self.base_estimator = base_estimator if base_estimator else DecisionTreeClassifier(max_depth=1)  #SVC() #LogisticRegression()
        self.n_estimators = n_estimators
        self.learning_rate = learning_rate
        self.models = []  # List to store models
        set_seed(seed)

    def fit(self, X, y):
        self.estimator_weights = np.zeros(self.n_estimators)  # Initialize estimator weights
        self.estimator_errors = np.zeros(self.n_estimators)  # Initialize estimator errors
        sample_weights = np.full(X.shape[0], 1 / X.shape[0])  # create an array with shape X.shape[0] and value 1 / X.shape[0]

        for i in tqdm(range(self.n_estimators)):
            # Undersampling
            X_resampled, y_resampled = self.undersampling(X, y, sample_weights)
            # print(X_resampled.shape)

            # Train a base estimator on the resampled data
            model = self.base_estimator
            model.fit(X_resampled, y_resampled)
            self.models.append(model)

            # Update sample weights
            y_pred = model.predict(X)
            errors = np.abs(y_pred - y)
            self.estimator_errors[i] = np.sum(sample_weights * errors) / np.sum(sample_weights)  # Calculate error
            self.estimator_weights[i] = self.learning_rate * np.log((1 - self.estimator_errors[i]) / self.estimator_errors[i])  # Update weight

            # Update sample weights
            sample_weights *= np.exp(self.estimator_weights[i] * errors)
            sample_weights /= np.sum(sample_weights)  # Normalize weights

    def predict(self, X):
        predictions = np.zeros(len(X))
        for model, weight in zip(self.models, self.estimator_weights):
            predictions += weight * model.predict(X)
        return np.where(predictions >= 0.5 * sum(self.estimator_weights), 1, 0)

    def undersampling(self, X, y, sample_weights):

        # count the number of examples for each class
        class_counts = Counter(y)  # 440 + 358 --> 0 class vs 233  --> 1 class

        # find the minority class
        minority_class = min(class_counts, key=class_counts.get)
        minority_count = class_counts[minority_class]
        keep_list = [(i, X[i], label) for i, label in enumerate(y) if label == minority_class]

        # find other classes as major
        delete_list = [(i, X[i], label) for i, label in enumerate(y) if label != minority_class]  # 440 + 358 = 798
        delete_indices = random.sample(range(len(delete_list)), k=max(0, len(delete_list) - (len(delete_list)-minority_count)) ) # 440 + 358 - 233 = 545
        keep_list_not_minor = [delete_list[i] for i in delete_indices]

        all_list = keep_list + keep_list_not_minor
        all_list.sort(key=lambda x: x[2])

        X_resampled = np.array([item[1] for item in all_list])
        y_resampled = np.array([item[2] for item in all_list])
        # print(X_resampled.shape)

        return X_resampled, y_resampled


#CMC

In [2]:
!pip install ucimlrepo

Collecting ucimlrepo
  Downloading ucimlrepo-0.0.3-py3-none-any.whl (7.0 kB)
Installing collected packages: ucimlrepo
Successfully installed ucimlrepo-0.0.3


In [3]:
# import libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score

from ucimlrepo import fetch_ucirepo

import warnings
warnings.filterwarnings('ignore')

In [239]:
contraceptive_method_choice = fetch_ucirepo(id=30)

X = contraceptive_method_choice.data.features
y = contraceptive_method_choice.data.targets

print(contraceptive_method_choice.metadata, contraceptive_method_choice.variables)

{'uci_id': 30, 'name': 'Contraceptive Method Choice', 'repository_url': 'https://archive.ics.uci.edu/dataset/30/contraceptive+method+choice', 'data_url': 'https://archive.ics.uci.edu/static/public/30/data.csv', 'abstract': 'Dataset is a subset of the 1987 National Indonesia Contraceptive Prevalence Survey.', 'area': 'Health and Medicine', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 1473, 'num_features': 9, 'feature_types': ['Categorical', 'Integer'], 'demographics': ['Age', 'Education Level', 'Other', 'Occupation'], 'target_col': ['contraceptive_method'], 'index_col': None, 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 1999, 'last_updated': 'Fri Feb 16 2024', 'dataset_doi': '10.24432/C59W2D', 'creators': ['Tjen-Sien Lim'], 'intro_paper': None, 'additional_info': {'summary': 'This dataset is a subset of the 1987 National Indonesia Contraceptive Prevalence Survey. The samples are married women who were either

In [5]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1473 entries, 0 to 1472
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype
---  ------                    --------------  -----
 0   wife_age                  1473 non-null   int64
 1   wife_edu                  1473 non-null   int64
 2   husband_edu               1473 non-null   int64
 3   num_children              1473 non-null   int64
 4   wife_religion             1473 non-null   int64
 5   wife_working              1473 non-null   int64
 6   husband_occupation        1473 non-null   int64
 7   standard_of_living_index  1473 non-null   int64
 8   media_exposure            1473 non-null   int64
dtypes: int64(9)
memory usage: 103.7 KB


In [6]:
X.value_counts()

wife_age  wife_edu  husband_edu  num_children  wife_religion  wife_working  husband_occupation  standard_of_living_index  media_exposure
36        4         4            3             1              1             1                   4                         0                 4
25        4         4            1             1              1             1                   4                         0                 4
24        4         4            1             1              1             1                   4                         0                 3
45        4         4            3             1              1             1                   4                         0                 3
46        4         4            4             1              1             1                   4                         0                 3
                                                                                                                                           ..
28        3

In [240]:
X = pd.get_dummies(X, columns = ['wife_edu', 'husband_edu', 'husband_occupation', 'standard_of_living_index'], drop_first = True)

In [8]:
X

Unnamed: 0,wife_age,num_children,wife_religion,wife_working,media_exposure,wife_edu_2,wife_edu_3,wife_edu_4,husband_edu_2,husband_edu_3,husband_edu_4,husband_occupation_2,husband_occupation_3,husband_occupation_4,standard_of_living_index_2,standard_of_living_index_3,standard_of_living_index_4
0,24,3,1,1,0,1,0,0,0,1,0,1,0,0,0,1,0
1,45,10,1,1,0,0,0,0,0,1,0,0,1,0,0,0,1
2,43,7,1,1,0,1,0,0,0,1,0,0,1,0,0,0,1
3,42,9,1,1,0,0,1,0,1,0,0,0,1,0,0,1,0
4,36,8,1,1,0,0,1,0,0,1,0,0,1,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1468,33,2,1,0,0,0,0,1,0,0,1,1,0,0,0,0,1
1469,33,3,1,1,0,0,0,1,0,0,1,0,0,0,0,0,1
1470,39,8,1,0,0,0,1,0,0,1,0,0,0,0,0,0,1
1471,33,4,1,0,0,0,1,0,0,1,0,1,0,0,1,0,0


In [241]:
# labelling the target variable for xgb
le = LabelEncoder()
y = le.fit_transform(y)

In [242]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,
                                                    random_state=42, stratify=y)

In [243]:
# Then, scale the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Wine

In [257]:
wine = fetch_ucirepo(id=109)

X = wine.data.features
y = wine.data.targets

print(wine.metadata, wine.variables)

{'uci_id': 109, 'name': 'Wine', 'repository_url': 'https://archive.ics.uci.edu/dataset/109/wine', 'data_url': 'https://archive.ics.uci.edu/static/public/109/data.csv', 'abstract': 'Using chemical analysis to determine the origin of wines', 'area': 'Physics and Chemistry', 'tasks': ['Classification'], 'characteristics': ['Tabular'], 'num_instances': 178, 'num_features': 13, 'feature_types': ['Integer', 'Real'], 'demographics': [], 'target_col': ['class'], 'index_col': None, 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 1992, 'last_updated': 'Mon Aug 28 2023', 'dataset_doi': '10.24432/C5PC7J', 'creators': ['Stefan Aeberhard', 'M. Forina'], 'intro_paper': {'title': 'Comparative analysis of statistical pattern recognition methods in high dimensional settings', 'authors': 'S. Aeberhard, D. Coomans, O. Vel', 'published_in': 'Pattern Recognition', 'year': 1994, 'url': 'https://www.semanticscholar.org/paper/83dc3e4030d7b9fbdbb4bde03ce12ab70ca10528', 'do

In [228]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 178 entries, 0 to 177
Data columns (total 13 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Alcohol                       178 non-null    float64
 1   Malicacid                     178 non-null    float64
 2   Ash                           178 non-null    float64
 3   Alcalinity_of_ash             178 non-null    float64
 4   Magnesium                     178 non-null    int64  
 5   Total_phenols                 178 non-null    float64
 6   Flavanoids                    178 non-null    float64
 7   Nonflavanoid_phenols          178 non-null    float64
 8   Proanthocyanins               178 non-null    float64
 9   Color_intensity               178 non-null    float64
 10  Hue                           178 non-null    float64
 11  0D280_0D315_of_diluted_wines  178 non-null    float64
 12  Proline                       178 non-null    int64  
dtypes: fl

In [229]:
# Check imbalance
y.value_counts()

class
2        71
1        59
3        48
dtype: int64

In [258]:
# labelling the target variable for xgb
le = LabelEncoder()
y = le.fit_transform(y)

In [259]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,
                                                    random_state=42, stratify=y)

In [260]:
# Then, scale the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

#Fitting

In [280]:
class MulticlassClassification:

    def __init__(self, seed):
        self.models = []
        self.seed = seed

    def fit(self, X, y):
        """
        Fits each model
        """
        for y_i in np.unique(y):
            # y_i - positive class for now
            # All other classes except y_i are negative

            # Choose x where y is positive class
            x_true = X[y == y_i]
            # Choose x where y is negative class
            x_false = X[y != y_i]
            # Concatanate
            x_true_false = np.vstack((x_true, x_false))

            # Set y to 1 where it is positive class
            y_true = np.ones(x_true.shape[0])
            # Set y to 0 where it is negative class
            y_false = np.zeros(x_false.shape[0])
            # Concatanate
            y_true_false = np.hstack((y_true, y_false))

            # Fit model and append to models list
            model = RUSBoost(seed=self.seed)
            # model = XGBoost()
            model.fit(x_true_false, y_true_false)
            self.models.append([y_i, model])


    def predict(self, X):
        y_pred = [[label, model.predict(X)] for label, model in self.models]

        output = []

        for i in range(X.shape[0]):
            max_label = None
            max_prob = -10**5
            for j in range(len(y_pred)):
                prob = y_pred[j][1][i]
                if prob > max_prob:
                    max_label = y_pred[j][0]
                    max_prob = prob
            output.append(max_label)

        return output

In [270]:
from sklearn.metrics import matthews_corrcoef, roc_auc_score, precision_score, recall_score, f1_score, cohen_kappa_score, accuracy_score
from scipy.stats import hmean
from tqdm.notebook import tqdm

def gmean_score(y_true, y_pred):
    return hmean([precision_score(y_true, y_pred, average='weighted'), recall_score(y_true, y_pred, average='weighted')])

precision_list, recall_list, f1_list, gmean_list = [], [], [], []

for seed in tqdm([42, 1007, 3248, 8, 100]):
    clf_mcl = MulticlassClassification(seed=seed)
    clf_mcl.fit(X_train, y_train)

    y_pred = clf_mcl.predict(X_test)

    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    gmean = gmean_score(y_test, y_pred)

    precision_list.append(precision)
    recall_list.append(recall)
    f1_list.append(f1)
    gmean_list.append(gmean)


  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

In [262]:
from sklearn.metrics import matthews_corrcoef, roc_auc_score, precision_score, recall_score, f1_score, cohen_kappa_score, accuracy_score
from scipy.stats import hmean
from tqdm.notebook import tqdm

def gmean_score(y_true, y_pred):
    return hmean([precision_score(y_true, y_pred, average='weighted'), recall_score(y_true, y_pred, average='weighted')])

precision_list, recall_list, f1_list, gmean_list = [], [], [], []

for seed in tqdm([42, 1007, 3248, 8, 100]):
    clf_mcl = MulticlassClassification(seed=seed)
    clf_mcl.fit(X_train, y_train)

    y_pred = clf_mcl.predict(X_test)

    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    gmean = gmean_score(y_test, y_pred)

    precision_list.append(precision)
    recall_list.append(recall)
    f1_list.append(f1)
    gmean_list.append(gmean)


  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

In [255]:
def print_metrics(precision_list, recall_list, f1_list, gmean_list):
    print("Precision:", np.round(np.mean(precision_list), 3), '+-',np.round(np.std(precision_list),3))
    print("Recall:", np.round(np.mean(recall_list),3), '+-',np.round(np.std(recall_list),3))
    print("F1-score:", np.round(np.mean(f1_list),3), '+-',np.round(np.std(f1_list),3))
    print("Gmean-score:", np.round(np.mean(gmean_list),3), '+-',np.round(np.std(gmean_list),3))

In [271]:
## CMC SVM
print_metrics(precision_list, recall_list, f1_list, gmean_list)

Precision: 0.111 +- 0.0
Recall: 0.333 +- 0.0
F1-score: 0.167 +- 0.0
Gmean-score: 0.167 +- 0.0


In [267]:
## CMC LogReg
print_metrics(precision_list, recall_list, f1_list, gmean_list)

Precision: 0.111 +- 0.0
Recall: 0.333 +- 0.0
F1-score: 0.167 +- 0.0
Gmean-score: 0.167 +- 0.0


In [256]:
## CMC DT
print_metrics(precision_list, recall_list, f1_list, gmean_list)

Precision: 0.314 +- 0.004
Recall: 0.39 +- 0.043
F1-score: 0.341 +- 0.018
Gmean-score: 0.347 +- 0.016


In [263]:
## WINE DT
print_metrics(precision_list, recall_list, f1_list, gmean_list)

Precision: 0.899 +- 0.026
Recall: 0.867 +- 0.03
F1-score: 0.869 +- 0.031
Gmean-score: 0.883 +- 0.028
