## 1. Multi-class and Multi-Label Classification Using Support Vector Machines

Import packages

In [126]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, hamming_loss, silhouette_score
from sklearn.svm import SVC, LinearSVC
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import make_scorer, accuracy_score
from imblearn.over_sampling import SMOTE
from scipy.stats import mode
from sklearn.cluster import KMeans
from collections import Counter


import warnings
warnings.filterwarnings("ignore")

### (a) Download the Anuran Calls (MFCCs) Data Set

In [43]:
df = pd.read_csv('../data/Anuran Calls (MFCCs)/Frogs_MFCCs.csv')
df

Unnamed: 0,MFCCs_ 1,MFCCs_ 2,MFCCs_ 3,MFCCs_ 4,MFCCs_ 5,MFCCs_ 6,MFCCs_ 7,MFCCs_ 8,MFCCs_ 9,MFCCs_10,...,MFCCs_17,MFCCs_18,MFCCs_19,MFCCs_20,MFCCs_21,MFCCs_22,Family,Genus,Species,RecordID
0,1.0,0.152936,-0.105586,0.200722,0.317201,0.260764,0.100945,-0.150063,-0.171128,0.124676,...,-0.108351,-0.077623,-0.009568,0.057684,0.118680,0.014038,Leptodactylidae,Adenomera,AdenomeraAndre,1
1,1.0,0.171534,-0.098975,0.268425,0.338672,0.268353,0.060835,-0.222475,-0.207693,0.170883,...,-0.090974,-0.056510,-0.035303,0.020140,0.082263,0.029056,Leptodactylidae,Adenomera,AdenomeraAndre,1
2,1.0,0.152317,-0.082973,0.287128,0.276014,0.189867,0.008714,-0.242234,-0.219153,0.232538,...,-0.050691,-0.023590,-0.066722,-0.025083,0.099108,0.077162,Leptodactylidae,Adenomera,AdenomeraAndre,1
3,1.0,0.224392,0.118985,0.329432,0.372088,0.361005,0.015501,-0.194347,-0.098181,0.270375,...,-0.136009,-0.177037,-0.130498,-0.054766,-0.018691,0.023954,Leptodactylidae,Adenomera,AdenomeraAndre,1
4,1.0,0.087817,-0.068345,0.306967,0.330923,0.249144,0.006884,-0.265423,-0.172700,0.266434,...,-0.048885,-0.053074,-0.088550,-0.031346,0.108610,0.079244,Leptodactylidae,Adenomera,AdenomeraAndre,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7190,1.0,-0.554504,-0.337717,0.035533,0.034511,0.443451,0.093889,-0.100753,0.037087,0.081075,...,0.069430,0.071001,0.021591,0.052449,-0.021860,-0.079860,Hylidae,Scinax,ScinaxRuber,60
7191,1.0,-0.517273,-0.370574,0.030673,0.068097,0.402890,0.096628,-0.116460,0.063727,0.089034,...,0.061127,0.068978,0.017745,0.046461,-0.015418,-0.101892,Hylidae,Scinax,ScinaxRuber,60
7192,1.0,-0.582557,-0.343237,0.029468,0.064179,0.385596,0.114905,-0.103317,0.070370,0.081317,...,0.082474,0.077771,-0.009688,0.027834,-0.000531,-0.080425,Hylidae,Scinax,ScinaxRuber,60
7193,1.0,-0.519497,-0.307553,-0.004922,0.072865,0.377131,0.086866,-0.115799,0.056979,0.089316,...,0.051796,0.069073,0.017963,0.041803,-0.027911,-0.096895,Hylidae,Scinax,ScinaxRuber,60


### (b) Train a classifier for each label

#### (i) Research

In [45]:
def exact_match_score(y_true, y_pred):
    matches = np.all(y_true == y_pred, axis=1)
    return np.mean(matches)

def hamming_score(y_true, y_pred):
    correct_labels = np.sum(y_true == y_pred)
    total_labels = np.size(y_true)
    return correct_labels / total_labels

def hamming_loss_manual(y_true, y_pred):
 
    incorrect_labels = np.sum(y_true != y_pred)
    total_labels = np.size(y_true)
    return incorrect_labels / total_labels

df = pd.read_csv('../data/Anuran Calls (MFCCs)/Frogs_MFCCs.csv')

X = df.iloc[:, :-4].values  
y = df.iloc[:, -4:-1]  

label_encoders = {}
for column in y.columns:
    le = LabelEncoder()
    y[column] = le.fit_transform(y[column])  
    label_encoders[column] = le  

y = y.values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

classifiers = []
y_pred = np.zeros_like(y_test)

for i in range(y_train.shape[1]):  
    clf = RandomForestClassifier(random_state=42)
    clf.fit(X_train, y_train[:, i])  
    classifiers.append(clf)
    y_pred[:, i] = clf.predict(X_test)  


exact_match = exact_match_score(y_test, y_pred)
hamming_loss_value = hamming_loss_manual(y_test, y_pred)  
hamming_score_value = hamming_score(y_test, y_pred)

print(f"Exact Match Ratio: {exact_match:.15f}")
print(f"Hamming Score: {hamming_score_value:.15f}")
print(f"Hamming Loss: {hamming_loss_value:.15f}")

Exact Match Ratio: 0.972672533580361
Hamming Score: 0.981318511656631
Hamming Loss: 0.018681488343369


**Exact Match Ratio**


Definition: The Exact Match Ratio measures the proportion of instances where all predicted labels exactly match all true labels. It’s the strictest evaluation metric for multi-label classification.

**Hamming Score**


Definition: The Hamming Score is the average accuracy of individual labels. It measures the fraction of correctly predicted labels (both positive and negative) across all labels and instances.


**Hamming Loss**


Definition: The Hamming Loss measures the fraction of incorrectly predicted labels across all instances. It quantifies the average mismatch between predicted and true labels.

#### (ii) Train a SVM for each of the labels

In [48]:
def evaluate_model_performance(y_true, y_pred):
    exact_matches = np.mean(np.all(y_true == y_pred, axis=1))
    total_labels = y_true.size
    incorrect_labels = np.sum(y_true != y_pred)
    hamming = incorrect_labels / total_labels
    return round(hamming, 4), round(exact_matches, 4)


def wide_range_search(X_train, y_train, c_values, gamma_values, accuracy_threshold):
    good_params = {'C': set(), 'gamma': set()}
    for c in c_values:
        for gamma in gamma_values:
            accuracies = []
            for i in range(y_train.shape[1]):
                svc = SVC(kernel='rbf', C=c, gamma=gamma, random_state=42)
                scores = cross_val_score(svc, X_train, y_train[:, i], cv=3, scoring='accuracy', n_jobs=-1)
                accuracies.append(scores.mean())
            if np.mean(accuracies) >= accuracy_threshold:
                good_params['C'].add(c)
                good_params['gamma'].add(gamma)
    
 
    reduced_c_values = np.linspace(min(good_params['C']), max(good_params['C']), num=10)
    reduced_gamma_values = np.linspace(min(good_params['gamma']), max(good_params['gamma']), num=10)
    return {'C': reduced_c_values, 'gamma': reduced_gamma_values}



def main_workflow():
   
    df = pd.read_csv('../data/Anuran Calls (MFCCs)/Frogs_MFCCs.csv')
    X = df.iloc[:, :-4].values
    y = df.iloc[:, -4:-1]

   
    label_encoders = {}
    for column in y.columns:
        le = LabelEncoder()
        y[column] = le.fit_transform(y[column])
        label_encoders[column] = le
    y = y.values

    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    
    scaler = StandardScaler()
    X_train_standardized = scaler.fit_transform(X_train)
    X_test_standardized = scaler.transform(X_test)

    
    c_values = [10**i for i in range(-3, 7)]
    gamma_values = [i * 0.5 for i in range(1, 11)]
    accuracy_threshold = 0.7
    good_params = wide_range_search(X_train_standardized, y_train, c_values, gamma_values, accuracy_threshold)

    
    reduced_c_values = np.unique(good_params['C'])
    reduced_gamma_values = np.unique(good_params['gamma'])
    classifiers = []
    y_pred = np.zeros_like(y_test)

    for i in range(y_train.shape[1]):
        best_score = 0
        best_model = None
        for c in reduced_c_values:
            for gamma in reduced_gamma_values:
                svc = SVC(kernel='rbf', C=c, gamma=gamma, random_state=42)
                scores = cross_val_score(svc, X_train_standardized, y_train[:, i], cv=5, scoring='accuracy', n_jobs=-1)
                if scores.mean() > best_score:
                    best_score = scores.mean()
                    best_model = svc
        best_model.fit(X_train_standardized, y_train[:, i])
        classifiers.append(best_model)
        y_pred[:, i] = best_model.predict(X_test_standardized)

    
    hamming, exact_match = evaluate_model_performance(y_test, y_pred)
    print("\nResults:")
    print(f"Hamming Loss: {hamming}")
    print(f"Exact Match Ratio: {exact_match}")

if __name__ == "__main__":
    main_workflow()



Results:
Hamming Loss: 0.0462
Exact Match Ratio: 0.9254


#### (iii) Repeat 1(b)ii with L1-penalized SVMs

In [54]:
def evaluate_model_performance(y_true, y_pred):
    exact_matches = np.mean(np.all(y_true == y_pred, axis=1))
    total_labels = y_true.size
    incorrect_labels = np.sum(y_true != y_pred)
    hamming = incorrect_labels / total_labels
    return round(hamming, 4), round(exact_matches, 4)


def wide_range_search(X_train, y_train, c_values, accuracy_threshold):
    good_params = {'C': []}
    for c in c_values:
        accuracies = []
        for i in range(y_train.shape[1]):
            lsvc = LinearSVC(penalty='l1', dual=False, C=c, max_iter=10000, random_state=42)
            scores = cross_val_score(lsvc, X_train, y_train[:, i], cv=3, scoring='accuracy', n_jobs=-1)
            accuracies.append(scores.mean())
        if np.mean(accuracies) >= accuracy_threshold:
            good_params['C'].append(c)
    return good_params


def main_workflow_l1():
    
    df = pd.read_csv('../data/Anuran Calls (MFCCs)/Frogs_MFCCs.csv')
    X = df.iloc[:, :-4].values
    y = df.iloc[:, -4:-1]

    
    label_encoders = {}
    for column in y.columns:
        le = LabelEncoder()
        y[column] = le.fit_transform(y[column])
        label_encoders[column] = le
    y = y.values

    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    
    scaler = StandardScaler()
    X_train_standardized = scaler.fit_transform(X_train)
    X_test_standardized = scaler.transform(X_test)

    
    c_values = [10**i for i in range(-3, 7)]  
    accuracy_threshold = 0.7
    good_params = wide_range_search(X_train_standardized, y_train, c_values, accuracy_threshold)

    
    reduced_c_values = np.logspace(np.log10(min(good_params['C'])), np.log10(max(good_params['C'])), num=10)

    
    param_grid = {'C': reduced_c_values}
    classifiers = []
    y_pred = np.zeros_like(y_test)

    for i in range(y_train.shape[1]):
        print(f"Training for label {i+1}/{y_train.shape[1]}...")
        lsvc = LinearSVC(penalty='l1', dual=False, max_iter=10000, random_state=42)
        grid_search = GridSearchCV(lsvc, param_grid, cv=10, scoring='accuracy', n_jobs=-1)
        grid_search.fit(X_train_standardized, y_train[:, i])
        
        best_model = grid_search.best_estimator_
        classifiers.append(best_model)
        y_pred[:, i] = best_model.predict(X_test_standardized)
        
        print(f"Best C for label {i+1}: {grid_search.best_params_['C']}")

    
    hamming, exact_match = evaluate_model_performance(y_test, y_pred)
    print("\nResults:")
    print(f"Hamming Loss: {hamming}")
    print(f"Exact Match Ratio: {exact_match}")

if __name__ == "__main__":
    main_workflow_l1()

Training for label 1/3...
Best C for label 1: 1.0
Training for label 2/3...
Best C for label 2: 10.0
Training for label 3/3...
Best C for label 3: 10.0

Results:
Hamming Loss: 0.0568
Exact Match Ratio: 0.9129


#### (iv) Repeat 1(b)iii by using SMOTE or any other method for imbalance

In [58]:
def evaluate_model_performance(y_true, y_pred):
    exact_matches = np.mean(np.all(y_true == y_pred, axis=1))
    total_labels = y_true.size
    incorrect_labels = np.sum(y_true != y_pred)
    hamming = incorrect_labels / total_labels
    return round(hamming, 4), round(exact_matches, 4)

def wide_range_search(X_train, y_train, c_values, accuracy_threshold):
    good_params = {'C': []}
    for c in c_values:
        accuracies = []
        for i in range(y_train.shape[1]):
            
            smote = SMOTE(random_state=42)
            X_resampled, y_resampled = smote.fit_resample(X_train, y_train[:, i])

            lsvc = LinearSVC(penalty='l1', dual=False, C=c, max_iter=10000, random_state=42)
            scores = cross_val_score(lsvc, X_resampled, y_resampled, cv=3, scoring='accuracy', n_jobs=-1)
            accuracies.append(scores.mean())
        if np.mean(accuracies) >= accuracy_threshold:
            good_params['C'].append(c)
    return good_params


def main_workflow_l1_smote():
   
    df = pd.read_csv('../data/Anuran Calls (MFCCs)/Frogs_MFCCs.csv')
    X = df.iloc[:, :-4].values
    y = df.iloc[:, -4:-1]

    
    label_encoders = {}
    for column in y.columns:
        le = LabelEncoder()
        y[column] = le.fit_transform(y[column])
        label_encoders[column] = le
    y = y.values

    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

   
    scaler = StandardScaler()
    X_train_standardized = scaler.fit_transform(X_train)
    X_test_standardized = scaler.transform(X_test)

   
    c_values = [10**i for i in range(-3, 7)]  
    accuracy_threshold = 0.7
    good_params = wide_range_search(X_train_standardized, y_train, c_values, accuracy_threshold)

    
    reduced_c_values = np.logspace(np.log10(min(good_params['C'])), np.log10(max(good_params['C'])), num=10)

    
    param_grid = {'C': reduced_c_values}
    classifiers = []
    y_pred = np.zeros_like(y_test)

    for i in range(y_train.shape[1]):
        print(f"Training for label {i+1}/{y_train.shape[1]}...")

       
        smote = SMOTE(random_state=42)
        X_resampled, y_resampled = smote.fit_resample(X_train_standardized, y_train[:, i])

        
        lsvc = LinearSVC(penalty='l1', dual=False, max_iter=10000, random_state=42)
        grid_search = GridSearchCV(lsvc, param_grid, cv=10, scoring='accuracy', n_jobs=-1)
        grid_search.fit(X_resampled, y_resampled)

        best_model = grid_search.best_estimator_
        classifiers.append(best_model)
        y_pred[:, i] = best_model.predict(X_test_standardized)

        print(f"Best C for label {i+1}: {grid_search.best_params_['C']}")

    
    hamming, exact_match = evaluate_model_performance(y_test, y_pred)
    print("\nResults:")
    print(f"Hamming Loss: {hamming}")
    print(f"Exact Match Ratio: {exact_match}")

if __name__ == "__main__":
    main_workflow_l1_smote()

Training for label 1/3...
Best C for label 1: 100.0
Training for label 2/3...
Best C for label 2: 100.0
Training for label 3/3...
Best C for label 3: 1000.0

Results:
Hamming Loss: 0.077
Exact Match Ratio: 0.8532


## 2. K-Means Clustering on a Multi-Class and Multi-Label Data Set

### (a) Use k-means clustering

In [123]:
def find_optimal_k(X, max_k=50):
    silhouette_scores = []
    for k in range(2, max_k + 1):
        kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
        labels = kmeans.fit_predict(X)
        silhouette_scores.append(silhouette_score(X, labels))
    optimal_k = np.argmax(silhouette_scores) + 2
    return optimal_k, silhouette_scores


def encode_labels(y):
    encoders = []
    y_encoded = np.zeros_like(y)
    for i in range(y.shape[1]):
        encoder = LabelEncoder()
        y_encoded[:, i] = encoder.fit_transform(y[:, i])
        encoders.append(encoder)
    return y_encoded, encoders


def find_majority_class(cluster_df, label_column):
    majority_classes = {}
    for cluster in cluster_df['Cluster'].unique():
        cluster_data = cluster_df[cluster_df['Cluster'] == cluster]
        majority_class = Counter(cluster_data[label_column]).most_common(1)[0][0]
        majority_classes[cluster] = majority_class
    return majority_classes


df = pd.read_csv('../data/Anuran Calls (MFCCs)/Frogs_MFCCs.csv')
X = df.iloc[:, :-4].values  
y = df.iloc[:, -4:-1].values  


y_encoded, encoders = encode_labels(y)


scaler = StandardScaler()
X_standardized = scaler.fit_transform(X)

optimal_k, silhouette_scores = find_optimal_k(X_standardized)
print(f"Optimal number of clusters (k): {optimal_k}")


kmeans = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
df['Cluster'] = kmeans.fit_predict(X_standardized)  
print(f"K-means clustering completed with {optimal_k} clusters.")


Optimal number of clusters (k): 4
K-means clustering completed with 4 clusters.


### (b) Determine which family is the majority

In [128]:
majority_family = find_majority_class(df, 'Family')
majority_genus = find_majority_class(df, 'Genus')
majority_species = find_majority_class(df, 'Species')

# Step 4: Print results
print("\nMajority Family in each cluster:")
print(majority_family)

print("\nMajority Genus in each cluster:")
print(majority_genus)

print("\nMajority Species in each cluster:")
print(majority_species)


Majority Family in each cluster:
{0: 'Hylidae', 2: 'Hylidae', 3: 'Leptodactylidae', 1: 'Leptodactylidae'}

Majority Genus in each cluster:
{0: 'Hypsiboas', 2: 'Hypsiboas', 3: 'Adenomera', 1: 'Adenomera'}

Majority Species in each cluster:
{0: 'HypsiboasCordobae', 2: 'HypsiboasCinerascens', 3: 'AdenomeraAndre', 1: 'AdenomeraHylaedactylus'}


In [130]:
def find_majority_class(cluster_df, label_column):
    majority_classes = {}
    for cluster in cluster_df['Cluster'].unique():
        cluster_data = cluster_df[cluster_df['Cluster'] == cluster]
        majority_class = Counter(cluster_data[label_column]).most_common(1)[0][0]
        majority_classes[cluster] = majority_class
    return majority_classes


majority_family = find_majority_class(df, 'Family')
majority_genus = find_majority_class(df, 'Genus')
majority_species = find_majority_class(df, 'Species')

# Print results
print("Majority Family in each cluster:")
print(majority_family)

print("\nMajority Genus in each cluster:")
print(majority_genus)

print("\nMajority Species in each cluster:")
print(majority_species)



Majority Family in each cluster:
{0: 'Hylidae', 2: 'Hylidae', 3: 'Leptodactylidae', 1: 'Leptodactylidae'}

Majority Genus in each cluster:
{0: 'Hypsiboas', 2: 'Hypsiboas', 3: 'Adenomera', 1: 'Adenomera'}

Majority Species in each cluster:
{0: 'HypsiboasCordobae', 2: 'HypsiboasCinerascens', 3: 'AdenomeraAndre', 1: 'AdenomeraHylaedactylus'}


In [117]:
def monte_carlo_simulation(X, y, k, num_iterations=50):
    hamming_distances = []
    for _ in range(num_iterations):
        kmeans = KMeans(n_clusters=k, random_state=None, n_init=10)
        predicted_clusters = kmeans.fit_predict(X)
        
        distance = calculate_hamming_distance(y, predicted_clusters)
        hamming_distances.append(distance)
    return np.mean(hamming_distances), np.std(hamming_distances)



def calculate_hamming_distance(true_labels, predicted_labels):
    mapped_labels = np.zeros_like(true_labels)  
    for cluster in np.unique(predicted_labels):
        mask = predicted_labels == cluster
        cluster_labels = true_labels[mask]

        if len(cluster_labels) > 0:
            cluster_labels = cluster_labels.astype(int)  
            cluster_modes = [mode(cluster_labels[:, i], keepdims=True).mode[0] for i in range(cluster_labels.shape[1])]

           
            for i in range(cluster_labels.shape[1]):
                mapped_labels[mask, i] = cluster_modes[i]

    
    return np.mean(true_labels != mapped_labels)

mean_hamming, std_hamming = monte_carlo_simulation(X_standardized, y_encoded, optimal_k)
print(f"\nMonte Carlo Simulation Results:")
print(f"Average Hamming Distance: {mean_hamming:.4f}")
print(f"Standard Deviation of Hamming Distance: {std_hamming:.4f}")



Monte Carlo Simulation Results:
Average Hamming Distance: 0.2434
Standard Deviation of Hamming Distance: 0.0089


### (c) Calculate the average Hamming distance, Hamming score, and Hamming loss

In [132]:
def assign_majority_labels(df, majority_family, majority_genus, majority_species):
    
    df['Assigned_Family'] = df['Cluster'].map(majority_family)
    df['Assigned_Genus'] = df['Cluster'].map(majority_genus)
    df['Assigned_Species'] = df['Cluster'].map(majority_species)
    return df

def calculate_hamming_metrics(df, true_labels):
   
    assigned_labels = df[['Assigned_Family', 'Assigned_Genus', 'Assigned_Species']].values
    
    
    hamming_distance = np.sum(true_labels != assigned_labels)
    
   
    total_labels = true_labels.size
    
    
    hamming_loss_value = hamming_distance / total_labels
    
   
    hamming_score = 1 - hamming_loss_value
    
    return hamming_distance, hamming_score, hamming_loss_value


df = assign_majority_labels(df, majority_family, majority_genus, majority_species)


true_labels = y 
hamming_distance, hamming_score, hamming_loss_value = calculate_hamming_metrics(df, true_labels)


print(f"Hamming Distance: {hamming_distance}")
print(f"Hamming Score: {hamming_score:.4f}")
print(f"Hamming Loss: {hamming_loss_value:.4f}")

Hamming Distance: 5021
Hamming Score: 0.7674
Hamming Loss: 0.2326


**References**


https://scikit-learn.org/1.5/modules/generated/sklearn.metrics.hamming_loss.html

https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html

https://scikit-learn.org/dev/modules/generated/sklearn.preprocessing.LabelEncoder.html

https://scikit-learn.org/dev/modules/generated/sklearn.model_selection.train_test_split.html

https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html

https://towardsdatascience.com/multi-label-text-classification-with-scikit-learn-30714b7819c5

https://scikit-learn.org/stable/modules/svm.html

https://www.kdnuggets.com/hyperparameter-tuning-gridsearchcv-and-randomizedsearchcv-explained

https://mmuratarat.github.io/2020-01-25/multilabel_classification_metrics

https://scikit-learn.org/stable/modules/generated/sklearn.svm.LinearSVC.html

https://scikit-learn.org/stable/auto_examples/svm/plot_svm_scale_c.html

https://arxiv.org/abs/1106.1813

https://scikit-learn.org/stable/modules/generated/sklearn.svm.LinearSVC.html

https://scikit-learn.org/stable/modules/multiclass.html

https://scikit-learn.org/stable/auto_examples/miscellaneous/plot_multilabel.html

https://scikit-learn.org/stable/auto_examples/cluster/plot_kmeans_silhouette_analysis.html

https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html

https://www.analyticsvidhya.com/blog/2019/08/comprehensive-guide-k-means-clustering/

https://naeglelab.github.io/OpenEnsembles/Examples/Example_Kmeans_MajorityVote.html

https://scikit-learn.org/1.5/modules/generated/sklearn.cluster.KMeans.html

https://scikit-learn.org/dev/modules/generated/sklearn.metrics.silhouette_score.html

https://scikit-learn.org/dev/modules/generated/sklearn.preprocessing.StandardScaler.html

https://scikit-learn.org/stable/modules/generated/sklearn.metrics.hamming_loss.html

https://docs.python.org/3/library/collections.html#collections.Counter

