In [233]:
import pandas as pd
import numpy as np
import random
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score, hamming_loss, classification_report, silhouette_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.svm import SVC, LinearSVC
from sklearn.multiclass import OneVsRestClassifier
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
from sklearn.cluster import KMeans

### 1(a): Downloading dataset and choosing randomly 70% as Training Set ###

In [8]:
pathToDataset ='../data/Frogs_MFCCs.csv'
df = pd.read_csv(pathToDataset)
df

Unnamed: 0,MFCCs_ 1,MFCCs_ 2,MFCCs_ 3,MFCCs_ 4,MFCCs_ 5,MFCCs_ 6,MFCCs_ 7,MFCCs_ 8,MFCCs_ 9,MFCCs_10,...,MFCCs_17,MFCCs_18,MFCCs_19,MFCCs_20,MFCCs_21,MFCCs_22,Family,Genus,Species,RecordID
0,1.0,0.152936,-0.105586,0.200722,0.317201,0.260764,0.100945,-0.150063,-0.171128,0.124676,...,-0.108351,-0.077623,-0.009568,0.057684,0.118680,0.014038,Leptodactylidae,Adenomera,AdenomeraAndre,1
1,1.0,0.171534,-0.098975,0.268425,0.338672,0.268353,0.060835,-0.222475,-0.207693,0.170883,...,-0.090974,-0.056510,-0.035303,0.020140,0.082263,0.029056,Leptodactylidae,Adenomera,AdenomeraAndre,1
2,1.0,0.152317,-0.082973,0.287128,0.276014,0.189867,0.008714,-0.242234,-0.219153,0.232538,...,-0.050691,-0.023590,-0.066722,-0.025083,0.099108,0.077162,Leptodactylidae,Adenomera,AdenomeraAndre,1
3,1.0,0.224392,0.118985,0.329432,0.372088,0.361005,0.015501,-0.194347,-0.098181,0.270375,...,-0.136009,-0.177037,-0.130498,-0.054766,-0.018691,0.023954,Leptodactylidae,Adenomera,AdenomeraAndre,1
4,1.0,0.087817,-0.068345,0.306967,0.330923,0.249144,0.006884,-0.265423,-0.172700,0.266434,...,-0.048885,-0.053074,-0.088550,-0.031346,0.108610,0.079244,Leptodactylidae,Adenomera,AdenomeraAndre,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7190,1.0,-0.554504,-0.337717,0.035533,0.034511,0.443451,0.093889,-0.100753,0.037087,0.081075,...,0.069430,0.071001,0.021591,0.052449,-0.021860,-0.079860,Hylidae,Scinax,ScinaxRuber,60
7191,1.0,-0.517273,-0.370574,0.030673,0.068097,0.402890,0.096628,-0.116460,0.063727,0.089034,...,0.061127,0.068978,0.017745,0.046461,-0.015418,-0.101892,Hylidae,Scinax,ScinaxRuber,60
7192,1.0,-0.582557,-0.343237,0.029468,0.064179,0.385596,0.114905,-0.103317,0.070370,0.081317,...,0.082474,0.077771,-0.009688,0.027834,-0.000531,-0.080425,Hylidae,Scinax,ScinaxRuber,60
7193,1.0,-0.519497,-0.307553,-0.004922,0.072865,0.377131,0.086866,-0.115799,0.056979,0.089316,...,0.051796,0.069073,0.017963,0.041803,-0.027911,-0.096895,Hylidae,Scinax,ScinaxRuber,60


In [10]:
X = df.iloc[:, :-4]  # Features (MFCCs)
Y = df.iloc[:, -4:-1]  # Labels (Family, Genus, Species)
Y

Unnamed: 0,Family,Genus,Species
0,Leptodactylidae,Adenomera,AdenomeraAndre
1,Leptodactylidae,Adenomera,AdenomeraAndre
2,Leptodactylidae,Adenomera,AdenomeraAndre
3,Leptodactylidae,Adenomera,AdenomeraAndre
4,Leptodactylidae,Adenomera,AdenomeraAndre
...,...,...,...
7190,Hylidae,Scinax,ScinaxRuber
7191,Hylidae,Scinax,ScinaxRuber
7192,Hylidae,Scinax,ScinaxRuber
7193,Hylidae,Scinax,ScinaxRuber


In [12]:
X

Unnamed: 0,MFCCs_ 1,MFCCs_ 2,MFCCs_ 3,MFCCs_ 4,MFCCs_ 5,MFCCs_ 6,MFCCs_ 7,MFCCs_ 8,MFCCs_ 9,MFCCs_10,...,MFCCs_13,MFCCs_14,MFCCs_15,MFCCs_16,MFCCs_17,MFCCs_18,MFCCs_19,MFCCs_20,MFCCs_21,MFCCs_22
0,1.0,0.152936,-0.105586,0.200722,0.317201,0.260764,0.100945,-0.150063,-0.171128,0.124676,...,-0.156436,0.082245,0.135752,-0.024017,-0.108351,-0.077623,-0.009568,0.057684,0.118680,0.014038
1,1.0,0.171534,-0.098975,0.268425,0.338672,0.268353,0.060835,-0.222475,-0.207693,0.170883,...,-0.254341,0.022786,0.163320,0.012022,-0.090974,-0.056510,-0.035303,0.020140,0.082263,0.029056
2,1.0,0.152317,-0.082973,0.287128,0.276014,0.189867,0.008714,-0.242234,-0.219153,0.232538,...,-0.237384,0.050791,0.207338,0.083536,-0.050691,-0.023590,-0.066722,-0.025083,0.099108,0.077162
3,1.0,0.224392,0.118985,0.329432,0.372088,0.361005,0.015501,-0.194347,-0.098181,0.270375,...,-0.317084,-0.011567,0.100413,-0.050224,-0.136009,-0.177037,-0.130498,-0.054766,-0.018691,0.023954
4,1.0,0.087817,-0.068345,0.306967,0.330923,0.249144,0.006884,-0.265423,-0.172700,0.266434,...,-0.298524,0.037439,0.219153,0.062837,-0.048885,-0.053074,-0.088550,-0.031346,0.108610,0.079244
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7190,1.0,-0.554504,-0.337717,0.035533,0.034511,0.443451,0.093889,-0.100753,0.037087,0.081075,...,-0.145668,-0.059364,0.024206,-0.000861,0.069430,0.071001,0.021591,0.052449,-0.021860,-0.079860
7191,1.0,-0.517273,-0.370574,0.030673,0.068097,0.402890,0.096628,-0.116460,0.063727,0.089034,...,-0.164675,-0.105600,0.030767,0.006457,0.061127,0.068978,0.017745,0.046461,-0.015418,-0.101892
7192,1.0,-0.582557,-0.343237,0.029468,0.064179,0.385596,0.114905,-0.103317,0.070370,0.081317,...,-0.150025,-0.078615,0.024861,0.008696,0.082474,0.077771,-0.009688,0.027834,-0.000531,-0.080425
7193,1.0,-0.519497,-0.307553,-0.004922,0.072865,0.377131,0.086866,-0.115799,0.056979,0.089316,...,-0.153120,-0.075320,0.022903,0.001924,0.051796,0.069073,0.017963,0.041803,-0.027911,-0.096895


In [14]:
N = df.shape[0]
random.seed(N)

train_indexes = random.sample(range(N), int(N * 0.7))

test_indexes = []
for indx in range(N):
    if indx not in train_indexes:
        test_indexes.append(indx)

In [16]:
# split train and test
train = df.iloc[train_indexes , :].reset_index(drop=True)
test = df.iloc[test_indexes, :].reset_index(drop=True)

train_X = train.iloc[:,:-4]
train_Family = train['Family']
train_Genus = train['Genus']
train_Species = train['Species']

test_X = test.iloc[:,:-4]
test_Family = test['Family']
test_Genus = test['Genus']
test_Species = test['Species']

In [251]:
test

Unnamed: 0,MFCCs_ 1,MFCCs_ 2,MFCCs_ 3,MFCCs_ 4,MFCCs_ 5,MFCCs_ 6,MFCCs_ 7,MFCCs_ 8,MFCCs_ 9,MFCCs_10,...,MFCCs_17,MFCCs_18,MFCCs_19,MFCCs_20,MFCCs_21,MFCCs_22,Family,Genus,Species,RecordID
0,1.0,0.171534,-0.098975,0.268425,0.338672,0.268353,0.060835,-0.222475,-0.207693,0.170883,...,-0.090974,-0.056510,-0.035303,0.020140,0.082263,0.029056,Leptodactylidae,Adenomera,AdenomeraAndre,1
1,1.0,0.152317,-0.082973,0.287128,0.276014,0.189867,0.008714,-0.242234,-0.219153,0.232538,...,-0.050691,-0.023590,-0.066722,-0.025083,0.099108,0.077162,Leptodactylidae,Adenomera,AdenomeraAndre,1
2,1.0,0.145130,-0.033660,0.284166,0.279537,0.175211,0.005791,-0.183329,-0.158483,0.192567,...,-0.055978,-0.048219,-0.056637,-0.022419,0.070085,0.021419,Leptodactylidae,Adenomera,AdenomeraAndre,1
3,1.0,0.271326,0.027777,0.375738,0.385432,0.272457,0.098192,-0.173730,-0.157857,0.207181,...,-0.120723,-0.112607,-0.156933,-0.118527,-0.002471,0.002304,Leptodactylidae,Adenomera,AdenomeraAndre,1
4,1.0,0.120565,-0.107235,0.316555,0.364437,0.307757,0.025992,-0.294179,-0.223236,0.268435,...,-0.051073,-0.052568,-0.111338,-0.040014,0.090204,0.088025,Leptodactylidae,Adenomera,AdenomeraAndre,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2154,1.0,-0.512599,-0.171956,0.325813,0.169600,0.421567,-0.123749,-0.298284,0.089382,0.243902,...,0.021225,0.157321,0.042847,0.006852,0.005439,-0.013693,Hylidae,Scinax,ScinaxRuber,60
2155,1.0,-0.554504,-0.337717,0.035533,0.034511,0.443451,0.093889,-0.100753,0.037087,0.081075,...,0.069430,0.071001,0.021591,0.052449,-0.021860,-0.079860,Hylidae,Scinax,ScinaxRuber,60
2156,1.0,-0.517273,-0.370574,0.030673,0.068097,0.402890,0.096628,-0.116460,0.063727,0.089034,...,0.061127,0.068978,0.017745,0.046461,-0.015418,-0.101892,Hylidae,Scinax,ScinaxRuber,60
2157,1.0,-0.582557,-0.343237,0.029468,0.064179,0.385596,0.114905,-0.103317,0.070370,0.081317,...,0.082474,0.077771,-0.009688,0.027834,-0.000531,-0.080425,Hylidae,Scinax,ScinaxRuber,60


### b(i) : Each instance has three labels: Families, Genus, and Species. Each of the labelshas multiple classes. We wish to solve a multi-class and multi-label problem.One of the most important approaches to multi-label classification is to train a classifier for each label (binary relevance). We first try this approach:
### (i)Research exact match and hamming score/ loss methods for evaluating multi- label classification and use them in evaluating the classifiers in this problem ###

***(1)Hamming Loss:
Measures the fraction of labels incorrectly predicted (either a false positive or a false negative).
Suitable for multi-label classification, as it treats each label independently.
Lower hamming loss indicates fewer label-level errors across all predictions.
Higher score reflects better overall label-wise accuracy.***


***(2):Exact Match:
Measures how many times the predicted label set matches the true label set completely.
A strict metric suitable when all labels need to be predicted accurately.
High exact match indicates models are accurately predicting all labels simultaneously
Advantages:
Gives a clear measure of "perfect" predictions.
Highlights total correctness of predictions.
Disadvantages:
Very sensitive to small errors.***

### b(ii)

In [235]:
tuning_params = {'C' : np.logspace(1, 4, 4), 'gamma' : np.logspace(-3, 6, 10)}
cv = StratifiedKFold(10, random_state=5036, shuffle=True)
classifier = SVC(kernel='rbf')

### Raw Attributes using Gaussian Kernel only : ###

In [237]:
#Family
cls = GridSearchCV(estimator=classifier, param_grid=tuning_params, cv=cv, verbose=1)
cls.fit(train_X, train_Family)

Fitting 10 folds for each of 40 candidates, totalling 400 fits


In [239]:
print("The best parameter values: " + str(cls.best_params_))
print("\n\nPrediction Metrics: ")
y_familyPredGaussian = cls.predict(test_X)
print(classification_report(test_Family, y_familyPredGaussian))
print("Scores for different parameter options")
mean_score = cls.cv_results_['mean_test_score']
std_score = cls.cv_results_['std_test_score']
params = cls.cv_results_['params']
for i in range(len(params)):
        print("Score = " + str(round(mean_score[i], 2)) + " for " + str(params[i]))

The best parameter values: {'C': 100.0, 'gamma': 1.0}


Prediction Metrics: 
                 precision    recall  f1-score   support

      Bufonidae       0.93      0.93      0.93        15
  Dendrobatidae       0.99      0.99      0.99       154
        Hylidae       0.99      0.98      0.99       673
Leptodactylidae       0.99      1.00      0.99      1317

       accuracy                           0.99      2159
      macro avg       0.98      0.98      0.98      2159
   weighted avg       0.99      0.99      0.99      2159

Scores for different parameter options
Score = 0.87 for {'C': 10.0, 'gamma': 0.001}
Score = 0.93 for {'C': 10.0, 'gamma': 0.01}
Score = 0.97 for {'C': 10.0, 'gamma': 0.1}
Score = 0.99 for {'C': 10.0, 'gamma': 1.0}
Score = 0.99 for {'C': 10.0, 'gamma': 10.0}
Score = 0.81 for {'C': 10.0, 'gamma': 100.0}
Score = 0.63 for {'C': 10.0, 'gamma': 1000.0}
Score = 0.62 for {'C': 10.0, 'gamma': 10000.0}
Score = 0.62 for {'C': 10.0, 'gamma': 100000.0}
Score = 0.62 for {'C

In [241]:
#Genus
clsGenusGaussian = GridSearchCV(estimator=classifier, param_grid=tuning_params, cv=cv, verbose=1)
clsGenusGaussian.fit(train_X, train_Genus)

Fitting 10 folds for each of 40 candidates, totalling 400 fits


In [243]:
print("The best parameter values: " + str(clsGenusGaussian.best_params_))
print("\n\nPrediction Metrics: ")
y_genusPredGaussian = clsGenusGaussian.predict(test_X)
print(classification_report(test_Genus, y_genusPredGaussian))
print("Scores for different parameter options")
mean_score = clsGenusGaussian.cv_results_['mean_test_score']
std_score = clsGenusGaussian.cv_results_['std_test_score']
params = clsGenusGaussian.cv_results_['params']
for i in range(len(params)):
        print("Score = " + str(round(mean_score[i], 2)) + " for " + str(params[i]))


The best parameter values: {'C': 100.0, 'gamma': 1.0}


Prediction Metrics: 
               precision    recall  f1-score   support

    Adenomera       0.99      1.00      1.00      1240
     Ameerega       0.99      0.99      0.99       154
Dendropsophus       0.97      0.95      0.96        91
    Hypsiboas       0.99      0.99      0.99       479
Leptodactylus       0.95      1.00      0.97        77
Osteocephalus       1.00      0.87      0.93        47
     Rhinella       1.00      0.93      0.97        15
       Scinax       1.00      0.98      0.99        56

     accuracy                           0.99      2159
    macro avg       0.99      0.96      0.97      2159
 weighted avg       0.99      0.99      0.99      2159

Scores for different parameter options
Score = 0.81 for {'C': 10.0, 'gamma': 0.001}
Score = 0.93 for {'C': 10.0, 'gamma': 0.01}
Score = 0.97 for {'C': 10.0, 'gamma': 0.1}
Score = 0.99 for {'C': 10.0, 'gamma': 1.0}
Score = 0.98 for {'C': 10.0, 'gamma': 10.0}
Sc

In [245]:
#trainSpecies
clsSpeciesGaussian = GridSearchCV(estimator=classifier, param_grid=tuning_params, cv=cv, verbose=1)
clsSpeciesGaussian.fit(train_X, train_Species)

Fitting 10 folds for each of 40 candidates, totalling 400 fits


In [247]:
print("The best parameter values: " + str(clsSpeciesGaussian.best_params_))
print("\n\nPrediction Metrics: ")
y_speciesPredGaussian = clsSpeciesGaussian.predict(test_X)
print(classification_report(test_Species, y_speciesPredGaussian))
print("Scores for different parameter options")
mean_score = clsSpeciesGaussian.cv_results_['mean_test_score']
std_score = clsSpeciesGaussian.cv_results_['std_test_score']
params = clsSpeciesGaussian.cv_results_['params']
for i in range(len(params)):
        print("Score = " + str(round(mean_score[i], 2)) + " for " + str(params[i]))

The best parameter values: {'C': 100.0, 'gamma': 1.0}


Prediction Metrics: 
                        precision    recall  f1-score   support

        AdenomeraAndre       0.99      0.98      0.99       182
AdenomeraHylaedactylus       0.99      1.00      1.00      1058
    Ameeregatrivittata       0.98      0.99      0.99       154
            HylaMinuta       0.97      0.95      0.96        91
  HypsiboasCinerascens       0.98      0.99      0.99       153
     HypsiboasCordobae       0.99      0.98      0.99       326
   LeptodactylusFuscus       0.97      1.00      0.99        77
 OsteocephalusOophagus       0.98      0.89      0.93        47
     Rhinellagranulosa       1.00      0.93      0.97        15
           ScinaxRuber       1.00      0.98      0.99        56

              accuracy                           0.99      2159
             macro avg       0.99      0.97      0.98      2159
          weighted avg       0.99      0.99      0.99      2159

Scores for different par

In [249]:
def hammingloss(y_true,y_pred):
    labels_misclassified = 0
    for actual, pred in zip(y_true.values, y_pred.values):
        err = (actual != pred)
        labels_misclassified += np.sum(err)
    hamming_loss = labels_misclassified / (y_true.shape[0] * y_true.shape[1])
    return np.round(hamming_loss, 4)
def exact_match_ratio(y_true,y_pred):
    exact_match_ratio = 0
    for actual, pred in zip(y_true.values, y_pred.values):
        match = (actual == pred)
        if sum(match) == y_true.shape[1]:
            exact_match_ratio += 1;
    exact_match_ratio /= y_true.shape[0]
    return np.round(exact_match_ratio, 4)

In [338]:
y_pred_multi = pd.DataFrame(columns=["Family","Genus","Species"])
y_pred_multi.loc[:, "Family"] = y_familyPredGaussian
y_pred_multi.loc[:, "Genus"] = y_genusPredGaussian
y_pred_multi.loc[:, "Species"] = y_speciesPredGaussian
y_true_multi=test.iloc[:, -4:-1]
   
print("Multilabel evaluation - Gaussian SVC (without Standardization)")
hamming_lossGaussian=hammingloss(y_true_multi,y_pred_multi)
exact_match_ratioGaussian=exact_match_ratio(y_true_multi,y_pred_multi)
print("Hamming Loss : "+str(hamming_lossGaussian))
print("Exact Match Ratio : "+str(exact_match_ratioGaussian))

Multilabel evaluation - Gaussian SVC (without Standardization)
Hamming Loss : 0.0096
Exact Match Ratio : 0.9851783232978231


### OneVsRestClassifier for raw features ###


***For Family***

In [342]:

# Tuning parameters
tuning_params = {'estimator__C': np.logspace(1, 4, 4), 
                 'estimator__gamma': np.logspace(-3, 6, 10)}

# Cross-validation strategy
cv = StratifiedKFold(10, random_state=5036, shuffle=True)

# Base SVM classifier with RBF kernel
base_classifier = SVC(kernel='rbf', probability=True)

# Wrap with OneVsRestClassifier
ovr_classifier = OneVsRestClassifier(base_classifier)

# GridSearchCV for OVR
grid_search_ovr = GridSearchCV(estimator=ovr_classifier, param_grid=tuning_params, cv=cv, verbose=1, n_jobs=-1)

# Example for "Family" label
print("Training for 'Family' label")
grid_search_ovr.fit(train_X, train_Family)
# Best hyperparameters
print("The best parameter values for 'Family':", grid_search_ovr.best_params_)

# Prediction and metrics
yTest_pred_OVR = grid_search_ovr.predict(test_X)
print("\n\nPrediction Metrics for 'Family':")
print(classification_report(test_Family, yTest_pred_OVR))
test_accuracy = accuracy_score(test_Family, yTest_pred_OVR)
test_hamming_loss = hamming_loss(test_Family, yTest_pred_OVR)
# Display grid search scores for different parameter combinations
print("\nScores for different parameter options:")
y_train_pred = grid_search_ovr.predict(train_X)
train_accuracy = accuracy_score(train_Family, y_train_pred)
train_hamming_loss = hamming_loss(train_Family, y_train_pred)
test_hamming_loss = hamming_loss(test_Family, yTest_pred_OVR)
print(f"Test Accuracy: {test_accuracy:.4f}")
mean_score = grid_search_ovr.cv_results_['mean_test_score']
std_score = grid_search_ovr.cv_results_['std_test_score']
params = grid_search_ovr.cv_results_['params']
print(f"Train Accuracy: {train_accuracy:.4f}")
print(f"Test Accuracy: {test_accuracy:.4f}")
print(f"Train Hamming Loss: {train_hamming_loss:.4f}")
print(f"Test Hamming Loss: {test_hamming_loss:.4f}")
for i in range(len(params)):
    print(f"Score = {round(mean_score[i], 2)} ± {round(std_score[i], 2)} for {params[i]}")


Training for 'Family' label
Fitting 10 folds for each of 40 candidates, totalling 400 fits
The best parameter values for 'Family': {'estimator__C': 10.0, 'estimator__gamma': 1.0}


Prediction Metrics for 'Family':
                 precision    recall  f1-score   support

      Bufonidae       0.93      0.87      0.90        15
  Dendrobatidae       0.99      0.99      0.99       154
        Hylidae       0.99      0.98      0.99       673
Leptodactylidae       0.99      1.00      0.99      1317

       accuracy                           0.99      2159
      macro avg       0.97      0.96      0.97      2159
   weighted avg       0.99      0.99      0.99      2159


Scores for different parameter options:
Test Accuracy: 0.9893
Train Accuracy: 0.9980
Test Accuracy: 0.9893
Train Hamming Loss: 0.0020
Test Hamming Loss: 0.0107
Score = 0.85 ± 0.01 for {'estimator__C': 10.0, 'estimator__gamma': 0.001}
Score = 0.92 ± 0.01 for {'estimator__C': 10.0, 'estimator__gamma': 0.01}
Score = 0.96 ± 0.01

***For Genus***

In [345]:

# Tuning parameters
tuning_params = {'estimator__C': np.logspace(1, 4, 4), 
                 'estimator__gamma': np.logspace(-3, 6, 10)}

# Cross-validation strategy
cv = StratifiedKFold(10, random_state=5036, shuffle=True)

# Base SVM classifier with RBF kernel
base_classifier = SVC(kernel='rbf', probability=True)

# Wrap with OneVsRestClassifier
ovr_classifier = OneVsRestClassifier(base_classifier)

# GridSearchCV for OVR
grid_search_ovr = GridSearchCV(estimator=ovr_classifier, param_grid=tuning_params, cv=cv, verbose=1, n_jobs=-1)

# Example for "Family" label
print("Training for 'Genus' label")
grid_search_ovr.fit(train_X, train_Genus)
# Best hyperparameters
print("The best parameter values for 'Genus':", grid_search_ovr.best_params_)

# Prediction and metrics
yGenus_pred_OVR = grid_search_ovr.predict(test_X)
print("\n\nPrediction Metrics for 'Genus':")
print(classification_report(test_Genus, yGenus_pred_OVR))
test_accuracy = accuracy_score(test_Genus, yGenus_pred_OVR)
test_hamming_loss = hamming_loss(test_Genus, yGenus_pred_OVR)
# Display grid search scores for different parameter combinations
print("\nScores for different parameter options:")
y_train_pred = grid_search_ovr.predict(train_X)
train_accuracy = accuracy_score(train_Genus, y_train_pred)
train_hamming_loss = hamming_loss(train_Genus, y_train_pred)
test_hamming_loss = hamming_loss(test_Genus, yGenus_pred_OVR)
print(f"Test Accuracy: {test_accuracy:.4f}")
mean_score = grid_search_ovr.cv_results_['mean_test_score']
std_score = grid_search_ovr.cv_results_['std_test_score']
params = grid_search_ovr.cv_results_['params']
print(f"Train Accuracy: {train_accuracy:.4f}")
print(f"Test Accuracy: {test_accuracy:.4f}")
print(f"Train Hamming Loss: {train_hamming_loss:.4f}")
print(f"Test Hamming Loss: {test_hamming_loss:.4f}")
for i in range(len(params)):
    print(f"Score = {round(mean_score[i], 2)} ± {round(std_score[i], 2)} for {params[i]}")

Training for 'Genus' label
Fitting 10 folds for each of 40 candidates, totalling 400 fits
The best parameter values for 'Genus': {'estimator__C': 1000.0, 'estimator__gamma': 1.0}


Prediction Metrics for 'Genus':
               precision    recall  f1-score   support

    Adenomera       0.99      1.00      1.00      1240
     Ameerega       0.99      1.00      0.99       154
Dendropsophus       0.98      0.95      0.96        91
    Hypsiboas       0.99      0.99      0.99       479
Leptodactylus       0.94      0.99      0.96        77
Osteocephalus       1.00      0.87      0.93        47
     Rhinella       1.00      0.93      0.97        15
       Scinax       1.00      0.98      0.99        56

     accuracy                           0.99      2159
    macro avg       0.99      0.96      0.97      2159
 weighted avg       0.99      0.99      0.99      2159


Scores for different parameter options:
Test Accuracy: 0.9893
Train Accuracy: 1.0000
Test Accuracy: 0.9893
Train Hamming Lo

### For Species

In [347]:
# Tuning parameters
tuning_params = {'estimator__C': np.logspace(1, 4, 4), 
                 'estimator__gamma': np.logspace(-3, 6, 10)}

# Cross-validation strategy
cv = StratifiedKFold(10, random_state=5036, shuffle=True)

# Base SVM classifier with RBF kernel
base_classifier = SVC(kernel='rbf', probability=True)

# Wrap with OneVsRestClassifier
ovr_classifier = OneVsRestClassifier(base_classifier)

# GridSearchCV for OVR
grid_search_ovr = GridSearchCV(estimator=ovr_classifier, param_grid=tuning_params, cv=cv, verbose=1, n_jobs=-1)

# Example for "Family" label
print("Training for 'Species' label")
grid_search_ovr.fit(train_X, train_Species)
# Best hyperparameters
print("The best parameter values for 'Species':", grid_search_ovr.best_params_)

# Prediction and metrics
ySpecies_pred_OVR = grid_search_ovr.predict(test_X)
print("\n\nPrediction Metrics for 'Species':")
print(classification_report(test_Species, ySpecies_pred_OVR))
test_accuracy = accuracy_score(test_Species, ySpecies_pred_OVR)
test_hamming_loss = hamming_loss(test_Species, ySpecies_pred_OVR)
# Display grid search scores for different parameter combinations
print("\nScores for different parameter options:")
y_train_pred = grid_search_ovr.predict(train_X)
train_accuracy = accuracy_score(train_Species, y_train_pred)
train_hamming_loss = hamming_loss(train_Species, y_train_pred)
test_hamming_loss = hamming_loss(test_Species, ySpecies_pred_OVR)
print(f"Test Accuracy: {test_accuracy:.4f}")
mean_score = grid_search_ovr.cv_results_['mean_test_score']
std_score = grid_search_ovr.cv_results_['std_test_score']
params = grid_search_ovr.cv_results_['params']
print(f"Train Accuracy: {train_accuracy:.4f}")
print(f"Test Accuracy: {test_accuracy:.4f}")
print(f"Train Hamming Loss: {train_hamming_loss:.4f}")
print(f"Test Hamming Loss: {test_hamming_loss:.4f}")
for i in range(len(params)):
    print(f"Score = {round(mean_score[i], 2)} ± {round(std_score[i], 2)} for {params[i]}")

Training for 'Species' label
Fitting 10 folds for each of 40 candidates, totalling 400 fits
The best parameter values for 'Species': {'estimator__C': 10.0, 'estimator__gamma': 1.0}


Prediction Metrics for 'Species':
                        precision    recall  f1-score   support

        AdenomeraAndre       0.99      0.97      0.98       182
AdenomeraHylaedactylus       0.99      1.00      1.00      1058
    Ameeregatrivittata       0.99      1.00      0.99       154
            HylaMinuta       0.98      0.95      0.96        91
  HypsiboasCinerascens       0.97      0.99      0.98       153
     HypsiboasCordobae       0.99      0.98      0.99       326
   LeptodactylusFuscus       0.97      1.00      0.99        77
 OsteocephalusOophagus       0.98      0.87      0.92        47
     Rhinellagranulosa       1.00      1.00      1.00        15
           ScinaxRuber       0.98      0.98      0.98        56

              accuracy                           0.99      2159
             

In [349]:
y_pred_multi = pd.DataFrame(columns=["Family","Genus","Species"])
y_pred_multi.loc[:, "Family"] = yTest_pred_OVR
y_pred_multi.loc[:, "Genus"] = yGenus_pred_OVR
y_pred_multi.loc[:, "Species"] = ySpecies_pred_OVR
y_true_multi=test.iloc[:, -4:-1]
   
print("Multilabel evaluation - OnevsRest SVC (without Standardization)")
hamming_lossOnevsRest=hammingloss(y_true_multi,y_pred_multi)
exact_match_ratioOnevsRest=exact_match_ratio(y_true_multi,y_pred_multi)
print("Hamming Loss : "+str(hamming_lossOnevsRest))
print("Exact Match Ratio : "+str(exact_match_ratioOnevsRest))

Multilabel evaluation - OnevsRest SVC (without Standardization)
Hamming Loss : 0.0111
Exact Match Ratio : 0.984251968503937


### b(iii) - L1

In [287]:

def evaluate_l1_svm(train_X, train_y, test_X, test_y, label_name):
    print(f"\nTraining L1-penalized SVM for label: {label_name}")
    
    # Standardize the features
    scaler = StandardScaler()
    train_X = scaler.fit_transform(train_X)
    test_X = scaler.transform(test_X)
    print(f"Features standardized for {label_name}.")
    
    # Define the L1-penalized SVM
    classifier = LinearSVC(penalty='l1', dual=False)
    
    # Tuning parameters
    tuning_params = {'C': np.logspace(1, 5, 10)}
    
    # Cross-validation strategy
    cv = StratifiedKFold(10, random_state=5036, shuffle=True)
    
    # Perform GridSearchCV
    grid_search = GridSearchCV(estimator=classifier, param_grid=tuning_params, cv=cv, scoring='accuracy', verbose=1, n_jobs=-1)
    grid_search.fit(train_X, train_y)
    
    # Best parameters
    best_params = grid_search.best_params_
    best_cv_score = grid_search.best_score_
    print(f"Best parameters for {label_name}: {best_params}")
    print(f"Best cross-validation score for {label_name}: {best_cv_score:.4f}")
    
    # Final model
    best_model = grid_search.best_estimator_
    
    # Train predictions
    y_train_pred = best_model.predict(train_X)
    train_accuracy = accuracy_score(train_y, y_train_pred)
    
    # Test predictions
    y_pred = best_model.predict(test_X)
    test_accuracy = accuracy_score(test_y, y_pred)
    test_hamming_loss_value = hamming_loss(test_y, y_pred)  # Ensure function isn't shadowed
    test_exact_match_ratio = np.mean(np.array(y_pred) == np.array(test_y))
    
    # Results summary
    print(f"\nResults for {label_name}:")
    print(f"Train Accuracy: {train_accuracy:.4f}")
    print(f"Test Accuracy: {test_accuracy:.4f}")
    print(f"Test Hamming Loss: {test_hamming_loss_value:.4f}")
    print(f"Test Exact Match Ratio: {test_exact_match_ratio:.4f}")
    print(f"\nClassification Report for {label_name}:\n")
    print(classification_report(test_y, y_pred))
    
    return y_pred





In [289]:
y_familyPredL1=evaluate_l1_svm(train_X, train_Family, test_X, test_Family, label_name="Family")


Training L1-penalized SVM for label: Family
Features standardized for Family.
Fitting 10 folds for each of 10 candidates, totalling 100 fits




Best parameters for Family: {'C': 10.0}
Best cross-validation score for Family: 0.9331

Results for Family:
Train Accuracy: 0.9353
Test Accuracy: 0.9412
Test Hamming Loss: 0.0588
Test Exact Match Ratio: 0.9412

Classification Report for Family:

                 precision    recall  f1-score   support

      Bufonidae       0.00      0.00      0.00        15
  Dendrobatidae       0.88      0.91      0.89       154
        Hylidae       0.94      0.90      0.92       673
Leptodactylidae       0.95      0.98      0.96      1317

       accuracy                           0.94      2159
      macro avg       0.69      0.70      0.69      2159
   weighted avg       0.93      0.94      0.94      2159





In [291]:
y_genusPredL1=evaluate_l1_svm(train_X, train_Genus, test_X, test_Genus, label_name="Genus")



Training L1-penalized SVM for label: Genus
Features standardized for Genus.
Fitting 10 folds for each of 10 candidates, totalling 100 fits




Best parameters for Genus: {'C': 10.0}
Best cross-validation score for Genus: 0.9508

Results for Genus:
Train Accuracy: 0.9541
Test Accuracy: 0.9491
Test Hamming Loss: 0.0509
Test Exact Match Ratio: 0.9491

Classification Report for Genus:

               precision    recall  f1-score   support

    Adenomera       0.96      0.99      0.98      1240
     Ameerega       0.93      0.95      0.94       154
Dendropsophus       0.94      0.65      0.77        91
    Hypsiboas       0.91      0.98      0.95       479
Leptodactylus       0.99      0.90      0.94        77
Osteocephalus       1.00      0.34      0.51        47
     Rhinella       0.92      0.73      0.81        15
       Scinax       0.94      0.91      0.93        56

     accuracy                           0.95      2159
    macro avg       0.95      0.81      0.85      2159
 weighted avg       0.95      0.95      0.94      2159





In [293]:
y_speciesPredL1=evaluate_l1_svm(train_X, train_Species, test_X, test_Species, label_name="Species")



Training L1-penalized SVM for label: Species
Features standardized for Species.
Fitting 10 folds for each of 10 candidates, totalling 100 fits




Best parameters for Species: {'C': 599.4842503189409}
Best cross-validation score for Species: 0.9601

Results for Species:
Train Accuracy: 0.9645
Test Accuracy: 0.9514
Test Hamming Loss: 0.0486
Test Exact Match Ratio: 0.9514

Classification Report for Species:

                        precision    recall  f1-score   support

        AdenomeraAndre       0.90      0.93      0.92       182
AdenomeraHylaedactylus       0.98      1.00      0.99      1058
    Ameeregatrivittata       0.94      0.95      0.94       154
            HylaMinuta       0.93      0.69      0.79        91
  HypsiboasCinerascens       0.91      0.95      0.93       153
     HypsiboasCordobae       0.92      0.96      0.94       326
   LeptodactylusFuscus       0.96      0.92      0.94        77
 OsteocephalusOophagus       1.00      0.47      0.64        47
     Rhinellagranulosa       0.74      0.93      0.82        15
           ScinaxRuber       0.95      0.96      0.96        56

              accuracy         



In [307]:
y_pred_multi_L1 = pd.DataFrame(columns=["Family","Genus","Species"])
y_pred_multi_L1.loc[:, "Family"] = y_familyPredL1
y_pred_multi_L1.loc[:, "Genus"] = y_genusPredL1
y_pred_multi_L1.loc[:, "Species"] = y_speciesPredL1
y_true_multi=test.iloc[:, -4:-1]
   
print("Multilabel evaluation - SVC with L1 Penalty")
hamming_loss_L1=hammingloss(y_true_multi,y_pred_multi_L1)
exact_match_ratio_L1=exact_match_ratio(y_true_multi,y_pred_multi_L1)
print("Hamming Loss : "+str(hamming_loss_L1))
print("Exact Match Ratio : "+str(exact_match_ratio_L1))

Multilabel evaluation - SVC with L1 Penalty
Hamming Loss : 0.0528
Exact Match Ratio : 0.9138490041685966


### (b)iv

In [123]:
scaler = StandardScaler()

scaledTrainX = scaler.fit_transform(train_X)
scaledTestX = scaler.fit_transform(test_X)

### Family Label : 

In [309]:
smote_tuning_params = {'classification__C' : np.logspace(1, 5, 10)}
classifierForSmote= Pipeline([('sampling', SMOTE()), ('classification', LinearSVC(penalty = 'l1', dual = False) )])
#For Label Family:
familySmotemodel = GridSearchCV(estimator = classifierForSmote, param_grid = smote_tuning_params, cv = cv, verbose = 1)
familySmotemodel.fit(scaledTrainX, train_Family)


Fitting 10 folds for each of 10 candidates, totalling 100 fits




In [311]:
print("The best parameter values: " + str(familySmotemodel.best_params_))
print("\n\nPrediction Metrics: ")
y_familyPredSmote = familySmotemodel.predict(scaledTestX)
print(classification_report(test_Family, y_familyPredSmote))
print("Scores for different parameter options")
mean_score = familySmotemodel.cv_results_['mean_test_score']
std_score = familySmotemodel.cv_results_['std_test_score']
params = familySmotemodel.cv_results_['params']
hamming_loss_value = hamming_loss(test_Family, y_pred)
print(f"Hamming Loss: {hamming_loss_value:.4f}")
 #Exact Match Ratio
exact_match_ratio = accuracy_score(test_Family, y_pred)  # Same as accuracy in single-label tasks
print(f"Exact Match Ratio: {exact_match_ratio:.4f}")
for i in range(len(params)):
    print("Score = " + str(round(mean_score[i], 2)) + " for " + str(params[i]))

The best parameter values: {'classification__C': 1668.100537200059}


Prediction Metrics: 
                 precision    recall  f1-score   support

      Bufonidae       0.23      1.00      0.37        15
  Dendrobatidae       0.79      0.98      0.88       154
        Hylidae       0.95      0.86      0.91       673
Leptodactylidae       0.96      0.95      0.95      1317

       accuracy                           0.92      2159
      macro avg       0.73      0.95      0.78      2159
   weighted avg       0.94      0.92      0.93      2159

Scores for different parameter options
Hamming Loss: 1.0000
Exact Match Ratio: 0.0000
Score = 0.91 for {'classification__C': 10.0}
Score = 0.91 for {'classification__C': 27.825594022071243}
Score = 0.91 for {'classification__C': 77.4263682681127}
Score = 0.91 for {'classification__C': 215.44346900318823}
Score = 0.92 for {'classification__C': 599.4842503189409}
Score = 0.92 for {'classification__C': 1668.100537200059}
Score = 0.91 for {'classific

### Genus Label

In [316]:
genusSmotemodel = GridSearchCV(estimator = classifierForSmote, param_grid = smote_tuning_params, cv = cv, verbose = 1)
genusSmotemodel.fit(scaledTrainX, train_Genus)

Fitting 10 folds for each of 10 candidates, totalling 100 fits




In [318]:
print("The best parameter values: " + str(genusSmotemodel.best_params_))
print("\n\nPrediction Metrics: ")
y_genusPredSmote = genusSmotemodel.predict(scaledTestX)
print(classification_report(test_Genus, y_genusPredSmote))
print("Scores for different parameter options")
mean_score = genusSmotemodel.cv_results_['mean_test_score']
std_score = genusSmotemodel.cv_results_['std_test_score']
params = genusSmotemodel.cv_results_['params']
hamming_loss_value = hamming_loss(test_Genus, y_genusPredSmote)
print(f"Hamming Loss: {hamming_loss_value:.4f}")
# Exact Match Ratio
exact_match_ratio = accuracy_score(test_Genus, y_genusPredSmote)  # Same as accuracy in single-label tasks
print(f"Exact Match Ratio: {exact_match_ratio:.4f}")
for i in range(len(params)):
    print("Score = " + str(round(mean_score[i], 2)) + " for " + str(params[i]))

The best parameter values: {'classification__C': 4641.588833612777}


Prediction Metrics: 
               precision    recall  f1-score   support

    Adenomera       0.99      0.92      0.95      1240
     Ameerega       0.82      0.91      0.86       154
Dendropsophus       0.65      0.97      0.78        91
    Hypsiboas       0.97      0.89      0.93       479
Leptodactylus       0.92      0.91      0.92        77
Osteocephalus       0.57      0.79      0.66        47
     Rhinella       0.21      0.87      0.34        15
       Scinax       0.86      0.96      0.91        56

     accuracy                           0.91      2159
    macro avg       0.75      0.90      0.79      2159
 weighted avg       0.94      0.91      0.92      2159

Scores for different parameter options
Hamming Loss: 0.0912
Exact Match Ratio: 0.9088
Score = 0.91 for {'classification__C': 10.0}
Score = 0.91 for {'classification__C': 27.825594022071243}
Score = 0.91 for {'classification__C': 77.4263682681127}

### Species

In [149]:
speciesSmotemodel = GridSearchCV(estimator = classifierForSmote, param_grid = smote_tuning_params, cv = cv, verbose = 1)
speciesSmotemodel.fit(scaledTrainX, train_Species)

Fitting 10 folds for each of 10 candidates, totalling 100 fits




In [320]:
print("The best parameter values: " + str(speciesSmotemodel.best_params_))
print("\n\nPrediction Metrics: ")
y_speciesPredSmote = speciesSmotemodel.predict(scaledTestX)
print(classification_report(test_Species, y_speciesPredSmote))
print("Scores for different parameter options")
mean_score = speciesSmotemodel.cv_results_['mean_test_score']
std_score = speciesSmotemodel.cv_results_['std_test_score']
params = speciesSmotemodel.cv_results_['params']
hamming_loss_value = hamming_loss(test_Species, y_speciesPredSmote)
print(f"Hamming Loss: {hamming_loss_value:.4f}")
# Exact Match Ratio
exact_match_ratio = accuracy_score(test_Species, y_speciesPredSmote)  # Same as accuracy in single-label tasks
print(f"Exact Match Ratio: {exact_match_ratio:.4f}")
for i in range(len(params)):
    print("Score = " + str(round(mean_score[i], 2)) + " for " + str(params[i]))

The best parameter values: {'classification__C': 1668.100537200059}


Prediction Metrics: 
                        precision    recall  f1-score   support

        AdenomeraAndre       0.95      0.93      0.94       182
AdenomeraHylaedactylus       0.99      1.00      0.99      1058
    Ameeregatrivittata       0.94      0.89      0.92       154
            HylaMinuta       0.83      0.90      0.86        91
  HypsiboasCinerascens       0.92      0.92      0.92       153
     HypsiboasCordobae       0.95      0.90      0.92       326
   LeptodactylusFuscus       0.93      0.90      0.91        77
 OsteocephalusOophagus       0.80      0.70      0.75        47
     Rhinellagranulosa       0.37      0.93      0.53        15
           ScinaxRuber       0.89      0.96      0.92        56

              accuracy                           0.95      2159
             macro avg       0.86      0.90      0.87      2159
          weighted avg       0.95      0.95      0.95      2159

Scores for

In [336]:
y_pred_multi_smote = pd.DataFrame(columns=["Family","Genus","Species"])
y_pred_multi_smote.loc[:, "Family"] = y_familyPredSmote
y_pred_multi_smote.loc[:, "Genus"] = y_genusPredSmote
y_pred_multi_smote.loc[:, "Species"] = y_speciesPredSmote
y_true_multi=test.iloc[:, -4:-1]
   
print("Multilabel evaluation -  SVC with L1-Penalty and SMOTE")
hamming_loss_L1_smote=hammingloss(y_true_multi,y_pred_multi_smote)
exact_match_ratio_L1_smote=exact_match_ratio(y_true_multi,y_pred_multi_smote)
print("Hamming Loss : "+str(hamming_loss_L1_smote))
print("Exact Match Ratio : "+str(exact_match_ratio_L1_smote))

Multilabel evaluation -  SVC with L1-Penalty and SMOTE
Hamming Loss : 0.0735
Exact Match Ratio : 0.8527095877721167


In [355]:
report = {'Gaussian SVM without Standardization': [hamming_lossGaussian,exact_match_ratioGaussian],
          'OnevsRest SVM without Standardization':[hamming_lossOnevsRest,exact_match_ratioOnevsRest],
           'SVM with L1-Penalty': [hamming_loss_L1,exact_match_ratio_L1],
           'SVM with L1-Penalty and SMOTE': [hamming_loss_L1_smote,exact_match_ratio_L1_smote]}

summary_df = pd.DataFrame(report, index=['Hamming Loss','Exact Match Ratio'])
summary_df


Unnamed: 0,Gaussian SVM without Standardization,OnevsRest SVM without Standardization,SVM with L1-Penalty,SVM with L1-Penalty and SMOTE
Hamming Loss,0.0096,0.0111,0.0528,0.0735
Exact Match Ratio,0.985178,0.984252,0.913849,0.85271


***The obtained hamming loss and exact match values indicate that Gaussian SVM without standardization
has the best performance amongst the four.***

***In L1-penalized SVM, we observe an increase in hamming loss and a decrease in exact match in comparison to Gaussian kernel SVM***

***The exact match ratio decreases and Hamming loss increases when we apply SMOTE to remedy class imbalance with L1-penalized SVMs. This indicates that applying SMOTE does not improve the performance of L1-penalized SVM in this case.***

### K-Means Clustering on a Multi-Class and Multi-Label Data Set
### Monte-Carlo Simulation: ###

In [187]:
table = {'TotalIterations': [], 'Optimal-K': [], 
          'Hamming Distance': [], 'Hamming Score': [], 'Hamming Loss': [], 
          'Majority Triplets': []}
for i in range(1, 51):
    #initialize for each iteration
    optimalK = 2
    maxScore = 0 # track best silhouete score for best k
    table['TotalIterations'].append(i)
    print(i)
    for j in range(2, 51):
        rand_val = random.randint(1, 50)
        k_means = KMeans(n_clusters=j, random_state=rand_val).fit(X)
        s_score = silhouette_score(X, k_means.labels_)
        if s_score > maxScore:
            maxScore = s_score
            optimalK = j
    table['Optimal-K'].append(optimalK)
    # Step 2: Run KMeans with optimal K
    rand_val = random.randint(1, 50)
    k_means = KMeans(n_clusters=optimalK, random_state=rand_val).fit(X)
    cluster_labels = k_means.labels_
    clustered_df = pd.concat([X, Y, pd.DataFrame({'labels': cluster_labels.tolist()})], axis=1)
    # Step 3: Calculate majority triplets
    majority_triplets = []
    total_mismatches = 0
    
    for cluster_id in range(optimalK):
        cluster_data = clustered_df[clustered_df['labels'] == cluster_id]
        
        family_majority = cluster_data['Family'].value_counts().idxmax()
        genus_majority = cluster_data['Genus'].value_counts().idxmax()
        species_majority = cluster_data['Species'].value_counts().idxmax()
        
        majority_triplets.append((family_majority, genus_majority, species_majority))
        
        # Count mismatches for Hamming metrics
        mismatches = cluster_data[['Family', 'Genus', 'Species']] != [family_majority, genus_majority, species_majority]
        total_mismatches += mismatches.sum().sum()
    
    table['Majority Triplets'].append(majority_triplets)
    # Step 4: Calculate Hamming metrics
    n_samples = len(clustered_df)
    hamming_distance = total_mismatches / n_samples
    hamming_loss = total_mismatches / (3 * n_samples)
    hamming_score = 1 - hamming_loss
    
    table['Hamming Distance'].append(hamming_distance)
    table['Hamming Loss'].append(hamming_loss)
    table['Hamming Score'].append(hamming_score)

# Display results summary
final_df = pd.DataFrame(table)
print(final_df.describe())

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
       TotalIterations  Optimal-K  Hamming Distance  Hamming Score  \
count         50.00000  50.000000         50.000000      50.000000   
mean          25.50000   4.580000          0.680859       0.773047   
std           14.57738   1.196764          0.125582       0.041861   
min            1.00000   3.000000          0.445587       0.701784   
25%           13.25000   4.000000          0.591244       0.755779   
50%           25.50000   4.000000          0.700903       0.766366   
75%           37.75000   5.000000          0.732662       0.802919   
max           50.00000   8.000000          0.894649       0.851471   

       Hamming Loss  
count     50.000000  
mean       0.226953  
std        0.041861  
min        0.148529  
25%        0.197081  
50%        0.233634  
75%        0.244221  
max        0.298216  


In [189]:
final_df

Unnamed: 0,TotalIterations,Optimal-K,Hamming Distance,Hamming Score,Hamming Loss,Majority Triplets
0,1,4,0.716053,0.761316,0.238684,"[(Leptodactylidae, Adenomera, AdenomeraHylaeda..."
1,2,6,0.497012,0.834329,0.165671,"[(Leptodactylidae, Adenomera, AdenomeraHylaeda..."
2,3,4,0.736901,0.754366,0.245634,"[(Dendrobatidae, Ameerega, Ameeregatrivittata)..."
3,4,4,0.700903,0.766366,0.233634,"[(Hylidae, Hypsiboas, HypsiboasCordobae), (Lep..."
4,5,5,0.669354,0.776882,0.223118,"[(Hylidae, Hypsiboas, HypsiboasCordobae), (Lep..."
5,6,4,0.893676,0.702108,0.297892,"[(Leptodactylidae, Adenomera, AdenomeraHylaeda..."
6,7,5,0.503266,0.832245,0.167755,"[(Dendrobatidae, Ameerega, Ameeregatrivittata)..."
7,8,5,0.645448,0.784851,0.215149,"[(Hylidae, Hypsiboas, HypsiboasCordobae), (Lep..."
8,9,3,0.847255,0.717582,0.282418,"[(Leptodactylidae, Adenomera, AdenomeraHylaeda..."
9,10,6,0.598888,0.800371,0.199629,"[(Hylidae, Hypsiboas, HypsiboasCordobae), (Lep..."


In [201]:
print(final_df.iloc[1, 5])
print(len(final_df.iloc[1, 5]))

[('Leptodactylidae', 'Adenomera', 'AdenomeraHylaedactylus'), ('Leptodactylidae', 'Adenomera', 'AdenomeraAndre'), ('Leptodactylidae', 'Adenomera', 'AdenomeraAndre'), ('Hylidae', 'Hypsiboas', 'HypsiboasCinerascens'), ('Dendrobatidae', 'Ameerega', 'Ameeregatrivittata'), ('Hylidae', 'Hypsiboas', 'HypsiboasCordobae')]
6


In [207]:
res = pd.DataFrame(table)
avg = res.describe().loc['mean']
std = res.describe().loc['std']

print("Average Hamming Distance: " + str(round(avg['Hamming Distance'], 4)))
print("Average Hamming Score: " + str(round(avg['Hamming Score'], 4)))
print("Average Hamming Loss: " + str(round(avg['Hamming Loss'], 4)))

print("Standard deviation of Hamming Distance: " + str(round(std['Hamming Distance'], 4)))
print("Standard deviation of Hamming Score: " + str(round(std['Hamming Score'], 4)))
print("Standard deviation of Hamming Loss: " + str(round(std['Hamming Loss'], 4)))

Average Hamming Distance: 0.6809
Average Hamming Score: 0.773
Average Hamming Loss: 0.227
Standard deviation of Hamming Distance: 0.1256
Standard deviation of Hamming Score: 0.0419
Standard deviation of Hamming Loss: 0.0419


In [231]:
print("Listing down majority triplets for each cluster in each iteration")
for idx, row in final_df.iterrows():
    majority_triplets = row['Majority Triplets']  # Extract the list of triplets
    # Create a DataFrame for the triplets
    triplet_df = pd.DataFrame(majority_triplets, columns=['Family', 'Genus', 'Species'])
    
    # Print the DataFrame for this row
    
    print(f"DataFrame for Majority Triplet inRow {idx + 1}:")
    print(triplet_df)
    print("\n")

Listing down majority triplets for each cluster in each iteration
DataFrame for Majority Triplet inRow 1:
            Family      Genus                 Species
0  Leptodactylidae  Adenomera  AdenomeraHylaedactylus
1  Leptodactylidae  Adenomera  AdenomeraHylaedactylus
2          Hylidae  Hypsiboas       HypsiboasCordobae
3    Dendrobatidae   Ameerega      Ameeregatrivittata


DataFrame for Majority Triplet inRow 2:
            Family      Genus                 Species
0  Leptodactylidae  Adenomera  AdenomeraHylaedactylus
1  Leptodactylidae  Adenomera          AdenomeraAndre
2  Leptodactylidae  Adenomera          AdenomeraAndre
3          Hylidae  Hypsiboas    HypsiboasCinerascens
4    Dendrobatidae   Ameerega      Ameeregatrivittata
5          Hylidae  Hypsiboas       HypsiboasCordobae


DataFrame for Majority Triplet inRow 3:
            Family      Genus                 Species
0    Dendrobatidae   Ameerega      Ameeregatrivittata
1  Leptodactylidae  Adenomera  AdenomeraHylaedactylus


### ISLR : 12.6.2 ###

### Check Github , Added in github directly ###