## Imports

In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC, LinearSVC
from sklearn.metrics import accuracy_score, hamming_loss, silhouette_score
from sklearn import preprocessing
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
from sklearn.cluster import KMeans
import random
import operator
import statistics

# Getting warnings even after adjusting max iterations to 1,000,000, so decided to filter out 
# as there were no other warnings when running through
import warnings
warnings.filterwarnings("ignore")

## 1. Multi-class and Multi-Label Classification Using Support Vector Machines

(a) Download the Anuran Calls (MFCCs) Data Set from: https://archive.ics.
uci.edu/ml/datasets/Anuran+Calls+%28MFCCs%29. Choose 70% of the data
randomly as the training set.

In [5]:
df = pd.read_csv('../data/Anuran Calls (MFCCs)/Frogs_MFCCs.csv')
df.head()

Unnamed: 0,MFCCs_ 1,MFCCs_ 2,MFCCs_ 3,MFCCs_ 4,MFCCs_ 5,MFCCs_ 6,MFCCs_ 7,MFCCs_ 8,MFCCs_ 9,MFCCs_10,...,MFCCs_17,MFCCs_18,MFCCs_19,MFCCs_20,MFCCs_21,MFCCs_22,Family,Genus,Species,RecordID
0,1.0,0.152936,-0.105586,0.200722,0.317201,0.260764,0.100945,-0.150063,-0.171128,0.124676,...,-0.108351,-0.077623,-0.009568,0.057684,0.11868,0.014038,Leptodactylidae,Adenomera,AdenomeraAndre,1
1,1.0,0.171534,-0.098975,0.268425,0.338672,0.268353,0.060835,-0.222475,-0.207693,0.170883,...,-0.090974,-0.05651,-0.035303,0.02014,0.082263,0.029056,Leptodactylidae,Adenomera,AdenomeraAndre,1
2,1.0,0.152317,-0.082973,0.287128,0.276014,0.189867,0.008714,-0.242234,-0.219153,0.232538,...,-0.050691,-0.02359,-0.066722,-0.025083,0.099108,0.077162,Leptodactylidae,Adenomera,AdenomeraAndre,1
3,1.0,0.224392,0.118985,0.329432,0.372088,0.361005,0.015501,-0.194347,-0.098181,0.270375,...,-0.136009,-0.177037,-0.130498,-0.054766,-0.018691,0.023954,Leptodactylidae,Adenomera,AdenomeraAndre,1
4,1.0,0.087817,-0.068345,0.306967,0.330923,0.249144,0.006884,-0.265423,-0.1727,0.266434,...,-0.048885,-0.053074,-0.08855,-0.031346,0.10861,0.079244,Leptodactylidae,Adenomera,AdenomeraAndre,1


In [6]:
train_df, test_df = train_test_split(df, test_size=0.3)

In [7]:
x_train = train_df.drop(columns=['Family','Genus','Species','RecordID'], axis=1)
y_train_fam = train_df['Family']
y_train_gen = train_df['Genus']
y_train_spe = train_df['Species']

x_test = test_df.drop(columns=['Family','Genus','Species','RecordID'], axis=1)
y_test_fam = test_df['Family']
y_test_gen = test_df['Genus']
y_test_spe = test_df['Species']

(b)
Each instance has three labels: Families, Genus, and Species. Each of the labels
has multiple classes. We wish to solve a multi-class and multi-label problem.
One of the most important approaches to multi-label classification is to train a
classifier for each label (binary relevance). We first try this approach:

i.
Research exact match and hamming score/ loss methods for evaluating multi-
label classification and use them in evaluating the classifiers in this problem.

Exact Match: Measurement of the percentage of predicted labels that exactly match the actual labels  
Hamming Loss: Fraction of observations for which actual labels do not match predicted labels

ii.
Train a SVM for each of the labels, using Gaussian kernels and one versus
all classifiers. Determine the weight of the SVM penalty and the width of
the Gaussian Kernel using 10 fold cross validation.
You are welcome to try
to solve the problem with both standardized
and raw attributes and report
the results.

In [8]:
# Only using 10 different values for each parameter due to long run times
c_values = np.logspace(-3, 6, 10)
gamma_values = np.linspace(0.1, 2, 10)

In [9]:
# Determining parameters for family label
svc = SVC(kernel='rbf')
params = {'C':c_values, 'gamma':gamma_values}
grid = GridSearchCV(svc, param_grid=params, cv=10)
grid.fit(x_train, y_train_fam)

c_fam = grid.best_params_['C']
gamma_fam = grid.best_params_['gamma']

print("Best parameters for family label SVM:", grid.best_params_)

Best parameters for family label SVM: {'C': 100.0, 'gamma': 1.788888888888889}


In [10]:
# Determining parameters for genus label
svc = SVC(kernel='rbf')
params = {'C':c_values, 'gamma':gamma_values}
grid = GridSearchCV(svc, param_grid=params, cv=10)
grid.fit(x_train, y_train_gen)

c_gen = grid.best_params_['C']
gamma_gen = grid.best_params_['gamma']

print("Best parameters for genus label SVM:", grid.best_params_)

Best parameters for genus label SVM: {'C': 100.0, 'gamma': 2.0}


In [11]:
# Determining parameters for species label
svc = SVC(kernel='rbf')
params = {'C':c_values, 'gamma':gamma_values}
grid = GridSearchCV(svc, param_grid=params, cv=10)
grid.fit(x_train, y_train_spe)

c_spe = grid.best_params_['C']
gamma_spe = grid.best_params_['gamma']

print("Best parameters for species label SVM:", grid.best_params_)

Best parameters for species label SVM: {'C': 10.0, 'gamma': 2.0}


In [12]:
svc = SVC(kernel='rbf',C=c_fam,gamma=gamma_fam)
svc.fit(x_train, y_train_fam)
y_pred_fam = svc.predict(x_test)
exact_match_fam = accuracy_score(y_test_fam, y_pred_fam)
hamming_loss_fam = hamming_loss(y_test_fam, y_pred_fam)

print("Family Exact Match Score:", exact_match_fam)
print("Family Hamming Loss:", hamming_loss_fam)

Family Exact Match Score: 0.9958314034275128
Family Hamming Loss: 0.0041685965724872626


In [13]:
svc = SVC(kernel='rbf',C=c_gen,gamma=gamma_gen)
svc.fit(x_train, y_train_gen)
y_pred_gen = svc.predict(x_test)
exact_match_gen = accuracy_score(y_test_gen, y_pred_gen)
hamming_loss_gen = hamming_loss(y_test_gen, y_pred_gen)

print("Genus Exact Match Score:", exact_match_gen)
print("Genus Hamming Loss:", hamming_loss_gen)

Genus Exact Match Score: 0.9930523390458545
Genus Hamming Loss: 0.006947660954145438


In [14]:
svc = SVC(kernel='rbf',C=c_spe,gamma=gamma_spe)
svc.fit(x_train, y_train_spe)
y_pred_spe = svc.predict(x_test)
exact_match_spe = accuracy_score(y_test_spe, y_pred_spe)
hamming_loss_spe = hamming_loss(y_test_spe, y_pred_spe)

print("Species Exact Match Score:", exact_match_spe)
print("Species Hamming Loss:", hamming_loss_spe)

Species Exact Match Score: 0.9925891616489115
Species Hamming Loss: 0.007410838351088467


iii.
Repeat 1(b)ii with L1-penalized SVMs. Remember to standardize the attributes. Determine the weight of the SVM penalty using 10 fold cross validation.

In [15]:
# Standardizing features
scaler = preprocessing.MinMaxScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.fit_transform(x_test)

In [16]:
# Determining parameters for family label
svc = LinearSVC(penalty='l1', dual=False)
params = {'C':c_values}
grid = GridSearchCV(svc, param_grid=params, cv=10)
grid.fit(x_train_scaled, y_train_fam)

c_fam = grid.best_params_['C']

print("Best parameters for family label SVM:", grid.best_params_)

Best parameters for family label SVM: {'C': 1000000.0}


In [17]:
# Determining parameters for genus label
svc = LinearSVC(penalty='l1', dual=False)
params = {'C':c_values}
grid = GridSearchCV(svc, param_grid=params, cv=10)
grid.fit(x_train_scaled, y_train_gen)

c_gen = grid.best_params_['C']

print("Best parameters for genus label SVM:", grid.best_params_)

Best parameters for genus label SVM: {'C': 1000.0}


In [18]:
# Determining parameters for species label
svc = LinearSVC(penalty='l1', dual=False)
params = {'C':c_values}
grid = GridSearchCV(svc, param_grid=params, cv=10)
grid.fit(x_train_scaled, y_train_spe)

c_spe = grid.best_params_['C']

print("Best parameters for species label SVM:", grid.best_params_)

Best parameters for species label SVM: {'C': 10000.0}


In [19]:
svc = LinearSVC(penalty='l1', dual=False,C=c_fam, max_iter=1000000)
svc.fit(x_train_scaled, y_train_fam)
y_pred_fam = svc.predict(x_test_scaled)
exact_match_fam = accuracy_score(y_test_fam, y_pred_fam)
hamming_loss_fam = hamming_loss(y_test_fam, y_pred_fam)

print("Family Exact Match Score:", exact_match_fam)
print("Family Hamming Loss:", hamming_loss_fam)

Family Exact Match Score: 0.6975451597962019
Family Hamming Loss: 0.30245484020379804


In [20]:
svc = LinearSVC(penalty='l1', dual=False,C=c_gen, max_iter=1000000)
svc.fit(x_train_scaled, y_train_gen)
y_pred_gen = svc.predict(x_test_scaled)
exact_match_gen = accuracy_score(y_test_gen, y_pred_gen)
hamming_loss_gen = hamming_loss(y_test_gen, y_pred_gen)

print("Genus Exact Match Score:", exact_match_gen)
print("Family Hamming Loss:", hamming_loss_gen)

Genus Exact Match Score: 0.4478925428439092
Family Hamming Loss: 0.5521074571560908


In [21]:
svc = LinearSVC(penalty='l1', dual=False,C=c_spe, max_iter=1000000)
svc.fit(x_train_scaled, y_train_spe)
y_pred_spe = svc.predict(x_test_scaled)
exact_match_spe = accuracy_score(y_test_spe, y_pred_spe)
hamming_loss_spe = hamming_loss(y_test_spe, y_pred_spe)

print("Family Exact Match Score:", exact_match_spe)
print("Family Hamming Loss:", hamming_loss_spe)

Family Exact Match Score: 0.5752663270032422
Family Hamming Loss: 0.42473367299675774


iv.
Repeat 1(b)iii by using SMOTE or any other method you know to remedy
class imbalance. Report your conclusions about the classifiers you trained.

In [25]:
# Case Control Sampling with SMOTE
sm = SMOTE()
x_train_fam_smote, y_train_fam_smote = sm.fit_sample(x_train_scaled, y_train_fam)

svc = LinearSVC(penalty='l1', dual=False)

pipe = Pipeline(steps=[("smote", sm), ("svc", svc)])
params = {'svc__C':c_values}
grid = GridSearchCV(pipe, param_grid=params, cv=10)
grid.fit(x_train_fam_smote, y_train_fam_smote)

c_fam = grid.best_params_['svc__C']

print("Best parameters for family label SVM:", grid.best_params_)

Best parameters for family label SVM: {'svc__C': 100000.0}


In [26]:
sm = SMOTE()
x_train_gen_smote, y_train_gen_smote = sm.fit_sample(x_train_scaled, y_train_gen)

svc = LinearSVC(penalty='l1', dual=False)

pipe = Pipeline(steps=[("smote", sm), ("svc", svc)])
params = {'svc__C':c_values}
grid = GridSearchCV(pipe, param_grid=params, cv=10)
grid.fit(x_train_gen_smote, y_train_gen_smote)

c_gen = grid.best_params_['svc__C']

print("Best parameters for genus label SVM:", grid.best_params_)

Best parameters for genus label SVM: {'svc__C': 10.0}


In [27]:
sm = SMOTE()
x_train_spe_smote, y_train_spe_smote = sm.fit_sample(x_train_scaled, y_train_spe)

svc = LinearSVC(penalty='l1', dual=False)

pipe = Pipeline(steps=[("smote", sm), ("svc", svc)])
params = {'svc__C':c_values}
grid = GridSearchCV(pipe, param_grid=params, cv=10)
grid.fit(x_train_spe_smote, y_train_spe_smote)

c_spe = grid.best_params_['svc__C']

print("Best parameters for species label SVM:", grid.best_params_)

Best parameters for species label SVM: {'svc__C': 100000.0}


In [28]:
svc = LinearSVC(penalty='l1', dual=False,C=c_fam, max_iter=1000000)
svc.fit(x_train_fam_smote, y_train_fam_smote)
y_pred_fam = svc.predict(x_test_scaled)
exact_match_fam = accuracy_score(y_test_fam, y_pred_fam)
hamming_loss_fam = hamming_loss(y_test_fam, y_pred_fam)

print("Family Exact Match Score:", exact_match_fam)
print("Family Hamming Loss:", hamming_loss_fam)

Family Exact Match Score: 0.6127836961556276
Family Hamming Loss: 0.3872163038443724


In [29]:
svc = LinearSVC(penalty='l1', dual=False,C=c_gen, max_iter=1000000)
svc.fit(x_train_gen_smote, y_train_gen_smote)
y_pred_gen = svc.predict(x_test_scaled)
exact_match_gen = accuracy_score(y_test_gen, y_pred_gen)
hamming_loss_gen = hamming_loss(y_test_gen, y_pred_gen)

print("Genus Exact Match Score:", exact_match_gen)
print("Genus Hamming Loss:", hamming_loss_gen)

Genus Exact Match Score: 0.21954608615099583
Genus Hamming Loss: 0.7804539138490042


In [30]:
svc = LinearSVC(penalty='l1', dual=False,C=c_spe, max_iter=1000000)
svc.fit(x_train_spe_smote, y_train_spe_smote)
y_pred_spe = svc.predict(x_test_scaled)
exact_match_spe = accuracy_score(y_test_spe, y_pred_spe)
hamming_loss_spe = hamming_loss(y_test_spe, y_pred_spe)

print("Species Exact Match Score:", exact_match_spe)
print("Species Hamming Loss:", hamming_loss_spe)

Species Exact Match Score: 0.6058360352014822
Species Hamming Loss: 0.39416396479851784


## 2. K-Means Clustering on a Multi-Class and Multi-Label Data Set
Monte-Carlo Simulation:
Perform the following procedures 50 times, and report
the average and standard deviation of the 50 Hamming Distances that you calculate.

(a)
Use k-means clustering on the whole Anuran Calls (MFCCs) Data Set (do not split
the data into train and test, as we are not performing supervised learning in this
exercise). Choose
k
∈ {
1
,
2
, . . . ,
50
}
automatically based on one of the methods
provided in the slides (CH or Gap Statistics or scree plots or Silhouettes) or any
other method you know.

(b)
In each cluster, determine which family is the majority by reading the true labels.
Repeat for genus and species.

(c)
Now for each cluster you have a majority label triplet (family, genus, species).
Calculate the average Hamming distance, Hamming score, and Hamming loss
between the true labels and the labels assigned by clusters.

In [31]:
# Going to perform procedure 50 times and report all outputs at the end

# Initializing dataset and labels
x = df.drop(columns=['Family','Genus','Species','RecordID'], axis=1)
y = df[['Family','Genus','Species']]

k_best_list = [] #2a
maj_fam = {k:[] for k in range(50)} #2b
maj_gen = {k:[] for k in range(50)} #2b
maj_spe = {k:[] for k in range(50)} #2b
ham_loss_list = [] #2c
ham_score_list = [] #2c
ham_dist_list = [] #2c

for mcs in range(50):
    # Selecting k-value using silhouette score
    sil_scores = {}
    for k in range(2, 51):
        rand_state =random.randint(0, 1000)
        km = KMeans(n_clusters=k, random_state=rand_state).fit(x)
        labels = km.labels_
        sil_score = silhouette_score(x, labels)
        sil_scores[k] = sil_score
    # Sort scores and determine best k
    sil_scores_sort = sorted(sil_scores.items(), key=operator.itemgetter(1), reverse=True)
    k_best = sil_scores_sort[0][0]
    k_best_list.append(k_best)
    # K-means clustering with optimal k value
    rand_state =random.randint(0, 1000)
    km = KMeans(n_clusters=k_best, random_state=rand_state).fit(x)
    labels = km.labels_
    df_k = pd.concat([x,y,pd.DataFrame({'labels':labels.tolist()})],axis = 1)

    # Determining and assigning majority label
    df_k['Family_pred'] = ""
    df_k['Genus_pred'] = ""
    df_k['Species_pred'] = ""

    maj_label = {k:[] for k in range(k_best)}
    for k in range(k_best):
        label_counts = df_k[df_k['labels']==k]
        # Majority label is prediction
        maj_label[k].append(label_counts['Family'].value_counts().index[0])
        maj_label[k].append(label_counts['Genus'].value_counts().index[0])
        maj_label[k].append(label_counts['Species'].value_counts().index[0])
        maj_fam[mcs].append(label_counts['Family'].value_counts().index[0])
        maj_gen[mcs].append(label_counts['Family'].value_counts().index[0])
        maj_spe[mcs].append(label_counts['Family'].value_counts().index[0])

    for k in range(k_best):
        # Outputting true labels for majority label
        df_k['Family_pred'] = np.where(df_k['labels']==k,maj_label[k][0],df_k['Family_pred'])
        df_k['Genus_pred'] = np.where(df_k['labels']==k,maj_label[k][1],df_k['Genus_pred'])
        df_k['Species_pred'] = np.where(df_k['labels']==k,maj_label[k][2],df_k['Species_pred'])

    ham_loss_fam = hamming_loss(df_k['Family'],df_k['Family_pred'])
    ham_loss_gen = hamming_loss(df_k['Genus'],df_k['Genus_pred'])
    ham_loss_spe = hamming_loss(df_k['Species'],df_k['Species_pred'])
    
    ham_loss = (ham_loss_fam + ham_loss_gen + ham_loss_spe)/3
    ham_score = 1 - ham_loss
    ham_dist = ham_loss * df_k.shape[0]
    
    ham_loss_list.append(ham_loss)
    ham_score_list.append(ham_score)
    ham_dist_list.append(ham_dist)

ham_dist_mean = statistics.mean(ham_dist_list)
ham_dist_std = statistics.stdev(ham_dist_list)
print("Average Hamming Distance:", ham_dist_mean)
print("Standard Deviation of Hamming Distances:", ham_dist_std)

Average Hamming Distance: 1617.8799999999999
Standard Deviation of Hamming Distances: 44.780936177105055


In [32]:
# Using zip to convert to iterable object, and then update to assign majority triplet

maj_trip = {}
ite_obj = list(zip(list(maj_fam.values()),list(maj_gen.values()),list(maj_spe.values())))
for i in range(50):
    maj_trip.update({i:list(zip(ite_obj[i][0],ite_obj[i][1],ite_obj[i][2]))})
maj_trip_list = list(maj_trip.values())
maj_trip_list

[[('Hylidae', 'Hylidae', 'Hylidae'),
  ('Leptodactylidae', 'Leptodactylidae', 'Leptodactylidae'),
  ('Dendrobatidae', 'Dendrobatidae', 'Dendrobatidae'),
  ('Hylidae', 'Hylidae', 'Hylidae')],
 [('Dendrobatidae', 'Dendrobatidae', 'Dendrobatidae'),
  ('Hylidae', 'Hylidae', 'Hylidae'),
  ('Leptodactylidae', 'Leptodactylidae', 'Leptodactylidae'),
  ('Hylidae', 'Hylidae', 'Hylidae')],
 [('Hylidae', 'Hylidae', 'Hylidae'),
  ('Leptodactylidae', 'Leptodactylidae', 'Leptodactylidae'),
  ('Hylidae', 'Hylidae', 'Hylidae'),
  ('Leptodactylidae', 'Leptodactylidae', 'Leptodactylidae')],
 [('Dendrobatidae', 'Dendrobatidae', 'Dendrobatidae'),
  ('Leptodactylidae', 'Leptodactylidae', 'Leptodactylidae'),
  ('Hylidae', 'Hylidae', 'Hylidae'),
  ('Hylidae', 'Hylidae', 'Hylidae')],
 [('Dendrobatidae', 'Dendrobatidae', 'Dendrobatidae'),
  ('Leptodactylidae', 'Leptodactylidae', 'Leptodactylidae'),
  ('Hylidae', 'Hylidae', 'Hylidae'),
  ('Hylidae', 'Hylidae', 'Hylidae')],
 [('Hylidae', 'Hylidae', 'Hylidae'),
  

In [33]:
df_results=pd.DataFrame({"Best K":k_best_list, "Majority Triplet":maj_trip_list,
                    "Hamming Loss":ham_loss_list, "Hamming Score":ham_score_list,
                   "Hamming Distance":ham_dist_list})
df_results

Unnamed: 0,Best K,Majority Triplet,Hamming Loss,Hamming Score,Hamming Distance
0,4,"[(Hylidae, Hylidae, Hylidae), (Leptodactylidae...",0.222423,0.777577,1600.333333
1,4,"[(Dendrobatidae, Dendrobatidae, Dendrobatidae)...",0.222423,0.777577,1600.333333
2,4,"[(Hylidae, Hylidae, Hylidae), (Leptodactylidae...",0.245263,0.754737,1764.666667
3,4,"[(Dendrobatidae, Dendrobatidae, Dendrobatidae)...",0.222423,0.777577,1600.333333
4,4,"[(Dendrobatidae, Dendrobatidae, Dendrobatidae)...",0.222423,0.777577,1600.333333
5,4,"[(Hylidae, Hylidae, Hylidae), (Leptodactylidae...",0.222423,0.777577,1600.333333
6,4,"[(Hylidae, Hylidae, Hylidae), (Leptodactylidae...",0.222423,0.777577,1600.333333
7,4,"[(Dendrobatidae, Dendrobatidae, Dendrobatidae)...",0.222423,0.777577,1600.333333
8,4,"[(Leptodactylidae, Leptodactylidae, Leptodacty...",0.221774,0.778226,1595.666667
9,4,"[(Leptodactylidae, Leptodactylidae, Leptodacty...",0.222423,0.777577,1600.333333
