Firt name: Kamiar (Kamy)\
Last name: Asgari (Gary)\
USC ID: 4590556658\
Github Username: kamiarasgari

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import classification_report, silhouette_score
from sklearn.svm import SVC, LinearSVC
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE

import warnings
warnings.filterwarnings("ignore")

import sys
import os

if not sys.warnoptions:
    warnings.simplefilter("ignore")
    os.environ["PYTHONWARNINGS"] = "ignore" # Also affect subprocesses

# 1. Multi-class and Multi-Label Classification Using Support Vector Machines

## 1(a). Download the Anuran Calls ...

In [2]:
root = "../data/Frogs_MFCCs.csv"
mfcc_data = pd.read_csv(root)
mfcc_data

Unnamed: 0,MFCCs_ 1,MFCCs_ 2,MFCCs_ 3,MFCCs_ 4,MFCCs_ 5,MFCCs_ 6,MFCCs_ 7,MFCCs_ 8,MFCCs_ 9,MFCCs_10,...,MFCCs_17,MFCCs_18,MFCCs_19,MFCCs_20,MFCCs_21,MFCCs_22,Family,Genus,Species,RecordID
0,1.0,0.152936,-0.105586,0.200722,0.317201,0.260764,0.100945,-0.150063,-0.171128,0.124676,...,-0.108351,-0.077623,-0.009568,0.057684,0.118680,0.014038,Leptodactylidae,Adenomera,AdenomeraAndre,1
1,1.0,0.171534,-0.098975,0.268425,0.338672,0.268353,0.060835,-0.222475,-0.207693,0.170883,...,-0.090974,-0.056510,-0.035303,0.020140,0.082263,0.029056,Leptodactylidae,Adenomera,AdenomeraAndre,1
2,1.0,0.152317,-0.082973,0.287128,0.276014,0.189867,0.008714,-0.242234,-0.219153,0.232538,...,-0.050691,-0.023590,-0.066722,-0.025083,0.099108,0.077162,Leptodactylidae,Adenomera,AdenomeraAndre,1
3,1.0,0.224392,0.118985,0.329432,0.372088,0.361005,0.015501,-0.194347,-0.098181,0.270375,...,-0.136009,-0.177037,-0.130498,-0.054766,-0.018691,0.023954,Leptodactylidae,Adenomera,AdenomeraAndre,1
4,1.0,0.087817,-0.068345,0.306967,0.330923,0.249144,0.006884,-0.265423,-0.172700,0.266434,...,-0.048885,-0.053074,-0.088550,-0.031346,0.108610,0.079244,Leptodactylidae,Adenomera,AdenomeraAndre,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7190,1.0,-0.554504,-0.337717,0.035533,0.034511,0.443451,0.093889,-0.100753,0.037087,0.081075,...,0.069430,0.071001,0.021591,0.052449,-0.021860,-0.079860,Hylidae,Scinax,ScinaxRuber,60
7191,1.0,-0.517273,-0.370574,0.030673,0.068097,0.402890,0.096628,-0.116460,0.063727,0.089034,...,0.061127,0.068978,0.017745,0.046461,-0.015418,-0.101892,Hylidae,Scinax,ScinaxRuber,60
7192,1.0,-0.582557,-0.343237,0.029468,0.064179,0.385596,0.114905,-0.103317,0.070370,0.081317,...,0.082474,0.077771,-0.009688,0.027834,-0.000531,-0.080425,Hylidae,Scinax,ScinaxRuber,60
7193,1.0,-0.519497,-0.307553,-0.004922,0.072865,0.377131,0.086866,-0.115799,0.056979,0.089316,...,0.051796,0.069073,0.017963,0.041803,-0.027911,-0.096895,Hylidae,Scinax,ScinaxRuber,60


In [3]:
# split features and labels
labels_list = ['Family','Genus','Species']
X_full = mfcc_data.drop(labels_list, axis=1)
y_full = mfcc_data[labels_list]

# split train and test
X_train, X_test, y_train, y_test = train_test_split(X_full, y_full, test_size=0.3, random_state=42)

## 1(b). Each instance has three labels: Families, Genus, and Species. Each of the l...

### 1(b)-i. Research exact match and hamming score/ loss methods for evaluating multi-label classification and use them in evaluating the classifiers in this problem.

**Exact Match Ratio**:
Exact Match Ratio calculates the percentage of samples for which all the predicted labels match the true labels. In other words, it measures the accuracy of the entire set of predicted labels for each sample.

**Hamming Loss**:
Hamming Loss measures the fraction of labels that are incorrectly predicted. It calculates the average fraction of labels that are incorrect over all samples.

In [4]:
def multilableMetric(y_true, y_pred):
    
    temp = np.array(y_pred)!=np.array(y_true)
    
    exact_ratio = np.sum(np.all(temp,axis=1))/y_true.shape[0]
    hamming = np.sum(temp)/y_true.shape[0]/y_true.shape[1]
    
    hamming = np.round(hamming, 4)
    exact_ratio = 1-np.round(exact_ratio, 4)
    
    ans = {
        "Hamming Loss": [hamming],
        "Exact Match Ratio": [exact_ratio]
    }
    print(pd.DataFrame(data=ans))
    return hamming, exact_ratio

def multilabelEval(testX, y_true, classifiers):
    predictY = pd.DataFrame(columns=y_true.columns)
    for label in y_true.columns:
        clf = classifiers[label]
        test_pred = clf.predict(testX)
        predictY.loc[:, label] = test_pred
    hamming, exact_ratio = multilableMetric(y_true, predictY)
    return [hamming, exact_ratio]

### 1(b)-ii. Train a SVM for each of the labels, using Gaussian kernels and one versus all classif ...

In [5]:
def Searcher( GridSearchCV_param, X_train, y_train, X_test, y_test):
    # build and fit the grid search for given classifier
    clf = GridSearchCV(**GridSearchCV_param)
    clf.fit(X_train, y_train)
    
    # output the best option
    print("\nThe best parameter setting is:")
    print(clf.best_params_, "\n")
    
    # evaluation on single-label task
    test_pred = clf.predict(X_test)
    print(classification_report(y_test, test_pred))
    return clf

In [6]:
def reporter(label, classifier, X_test, y_test):
    print('Class:',label,'\n')
    print('The best parameter setting is:')
    print(classifier.best_params_,'\n')

    test_pred = classifier.predict(X_test)
    print('one-versus-rest performance on the test data:')
    print(classification_report(y_test, test_pred)) 
    print(50*'*')

In [7]:
# Gaussian SVC without standardization
gaussianSVC_classifiers = {}
gaussianSVC_param_grid ={
    labels_list[0] : {'C' : np.logspace(-1, 1, 11), 'gamma' : np.linspace(0, 1, 15)},
    labels_list[1] : {'C' : np.logspace(-1, 1, 11), 'gamma' : np.linspace(0, 1, 15)},
    labels_list[2] : {'C' : np.logspace(-1, 1, 11), 'gamma' : np.linspace(0, 1, 15)},   
}
cv = 10
classifier = SVC(kernel='rbf', decision_function_shape = 'ovr')

In [8]:
print('Gaussian SVC without Standardization:')
print(50*'*')
for label in labels_list:    
    searsher = GridSearchCV(classifier, gaussianSVC_param_grid[label], cv=cv, n_jobs=-1)
    searsher.fit(X_train, y_train[label])
    gaussianSVC_classifiers[label] = searsher
    reporter(label, searsher, X_test, y_test[label])

Gaussian SVC without Standardization:
**************************************************
Class: Family 

The best parameter setting is:
{'C': 0.3981071705534973, 'gamma': 0.21428571428571427} 

one-versus-rest performance on the test data:
                 precision    recall  f1-score   support

      Bufonidae       1.00      0.96      0.98        27
  Dendrobatidae       1.00      1.00      1.00       162
        Hylidae       1.00      1.00      1.00       623
Leptodactylidae       1.00      1.00      1.00      1347

       accuracy                           1.00      2159
      macro avg       1.00      0.99      1.00      2159
   weighted avg       1.00      1.00      1.00      2159

**************************************************
Class: Genus 

The best parameter setting is:
{'C': 0.251188643150958, 'gamma': 0.2857142857142857} 

one-versus-rest performance on the test data:
               precision    recall  f1-score   support

    Adenomera       1.00      1.00      1.00  

In [9]:
print('Multilabel evaluation of Gaussian SVC without Standardization')
summery_gaussianSVC = multilabelEval(X_test, y_test, classifiers= gaussianSVC_classifiers)

Multilabel evaluation of Gaussian SVC without Standardization
   Hamming Loss  Exact Match Ratio
0        0.0005             0.9995


### 1(b)-iii. Repeat 1(b)ii with L1-penalized SVMs. Remember to standardize the attributes. Determine the weight of the SVM penalty using 10 fold cross ...

In [10]:
# SVC with L1-penalty
L1_svm_classifiers = {}
L1_svm_param_grid ={
    labels_list[0] : {'C' : np.logspace(2, 3, 10)},
    labels_list[1] : {'C' : np.logspace(0, 2, 10)},
    labels_list[2] : {'C' : np.logspace(0, 2, 10)},   
}
cv = 10
classifier = LinearSVC(penalty='l1', multi_class = 'ovr', dual=False, max_iter=10000)

In [11]:
print('Standardize the attributes')

std_scaler = StandardScaler()
X_train_std = std_scaler.fit_transform(X_train)
X_test_std = std_scaler.fit_transform(X_test)

Standardize the attributes


In [12]:
print('L1-penalized SVM with Standardization:')
print(50*'*')
for label in labels_list:    
    searsher = GridSearchCV(classifier, L1_svm_param_grid[label], cv=cv, n_jobs=-1)
    searsher.fit(X_train_std, y_train[label])
    L1_svm_classifiers[label] = searsher
    reporter(label, searsher, X_test_std, y_test[label])

L1-penalized SVM with Standardization:
**************************************************


Class: Family 

The best parameter setting is:
{'C': 215.44346900318845} 

one-versus-rest performance on the test data:
                 precision    recall  f1-score   support

      Bufonidae       0.57      0.15      0.24        27
  Dendrobatidae       0.91      0.95      0.93       162
        Hylidae       0.94      0.95      0.95       623
Leptodactylidae       0.98      0.98      0.98      1347

       accuracy                           0.96      2159
      macro avg       0.85      0.76      0.77      2159
   weighted avg       0.95      0.96      0.96      2159

**************************************************
Class: Genus 

The best parameter setting is:
{'C': 7.742636826811269} 

one-versus-rest performance on the test data:
               precision    recall  f1-score   support

    Adenomera       1.00      0.99      0.99      1251
     Ameerega       0.95      0.96      0.95       162
Dendropsophus       0.98      0.94      0.96        84
    Hypsiboas       0.96     

In [13]:
print('Multilabel evaluation of Support Vector Classifier with L1-penalty')
summery_L1_svm = multilabelEval(X_test_std, y_test, classifiers= L1_svm_classifiers)

Multilabel evaluation of Support Vector Classifier with L1-penalty
   Hamming Loss  Exact Match Ratio
0        0.0252             0.9907


### 1(b)-iv. Repeat 1(b)iii by using SMOTE or any other method you know to remedy class imbalance. Report your conclusions about the classifiers you trained.

In [19]:
# SVC with L1-penalty
SMOTE_L1_svm_classifiers = {}
SMOTE_L1_svm_param_grid ={
    labels_list[0] : {'classification__C' : np.logspace(3, 6, 10)},
    labels_list[1] : {'classification__C' : np.logspace(0, 1, 10)},
    labels_list[2] : {'classification__C' : np.logspace(1, 2, 10)},   
}
cv = 10
classifier = LinearSVC(penalty='l1', multi_class = 'ovr', dual=False, max_iter=5000)
model_smote = Pipeline([('sampling', SMOTE()), ('classification', classifier)])

In [20]:
print('SMOTE L1-penalized SVM with Standardization:')
print(50*'*')
for label in labels_list:    
    searsher = GridSearchCV(model_smote, param_grid = SMOTE_L1_svm_param_grid[label], cv=cv, n_jobs=-1)
    searsher.fit(X_train_std, y_train[label])
    SMOTE_L1_svm_classifiers[label] = searsher
    reporter(label, searsher, X_test_std, y_test[label])

SMOTE L1-penalized SVM with Standardization:
**************************************************


Class: Family 

The best parameter setting is:
{'classification__C': 10000.0} 

one-versus-rest performance on the test data:
                 precision    recall  f1-score   support

      Bufonidae       0.83      0.89      0.86        27
  Dendrobatidae       0.79      0.97      0.87       162
        Hylidae       0.95      0.94      0.94       623
Leptodactylidae       0.98      0.95      0.97      1347

       accuracy                           0.95      2159
      macro avg       0.88      0.94      0.91      2159
   weighted avg       0.95      0.95      0.95      2159

**************************************************
Class: Genus 

The best parameter setting is:
{'classification__C': 4.641588833612778} 

one-versus-rest performance on the test data:
               precision    recall  f1-score   support

    Adenomera       1.00      0.99      0.99      1251
     Ameerega       0.95      0.98      0.96       162
Dendropsophus       0.94      0.96      0.95        84
    Hyps

In [21]:
print('Multilabel evaluation of SVM with L1 penalty and SMOTE')
summery_SMOTE_L1_svm= multilabelEval(X_test_std, y_test, classifiers=SMOTE_L1_svm_classifiers)

Multilabel evaluation of SVM with L1 penalty and SMOTE
   Hamming Loss  Exact Match Ratio
0        0.0301             0.9898


**Comparision**

In [22]:
row_labels = ['Hamming Loss', 'Exact Match Ratio']
dataa = {
    'Gaussian SVC without Standardization': summery_gaussianSVC,
    'SVC_L1': summery_L1_svm,
    'SVC_L1_SMOTE' : summery_SMOTE_L1_svm
        }

df = pd.DataFrame(dataa, index=row_labels)
df

Unnamed: 0,Gaussian SVC without Standardization,SVC_L1,SVC_L1_SMOTE
Hamming Loss,0.0005,0.0252,0.0301
Exact Match Ratio,0.9995,0.9907,0.9898


# 2. K-Means Clustering on a Multi-Class and Multi-Label Data Set | Monte Carlo Simulation

In [29]:
def Optimal_K(K_range, X, rand):
    optimalK = min(K_range)
    max_score = 0
    for k in K_range:
        clusterer = KMeans(n_clusters=k, random_state=rand)
        cluster_labels = clusterer.fit_predict(X)
        silhouette_avg = silhouette_score(X, cluster_labels)
        if silhouette_avg > max_score:
            optimalK = k
            max_score = silhouette_avg
    print('\nThe optimal K is:' ,optimalK)
    return optimalK


def Majority_Labels(optimalK, cluster_labels, Y):
    cluster_major = pd.DataFrame(columns=Y.columns)
    for c in range(optimalK):
        idx, = np.where(cluster_labels == c)
        cluster_samples = Y.iloc[idx, :]
        row = []
        for label in Y.columns:
            cur_major = cluster_samples.loc[:, label].value_counts().index[0]
            row.append(cur_major)
        cluster_major.loc[c] = row
    return cluster_major


def hamming_loss_dist(cluster_major, cluster_labels, Y):
    missclf_labels = 0
    for c in range(len(cluster_major)):
        idx, = np.where(cluster_labels == c)
        for label in Y.loc[idx].values:
            miss = (label != cluster_major.loc[c].values)
            missclf_labels += np.sum(miss)
    hamming_dist = missclf_labels / Y.shape[0]
    hamming_loss = missclf_labels / (Y.shape[0] * Y.shape[1])
    return hamming_dist, hamming_loss



K_range = range(2,51)
iterations = 50

hamming_dist = []
hamming_loss = []
for iter in range(iterations):
    print('Iteration:' , iter+1)
    optimalK = Optimal_K(K_range, mfcc_data.iloc[:, :-4], iter)
    clusterer = KMeans(n_clusters=optimalK, random_state=iter)
    cluster_labels = clusterer.fit_predict(mfcc_data.iloc[:, :-4])
    cluster_major = Majority_Labels(optimalK, cluster_labels, mfcc_data.iloc[:, -4:-1])
    cur_dist, cur_loss = hamming_loss_dist(cluster_major, cluster_labels, mfcc_data.iloc[:, -4:-1])
    hamming_dist.append(cur_dist)
    hamming_loss.append(cur_loss)
    print('Hamming Distance:', round(cur_dist, 4) , 'Hamming Loss:' , round(cur_loss, 4))

Iteration: 1

The optimal K is: 4
Hamming Distance: 0.6653 Hamming Loss: 0.2218
Iteration: 2

The optimal K is: 4
Hamming Distance: 0.6673 Hamming Loss: 0.2224
Iteration: 3

The optimal K is: 4
Hamming Distance: 0.7358 Hamming Loss: 0.2453
Iteration: 4

The optimal K is: 4
Hamming Distance: 0.6673 Hamming Loss: 0.2224
Iteration: 5

The optimal K is: 4
Hamming Distance: 0.6673 Hamming Loss: 0.2224
Iteration: 6

The optimal K is: 4
Hamming Distance: 0.6673 Hamming Loss: 0.2224
Iteration: 7

The optimal K is: 4
Hamming Distance: 0.6673 Hamming Loss: 0.2224
Iteration: 8

The optimal K is: 4
Hamming Distance: 0.6673 Hamming Loss: 0.2224
Iteration: 9

The optimal K is: 4
Hamming Distance: 0.6673 Hamming Loss: 0.2224
Iteration: 10

The optimal K is: 4
Hamming Distance: 0.6673 Hamming Loss: 0.2224
Iteration: 11

The optimal K is: 4
Hamming Distance: 0.6673 Hamming Loss: 0.2224
Iteration: 12

The optimal K is: 4
Hamming Distance: 0.6673 Hamming Loss: 0.2224
Iteration: 13

The optimal K is: 4
Ha

In [31]:
data = {
        "Average Hamming Distance": [round(np.mean(hamming_dist), 4)],
        "Std Devation (Hamming Distance)": [round(np.std(hamming_dist), 4)]
    }
df_hamming_distance = pd.DataFrame(data)
df_hamming_distance

Unnamed: 0,Average Hamming Distance,Std Devation (Hamming Distance)
0,0.6712,0.031


In [32]:
data = {
        "Average Hamming Loss": [round(np.mean(hamming_loss), 4)],
        "Std Devation (Hamming Loss)": [round(np.std(hamming_loss), 4)]
    }
df_hamming_loss = pd.DataFrame(data)
df_hamming_loss

Unnamed: 0,Average Hamming Loss,Std Devation (Hamming Loss)
0,0.2237,0.0103


In [33]:
data = {
        "Average Hamming Score": [round(1 - np.mean(hamming_loss), 4)],
        "Std Devation (Hamming Score)": [round(np.std(hamming_loss), 4)]
    }


df_hamming_score =  pd.DataFrame(data)
df_hamming_score

Unnamed: 0,Average Hamming Score,Std Devation (Hamming Score)
0,0.7763,0.0103
