In [115]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV,GridSearchCV
from sklearn.svm import SVC, LinearSVC
from sklearn.metrics import hamming_loss, accuracy_score
from imblearn.over_sampling import SMOTE
from collections import OrderedDict
from sklearn.preprocessing import Normalizer
import warnings
warnings.filterwarnings('ignore')

In [116]:
data_path = "../data/Frogs_MFCCs.csv"
df= pd.read_csv(data_path)
df=df.drop(columns=["RecordID"])

In [117]:
df_y = df[['Family','Genus','Species']]
df_x = df.drop(columns=['Family','Genus','Species'])

In [118]:
labels = ['Family','Genus', 'Species']
df_y = df[cols_to_transform].astype('category')
df_y = df_y[cols_to_transform].apply(lambda x: x.cat.codes)

In [119]:
x_train, x_test, y_train, y_test = train_test_split(df_x, df_y, train_size = 0.7, test_size = 0.3)

In [120]:
y_train1 = y_train['Family']
y_train2 = y_train['Genus']
y_train3 = y_train['Species']

#### ii) Train a SVM for each of the labels, using Gaussian kernels and one versus all classifiers.

In [None]:
C_range = np.logspace(-3, 6, 10) 
gamma_range = np.arange(0.1, 2.1, 0.1) 
param_grid = dict(gamma=gamma_range, C=C_range)

rbf1 = RandomizedSearchCV(SVC(), param_distributions =param_grid, cv=10)
rbf2 = RandomizedSearchCV(SVC(), param_distributions =param_grid, cv=10)
rbf3 = RandomizedSearchCV(SVC(), param_distributions =param_grid, cv=10)
rbf1.fit(x_train, y_train1)
rbf2.fit(x_train, y_train2)
rbf3.fit(x_train, y_train3)

In [None]:
print(f"Optimal values for first classifier with rbf/gaussian kernel is {rbf1.best_params_} with score={rbf1.best_score_}")
print(f"Optimal values for second classifier with rbf/gaussian kernel is {rbf2.best_params_} with score={rbf2.best_score_}")
print(f"Optimal values for third classifier with rbf/gaussian kernel is {rbf2.best_params_} with score={rbf2.best_score_}")

In [60]:
#family
svc1 = SVC(C=rbf1.best_params_['C'], gamma=rbf1.best_params_['gamma'])
svc1.fit(x_train,y_train1)
y_pred1 = svc1.predict(x_test)

#Species
svc2 = SVC(C=rbf2.best_params_['C'], gamma=rbf2.best_params_['gamma'])
svc2.fit(x_train,y_train2)
y_pred2 = svc2.predict(x_test)

#Genus
svc3 = SVC(C=rbf3.best_params_['C'], gamma=rbf2.best_params_['gamma'])
svc3.fit(x_train,y_train3)
y_pred3 = svc3.predict(x_test)

pred_df = OrderedDict([('Family', y_pred1), ('Genus', y_pred2), ('Species',y_pred3)])

rbf_pred = pd.DataFrame.from_dict(pred_df)

In [66]:
def hamming_loss(ytest, ypred):
    hamm_loss = 0
    for i in range(0, len(ytest)):
        losscore = 0
        for j in range(0, 3):
            if ytest.iloc[i, j] != ypred.iloc[i, j]:
                losscore += 1
            losscore = losscore / 3
            hamm_loss += losscore
    hamm_loss /=  (len(ytest))
    return (hamm_loss)

def exact_match(ytest, ypred):
    exactmatch = 0
    for i in range(len(ypred)):
        if (ytest.iloc[i, 0] - ypred.iloc[i, 0] == 0
                and ytest.iloc[i, 1] - ypred.iloc[i, 1] == 0
                and ytest.iloc[i, 2] - ypred.iloc[i, 2] == 0):
            exactmatch += 1
    exactmatch /=(len(ypred))
    return (exactmatch)

In [67]:
#Evaluating the classifier with gaussian kernel
rbf_svm_hamm_loss = hamming_loss(y_test, rbf_pred)
print('The Hamming loss for rbf/gaussian kernel svm: ', rbf_svm_hamm_loss)

rbf_svm_exact_score = exact_match(y_test, rbf_pred)
print('The exact match for rbf/gaussian kernel svm: ', rbf_svm_exact_score)

The Hamming loss for rbf/gaussian kernel svm:  0.012643027464704151
The exact match for rbf/gaussian kernel svm:  0.9861046780917091


#### iii) Repeat 6(b)ii with L1-penalized SVMs.4 Remember to normalize the attributes.

In [70]:
C_range = np.logspace(-3, 6, 10) 
param_grid = dict(C=C_range)

rbf1 = RandomizedSearchCV(LinearSVC(penalty='l1', dual=False), param_distributions =param_grid, cv=10)
rbf2 = RandomizedSearchCV(LinearSVC(penalty='l1', dual=False), param_distributions =param_grid, cv=10)
rbf3 = RandomizedSearchCV(LinearSVC(penalty='l1', dual=False), param_distributions =param_grid, cv=10)
rbf1.fit(x_train, y_train1)
rbf2.fit(x_train, y_train2)
rbf3.fit(x_train, y_train3)



















Optimal values for first classifier with rbf/gaussian kernel is {'C': 10.0} with score=0.9336800624822494
Optimal values for second classifier with rbf/gaussian kernel is {'C': 100.0} with score=0.9499613430527944
Optimal values for third classifier with rbf/gaussian kernel is {'C': 100.0} with score=0.9499613430527944




In [71]:
print(f"Optimal values for first classifier with rbf/gaussian kernel is {rbf1.best_params_} with score={rbf1.best_score_}")
print(f"Optimal values for second classifier with rbf/gaussian kernel is {rbf2.best_params_} with score={rbf2.best_score_}")
print(f"Optimal values for third classifier with rbf/gaussian kernel is {rbf2.best_params_} with score={rbf2.best_score_}")

Optimal values for first classifier with rbf/gaussian kernel is {'C': 10.0} with score=0.9336800624822494
Optimal values for second classifier with rbf/gaussian kernel is {'C': 100.0} with score=0.9499613430527944
Optimal values for third classifier with rbf/gaussian kernel is {'C': 100.0} with score=0.9499613430527944


In [72]:
#family
svc1 = LinearSVC(penalty='l1', dual=False,C=rbf1.best_params_['C'] )
svc1.fit(x_train,y_train1)
y_pred1 = svc1.predict(x_test)

#Species
svc2 = LinearSVC(penalty='l1', dual=False,C=rbf1.best_params_['C'] )
svc2.fit(x_train,y_train2)
y_pred2 = svc2.predict(x_test)

#Genus
svc3 = LinearSVC(penalty='l1', dual=False,C=rbf1.best_params_['C'] )
svc3.fit(x_train,y_train3)
y_pred3 = svc3.predict(x_test)

pred_df = OrderedDict([('Family', y_pred1), ('Genus', y_pred2), ('Species',y_pred3)])

rbf_pred = pd.DataFrame.from_dict(pred_df)



In [74]:
#Evaluating the classifier with gaussian kernel
rbf_svm_hamm_loss = hamming_loss(y_test, rbf_pred)
print('The Hamming loss for rbf/gaussian kernel svm with L1 penalty: ', rbf_svm_hamm_loss)

rbf_svm_exact_score = exact_match(y_test, rbf_pred)
print('The exact match for rbf/gaussian kernel svm with L1 penalty: ', rbf_svm_exact_score)

The Hamming loss for rbf/gaussian kernel svm with L1 penalty:  0.06620005832604266
The exact match for rbf/gaussian kernel svm with L1 penalty:  0.9152385363594256


#### iv) Repeat 6(b)iii by using SMOTE or any other method you know to remedy class imbalance. Report your conclusions about the classifiers you trained.

In [101]:
transformer = Normalizer().fit(x_train)
x_train_norm=transformer.transform(x_train)
x_test_norm=transformer.transform(x_test)

In [102]:
x_train_smote1, y_train__smote1 = SMOTE().fit_resample(x_train_norm, y_train1)
x_train_smote2, y_train__smote2 = SMOTE().fit_resample(x_train_norm, y_train2)
x_train_smote3, y_train__smote3 = SMOTE().fit_resample(x_train_norm, y_train3)

In [107]:
C_range = np.logspace(-3, 6, 10) 
param_grid = dict(C=C_range)

l1_model= LinearSVC(penalty='l1', dual=False)
rbf1 = GridSearchCV(l1_model, param_grid, cv=10)
rbf1.fit(x_train_smote1, y_train__smote1)

GridSearchCV(cv=10, estimator=LinearSVC(dual=False, penalty='l1'),
             param_grid={'C': array([1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02, 1.e+03, 1.e+04,
       1.e+05, 1.e+06])})

In [None]:
rbf2 = GridSearchCV(l1_model, param_grid, cv=10)
rbf2.fit(x_train_smote2, y_train__smote2)

In [None]:
rbf3 = GridSearchCV(l1_model, param_grid, cv=10)
rbf3.fit(x_train_smote3, y_train__smote3)

In [None]:
#Evaluating the classifier with gaussian kernel
rbf_svm_hamm_loss = hamming_loss(y_test, rbf_pred)
print('The Hamming loss for smote rbf/gaussian kernel svm with L1 penalty:  ',rbf_svm_hamm_loss)

rbf_svm_exact_score = exact_match(y_test, rbf_pred)
print('The exact match for smote rbf/gaussian kernel svm with L1 penalty: ',rbf_svm_exact_score)

The smote is computationally expensive and out of all the models gaussian kernel/rbf is better compared to others and next comes L1 penalty.