
### Rubina Iman Kabir 


In [2]:
# Packages
import pandas as pd
import scipy.stats as st
from sklearn.cluster import KMeans as kmeans
import seaborn as sns
import matplotlib.pyplot as plt
from statsmodels.tools.eval_measures import mse
import numpy as np

from sklearn.metrics import confusion_matrix,roc_curve,roc_auc_score,auc, accuracy_score,hamming_loss,calinski_harabaz_score

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold,GridSearchCV

from sklearn.decomposition import PCA

from pandas.plotting import scatter_matrix
from imblearn.over_sampling import SMOTE

from sklearn.svm import SVC ,LinearSVC



# 1. Multi-class and Multi-Label Classification

##### (a) Select Training and TestData

In [5]:
data = pd.read_csv('Frogs_MFCCs.csv')
data.drop('RecordID',inplace=True,axis = 1)
Xdata = data.iloc[:,0:22]


train = data.sample(frac = 0.7)
train.reset_index(inplace=True)
train.drop('index',inplace=True,axis = 1)
Xtrain = train.iloc[:,0:22]
Ytrain = train.iloc[:,22:]
Ytrain.shape

test = data.loc[~data.index.isin(train.index)]
test.reset_index(inplace=True)
test.drop('index',inplace=True, axis = 1)
Xtest = test.iloc[:,0:22]
Ytest = test.iloc[:,22:]

##### (b) (i) Exact match and Hamming Score/Loss

Exact Match: most strict method that measures the percentage of samples that have ALL their labels classified correctly. (optimal is 1.0%) Disadvantage to this method is that this measure doesn't distinguish between completely incorrect and partially correct.

Hamming Loss: the fraction of the wrong labels to the total number of labels. (optimal is 0)

##### (b) (ii) Train SVM

In [5]:
gamma_range = np.logspace(0, 3, 10)
C_range = np.logspace(-1, 3, 10)
b_df = pd.DataFrame(columns = ['Gamma','C','Label','CV Error', 'Test Error'])
r = 0
for label in Ytrain.columns:
    classes = np.unique(Ytrain[label])
    ytrain = Ytrain[label]
    for gamma in gamma_range:
        for c in C_range:        
            folds = KFold(10,shuffle=True).split(Xtrain,ytrain)
            cv = 0
            for tr,val in folds:
                svm = SVC(decision_function_shape = 'ovr',gamma=gamma,C=c)
                svm.fit(Xtrain.loc[tr],ytrain.loc[tr])
                Ypred = svm.predict(Xtrain.loc[val])
                hamming = hamming_loss(Ypred,ytrain.loc[val])
                cv = cv + hamming
            
            svm = SVC(decision_function_shape = 'ovr',gamma=gamma,C=c)
            svm.fit(Xtrain,ytrain)
            Ypred = svm.predict(Xtest)
            hamming = hamming_loss(Ypred,Ytest[label])
            b_df.loc[r,:] = [gamma,c,label,cv/10, hamming] 

            r = r + 1

In [6]:
range_df = pd.DataFrame(columns=['Gamma','C','Label','CV Error', 'Test Error'])
r = 0
for label in Ytrain.columns:
    mask = (b_df['Label'] == label) 
    mins = b_df[mask].sort_values(by = 'CV Error')
    mins.reset_index(inplace=True)
    range_df.loc[r,:] = mins.iloc[0,:].values.tolist()[1:]
    r = r + 1
range_df

Unnamed: 0,Gamma,C,Label,CV Error,Test Error
0,2.15443,5.99484,Family,0.00635276,0.00370542
1,2.15443,129.155,Genus,0.0081432,0.00787402
2,2.15443,16.681,Species,0.00893449,0.00880037


##### (b) iii L1-penalized SVM

In [13]:
C_range = np.logspace(-2, 3, 8)
iii_df = pd.DataFrame(columns = ['C','Label','CV Error', 'Test Error'])
r = 0
for label in Ytrain.columns:
    classes = np.unique(Ytrain[label])
    ytrain = Ytrain[label]
    for c in C_range:        
        folds = KFold(10,shuffle=True).split(Xtrain,ytrain)
        cv = 0
        for tr,val in folds:
            linear_svm = LinearSVC(penalty = 'l1',C=c, max_iter = 1e5, dual = False)
            linear_svm.fit(Xtrain.loc[tr],ytrain.loc[tr])
            Ypred = linear_svm.predict(Xtrain.loc[val])
            hamming = hamming_loss(Ypred,ytrain.loc[val])
            cv = cv + hamming
        
        linear_svm = LinearSVC(penalty = 'l1',C=c, max_iter = 1e5,dual = False)
        linear_svm.fit(Xtrain,ytrain)
        Ypred = linear_svm.predict(Xtest)
        hamming = hamming_loss(Ypred,Ytest[label])
        iii_df.loc[r,:] = [c,label,cv/10, hamming] 
        r = r + 1








In [14]:
range_df = pd.DataFrame(columns=['C','Label','CV Error', 'Test Error'])
r = 0
for label in Ytrain.columns:
    mask = (iii_df['Label'] == label) 
    mins = iii_df[mask].sort_values(by = 'CV Error')
    mins.reset_index(inplace=True)
    range_df.loc[r,:] = mins.iloc[0,:].values.tolist()[1:]
    r = r + 1
range_df



Unnamed: 0,C,Label,CV Error,Test Error
0,7.19686,Family,0.0639382,0.0833719
1,1000.0,Genus,0.047853,0.0759611
2,193.07,Species,0.0403141,0.0565076


##### (b) iv using SMOTE

In [4]:
C_range = np.logspace(-4, 3, 8)
iv_df = pd.DataFrame(columns = ['C','Label','CV Error', 'Test Error'])
r = 0
for label in Ytrain.columns:
    ytrain = Ytrain[label]
    for c in C_range:        
        folds = KFold(10,shuffle=True).split(Xtrain,ytrain)
        cv = 0
        for tr,val in folds:
            X_sm,Y_sm = SMOTE().fit_resample(Xtrain.loc[tr],ytrain.loc[tr])
            linear_svm = LinearSVC(penalty = 'l1',C=c,max_iter = 1e5,dual=False)
            linear_svm.fit(X_sm,Y_sm)
            Ypred = linear_svm.predict(Xtrain.loc[val])
            hamming = hamming_loss(Ypred,ytrain.loc[val])
            cv = cv + hamming
            
        X_sm,Y_sm = SMOTE().fit_resample(Xtrain,ytrain)
        linear_svm = LinearSVC(penalty = 'l1',C=c,max_iter = 1e5,dual=False)
        linear_svm.fit(X_sm,Y_sm)
        Ypred = linear_svm.predict(Xtest)
        hamming = hamming_loss(Ypred,Ytest[label])
        iv_df.loc[r,:] = [c,label,cv/10, hamming] 
        r = r + 1







In [5]:
range_df = pd.DataFrame(columns=['C','Label','CV Error', 'Test Error'])
r = 0
for label in Ytrain.columns:
    mask = (iv_df['Label'] == label) 
    mins = iv_df[mask].sort_values(by = 'CV Error')
    mins.reset_index(inplace=True)
    range_df.loc[r,:] = mins.iloc[0,:].values.tolist()[1:]
    r = r + 1
range_df



Unnamed: 0,C,Label,CV Error,Test Error
0,10,Family,0.0828012,0.0986568
1,1,Genus,0.0915361,0.0750347
2,10,Species,0.0470601,0.0592867


Best method for each label was using SVC with the Gaussian function (rbf).

    Family:
        Test Error:0.003
        
    Genus:
        Test Error: 0.007
        
    Species:
        Test Error: 0.008
        


## 2. K-Means

In [7]:
Xdata = data.iloc[:,0:22]
Ydata = data.iloc[:,22:]
monte_df = pd.DataFrame(index=range(0,50),columns = ['k','Hamming Distance'])


In [48]:
def hamming_score(y_true, y_pred, normalize=True, sample_weight=None):
    '''
    Compute the Hamming score (a.k.a. label-based accuracy) for the multi-label case
    http://stackoverflow.com/q/32239577/395857
    '''
    acc_list = []
    for i in range(y_true.shape[0]):
        set_true = set( y_true[i] )
        set_pred = set( y_pred )
        #print('\nset_true: {0}'.format(set_true))
        #print('set_pred: {0}'.format(set_pred))
        tmp_a = None
        if len(set_true) == 0 and len(set_pred) == 0:
            tmp_a = 1
        else:
            tmp_a = len(set_true.intersection(set_pred))/\
                    float( len(set_true.union(set_pred)) )
        #print('tmp_a: {0}'.format(tmp_a))
        acc_list.append(tmp_a)

    return np.mean(acc_list)

def hamming_distance(y_true,y_pred):
    dist_sum = 0
    for i in range(y_true.shape[0]):
        true = y_true[i]
        pred = y_pred
        wrong = 0
        for k in range(3):
            if true[k] != pred[k]:
                wrong = wrong + 1
        dist_sum = dist_sum + (wrong / 3)
                
    return dist_sum/(y_true.shape[0])


In [53]:
K = range(2,51)
monte_carlo = []
monte_df = pd.DataFrame(index=range(2,51),columns=['Best k','Hamming Distance','Hamming Score','Hamming Loss'])
r = 0
for monte in K:
    k_ch = []
    for k in K:
        km = kmeans(n_clusters = k ,random_state = 5).fit(Xdata)
        
        ch = calinski_harabaz_score(Xdata, km.labels_)
        k_ch.append([ch,k])
        
    min_k = max(k_ch)
    best_k = min_k[1]
    # Store cluster labels with X data
    cluster_labels = pd.DataFrame(km.labels_,columns=['Cluster Label'])
    km_X = pd.concat([Xdata,cluster_labels],axis = 1)  
    
    #majority_df = pd.DataFrame(columns=['Family','Genus','Species'],index=range(0,best_k))
    score_avg = 0
    total_h_loss = 0
    h_dist = 0
    for cluster in range(0,best_k):
        triplet = []
        # Get data in cluster
        mask = (km_X['Cluster Label'] == cluster)
        data_cluster = km_X[mask]
        # Get index of data in cluster
        inds = data_cluster.index
        h_loss = 0
        # Get majority vote for each label in each cluster
        for label in Ytrain.columns:
            true = Ydata[label].loc[inds]
            data_cluster['True'] = true
            majority = data_cluster['True'].value_counts().argmax()
            triplet.append(majority)
            
            # Calculate hamming loss for each cluster on each label
            Ypred = [majority]*true.shape[0]
            hamming_los = hamming_loss(true,Ypred)
            h_loss = hamming_los + h_loss
            
        total_h_loss = total_h_loss + (h_loss / 3)
            
        Ytrue = Ydata.loc[inds]
        # calculate hamming score for cluster
        hamming_sco = hamming_score(Ytrue.values,triplet)
        score_avg = hamming_sco + score_avg
        
        # Calc hamming distance
        h_dist = h_dist + hamming_distance(Ytrue.values,triplet)
    
    # Aggregate hamming * from all clusters
    final_distance = h_dist/best_k
    final_score = score_avg/best_k
    final_loss = total_h_loss/best_k

    monte_carlo.append(final_distance)
    monte_df.loc[r] = [best_k,final_distance,final_score,final_loss]
    r = r + 1
print('Monte Carlo - Hamming Distance:\n')
print('Average: \t', np.median(monte_carlo))
print('Standard Deviation: \t', np.std(monte_carlo))
        


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
The current behaviour of 'Series.argmax' is deprecated, use 'idxmax'
instead.
The behavior of 'argmax' will be corrected to return the positional
maximum in the future. For now, use 'series.values.argmax' or
'np.argmax(np.array(values))' to get the position of the maximum
row.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
The current behaviour of 'Series.argmax' is deprecated, use 'idxmax'
instead.
The behavior of 'argmax' will be corrected to return the positional
maximum in the future. For now, use 'series.values.argmax' or
'np.argmax(np.array(va

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
The current behaviour of 'Series.argmax' is deprecated, use 'idxmax'
instead.
The behavior of 'argmax' will be corrected to return the positional
maximum in the future. For now, use 'series.values.argmax' or
'np.argmax(np.array(values))' to get the position of the maximum
row.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
The current behaviour of 'Series.argmax' is deprecated, use 'idxmax'
instead.
The behavior of 'argmax' will be corrected to return the positional
maximum in the future. For now, use 'series.values.argmax' or
'np.argmax(np.array(va

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
The current behaviour of 'Series.argmax' is deprecated, use 'idxmax'
instead.
The behavior of 'argmax' will be corrected to return the positional
maximum in the future. For now, use 'series.values.argmax' or
'np.argmax(np.array(values))' to get the position of the maximum
row.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
The current behaviour of 'Series.argmax' is deprecated, use 'idxmax'
instead.
The behavior of 'argmax' will be corrected to return the positional
maximum in the future. For now, use 'series.values.argmax' or
'np.argmax(np.array(va

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
The current behaviour of 'Series.argmax' is deprecated, use 'idxmax'
instead.
The behavior of 'argmax' will be corrected to return the positional
maximum in the future. For now, use 'series.values.argmax' or
'np.argmax(np.array(values))' to get the position of the maximum
row.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
The current behaviour of 'Series.argmax' is deprecated, use 'idxmax'
instead.
The behavior of 'argmax' will be corrected to return the positional
maximum in the future. For now, use 'series.values.argmax' or
'np.argmax(np.array(va

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
The current behaviour of 'Series.argmax' is deprecated, use 'idxmax'
instead.
The behavior of 'argmax' will be corrected to return the positional
maximum in the future. For now, use 'series.values.argmax' or
'np.argmax(np.array(values))' to get the position of the maximum
row.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
The current behaviour of 'Series.argmax' is deprecated, use 'idxmax'
instead.
The behavior of 'argmax' will be corrected to return the positional
maximum in the future. For now, use 'series.values.argmax' or
'np.argmax(np.array(va

Monte Carlo - Hamming Distance:

Average: 	 0.004457745220653921
Standard Deviation: 	 0.0


The best k for EACH run of the k-means algorithm was **k*=2** with a Hamming Score of 99.5%!


Monte Carlo - Hamming Distance:


        Average: 	 4.457745e-03

        Standard Deviation: 	 2.629051e-18
        
Based off these results, there was minimal difference between each monte-carlo run of the k-means algorithm.  
      

In [56]:
monte_df.dropna()

Unnamed: 0,Best k,Hamming Distance,Hamming Score,Hamming Loss
2,2.0,0.004458,0.99537,0.004458
3,2.0,0.004458,0.99537,0.004458
4,2.0,0.004458,0.99537,0.004458
5,2.0,0.004458,0.99537,0.004458
6,2.0,0.004458,0.99537,0.004458
7,2.0,0.004458,0.99537,0.004458
8,2.0,0.004458,0.99537,0.004458
9,2.0,0.004458,0.99537,0.004458
10,2.0,0.004458,0.99537,0.004458
11,2.0,0.004458,0.99537,0.004458
