<center><h1>Kan_Herun_HW7</h1></center>

Name: Herun Kan
<br>
Github Username: herunkan
<br>
USC ID: 7222919427

## 1. Multi-class and Multi-Label Classification Using Support Vector Machines

Import packages

In [43]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.svm import SVC,LinearSVC
from sklearn.metrics import hamming_loss, accuracy_score, silhouette_score
from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold
from sklearn.multiclass import OneVsRestClassifier
from sklearn.pipeline import make_pipeline
from imblearn.over_sampling import SMOTE
from sklearn.pipeline import Pipeline
from sklearn.cluster import KMeans
from scipy.stats import mode

### (a) Download the Anuran Calls (MFCCs) Data Set

In [44]:
data = pd.read_csv('../data/Anuran Calls (MFCCs)/Frogs_MFCCs.csv')


X = data.iloc[:, :-4]
y = data.iloc[:, -4:-1] #ditch recordid for our ys

label_encoders = {col: LabelEncoder() for col in y.columns}
for col in y.columns:
    y[col] = label_encoders[col].fit_transform(y[col])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)


### (b) Train a classifier for each label

#### (i) Research

Exact Match Ratio: ignores partially correct (consider them incorrect) and extend the accuracy used in single label case for multi-label prediction. A disadvantage of this measure is that it does not distinguish between complete incorrect and partially correct which might be considered harsh.

Hamming Loss: reports how many times on average, the relevance of an example to a class label is incorrectly predicted. Hamming loss takes into account the prediction error (an incorrect label is predicted) and missing error (a relevant label not predicted), normalized over total number of classes and total number of examples.

#### (ii) Train a SVM for each of the labels

In [45]:
#very large and very small parameter values
C_values = [10**i for i in range(-3, 7)]
gamma_values = np.linspace(0.1, 2, 20) 

def filter_parameters(X_train, y_train, C_values, gamma_values, label, threshold=0.7):
    valid_params = []
    for C in C_values:
        for gamma in gamma_values:
            svm = SVC(C=C, gamma=gamma, kernel='rbf')
            svm.fit(X_train, y_train[label])
            train_acc = svm.score(X_train, y_train[label])
            
            if train_acc >= threshold:
                valid_params.append((C, gamma))
    
    return valid_params

models = {}
results = {}

for label in y.columns:
    valid_params = filter_parameters(X_train, y_train, C_values, gamma_values, label)
    if not valid_params:
        print(f"No valid parameter pairs found for {label} above the 70% threshold.")
        continue
    C_range = sorted(set([param[0] for param in valid_params]))
    gamma_range = sorted(set([param[1] for param in valid_params]))
    param_grid = {
        'C': C_range,
        'gamma': gamma_range
    }
    svc = SVC(kernel='rbf')
    grid_search = GridSearchCV(svc, param_grid, cv=10,n_jobs= -1, scoring='accuracy')
    grid_search.fit(X_train, y_train[label])

    best_model = grid_search.best_estimator_
    models[label] = best_model
    y_pred = best_model.predict(X_test)
    exact_match = accuracy_score(y_test[label], y_pred)
    hamming_score = 1 - hamming_loss(y_test[label], y_pred)
    
    results[label] = {
        "Best Parameters": grid_search.best_params_,
        "Exact Match (Accuracy)": exact_match,
        "Hamming Score": hamming_score
    }

for label, metrics in results.items():
    print(f"\nLabel: {label}")
    print(f"  Best Parameters: {metrics['Best Parameters']}")
    print(f"  Exact Match (Accuracy): {metrics['Exact Match (Accuracy)']:.4f}")
    print(f"  Hamming Score: {metrics['Hamming Score']:.4f}")

y_pred_combined = pd.DataFrame({
    label: models[label].predict(X_test) for label in y.columns
})
y_pred_combined = y_pred_combined.reindex_like(y_test)
exact_match_overall = (y_pred_combined.values == y_test.values).all(axis=1).mean()
hamming_loss_overall = 1 - (y_pred_combined != y_test).mean().mean()

print(f"\nOverall Exact Match (Accuracy): {exact_match_overall:.4f}")
print(f"Overall Hamming Loss: {hamming_loss_overall:.4f}")


Label: Family
  Best Parameters: {'C': 100, 'gamma': np.float64(1.8)}
  Exact Match (Accuracy): 0.9944
  Hamming Score: 0.9944

Label: Genus
  Best Parameters: {'C': 10, 'gamma': np.float64(2.0)}
  Exact Match (Accuracy): 0.9884
  Hamming Score: 0.9884

Label: Species
  Best Parameters: {'C': 10, 'gamma': np.float64(1.8)}
  Exact Match (Accuracy): 0.9889
  Hamming Score: 0.9889

Overall Exact Match (Accuracy): 0.0783
Overall Hamming Loss: 0.1229


#### (iii) Repeat 1(b)ii with L1-penalized SVMs

In [46]:
for label in y.columns:
    param_grid = {
        'C': C_values}
    svc = LinearSVC(penalty='l1', dual=False, max_iter=5000)
    grid_search = GridSearchCV(svc, param_grid, cv=10,n_jobs=-1, scoring='accuracy')
    grid_search.fit(X_train, y_train[label])
    best_model = grid_search.best_estimator_
    models[label] = best_model
    y_pred = best_model.predict(X_test)
    exact_match = accuracy_score(y_test[label], y_pred)
    hamming_score = 1 - hamming_loss(y_test[label], y_pred)
    results[label] = {
        "Best Parameters": grid_search.best_params_,
        "Exact Match (Accuracy)": exact_match,
        "Hamming Score": hamming_score
    }

for label, metrics in results.items():
    print(f"\nLabel: {label}")
    print(f"  Best Parameters: {metrics['Best Parameters']}")
    print(f"  Exact Match (Accuracy): {metrics['Exact Match (Accuracy)']:.4f}")
    print(f"  Hamming Score: {metrics['Hamming Score']:.4f}")

#Calculate overall Exact Match and Hamming Score for all labels combined
y_pred_combined = pd.DataFrame({
    label: models[label].predict(X_test) for label in y.columns})
y_pred_combined = y_pred_combined.reindex_like(y_test)
exact_match_overall = (y_pred_combined.values == y_test.values).all(axis=1).mean()
hamming_loss_overall = 1 - (y_pred_combined != y_test).mean().mean()

print(f"\nOverall Exact Match (Accuracy): {exact_match_overall:.4f}")
print(f"Overall Hamming Loss: {hamming_loss_overall:.4f}")




Label: Family
  Best Parameters: {'C': 10}
  Exact Match (Accuracy): 0.9416
  Hamming Score: 0.9416

Label: Genus
  Best Parameters: {'C': 1000}
  Exact Match (Accuracy): 0.9504
  Hamming Score: 0.9504

Label: Species
  Best Parameters: {'C': 1000}
  Exact Match (Accuracy): 0.9625
  Hamming Score: 0.9625

Overall Exact Match (Accuracy): 0.0787
Overall Hamming Loss: 0.1252




#### (iv) Repeat 1(b)iii by using SMOTE or any other method for imbalance

In [47]:
# Initialize results dictionary to store metrics
results = {}
models = {}

# SMOTE: Apply to training data for each label
for label in y.columns:
    # Apply SMOTE to the training data (X_train, y_train)
    smote = SMOTE(random_state=42)
    X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train[label])

    # Parameter grid for grid search
    param_grid = {
        'C': C_values
    }

    # Initialize the LinearSVC model with L1 penalty
    svc = LinearSVC(penalty='l1', dual=False, max_iter=5000)

    # Perform Grid Search with 10-fold cross-validation
    grid_search = GridSearchCV(svc, param_grid, cv=10, n_jobs=-1, scoring='accuracy')
    grid_search.fit(X_train_resampled, y_train_resampled)

    # Get the best model
    best_model = grid_search.best_estimator_
    models[label] = best_model

    # Predict using the best model
    y_pred = best_model.predict(X_test)

    # Calculate Exact Match (Accuracy) and Hamming Score
    exact_match = accuracy_score(y_test[label], y_pred)
    hamming_score = 1 - hamming_loss(y_test[label], y_pred)

    # Store results for this label
    results[label] = {
        "Best Parameters": grid_search.best_params_,
        "Exact Match (Accuracy)": exact_match,
        "Hamming Score": hamming_score
    }

# Print results for each label
for label, metrics in results.items():
    print(f"\nLabel: {label}")
    print(f"  Best Parameters: {metrics['Best Parameters']}")
    print(f"  Exact Match (Accuracy): {metrics['Exact Match (Accuracy)']:.4f}")
    print(f"  Hamming Score: {metrics['Hamming Score']:.4f}")

# Calculate overall Exact Match and Hamming Score for all labels combined
y_pred_combined = pd.DataFrame({
    label: models[label].predict(X_test) for label in y.columns})
y_pred_combined = y_pred_combined.reindex_like(y_test)

# Calculate overall Exact Match (Accuracy)
exact_match_overall = (y_pred_combined.values == y_test.values).all(axis=1).mean()

# Calculate overall Hamming Loss
hamming_loss_overall = 1 - (y_pred_combined != y_test).mean().mean()

print(f"\nOverall Exact Match (Accuracy): {exact_match_overall:.4f}")
print(f"Overall Hamming Loss: {hamming_loss_overall:.4f}")




Label: Family
  Best Parameters: {'C': 10000}
  Exact Match (Accuracy): 0.9185
  Hamming Score: 0.9185

Label: Genus
  Best Parameters: {'C': 100}
  Exact Match (Accuracy): 0.9078
  Hamming Score: 0.9078

Label: Species
  Best Parameters: {'C': 10000}
  Exact Match (Accuracy): 0.9569
  Hamming Score: 0.9569

Overall Exact Match (Accuracy): 0.0723
Overall Hamming Loss: 0.1183




## 2. K-Means Clustering on a Multi-Class and Multi-Label Data Set

### (a) Use k-means clustering

In [48]:
silhouette_scores = []
k_range = range(2, 51)

for k in k_range:
    kmeans = KMeans(n_clusters=k, random_state=42)
    labels = kmeans.fit_predict(X)
    silhouette_avg = silhouette_score(X, labels)
    silhouette_scores.append(silhouette_avg)

    # Choose k with the highest Silhouette score
optimal_k = k_range[np.argmax(silhouette_scores)]
print(f"Optimal number of clusters (k): {optimal_k}")

Optimal number of clusters (k): 4


### (b) Determine which family is the majority

In [49]:
kmeans = KMeans(n_clusters=optimal_k, random_state=42)
cluster_lables = kmeans.fit_predict(X)

In [50]:
def Majority_Labels(optimalK, cluster_labels, Y):
    cluster_major = pd.DataFrame(columns=Y.columns)
    
    for c in range(optimalK):
        idx, = np.where(cluster_labels == c)
        cluster_samples = Y.iloc[idx, :]
        row = []
        for label in Y.columns:
            cur_major = cluster_samples.loc[:, label].value_counts().index[0]
            row.append(cur_major)
        cluster_major.loc[c] = row
    
    return cluster_major

cluster_major = Majority_Labels(optimal_k,cluster_lables, y)

### (c) Calculate the average Hamming distance, Hamming score, and Hamming loss

In [51]:
def hamming_metrics(cluster_major, cluster_labels, Y):
    total_mismatches = 0
    total_matches = 0
    total_samples = Y.shape[0]
    total_labels = Y.shape[1]

    for cluster in range(len(cluster_major)):
        indices = np.where(cluster_labels == cluster)[0]
        majority_triplet = cluster_major.loc[cluster].values
        true_labels = Y.iloc[indices].values
        mismatches = np.sum(true_labels != majority_triplet, axis=1)
        matches = total_labels - mismatches
        total_mismatches += np.sum(mismatches)
        total_matches += np.sum(matches)

    hamming_distance = total_mismatches / total_samples
    hammingloss = total_mismatches / (total_samples * total_labels)
    hamming_score = total_matches / (total_samples * total_labels)

    return hamming_distance, hammingloss, hamming_score

hamming_dist, hammingloss, hamming_score = hamming_metrics(cluster_major, cluster_lables, y)
print(f"Hamming Distance: {hamming_dist}")
print(f"Hamming Loss: {hammingloss}")
print(f"Hamming Score: {hamming_score}")

Hamming Distance: 0.66726893676164
Hamming Loss: 0.2224229789205467
Hamming Score: 0.7775770210794534


## 3. ISLR 12.6.2

![Part a](a.png)

![Part b](b.png)

![Part c](c&d.png)

![Part e](e.png)