In [None]:
pip freeze > requirements.txt

**IMPORTING LIBRARIES**

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

**DATA LOADING AND PREPROCESSING FUNCTION**

In [None]:
# Define Data Loading and Preprocessing Function
def load_data(filepath):
    ### START CODE HERE ###
    df = pd.read_csv(filepath)

    # Families mapping
    families_mapping = {
        'Bufonidae': 1,
        'Dendrobatidae': 2,
        'Hylidae': 3,
        'Leptodactylidae': 4
    }
    df.replace(families_mapping, inplace=True)

    # Genus mapping
    genus_mapping = {
        'Adenomera': 1,
        'Ameerega': 2,
        'Dendropsophus': 3,
        'Hypsiboas': 4,
        'Leptodactylus': 5,
        'Osteocephalus': 6,
        'Rhinella': 7,
        'Scinax': 8
    }
    df.replace(genus_mapping, inplace=True)

    # Species mapping
    species_mapping = {
        'AdenomeraAndre': 1,
        'AdenomeraHylaedactylus': 2,
        'Ameeregatrivittata': 3,
        'HylaMinuta': 4,
        'HypsiboasCinerascens': 5,
        'HypsiboasCordobae': 6,
        'LeptodactylusFuscus': 7,
        'OsteocephalusOophagus': 8,
        'Rhinellagranulosa': 9,
        'ScinaxRuber': 10
    }
    df.replace(species_mapping, inplace=True)

    # Check for missing values
    print("\nMissing values ")
    print(df.isnull().sum())

    ### END CODE HERE ###

    return df


**PLOTIING FEATURE DISTRIBUTIONS AND BOX PLOTS FOR OUTLIER DETECTION**

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

def visualize_data(df):
    # Get the numeric columns for plotting
    numeric_columns = df.select_dtypes(include=[float, int]).columns
    num_features = len(numeric_columns)
    
    # Determine number of rows needed for subplots (4 per row)
    num_rows = int(np.ceil(num_features / 4))

    # Visualize feature distributions in subplots
    if num_features > 0:
        fig, axes = plt.subplots(num_rows, 4, figsize=(20, 5 * num_rows))
        axes = axes.flatten()  # Flatten the 2D array of axes

        for ax, column in zip(axes, numeric_columns):
            sns.histplot(df[column], kde=True, ax=ax)
            ax.set_title(f'Distribution of {column}')
            ax.set_xlabel(column)
            ax.set_ylabel('Frequency')

        # Hide any unused subplots
        for i in range(num_features, len(axes)):
            axes[i].set_visible(False)
        
        plt.tight_layout()
        plt.show()
    
    # Outlier detection using box plots in subplots
    if num_features > 0:
        fig, axes = plt.subplots(num_rows, 4, figsize=(20, 5 * num_rows))
        axes = axes.flatten()  # Flatten the 2D array of axes

        for ax, column in zip(axes, numeric_columns):
            sns.boxplot(x=df[column], ax=ax)
            ax.set_title(f'Outliers in {column}')
        
        # Hide any unused subplots
        for i in range(num_features, len(axes)):
            axes[i].set_visible(False)
        
        plt.tight_layout()
        plt.show()

# Example usage:
# visualize_data(X1)  # Replace X1 with your DataFrame variable


**VISUALSISING CORRELATION MATRIX**

In [None]:
# Correlation Matrix Visualization
def plot_correlation_matrix(df):
    # Calculate the correlation matrix
    correlation_matrix = df.corr()

    # Set up the matplotlib figure
    plt.figure(figsize=(12, 10))

    # Create a heatmap using seaborn
    sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap='coolwarm', square=True, cbar_kws={"shrink": .8})
    plt.title('Correlation Matrix')
    plt.show()


**REMOVING OUTLIERS**

In [None]:
# Remove Outliers
def remove_outliers(df):
    # Creating a copy of the DataFrame to avoid modifying the original
    df_cleaned = df.copy()

    # Iterate over numeric columns
    for column in df_cleaned.select_dtypes(include=[float, int]).columns:
        # Calculate Q1 (25th percentile) and Q3 (75th percentile)
        Q1 = df_cleaned[column].quantile(0.25)
        Q3 = df_cleaned[column].quantile(0.75)
        IQR = Q3 - Q1  # Interquartile range

        # Define bounds for outliers
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR

        # Remove outliers
        df_cleaned = df_cleaned[(df_cleaned[column] >= lower_bound) & (df_cleaned[column] <= upper_bound)]

    return df_cleaned


**EXECUTION OF FUNCTIONS AND REMOVING FEATURES WITH HIGH CORRELATION**

In [None]:
# Cell 4: Main Execution Cell
filepath = 'Frogs_MFCCs.csv'  # Set the file path
df = load_data(filepath)        # Load the data
visualize_data(df)              # Visualize distributions and outliers

print("Data shape before removing outliers:", df.shape)  # Print shape after removing outliers
df_cleaned = remove_outliers(df)  # Remove outliers from the dataset
print("Data shape after removing outliers:", df_cleaned.shape)  # Print shape after removing outliers

plot_correlation_matrix(df_cleaned)  # Plot the correlation matrix for cleaned data

# Assuming df_cleaned is your DataFrame
# Step 1: Calculate the correlation matrix
correlation_matrix = df_cleaned.corr()

# Step 2: Find pairs of columns with a correlation greater than the threshold
threshold = 0.95
to_drop = set()

# Iterate through the correlation matrix to identify columns to drop
for i in range(len(correlation_matrix.columns)):
    for j in range(i):
        if abs(correlation_matrix.iloc[i, j]) > threshold:  # Check correlation
            colname = correlation_matrix.columns[i]  # Name of the column to drop
            to_drop.add(colname)

# Step 3: Drop the identified columns from the DataFrame
df_cleaned_reduced = df_cleaned.drop(columns=to_drop)

# Display the original and reduced DataFrame shapes
print("Original DataFrame shape:", df_cleaned.shape)
print("Reduced DataFrame shape:", df_cleaned_reduced.shape)

df_cleaned=df_cleaned_reduced\

plot_correlation_matrix(df_cleaned)  # Plot the correlation matrix for cleaned data

X = df_cleaned.values               # Convert cleaned DataFrame to NumPy array
print(type(X))                       # Print type of X


**Feature Engineering :**
Polynomial features transform existing features by raising them to different powers (e.g., squaring, cubing) or creating combinations of features to capture nonlinear relationships. By adding polynomial features, you can expand the feature space, potentially improving a model's ability to distinguish clusters that have complex boundaries.



In [None]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
import numpy as np
import pandas as pd

# Assume X is your NumPy array containing MFCC features

# Step 1: Generate polynomial features up to degree 2
poly = PolynomialFeatures(degree=2, include_bias=False)
X_poly = poly.fit_transform(X)  # Apply polynomial features to the NumPy array

# Create a placeholder target variable for RFE
# For demonstration, we'll use the mean of each row as a pseudo target
y = np.mean(X, axis=1)  # Generates a target based on the mean of each row

# Step 2: Using RFE with a Linear Regression model
model = LinearRegression()
selector = RFE(estimator=model, n_features_to_select=25)
selector = selector.fit(X_poly, y)

# Get the mask of selected features
selected_indices = selector.support_

# Get selected features based on the mask
X_selected = X_poly[:, selected_indices]

# Display the shape of the selected features
print("Shape of the selected features (top 25):", X_selected.shape)

# If you want to view the selected polynomial features as a DataFrame
df_X_poly = pd.DataFrame(X_selected)
X=df_X_poly
print("Selected Polynomial Features (first few rows):")
print(df_X_poly.head())


### Comparing K-Means Initialization Methods (k = 3)

This code compares the **inertia** (sum of squared distances to cluster centers) for K-Means clustering with `random` and `k-means++` initializations using **3 clusters** over **10 iterations**. Lower inertia values indicate more compact clusters.

The plot below shows inertia for each initialization method across the 10 iterations, allowing for a direct comparison of their effectiveness. Average inertia for each method is displayed for additional insight.


In [None]:
# Define the number of clusters
k = 3

# Initialize lists to store inertia (sum of squared distances) values
inertia_random = []
inertia_kmeanspp = []

# Run K-Means with different initializations for a fixed range of cluster trials
n_runs = 50
for i in range(n_runs):
    # Random initialization
    kmeans_random = KMeans(n_clusters=k, init='random', random_state=i)
    kmeans_random.fit(X)
    inertia_random.append(kmeans_random.inertia_)

    # k-means++ initialization
    kmeans_kpp = KMeans(n_clusters=k, init='k-means++', random_state=i)
    kmeans_kpp.fit(X)
    inertia_kmeanspp.append(kmeans_kpp.inertia_)

# Plot the inertia values for each initialization method
plt.figure(figsize=(10, 6))
plt.plot(range(1, n_runs + 1), inertia_random, marker='o', linestyle='-', label='Random Initialization')
plt.plot(range(1, n_runs + 1), inertia_kmeanspp, marker='s', linestyle='--', label='k-means++ Initialization')
plt.xlabel("Run")
plt.ylabel("Sum of Squared Distances (Inertia)")
plt.title("Inertia Comparison: Random vs k-means++ Initialization")
plt.legend()
plt.grid()
plt.show()

# Display average inertia for each method
print("Average inertia with Random Initialization:", np.mean(inertia_random))
print("Average inertia with k-means++ Initialization:", np.mean(inertia_kmeanspp))

### MFCC Feature Contributions to Cluster Separation

This analysis identifies which MFCC features contribute most to cluster separation by calculating the variance of each feature across clusters. The bar plot below shows MFCCs with higher variance, indicating stronger contributions to defining distinct clusters.


In [None]:
pip install shap

In [None]:
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Assume you already have a dataset X with MFCC features

# Step 1: Run K-Means clustering
k = 3  # Define number of clusters
kmeans = KMeans(n_clusters=k, random_state=0)
clusters = kmeans.fit_predict(X)

# Step 2: Calculate variance of each MFCC feature across clusters
cluster_centers = pd.DataFrame(kmeans.cluster_centers_, columns=X.columns)
feature_variances = cluster_centers.var(axis=0)

# Step 3: Visualize feature contributions
plt.figure(figsize=(12, 6))
plt.bar(X.columns, feature_variances)
plt.xlabel("MFCC Features")
plt.ylabel("Variance Across Clusters")
plt.title("MFCC Feature Contributions to Cluster Separation")
plt.xticks(rotation=90)
plt.show()


**PLOT FOR ELBOW METHOD**

In [None]:
# List to store the inertia values
inertia = []

# Range of clusters to test
cluster_range = range(1, 11)

# Apply K-means for each number of clusters

for k in cluster_range:
    kmeans = KMeans(n_clusters=k, random_state=0)
    kmeans.fit(X)
    inertia.append(kmeans.inertia_)

# Plotting the elbow method
plt.figure(figsize=(10, 6))
plt.plot(cluster_range, inertia, marker='o')
plt.title('Elbow Method for Optimal k')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Sum of Squared Distances (SSD)')
plt.xticks(cluster_range)
plt.grid()
plt.show()

**SILHOUTTE SCORE FOR EVALUATION:**

In [None]:
range_n_clusters = range(2, 11)
silhouette_avg = []

for n_clusters in range_n_clusters:
    kmeans = KMeans(n_clusters=n_clusters, random_state=0)
    cluster_labels = kmeans.fit_predict(X)

    # Calculate the silhouette score
    silhouette_avg.append(silhouette_score(X, cluster_labels))

In [None]:
# Plotting the silhouette scores
plt.figure(figsize=(10, 6))
plt.plot(range_n_clusters, silhouette_avg, marker='o')
plt.title('Silhouette Score vs Number of Clusters')
plt.xlabel('Number of Clusters')
plt.ylabel('Average Silhouette Score')
plt.xticks(range_n_clusters)
plt.grid()
plt.show()



In [None]:
# Print silhouette scores for each number of clusters
m = -1 ;
for n_clusters, score in zip(range_n_clusters, silhouette_avg):
    print(f'Number of clusters: {n_clusters}, Silhouette Score: {score:.4f}')
    if score > m :
        m = score
        k = n_clusters
print(f'Number of clusters for maximum Silhouette Score: {k}')

**DAVIES BOULDIN INDEX**

In [None]:
from sklearn.metrics import davies_bouldin_score
range_n_clusters = range(2, 11)
db_index = []
min = 2 ;
for n_clusters in range_n_clusters:
    kmeans = KMeans(n_clusters=n_clusters, random_state=0)
    cluster_labels = kmeans.fit_predict(X)

    # Calculate the Davies-Bouldin Index
    db_index.append(davies_bouldin_score(X, cluster_labels))
    if min < db_index[-1] :
        min = db_index[-1]
        k = n_clusters
print(f'Number of clusters for minimum Davies-Bouldin Index: {k}')

In [None]:
# Plotting the Davies-Bouldin Index
plt.figure(figsize=(10, 6))
plt.plot(range_n_clusters, db_index, marker='o')
plt.title('Davies-Bouldin Index vs Number of Clusters')
plt.xlabel('Number of Clusters')
plt.ylabel('Davies-Bouldin Index')
plt.xticks(range_n_clusters)
plt.grid()
plt.show()


In [None]:
# Print DBI for each number of clusters
for n_clusters, db in zip(range_n_clusters, db_index):
    print(f'Number of clusters: {n_clusters}, Davies-Bouldin Index: {db:.4f}')
print(f'Number of clusters for minimum Davies-Bouldin Index: {k}')

**CALSINKI HARABASZ INDEX**

In [None]:
from sklearn.metrics import calinski_harabasz_score
ch_index = []
for n_clusters in range_n_clusters:
    kmeans = KMeans(n_clusters=n_clusters, random_state=0)
    cluster_labels = kmeans.fit_predict(X)

    # Calculate the Calinski-Harabasz Index
    ch_index.append(calinski_harabasz_score(X, cluster_labels))

In [None]:
plt.figure(figsize=(10, 6))
plt.plot(range_n_clusters, ch_index, marker='o')
plt.title('Calinski-Harabasz Index vs Number of Clusters')
plt.xlabel('Number of Clusters')
plt.ylabel('Calinski-Harabasz Index')
plt.xticks(range_n_clusters)
plt.grid()
plt.show()

In [None]:
max = 0
for n_clusters, ch in zip(range_n_clusters, ch_index):
    print(f'Number of clusters: {n_clusters}, Calinski-Harabasz Index: {ch:.4f}')
    if max < ch :
        max = ch
        k = n_clusters
print(f'Number of clusters for maximum Calinski-Harabasz Index: {k}')

**PCA**

In [None]:
from sklearn.preprocessing import StandardScaler
# Standardize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Compute the covariance matrix
cov_matrix = np.cov(X_scaled, rowvar=False)

# Compute eigenvalues and eigenvectors
eigenvalues, eigenvectors = np.linalg.eigh(cov_matrix)

# Sort eigenvalues and corresponding eigenvectors
sorted_indices = np.argsort(eigenvalues)[::-1]
eigenvalues_sorted = eigenvalues[sorted_indices]
eigenvectors_sorted = eigenvectors[:, sorted_indices]

# Select the top k eigenvectors (principal components)
k = 2  # Number of dimensions to reduce to
W = eigenvectors_sorted[:, :k]

# Project the data onto the new feature space
X_pca = X_scaled.dot(W)

# Apply K-means clustering
n_clusters_values = [2,3]  # Set the number of clusters
for n_clusters in n_clusters_values:
    kmeans = KMeans(n_clusters=n_clusters, random_state=0)
    cluster_labels = kmeans.fit_predict(X_pca)

    # Plot the PCA results with clusters
    plt.figure(figsize=(10, 6))
    scatter = plt.scatter(X_pca[:, 0], X_pca[:, 1], c=cluster_labels, cmap='viridis', alpha=0.7)
    plt.title(f'PCA with K-means Clustering(K ={n_clusters})')
    plt.xlabel('Principal Component 1')
    plt.ylabel('Principal Component 2')
    plt.colorbar(scatter, label='Cluster Label')
    plt.grid()
    plt.show()



**Agglomerative Hierarchical Clustering or DBSCAN**

In [None]:
from sklearn.cluster import DBSCAN
# Apply DBSCAN
dbscan = DBSCAN(eps=0.5, min_samples=5)  # Adjust eps and min_samples as needed
cluster_labels = dbscan.fit_predict(X_pca)

# Plot the results
plt.figure(figsize=(10, 6))
# Use a scatter plot where the color corresponds to the cluster labels
scatter = plt.scatter(X_pca[:, 0], X_pca[:, 1], c=cluster_labels, cmap='viridis', alpha=0.7)
plt.title('DBSCAN Clustering Result')
plt.xlabel('Feature 1 (standardized)')
plt.ylabel('Feature 2 (standardized)')
plt.colorbar(scatter, label='Cluster Label')
plt.grid()
plt.show()

# Print the number of clusters found
n_clusters = len(set(cluster_labels)) - (1 if -1 in cluster_labels else 0)  # Exclude noise points
n_noise = list(cluster_labels).count(-1)
print(f'Estimated number of clusters: {n_clusters}')
print(f'Estimated number of noise points: {n_noise}')