In [40]:
# Imports and Setup
import os
import numpy as np
import pandas as pd
import librosa
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import ttest_ind

sns.set(style='whitegrid')


In [41]:
#  Feature Extractor Class
class FeatureExtractor:
    def __init__(self, sample_rate=44100, duration=7, n_mfcc=13):
        self.sample_rate = sample_rate
        self.duration = duration
        self.n_mfcc = n_mfcc

    def extract_features(self, file_path):
        try:
            y, sr = librosa.load(file_path, sr=self.sample_rate, duration=self.duration)

            # MFCC mean and std
            mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=self.n_mfcc)
            mfcc_mean = np.mean(mfcc, axis=1)
            mfcc_std = np.std(mfcc, axis=1)

            # Delta and delta-delta
            delta = librosa.feature.delta(mfcc)
            delta2 = librosa.feature.delta(mfcc, order=2)
            delta_mean = np.mean(delta, axis=1)
            delta_std = np.std(delta, axis=1)
            delta2_mean = np.mean(delta2, axis=1)
            delta2_std = np.std(delta2, axis=1)

            # Spectral features
            zcr = librosa.feature.zero_crossing_rate(y)[0]
            zcr_mean = np.mean(zcr)

            centroid = librosa.feature.spectral_centroid(y=y, sr=sr)[0]
            bandwidth = librosa.feature.spectral_bandwidth(y=y, sr=sr)[0]
            rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr)[0]
            rms = librosa.feature.rms(y=y)[0]

            spectral_features = [
                np.mean(centroid), np.std(centroid),
                np.mean(bandwidth), np.std(bandwidth),
                np.mean(rolloff), np.std(rolloff),
                np.mean(rms), np.std(rms),
                zcr_mean
            ]

            features = np.concatenate([
                mfcc_mean, mfcc_std,
                delta_mean, delta_std,
                delta2_mean, delta2_std,
                spectral_features
            ])

            return features
        except Exception as e:
            print(f"Error in {file_path}: {e}")
            return None

    def load_dataset(self, ripe_dir, unripe_dir):
        features = []
        labels = []

        for label, directory in zip([1, 0], [ripe_dir, unripe_dir]):
            for file in os.listdir(directory):
                if file.endswith('.wav'):
                    path = os.path.join(directory, file)
                    feat = self.extract_features(path)
                    if feat is not None:
                        features.append(feat)
                        labels.append(label)

        X = np.array(features)
        y = np.array(labels)
        return X, y


In [42]:
# Load dataset and extract features
ripe_dir = 'data/ripe'
unripe_dir = 'data/unripe'

extractor = FeatureExtractor()

print("Extracting features from audio files...")
X, y = extractor.load_dataset(ripe_dir, unripe_dir)

print(f"Extracted features shape: {X.shape}")
print(f"Labels shape: {y.shape}")


Extracting features from audio files...
Extracted features shape: (120, 87)
Labels shape: (120,)


In [43]:
# Save features to Excel
def save_to_excel(X, y, filename='features.xlsx', n_mfcc=13):
    columns = []
    columns += [f'mfcc_mean_{i+1}' for i in range(n_mfcc)]
    columns += [f'mfcc_std_{i+1}' for i in range(n_mfcc)]
    columns += [f'delta_mean_{i+1}' for i in range(n_mfcc)]
    columns += [f'delta_std_{i+1}' for i in range(n_mfcc)]
    columns += [f'delta2_mean_{i+1}' for i in range(n_mfcc)]
    columns += [f'delta2_std_{i+1}' for i in range(n_mfcc)]

    spectral_names = [
        'spectral_centroid_mean', 'spectral_centroid_std',
        'spectral_bandwidth_mean', 'spectral_bandwidth_std',
        'spectral_rolloff_mean', 'spectral_rolloff_std',
        'rms_mean', 'rms_std',
        'zcr_mean'
    ]
    columns += spectral_names

    df = pd.DataFrame(X, columns=columns)
    df['Label'] = y
    df['Label'] = df['Label'].map({1: 'Ripe', 0: 'Unripe'})
    df.to_excel(filename, index=False)
    print(f"Saved features to {filename}")
    return df

df = save_to_excel(X, y)
df.head()


Saved features to features.xlsx


Unnamed: 0,mfcc_mean_1,mfcc_mean_2,mfcc_mean_3,mfcc_mean_4,mfcc_mean_5,mfcc_mean_6,mfcc_mean_7,mfcc_mean_8,mfcc_mean_9,mfcc_mean_10,...,spectral_centroid_mean,spectral_centroid_std,spectral_bandwidth_mean,spectral_bandwidth_std,spectral_rolloff_mean,spectral_rolloff_std,rms_mean,rms_std,zcr_mean,Label
0,-605.135803,140.14917,14.717386,23.677568,-12.223973,1.91674,32.19902,9.819526,16.056005,12.657329,...,1748.793542,428.25827,3247.279462,945.703509,3285.509707,1661.650658,0.005113,0.005681,0.024809,Ripe
1,-580.152161,141.289597,15.366449,19.747908,-19.63592,-5.756866,27.905113,8.587965,14.773101,10.756027,...,1879.611242,945.60404,3161.239424,981.170178,3470.309585,2142.671183,0.005648,0.006681,0.030282,Ripe
2,-592.334717,138.983185,12.908314,23.229313,-16.120331,-1.278966,32.677967,9.055169,14.148767,10.864845,...,1882.520049,939.662279,3217.154757,987.488092,3641.361066,2156.109327,0.005194,0.00591,0.029977,Ripe
3,-593.591919,130.21254,13.740931,22.424614,-13.90311,0.952517,30.962719,9.304797,15.681391,10.574866,...,1992.364378,520.087757,3406.646472,1093.984684,4085.844945,1771.076757,0.004932,0.006142,0.028617,Ripe
4,-585.973694,141.833908,15.133223,23.787643,-9.880121,5.861056,35.56221,12.621987,19.055359,14.209699,...,1434.593256,1025.239465,2704.024844,688.571022,2385.11471,2424.249235,0.006508,0.005459,0.023859,Ripe


In [44]:
# Perform t-tests between ripe and unripe
def perform_t_tests(df):
    feature_cols = df.columns[:-1]  # exclude label
    ripe = df[df['Label'] == 'Ripe']
    unripe = df[df['Label'] == 'Unripe']

    results = []
    print(f"{'Feature':<30} {'Mean_Ripe':>10} {'SD_Ripe':>10} {'Mean_Unripe':>12} {'SD_Unripe':>10} {'p-value':>10}")
    print("-"*90)
    for col in feature_cols:
        ripe_vals = ripe[col]
        unripe_vals = unripe[col]
        t_stat, p_val = ttest_ind(ripe_vals, unripe_vals, equal_var=False)
        results.append((col,
                        ripe_vals.mean(), ripe_vals.std(),
                        unripe_vals.mean(), unripe_vals.std(),
                        p_val))
        print(f"{col:<30} {ripe_vals.mean():10.4f} {ripe_vals.std():10.4f} {unripe_vals.mean():12.4f} {unripe_vals.std():10.4f} {p_val:10.4e}")
    return results

ttest_results = perform_t_tests(df)


Feature                         Mean_Ripe    SD_Ripe  Mean_Unripe  SD_Unripe    p-value
------------------------------------------------------------------------------------------
mfcc_mean_1                     -569.3826    14.6509    -554.0391    17.4934 8.4941e-07
mfcc_mean_2                      133.8049    10.5513     123.0261     9.3444 3.2716e-08
mfcc_mean_3                        6.5315     7.6195       5.9170     5.2485 6.0802e-01
mfcc_mean_4                       23.0160     3.9843      17.9612     6.1418 5.5317e-07
mfcc_mean_5                      -14.6308     3.1393     -18.2984     4.5404 1.2443e-06
mfcc_mean_6                       -6.0395     5.7310      -5.3857     4.1031 4.7401e-01
mfcc_mean_7                       31.4186     4.0849      28.0797     3.4344 3.9846e-06
mfcc_mean_8                        9.8888     2.5870       7.3057     2.3547 8.3391e-08
mfcc_mean_9                       10.2712     4.4648      12.3337     3.8141 7.5256e-03
mfcc_mean_10                 

In [46]:
ttest_results = perform_t_tests(df)


Feature                         Mean_Ripe    SD_Ripe  Mean_Unripe  SD_Unripe    p-value
------------------------------------------------------------------------------------------
mfcc_mean_1                     -569.3826    14.6509    -554.0391    17.4934 8.4941e-07
mfcc_mean_2                      133.8049    10.5513     123.0261     9.3444 3.2716e-08
mfcc_mean_3                        6.5315     7.6195       5.9170     5.2485 6.0802e-01
mfcc_mean_4                       23.0160     3.9843      17.9612     6.1418 5.5317e-07
mfcc_mean_5                      -14.6308     3.1393     -18.2984     4.5404 1.2443e-06
mfcc_mean_6                       -6.0395     5.7310      -5.3857     4.1031 4.7401e-01
mfcc_mean_7                       31.4186     4.0849      28.0797     3.4344 3.9846e-06
mfcc_mean_8                        9.8888     2.5870       7.3057     2.3547 8.3391e-08
mfcc_mean_9                       10.2712     4.4648      12.3337     3.8141 7.5256e-03
mfcc_mean_10                 

In [28]:
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import ttest_ind

def perform_avg_feature_t_test_and_plot(df, output_path='average_feature_barplot.png'):
    feature_cols = df.columns[:-1]  # exclude 'Label'
    
    # Calculate average feature for each sample
    df['Average_Feature'] = df[feature_cols].mean(axis=1)
    
    # Split ripe and unripe
    ripe = df[df['Label'] == 'Ripe']['Average_Feature']
    unripe = df[df['Label'] == 'Unripe']['Average_Feature']
    
    # Perform t-test
    t_stat, p_val = ttest_ind(ripe, unripe, equal_var=False)
    
    # Print summary table
    print(f"{'Average Feature':<20} {'Mean_Ripe':>10} {'SD_Ripe':>10} {'Mean_Unripe':>12} {'SD_Unripe':>10} {'p-value':>10}")
    print("-" * 75)
    print(f"{'Average_Feature':<20} {ripe.mean():10.4f} {ripe.std():10.4f} {unripe.mean():12.4f} {unripe.std():10.4f} {p_val:10.4e}")
    
    # Bar plot with error bars
    labels = ['Ripe', 'Unripe']
    means = [ripe.mean(), unripe.mean()]
    stds = [ripe.std(), unripe.std()]
    
    plt.figure(figsize=(6, 6))
    plt.bar(labels, means, yerr=stds, capsize=10, color=['green', 'orange'])
    plt.title('Average Feature Comparison')
    plt.ylabel('Mean Value')
    plt.tight_layout()
    plt.savefig(output_path)
    plt.close()
    
    print(f"Bar graph saved to {output_path}")
    
    return {
        'Feature': 'Average_Feature',
        'Mean_Ripe': ripe.mean(),
        'SD_Ripe': ripe.std(),
        'Mean_Unripe': unripe.mean(),
        'SD_Unripe': unripe.std(),
        'p-value': p_val
    }

# Example usage:
# result = perform_avg_feature_t_test_and_plot(df)


In [36]:
import matplotlib.pyplot as plt

# Create and save a line plot of mean feature values from t-test results
def plot_ttest_linegraph(ttest_results, filename='lineplot_feature_means.png'):
    features = [r[0] for r in ttest_results]
    ripe_means = [r[1] for r in ttest_results]
    unripe_means = [r[3] for r in ttest_results]

    plt.figure(figsize=(14, 6))
    plt.plot(features, ripe_means, marker='o', label='Ripe', color='green')
    plt.plot(features, unripe_means, marker='o', label='Unripe', color='orange')
    plt.xticks(rotation=90)
    plt.ylabel('Mean Feature Value')
    plt.title('Mean Feature Comparison (Ripe vs Unripe)')
    plt.legend()
    plt.tight_layout()
    plt.savefig(filename)
    plt.close()

    print(f"Line plot saved to: {filename}")

# Use this after running perform_t_tests(df)
plot_ttest_linegraph(ttest_results)


Line plot saved to: lineplot_feature_means.png


In [47]:
import pandas as pd

# Convert results to DataFrame
ttest_df = pd.DataFrame(ttest_results, columns=[
    'Feature', 'Mean_Ripe', 'SD_Ripe', 'Mean_Unripe', 'SD_Unripe', 'p_value'
])


In [23]:
import matplotlib.pyplot as plt
import os

def chunk_features(features, chunk_size=5):
    """Split features into chunks of given size."""
    return [features[i:i+chunk_size] for i in range(0, len(features), chunk_size)]

def plot_bar_graph_chunks(ttest_df, chunk_size=5, output_dir='barplots'):
    os.makedirs(output_dir, exist_ok=True)
    chunks = chunk_features(ttest_df['Feature'].tolist(), chunk_size)

    for idx, chunk in enumerate(chunks, 1):
        subset = ttest_df[ttest_df['Feature'].isin(chunk)]

        x = range(len(subset))
        width = 0.35

        plt.figure(figsize=(10, 6))
        plt.bar([p - width/2 for p in x], subset['Mean_Ripe'], width, label='Ripe', color='green')
        plt.bar([p + width/2 for p in x], subset['Mean_Unripe'], width, label='Unripe', color='orange')
        plt.xticks(x, subset['Feature'], rotation=45)
        plt.ylabel('Mean Value')
        plt.title(f'T-Test Feature Comparison (Part {idx})')
        plt.legend()
        plt.tight_layout()

        filename = os.path.join(output_dir, f'barplot_part{idx}.png')
        plt.savefig(filename)
        plt.close()
        print(f"Saved: {filename}")

plot_bar_graph_chunks(ttest_df)


Saved: barplots\barplot_part1.png
Saved: barplots\barplot_part2.png
Saved: barplots\barplot_part3.png
Saved: barplots\barplot_part4.png
Saved: barplots\barplot_part5.png
Saved: barplots\barplot_part6.png
Saved: barplots\barplot_part7.png
Saved: barplots\barplot_part8.png
Saved: barplots\barplot_part9.png
Saved: barplots\barplot_part10.png
Saved: barplots\barplot_part11.png
Saved: barplots\barplot_part12.png
Saved: barplots\barplot_part13.png
Saved: barplots\barplot_part14.png
Saved: barplots\barplot_part15.png
Saved: barplots\barplot_part16.png
Saved: barplots\barplot_part17.png
Saved: barplots\barplot_part18.png


In [48]:
def perform_avg_feature_t_test(df):
    feature_cols = df.columns[:-1]  # exclude 'Label'
    # Calculate average feature for each sample
    df['Average_Feature'] = df[feature_cols].mean(axis=1)

    ripe = df[df['Label'] == 'Ripe']['Average_Feature']
    unripe = df[df['Label'] == 'Unripe']['Average_Feature']

    t_stat, p_val = ttest_ind(ripe, unripe, equal_var=False)

    # Print summary table
    print(f"{'Average Feature':<20} {'Mean_Ripe':>10} {'SD_Ripe':>10} {'Mean_Unripe':>12} {'SD_Unripe':>10} {'p-value':>10}")
    print("-" * 75)
    print(f"{'Average_Feature':<20} {ripe.mean():10.4f} {ripe.std():10.4f} {unripe.mean():12.4f} {unripe.std():10.4f} {p_val:10.4e}")

    return {
        'Feature': 'Average_Feature',
        'Mean_Ripe': ripe.mean(),
        'SD_Ripe': ripe.std(),
        'Mean_Unripe': unripe.mean(),
        'SD_Unripe': unripe.std(),
        'p-value': p_val
    }

result = perform_avg_feature_t_test(df)


Average Feature       Mean_Ripe    SD_Ripe  Mean_Unripe  SD_Unripe    p-value
---------------------------------------------------------------------------
Average_Feature        142.2513    13.7279     152.2020     9.9791 1.4622e-05


In [24]:
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import ttest_ind

def perform_avg_feature_t_test_and_plot(df, output_path='average_feature_barplot.png'):
    feature_cols = df.columns[:-1]  # exclude 'Label'
    
    # Calculate average feature for each sample
    df['Average_Feature'] = df[feature_cols].mean(axis=1)
    
    # Split ripe and unripe
    ripe = df[df['Label'] == 'Ripe']['Average_Feature']
    unripe = df[df['Label'] == 'Unripe']['Average_Feature']
    
    # Perform t-test
    t_stat, p_val = ttest_ind(ripe, unripe, equal_var=False)
    
    # Print summary table
    print(f"{'Average Feature':<20} {'Mean_Ripe':>10} {'SD_Ripe':>10} {'Mean_Unripe':>12} {'SD_Unripe':>10} {'p-value':>10}")
    print("-" * 75)
    print(f"{'Average_Feature':<20} {ripe.mean():10.4f} {ripe.std():10.4f} {unripe.mean():12.4f} {unripe.std():10.4f} {p_val:10.4e}")
    
    # Bar plot with error bars
    labels = ['Ripe', 'Unripe']
    means = [ripe.mean(), unripe.mean()]
    stds = [ripe.std(), unripe.std()]
    
    plt.figure(figsize=(6, 6))
    plt.bar(labels, means, yerr=stds, capsize=10, color=['green', 'orange'])
    plt.title('Average Feature Comparison')
    plt.ylabel('Mean Value')
    plt.tight_layout()
    plt.savefig(output_path)
    plt.close()
    
    print(f"Bar graph saved to {output_path}")
    
    return {
        'Feature': 'Average_Feature',
        'Mean_Ripe': ripe.mean(),
        'SD_Ripe': ripe.std(),
        'Mean_Unripe': unripe.mean(),
        'SD_Unripe': unripe.std(),
        'p-value': p_val
    }

# Example usage:
# result = perform_avg_feature_t_test_and_plot(df)


In [10]:
# Plot boxplots and save as PNG
def plot_boxplots(df, features_group, group_name, filename):
    plt.figure(figsize=(14, 8))
    melted = df.melt(id_vars='Label', value_vars=features_group,
                     var_name='Feature', value_name='Value')
    sns.boxplot(x='Feature', y='Value', hue='Label', data=melted)
    plt.title(f'{group_name} Feature Distributions by Ripeness')
    plt.xticks(rotation=90)
    plt.tight_layout()
    plt.savefig(filename)
    plt.close()
    print(f"Saved {group_name} boxplot to {filename}")

# Cepstral features: MFCC mean + std
n_mfcc = extractor.n_mfcc
mfcc_mean_features = [f'mfcc_mean_{i+1}' for i in range(n_mfcc)]
mfcc_std_features = [f'mfcc_std_{i+1}' for i in range(n_mfcc)]
cepstral_features = mfcc_mean_features + mfcc_std_features
plot_boxplots(df, cepstral_features, "Cepstral (MFCC Mean & Std)", "cepstral_features.png")

# Delta MFCC std features (like 2nd degree polynomial feature)
delta_std_features = [f'delta_std_{i+1}' for i in range(n_mfcc)]
plot_boxplots(df, delta_std_features, "Delta MFCC Std Dev", "delta_std_features.png")

# Spectral features
spectral_features = [
    'spectral_centroid_mean', 'spectral_centroid_std',
    'spectral_bandwidth_mean', 'spectral_bandwidth_std',
    'spectral_rolloff_mean', 'spectral_rolloff_std',
    'rms_mean', 'rms_std',
    'zcr_mean'
]
plot_boxplots(df, spectral_features, "Spectral Features", "spectral_features.png")


Saved Cepstral (MFCC Mean & Std) boxplot to cepstral_features.png
Saved Delta MFCC Std Dev boxplot to delta_std_features.png
Saved Spectral Features boxplot to spectral_features.png


In [19]:

def chunk_features(features, chunk_size=5):
    """Split features into chunks of given size."""
    return [features[i:i+chunk_size] for i in range(0, len(features), chunk_size)]

def plot_bar_graph_chunks(df, features_group, group_name, base_filename, chunk_size=5):
    chunks = chunk_features(features_group, chunk_size)
    
    for idx, chunk in enumerate(chunks, 1):
        # Compute mean per feature per label
        grouped_means = df.groupby('Label')[chunk].mean().T
        grouped_means.columns.name = None  # clean legend
        
        plt.figure(figsize=(10, 6))
        grouped_means.plot(kind='bar', ax=plt.gca())
        plt.title(f'{group_name} - Features {chunk[0]} to {chunk[-1]}')
        plt.xlabel('Feature')
        plt.ylabel('Mean Value')
        plt.xticks(rotation=45)
        plt.legend(title='Ripeness')
        plt.tight_layout()

        # Save each chunk with a number in the filename
        filename = f"{base_filename}_part{idx}.png"
        plt.savefig(filename)
        plt.close()
        print(f"Saved bar graph: {filename}")


In [13]:
# Plot dot plots and save as PNG
def plot_dotplots(df, features_group, group_name, filename):
    plt.figure(figsize=(14, 8))
    melted = df.melt(id_vars='Label', value_vars=features_group,
                     var_name='Feature', value_name='Value')
    sns.stripplot(x='Feature', y='Value', hue='Label', data=melted,
                  dodge=True, jitter=0.25, alpha=0.7)
    plt.title(f'{group_name} Feature Distributions by Ripeness (Dots)')
    plt.xticks(rotation=90)
    plt.tight_layout()
    plt.savefig(filename)
    plt.close()
    print(f"Saved {group_name} dot plot to {filename}")

# Cepstral features: MFCC mean + std
n_mfcc = extractor.n_mfcc
mfcc_mean_features = [f'mfcc_mean_{i+1}' for i in range(n_mfcc)]
mfcc_std_features = [f'mfcc_std_{i+1}' for i in range(n_mfcc)]
cepstral_features = mfcc_mean_features + mfcc_std_features
plot_dotplots(df, cepstral_features, "Cepstral (MFCC Mean & Std)", "cepstral_features_dots.png")

# Delta MFCC std features (like 2nd degree polynomial feature)
delta_std_features = [f'delta_std_{i+1}' for i in range(n_mfcc)]
plot_dotplots(df, delta_std_features, "Delta MFCC Std Dev", "delta_std_features_dots.png")

# Spectral features
spectral_features = [
    'spectral_centroid_mean', 'spectral_centroid_std',
    'spectral_bandwidth_mean', 'spectral_bandwidth_std',
    'spectral_rolloff_mean', 'spectral_rolloff_std',
    'rms_mean', 'rms_std',
    'zcr_mean'
]
plot_dotplots(df, spectral_features, "Spectral Features", "spectral_features_dots.png")

Saved Cepstral (MFCC Mean & Std) dot plot to cepstral_features_dots.png
Saved Delta MFCC Std Dev dot plot to delta_std_features_dots.png
Saved Spectral Features dot plot to spectral_features_dots.png


In [14]:
def plot_avg_feature_dots(df, filename="average_feature_dots.png"):
    plt.figure(figsize=(6, 6))
    sns.stripplot(data=df, x="Label", y="Average_Feature", jitter=0.25, alpha=0.7)
    plt.title("Average Feature Value by Ripeness (Dot Plot)")
    plt.ylabel("Average Feature Value")
    plt.xlabel("Ripeness Label")
    plt.tight_layout()
    plt.savefig(filename)
    plt.close()
    print(f"Saved average feature dot plot to {filename}")

# Call after `perform_avg_feature_t_test(df)`
plot_avg_feature_dots(df)

Saved average feature dot plot to average_feature_dots.png


In [15]:
import matplotlib.pyplot as plt

def plot_custom_dot_comparison(df, filename="custom_avg_feature_plot.png"):
    ripe = df[df['Label'] == 'Ripe']['Average_Feature'].reset_index(drop=True)
    unripe = df[df['Label'] == 'Unripe']['Average_Feature'].reset_index(drop=True)

    plt.figure(figsize=(6, 6))
    plt.scatter(ripe.index + 1, ripe, color='green', label='Ripe')
    plt.scatter(unripe.index + 1, unripe, color='red', label='Unripe')
    
    plt.title('Ripe vs Unripe')
    plt.xlabel('No of Samples')
    plt.ylabel('Cepstral-based statistical features')
    plt.legend()
    plt.tight_layout()
    plt.savefig(filename)
    plt.close()
    print(f"Saved custom scatter plot to {filename}")

# Call this after computing 'Average_Feature' in your dataframe
plot_custom_dot_comparison(df)


Saved custom scatter plot to custom_avg_feature_plot.png
