In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import h5py
import itertools

In [2]:
root_dir = os.path.dirname(os.getcwd())
export_dir = os.path.join(root_dir, 'export')
df = pd.read_csv(os.path.join(export_dir, 'measurement_files_metadata.csv'))

In [None]:
# Check the class distribution (imbalance)
plt.figure(figsize=(12, 6), facecolor='#f9f9f9')
sns.set_style("whitegrid")
plt.rcParams.update({'font.size': 11, 'font.family': 'sans-serif'})

# Create a custom color palette - using the same colors as in the next figure
class_colors = {'good': '#2ecc71', 'bad': '#e74c3c'}
colors = [class_colors[cls] for cls in df['class'].unique()]

# Plot class distribution
plt.subplot(1, 2, 1)
class_counts = df['class'].value_counts().sort_values(ascending=False)
ax1 = sns.barplot(x=class_counts.index, y=class_counts.values, hue=class_counts.index, palette=colors, legend=False)
plt.title('Class Distribution', fontsize=14, fontweight='bold')
plt.xlabel('Class', fontsize=12)
plt.ylabel('Count', fontsize=12)
plt.xticks(rotation=45, ha='right')

# Add count labels on top of bars
for i, v in enumerate(class_counts.values):
    ax1.text(i, v + 0.1, str(v), ha='center', fontsize=10)

# Plot machine distribution
plt.subplot(1, 2, 2)
machine_counts = df['machine'].value_counts().sort_values(ascending=False)
machine_colors = sns.color_palette("mako", n_colors=len(df['machine'].unique()))
ax2 = sns.barplot(x=machine_counts.index, y=machine_counts.values, hue=machine_counts.index, palette=machine_colors, legend=False)
plt.title('Machine Distribution', fontsize=14, fontweight='bold')
plt.xlabel('Machine', fontsize=12)
plt.ylabel('Count', fontsize=12)
plt.xticks(rotation=45, ha='right')
# Add count labels on top of bars
for i, v in enumerate(machine_counts.values):
    ax2.text(i, v + 0.1, str(v), ha='center', fontsize=10)

plt.tight_layout(pad=3.0)
# Save the figure to the export directory
plt.savefig(os.path.join(export_dir, 'distribution_plots.png'), dpi=300, bbox_inches='tight')
plt.show()

# Display percentage distribution for better understanding
print("Class Distribution (%):")
class_pct = df['class'].value_counts(normalize=True).mul(100).round(2).sort_values(ascending=False)
for cls, pct in class_pct.items():
    print(f"  {cls}: {pct}%")

print("\nMachine Distribution (%):")
machine_pct = df['machine'].value_counts(normalize=True).mul(100).round(2).sort_values(ascending=False)
for machine, pct in machine_pct.items():
    print(f"  {machine}: {pct}%")


In [None]:
def plot_operation_distribution(df, machines=None, export_dir=None, xlim=80, ax=None, show=True):
    """
    Plot operation distribution for specific machines or all machines.
    
    Parameters:
    -----------
    df : pandas DataFrame
        The input dataframe containing the data
    machines : str or list, optional
        If specified, only plot data for these machines
    export_dir : str, optional
        Directory to save the plot
    ax : matplotlib axis, optional
        Axis to plot on
    show : bool
        Whether to show the plot
    """
    # Filter data for specific machine if provided
    if machines:
        if isinstance(machines, str):
            machines = [machines]
        plot_df = df[df['machine'].isin(machines)].copy()
        title_suffix = f' for {", ".join(machines)}'
    else:
        plot_df = df.copy()
        title_suffix = ''

    # Create a figure if no axis is provided
    if ax is None:
        fig, ax = plt.subplots(figsize=(14, 8), facecolor='#f9f9f9')
        standalone = True
    else:
        standalone = False

    # Create pivot table
    pivot_df = pd.crosstab(plot_df['operation'], plot_df['class'], margins=True, margins_name='Total')

    # Sort operations
    pivot_df['op_id'] = pivot_df.index.str.extract(r'(\d+)').astype(float)
    pivot_df = pivot_df.sort_values(by='op_id', ascending=False)
    pivot_df = pivot_df.drop('op_id', axis=1)
    
    if 'Total' in pivot_df.index:
        total_row = pivot_df.loc['Total']
        pivot_df = pivot_df.drop('Total')
        pivot_df.loc['Total'] = total_row

    # Prepare data for plotting
    pivot_for_plot = pivot_df.drop('Total', axis=1).drop('Total', axis=0)

    # Define custom colors
    class_colors = {'good': '#2ecc71', 'bad': '#e74c3c', 'Total': '#7f8c8d'}
    colors = [class_colors.get(col, '#3498db') for col in pivot_for_plot.columns]

    # Create plot
    pivot_for_plot.plot(kind='barh', stacked=True, color=colors, ax=ax, width=0.7)

    # Create custom legend
    handles = [plt.Rectangle((0,0),1,1, color=class_colors[label]) for label in pivot_for_plot.columns if label in class_colors]
    labels = [label for label in pivot_for_plot.columns if label in class_colors]

    other_labels = [label for label in pivot_for_plot.columns if label not in class_colors]
    if other_labels:
        for label in other_labels:
            handles.append(plt.Rectangle((0,0),1,1, color='#3498db'))
            labels.append(label)

    ax.legend(handles, labels, title='Label', title_fontsize=12)

    ax.set_title(f'Label Distribution per Operation{title_suffix}', fontsize=16, fontweight='bold')
    ax.set_xlabel('Number of samples', fontsize=12)
    ax.set_ylabel('Operation ID', fontsize=12)
    ax.grid(axis='x', linestyle='--', alpha=0.7)
    ax.set_xlim(0, xlim)

    # Add labels
    for i, (idx, row) in enumerate(pivot_for_plot.iterrows()):
        cumulative = 0
        for col in pivot_for_plot.columns:
            if row[col] > 0:
                x_pos = cumulative + row[col]/2
                ax.text(x_pos, i, f"{int(row[col])}", 
                       ha='center', va='center', 
                       color='white', fontweight='bold')
            cumulative += row[col]
        
        total = pivot_df.loc[idx, 'Total']
        ax.text(cumulative + 5, i, f"Total: {int(total)}", 
               ha='left', va='center', 
               color='#7f8c8d', fontweight='bold')
    
    if standalone and show:
        plt.tight_layout()
        plt.show()
    
    return ax

# Create a 2x2 grid of subplots for all machines and overall
fig, axs = plt.subplots(2, 2, figsize=(20, 16), facecolor='#f9f9f9')
axs = axs.flatten()

# Plot for each machine separately
machines = df['machine'].unique()
for i, machine in enumerate(machines):
    plot_operation_distribution(df, machines=machine, xlim=80, ax=axs[i], show=False)
    axs[i].set_title(f'Label Distribution per Operation for {machine}', fontsize=16, fontweight='bold')

# Plot overall distribution
plot_operation_distribution(df, xlim=180, ax=axs[-1], show=False)
axs[-1].set_title('Overall Label Distribution per Operation', fontsize=16, fontweight='bold')

plt.tight_layout(pad=3.0)

# Save the combined figure
if export_dir:
    plt.savefig(os.path.join(export_dir, 'operation_class_distribution_all.png'), 
                dpi=300, bbox_inches='tight', pad_inches=0.2)

plt.show()
