In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import h5py
import itertools

In [None]:
root_dir = os.path.dirname(os.getcwd())
export_dir = os.path.join(root_dir, 'export')
df = pd.read_csv(os.path.join(export_dir, 'measurement_files_metadata.csv'))

In [None]:
# Check the class distribution (imbalance)
plt.figure(figsize=(12, 6), facecolor='#f9f9f9')
sns.set_style("whitegrid")
plt.rcParams.update({'font.size': 11, 'font.family': 'sans-serif'})

# Create a custom color palette - using the same colors as in the next figure
class_colors = {'good': '#2ecc71', 'bad': '#e74c3c'}
colors = [class_colors[cls] for cls in df['class'].unique()]

# Plot class distribution
plt.subplot(1, 2, 1)
class_counts = df['class'].value_counts().sort_values(ascending=False)
ax1 = sns.barplot(x=class_counts.index, y=class_counts.values, hue=class_counts.index, palette=colors, legend=False)
plt.title('Class Distribution', fontsize=14, fontweight='bold')
plt.xlabel('Class', fontsize=12)
plt.ylabel('Count', fontsize=12)
plt.xticks(rotation=45, ha='right')

# Add count labels on top of bars
for i, v in enumerate(class_counts.values):
    ax1.text(i, v + 0.1, str(v), ha='center', fontsize=10)

# Plot machine distribution
plt.subplot(1, 2, 2)
machine_counts = df['machine'].value_counts().sort_values(ascending=False)
machine_colors = sns.color_palette("mako", n_colors=len(df['machine'].unique()))
ax2 = sns.barplot(x=machine_counts.index, y=machine_counts.values, hue=machine_counts.index, palette=machine_colors, legend=False)
plt.title('Machine Distribution', fontsize=14, fontweight='bold')
plt.xlabel('Machine', fontsize=12)
plt.ylabel('Count', fontsize=12)
plt.xticks(rotation=45, ha='right')
# Add count labels on top of bars
for i, v in enumerate(machine_counts.values):
    ax2.text(i, v + 0.1, str(v), ha='center', fontsize=10)

plt.tight_layout(pad=3.0)
# Save the figure to the export directory
plt.savefig(os.path.join(export_dir, 'distribution_plots.png'), dpi=300, bbox_inches='tight')
plt.show()

# Display percentage distribution for better understanding
print("Class Distribution (%):")
class_pct = df['class'].value_counts(normalize=True).mul(100).round(2).sort_values(ascending=False)
for cls, pct in class_pct.items():
    print(f"  {cls}: {pct}%")

print("\nMachine Distribution (%):")
machine_pct = df['machine'].value_counts(normalize=True).mul(100).round(2).sort_values(ascending=False)
for machine, pct in machine_pct.items():
    print(f"  {machine}: {pct}%")


In [None]:
# Create a bar chart visualization with custom colors
plt.figure(figsize=(14, 8), facecolor='#f9f9f9')

# Assuming 'class' column contains 'good' and 'bad' values
# Pivot the data to get operation counts by class
pivot_df = pd.crosstab(df['operation'], df['class'], margins=True, margins_name='Total')



# Extract operation IDs and convert to integers for proper sorting
# This assumes operations are in format like 'op1', 'op2', etc.
pivot_df['op_id'] = pivot_df.index.str.extract(r'(\d+)').astype(float)
# Sort by operation ID (numerically) in descending order
pivot_df = pivot_df.sort_values(by='op_id', ascending=False)
# Remove the temporary column used for sorting
pivot_df = pivot_df.drop('op_id', axis=1)
# Remove the Total row as we'll add it back after sorting
if 'Total' in pivot_df.index:
    total_row = pivot_df.loc['Total']
    pivot_df = pivot_df.drop('Total')
    # Add Total row back at the end
    pivot_df.loc['Total'] = total_row

# Prepare data for plotting
pivot_for_plot = pivot_df.drop('Total', axis=1).drop('Total', axis=0)

# Define custom colors for categories
class_colors = {'good': '#2ecc71', 'bad': '#e74c3c', 'Total': '#7f8c8d'}  # Nice green, red, and gray for Total
colors = [class_colors.get(col, '#3498db') for col in pivot_for_plot.columns]

# Plot with custom colors
ax = pivot_for_plot.plot(kind='barh', stacked=True, color=colors, figsize=(14, 8),width=0.7)

# Fix for legend colors - create a custom legend instead
handles = [plt.Rectangle((0,0),1,1, color=class_colors[label]) for label in pivot_for_plot.columns if label in class_colors]
labels = [label for label in pivot_for_plot.columns if label in class_colors]

# Add any other columns that aren't in class_colors
other_labels = [label for label in pivot_for_plot.columns if label not in class_colors]
if other_labels:
    for label in other_labels:
        handles.append(plt.Rectangle((0,0),1,1, color='#3498db'))  # default color
        labels.append(label)

# Use the custom handles
plt.legend(handles, labels, title='Label', title_fontsize=12)

plt.title('Label Distribution per Operation', fontsize=16, fontweight='bold')
plt.xlabel('Number of samples', fontsize=12)
plt.ylabel('Operation ID', fontsize=12)
plt.grid(axis='x', linestyle='--', alpha=0.7)
plt.xlim(0, 180)

# Add count labels inside or next to the bars
for i, (idx, row) in enumerate(pivot_for_plot.iterrows()):
    cumulative = 0
    for col in pivot_for_plot.columns:
        if row[col] > 0:  # Only add labels for non-zero values
            # Position label in middle of each segment
            x_pos = cumulative + row[col]/2
            ax.text(x_pos, i, f"{int(row[col])}", 
                   ha='center', va='center', 
                   color='white', fontweight='bold')
        cumulative += row[col]
    
    # Add total count at the end of each bar
    total = pivot_df.loc[idx, 'Total']
    ax.text(cumulative + 5, i, f"Total: {int(total)}", 
           ha='left', va='center', 
           color='#7f8c8d', fontweight='bold')  # Gray color for total text

plt.tight_layout()
plt.savefig(os.path.join(export_dir, 'operation_class_barchart.png'), dpi=300, bbox_inches='tight', pad_inches=0.2)
plt.show()
