# Data augmentation with SMOTE ENN

In [None]:
import pandas as pd
from imblearn.combine import SMOTEENN
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import EditedNearestNeighbours
from collections import Counter
import matplotlib.pyplot as plt

# Define target columns
TARGET_COLUMNS = ['OK', 'oestrus', 'lameness', 'mastitis', 'other_disease']

def load_data(file_path):
    """Load dataset from CSV file."""
    return pd.read_csv(file_path)

def check_missing_values(df):
    """Print missing values per column and dataset shape."""
    print("Nombre de valeurs manquantes par colonne :")
    print(df.isna().sum())
    print(f"✅ Nombre de lignes : {df.shape[0]}")
    print(f"✅ Nombre de colonnes : {df.shape[1]}")

def preprocess_data(df, target_columns):
    """Preprocess data by dropping rows with missing values and creating a multiclass label."""
    df_cleaned = df.dropna(subset=target_columns).copy()
    X_full = df_cleaned.drop(columns=target_columns)
    X_full = X_full.select_dtypes(include=['int64', 'float64'])
    df_cleaned = df_cleaned[X_full.columns.tolist() + target_columns]
    df_cleaned = df_cleaned.dropna()

    # Create multiclass label
    df_cleaned['label'] = df_cleaned[target_columns].idxmax(axis=1)
    X = df_cleaned.drop(columns=target_columns + ['label'])
    y = df_cleaned['label']
    return df_cleaned, X, y

def apply_smote_enn(X, y, k_neighbors=2, n_neighbors=3):
    """Apply SMOTE-ENN to balance the dataset."""
    print("Avant SMOTE-ENN :", Counter(y))
    smote_enn = SMOTEENN(
        sampling_strategy='minority',
        random_state=42,
        smote=SMOTE(k_neighbors=k_neighbors),
        enn=EditedNearestNeighbours(n_neighbors=n_neighbors),
        n_jobs=1
    )
    X_res, y_res = smote_enn.fit_resample(X, y)
    print("Après SMOTE-ENN :", Counter(y_res))
    return X_res, y_res

def create_final_dataframe(X_res, y_res, target_columns, X_columns):
    """Create final DataFrame with one-hot encoded target columns."""
    final_df = pd.DataFrame(X_res, columns=X_columns)
    for col in target_columns:
        final_df[col] = (y_res == col).astype(int)
    return final_df

def save_dataframe(df, output_path):
    """Save DataFrame to CSV file."""
    df.to_csv(output_path, index=False)
    print(f"✅ Dataset multiclasse équilibré sauvegardé dans '{output_path}'")

def print_class_distribution(df, target_columns, stage=""):
    """Print class distribution for each target column."""
    print(f"\nDistribution des 0 et 1 par classe {stage}:")
    for col in target_columns:
        counts = df[col].value_counts().to_dict()
        print(f"{col}: {counts}")

def check_zero_targets(df, target_columns):
    """Check for rows where all target columns are zero."""
    all_zero_targets = df[(df[target_columns] == 0).all(axis=1)]
    print("🔍 Lignes où toutes les cibles sont à 0 :")
    print(all_zero_targets)
    print(f"Nombre de lignes concernées : {len(all_zero_targets)}")

def plot_class_distribution(before_df, after_df, target_columns):
    """Plot class distribution before and after SMOTE-ENN."""
    before_counts = {col: before_df[col].value_counts().to_dict() for col in target_columns}
    after_counts = {col: after_df[col].value_counts().to_dict() for col in target_columns}

    fig, axes = plt.subplots(len(target_columns), 1, figsize=(10, 8))
    for i, col in enumerate(target_columns):
        before_vals = before_counts[col]
        after_vals = after_counts[col]

        axes[i].bar([0, 1], [before_vals.get(0, 0), before_vals.get(1, 0)], width=0.4, label="Avant SMOTE-ENN", align='center')
        axes[i].bar([0, 1], [after_vals.get(0, 0), after_vals.get1, 0)], width=0.4, label="Après SMOTE-ENN", align='edge')

        axes[i].set_title(f'Distribution des 0 et 1 pour {col}')
        axes[i].set_xticks([0, 1])
        axes[i].set_xticklabels(['0', '1'])
        axes[i].legend()

    plt.tight_layout()
    plt.show()

def main():
    """Main function to execute the SMOTE-ENN pipeline."""
    # Load data
    df = load_data("32features_1hour_shift_222222222.csv")

    # Check missing values
    check_missing_values(df)

    # Preprocess data
    df_cleaned, X, y = preprocess_data(df, TARGET_COLUMNS)

    # Apply SMOTE-ENN
    X_res, y_res = apply_smote_enn(X, y)

    # Create final DataFrame
    final_df = create_final_dataframe(X_res, y_res, TARGET_COLUMNS, X.columns)

    # Save final DataFrame
    save_dataframe(final_df, "balanced_dataset_multiclass.csv")

    # Print class distributions
    print_class_distribution(df_cleaned, TARGET_COLUMNS, "AVANT SMOTE-ENN")
    print_class_distribution(final_df, TARGET_COLUMNS, "APRÈS SMOTE-ENN")

    # Check for rows with all zero targets
    check_zero_targets(final_df, TARGET_COLUMNS)

    # Plot class distributions
    plot_class_distribution(df_cleaned, final_df, TARGET_COLUMNS)

    # Print dataset shapes
    print("✅ AVANT SMOTE-ENN :")
    print("Nombre de lignes :", df_cleaned.shape[0])
    print("Nombre de colonnes :", df_cleaned.shape[1])
    print("\n✅ APRÈS SMOTE-ENN :")
    print("Nombre de lignes :", final_df.shape[0])
    print("Nombre de colonnes :", final_df.shape[1])

if __name__ == "__main__":
    main()


In [1]:
import pandas as pd
from imblearn.combine import SMOTEENN
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import EditedNearestNeighbours
from collections import Counter
import matplotlib.pyplot as plt
import os

def load_data(file_path):
    """Load dataset from CSV file."""
    try:
        df = pd.read_csv(file_path)
        # Convert column names to lowercase for consistency
        df.columns = df.columns.str.lower()
        return df
    except FileNotFoundError:
        print(f"Error: File not found at {file_path}")
        return None

def check_missing_values(df):
    """Print missing values per column and dataset shape."""
    print("Nombre de valeurs manquantes par colonne :")
    print(df.isna().sum())
    print(f"✅ Nombre de lignes : {df.shape[0]}")
    print(f"✅ Nombre de colonnes : {df.shape[1]}")

def preprocess_data(df, target_columns):
    """Preprocess data by dropping rows with missing values and creating a multiclass label."""
    # Ensure all target columns are in the dataframe
    present_targets = [col.lower() for col in target_columns if col.lower() in df.columns]
    if not present_targets:
        print("Error: None of the target columns found in the dataset.")
        return None, None, None

    df_cleaned = df.dropna(subset=present_targets).copy()
    
    # Exclude non-numeric columns before dropping them
    numerical_cols = df_cleaned.select_dtypes(include=['int64', 'float64']).columns
    X_full = df_cleaned.drop(columns=present_targets)
    X_full = X_full.select_dtypes(include=['int64', 'float64'])
    
    # Filter the original dataframe to keep only the selected numerical features and target columns
    df_cleaned = df_cleaned[X_full.columns.tolist() + present_targets]
    df_cleaned = df_cleaned.dropna()

    # Create multiclass label
    df_cleaned['label'] = df_cleaned[present_targets].idxmax(axis=1)
    
    # Exclude targets and the new 'label' column from features
    X = df_cleaned.drop(columns=present_targets + ['label'])
    y = df_cleaned['label']
    
    return df_cleaned, X, y

def apply_smote_enn(X, y, k_neighbors=2, n_neighbors=3):
    """Apply SMOTE-ENN to balance the dataset."""
    print("Avant SMOTE-ENN :", Counter(y))
    smote_enn = SMOTEENN(
        sampling_strategy='all', # Use 'all' for multi-class balancing
        random_state=42,
        smote=SMOTE(k_neighbors=k_neighbors),
        enn=EditedNearestNeighbours(n_neighbors=n_neighbors),
        n_jobs=1
    )
    X_res, y_res = smote_enn.fit_resample(X, y)
    print("Après SMOTE-ENN :", Counter(y_res))
    return X_res, y_res

def create_final_dataframe(X_res, y_res, target_columns, X_columns):
    """Create final DataFrame with one-hot encoded target columns."""
    final_df = pd.DataFrame(X_res, columns=X_columns)
    for col in target_columns:
        final_df[col.lower()] = (y_res == col.lower()).astype(int)
    return final_df

def save_dataframe(df, output_path):
    """Save DataFrame to CSV file."""
    df.to_csv(output_path, index=False)
    print(f"✅ Dataset multiclasse équilibré sauvegardé dans '{output_path}'")

def print_class_distribution(df, target_columns, stage=""):
    """Print class distribution for each target column."""
    print(f"\nDistribution des 0 et 1 par classe {stage}:")
    for col in target_columns:
        counts = df[col.lower()].value_counts().to_dict()
        print(f"{col}: {counts}")

def check_zero_targets(df, target_columns):
    """Check for rows where all target columns are zero."""
    # Ensure targets are lowercase
    target_columns_lower = [col.lower() for col in target_columns]
    
    # Check if a row has a 1 in any of the target columns
    sum_of_targets = df[target_columns_lower].sum(axis=1)
    
    # Find rows where the sum is 0
    all_zero_targets = df[sum_of_targets == 0]
    
    print("🔍 Lignes où toutes les cibles sont à 0 :")
    print(all_zero_targets)
    print(f"Nombre de lignes concernées : {len(all_zero_targets)}")

def plot_class_distribution(before_df, after_df, target_columns, file_name):
    """Plot class distribution before and after SMOTE-ENN and save the plot."""
    before_counts = {col.lower(): before_df[col.lower()].value_counts().to_dict() for col in target_columns}
    after_counts = {col.lower(): after_df[col.lower()].value_counts().to_dict() for col in target_columns}

    fig, axes = plt.subplots(len(target_columns), 1, figsize=(10, len(target_columns)*2), sharex=True)
    if len(target_columns) == 1:
        axes = [axes] # Ensure axes is iterable for a single subplot
    
    for i, col in enumerate(target_columns):
        col_lower = col.lower()
        before_vals = before_counts.get(col_lower, {0: 0, 1: 0})
        after_vals = after_counts.get(col_lower, {0: 0, 1: 0})
        
        # Prepare data for plotting
        labels = ['0', '1']
        before_data = [before_vals.get(0, 0), before_vals.get(1, 0)]
        after_data = [after_vals.get(0, 0), after_vals.get(1, 0)]
        
        x = np.arange(len(labels))
        width = 0.35
        
        axes[i].bar(x - width/2, before_data, width, label="Avant SMOTE-ENN")
        axes[i].bar(x + width/2, after_data, width, label="Après SMOTE-ENN")

        axes[i].set_title(f'Distribution des 0 et 1 pour {col}', fontsize=12)
        axes[i].set_xticks(x)
        axes[i].set_xticklabels(labels)
        axes[i].legend(fontsize=8)
        axes[i].set_ylabel('Nombre de Lignes')

    plt.suptitle(f"Distribution des classes pour {os.path.basename(file_name)}", y=1.02)
    plt.tight_layout()
    plot_output_path = f"{os.path.splitext(file_name)[0]}_distribution.png"
    plt.savefig(plot_output_path, bbox_inches='tight')
    plt.close()
    print(f"✅ Plot saved to {plot_output_path}")

def process_single_dataset(file_info):
    """Processes a single dataset from start to finish."""
    input_path = file_info['input_path']
    output_path = file_info['output_path']
    target_columns = file_info['labels']

    print(f"\n{'='*50}\nProcessing {os.path.basename(input_path)}\n{'='*50}")

    df = load_data(input_path)
    if df is None:
        return

    # Check missing values
    check_missing_values(df)

    # Preprocess data
    df_cleaned, X, y = preprocess_data(df, target_columns)
    if X is None:
        print("Skipping due to missing key columns.")
        return

    # Apply SMOTE-ENN
    try:
        X_res, y_res = apply_smote_enn(X, y)
    except ValueError as e:
        print(f"Error applying SMOTE-ENN: {e}. This may happen if a class has too few samples.")
        return

    # Create final DataFrame
    final_df = create_final_dataframe(X_res, y_res, target_columns, X.columns)

    # Save final DataFrame
    save_dataframe(final_df, output_path)

    # Print class distributions
    print_class_distribution(df_cleaned, target_columns, "AVANT SMOTE-ENN")
    print_class_distribution(final_df, target_columns, "APRÈS SMOTE-ENN")

    # Check for rows with all zero targets
    check_zero_targets(final_df, target_columns)

    # Plot class distributions
    plot_class_distribution(df_cleaned, final_df, target_columns, output_path)

    # Print dataset shapes
    print("✅ AVANT SMOTE-ENN :")
    print("Nombre de lignes :", df_cleaned.shape[0])
    print("Nombre de colonnes :", df_cleaned.shape[1])
    print("\n✅ APRÈS SMOTE-ENN :")
    print("Nombre de lignes :", final_df.shape[0])
    print("Nombre de colonnes :", final_df.shape[1])


if __name__ == "__main__":
    # ⚠️ IMPORTANT: Update these lists with the correct paths and labels for your datasets.
    datasets_to_process = [
        {
            'input_path': r'C:\Users\lamia\Desktop\datasets\Dataset1_shift_selected22.csv',
            'output_path': r'C:\Users\lamia\Desktop\datasets\Dataset1_balanced.csv',
            'labels': ['OK', 'oestrus', 'calving', 'mastitis','lameness', 'other_disease']
        },
        {
            'input_path': r'C:\Users\lamia\Desktop\datasets\Dataset2_shift_selected22.csv',
            'output_path': r'C:\Users\lamia\Desktop\datasets\Dataset2_balanced.csv',
            'labels': ['OK', 'oestrus', 'mastitis','lameness', 'other_disease']
        },
        {
            'input_path': r'C:\Users\lamia\Desktop\datasets\Dataset3_shift_selected22.csv',
            'output_path': r'C:\Users\lamia\Desktop\datasets\Dataset3_balanced.csv',
            'labels': ['OK', 'oestrus']
        },
        {
            'input_path': r'C:\Users\lamia\Desktop\datasets\Dataset4_shift_selected22.csv',
            'output_path': r'C:\Users\lamia\Desktop\datasets\Dataset4_balanced.csv',
            'labels': ['OK', 'oestrus', 'calving', 'mastitis','lameness', 'other_disease']
        },
    ]

    for dataset in datasets_to_process:
        process_single_dataset(dataset)

ModuleNotFoundError: No module named 'pandas'