In [None]:

# openpyxl-3.1.5
# nbformat>=4.2.0
import pandas as pd
import seaborn as sns
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import matplotlib.pyplot as plt
import re
from typing import Tuple

from sklearn.feature_selection import mutual_info_classif

from A_data_cleaner_class import DataCleaner

# **Load data**

In [None]:
# Load data
df = pd.read_excel(
    r"Z:\2025_ml_marie\TD0002697 - Liste des consommateurs électrique - ALLDIV no null.xlsx",
    keep_default_na=False,
    na_values=['vides', '[E]', 'empty', '']
    )
print(f"Loaded dataset: {df.shape[0]} rows × {df.shape[1]} columns")
df.head()

# **Initialise the data cleaner**

In [None]:

cleaner = DataCleaner(df)
cleaner.get_summary()

target = cleaner.target
numerics = cleaner.numeric_cols

In [None]:
def analyse_and_print_columns_compact(df):
    """ Print column analysis in a compact format """
    for column in df.columns:
        print(f"\n{column}")
        print("-" * len(column))
        print(f"Type: {df[column].dtype}")
        print(f"Nulls: {df[column].isnull().sum()}")
        print(f"Unique: {df[column].nunique()}")
        print("Top values:")
        print(df[column].value_counts().head().to_string())

# Example usage:
analyse_and_print_columns_compact(cleaner.df)

# **Step 1: Cleaning up column names**
Remove unprintable characters from column names and strip them

In [None]:
cleaner.find_unprintable_columns()

print("Cleaning column names...")
cleaner.standardise_column_names()
print("✓ All column names are clean!")

# **Step 2: Drop columns that are not of interest**

In [None]:
print(f"Initial column count: {len(cleaner.df.columns)}")
cleaner.drop_columns()
print(f"Final column count: {len(cleaner.df.columns)}")

# **Step 3: Clean target - Allocated division**

In [None]:
print(f"Target column before cleaning:")
print(cleaner.df[target].value_counts(dropna=False))

cleaner.clean_target(only_later=False)

print(f"\nTarget column after cleaning:")
print(cleaner.df[target].value_counts(dropna=False))

# **Step 4: Trim whitespace**
Remove leading/trailing whitespace from string columns.

In [None]:
whitespace_df = cleaner.find_whitespace_in_values()

if len(whitespace_df) > 0:
    print("Columns with whitespace issues:")
    display(whitespace_df)
    
    # Trim whitespace from all string columns
    cleaner.trim_whitespace()
else:
    print("✓ No whitespace issues found!")

# **Step 5: Standardise NA values**
Transform NA variations into NA. Examples:
- `N/A`
- `not available`

In [None]:
no_different_na = cleaner.NA_values()

# **Step 6: Standardise case**
Fix case-insensitive duplicates (e.g., 'LATER' vs 'later')

In [None]:
case_dups_df = cleaner.find_case_insensitive_duplicates()

if len(case_dups_df) > 0:
    print("Columns with case-insensitive duplicates:")
    display(case_dups_df)
    
    # Standardize case for affected columns
    columns_to_standardise = case_dups_df['column'].tolist()
    cleaner.standardise_case(columns_to_standardise)
else:
    print("✓ No case-insensitive duplicates found!")

# **Step 7: Find and fix fuzzy duplicates**
Identify potential typos and similar strings

In [None]:
fuzzy_issues = cleaner.find_fuzzy_duplicates(threshold=85, min_length=3)

if fuzzy_issues:
    print(f"Found fuzzy duplicates in {len(fuzzy_issues)} columns:\n")
    for issue in fuzzy_issues:
        print(f"Column: {issue['column']}")
        for i, group in enumerate(issue['fuzzy_groups'], 1):
            print(f"  Group {i}: {group}")
        print()
else:
    print("✓ No fuzzy duplicates found!")

In [None]:
# e.g. comment utiliser
# ELECTRICAL LOAD TYPE
load_type_mapping = {
    'ON OFF VALVE':'ON-OFF VALVE'
}
cleaner.standardise_fuzzy_values('ELECTRICAL LOAD TYPE', load_type_mapping)

# **Step 8: Type conversion**
Convert columns to appropriate data types (numeric, boolean)

In [None]:
print("Current data types:")
print(cleaner.get_summary())

if numerics:
    print("\nConverting to numeric:")
    cleaner.convert_to_numeric()

cleaner.get_summary()

# **Review Data Quality After Cleaning**

In [None]:
# Check for duplicates
dup_count = cleaner.df.duplicated().sum()
print(f"Duplicate rows: {dup_count}")  ## eventual duplicates come from dropping CL LINE

# Check data types
print(f"\nData type distribution:")
print(cleaner.df.dtypes.value_counts())

# Check for missing values
missing = cleaner.df.isnull().sum()
missing = missing[missing > 0].sort_values(ascending=False)
print(f"\nColumns with missing values: {len(missing)}")
if len(missing) > 0:
    print(missing)

# **Step 9: Handle Unexpected and Missing Values**

### **9.4 Clean unexpected values**

In [None]:
summ = cleaner.get_missing_summary()

In [None]:
# Find null-like values that aren't actually null (?, empty, etc.)
null_like = cleaner.find_null_like_values()

### **9.5 Swap NaN with `unset`**

In [None]:
final_missing = cleaner.get_missing_summary()

In [None]:
# Get non-numeric columns
non_numeric_cols = cleaner.current_df.select_dtypes(exclude=['number']).columns

# Apply transformation
cleaner.current_df[non_numeric_cols] = cleaner.current_df[non_numeric_cols].fillna(np.nan).replace(np.nan, "LATER")

In [None]:
cleaner.current_df.isna().sum()

In [None]:
cleaner.get_summary()

### **9.6 Clean textual columns**
Trasform the following values into "LATER": '-', 'empty', 'à'

In [None]:
clean_text = cleaner.textual_columns(only_later=False)

# **Review data quality**

In [None]:
# Check for duplicates
dup_count = cleaner.df.duplicated().sum()
print(f"Duplicate rows: {dup_count}")  ## eventual duplicates come from dropping CL LINE

cleaner.get_summary()

final_missing = cleaner.get_missing_summary()

# **Prepare dataset for mutual importance**

In [None]:
def plot_categorical_distribution(df, column_name, title=None, color='#636EFA', 
                                  top_n=None, sort_by='count', save_path=None):
    """
    Generate a bar chart showing the distribution of a categorical column using Plotly.
    
    Parameters:
    -----------
    df : pandas.DataFrame
        The dataframe containing the data
    column_name : str
        The name of the categorical column to visualize
    title : str, optional
        Title for the chart (defaults to 'Distribution of {column_name}')
    color : str, optional
        Color for the bars
    top_n : int, optional
        Display only the top N categories (by count)
    sort_by : str, optional
        How to sort categories: 'count' (descending), 'alphabetical', or 'none'
    save_path : str, optional
        Path to save the figure (if None, figure is only displayed)
        
    Returns:
    --------
    fig : plotly.graph_objects.Figure
        The plotly figure object
    """
    import plotly.express as px
    import pandas as pd
    
    # Set default title if not provided
    if title is None:
        title = f'Distribution of {column_name}'
    
    # Calculate value counts
    value_counts = df[column_name].value_counts()
    
    # Apply top_n if specified
    if top_n is not None and len(value_counts) > top_n:
        top_values = value_counts.nlargest(top_n)
        other_count = value_counts[top_n:].sum()
        
        # Add "Other" category
        if other_count > 0:
            data = pd.DataFrame({
                'Category': list(top_values.index) + ['Other'],
                'Count': list(top_values.values) + [other_count]
            })
        else:
            data = pd.DataFrame({
                'Category': top_values.index,
                'Count': top_values.values
            })
    else:
        data = pd.DataFrame({
            'Category': value_counts.index,
            'Count': value_counts.values
        })
    
    # Sort the data
    if sort_by == 'count':
        data = data.sort_values('Count', ascending=False)
    elif sort_by == 'alphabetical':
        data = data.sort_values('Category')
    # 'none' will use the order from value_counts
    
    # Create the figure
    fig = px.bar(
        data, x='Category', y='Count',
        title=title,
        color_discrete_sequence=[color]
    )
    
    # Improve layout
    fig.update_layout(
        xaxis_title=column_name,
        yaxis_title='Count',
        bargap=0.2,
        template='plotly_white'
    )
    
    # Add value labels on top of bars
    fig.update_traces(texttemplate='%{y}', textposition='outside')
    
    # Rotate x-axis labels if there are many categories
    if len(data) > 5:
        fig.update_layout(xaxis_tickangle=-45)
    
    # Save if path is provided
    if save_path:
        fig.write_image(save_path)
        print(f"Figure saved to {save_path}")
    
    return fig

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import mutual_info_classif

def analyse_mutual_information(df, target, top_n=20, show_plot=True):
    """
    Analyse mutual information between features and target variable.
    
    Parameters:
    -----------
    df : DataFrame
        DataFrame containing the data
    target : str
        Name of the target column
    top_n : int, default=20
        Number of top features to return
    show_plot : bool, default=True
        Whether to display the plotly visualization
    
    Returns:
    --------
    tuple : (X_final, mi_df, top_features)
        - X_final: DataFrame with top features and target
        - mi_df: DataFrame with all MI scores and metadata
        - top_features: List of top N feature names
    """
    # After extraction, encode categorically
    df_clean = df.copy()
    
    # Extract all meaningful parts
    # df_clean['ECS_number'] = df_clean['ECS CODE'].str.extract(r'[A-Z]+(\d{2})').fillna("unknown")
    # df_clean = df_clean.drop(columns=['ECS CODE', 'DESCRIPTION'])
    df_clean = df_clean.drop(columns=['DESCRIPTION', 'ADDITIONAL REQUIREMENTS'], errors='ignore')
    
    y = df_clean[target]
    X = df_clean.drop(columns=[target])
    
    # Prepare data and track which features are discrete/categorical
    X_prepared = X.copy()
    discrete_mask = []
    
    print("Preparing columns for mutual information...")
    for col in X_prepared.columns:
        # Handle missing values
        if X_prepared[col].dtype in ['object', 'string', 'category', 'boolean']:
            # Categorical column - fill missing and convert to category codes
            X_prepared[col] = X_prepared[col].fillna('_MISSING_').astype('category')
            X_prepared[col] = X_prepared[col].cat.codes
            discrete_mask.append(True)
        elif X_prepared[col].dtype in ['int64', 'float64']:
            # Numeric column
            X_prepared[col] = X_prepared[col].fillna(-999)
            # Consider integer columns as discrete if they have few unique values
            if X_prepared[col].dtype == 'int64' or X_prepared[col].nunique() < 20:
                discrete_mask.append(True)
            else:
                discrete_mask.append(False)
        else:
            # Other types - treat as categorical
            X_prepared[col] = X_prepared[col].fillna('_MISSING_').astype('category').cat.codes
            discrete_mask.append(True)
    
    print(f"Discrete features: {sum(discrete_mask)}/{len(discrete_mask)}")
    
    # Encode target if categorical
    y_encoded = y.copy()
    if y.dtype in ['object', 'string', 'category']:
        y_encoded = y.astype('category').cat.codes
        # Get category mapping
        cat_mapping = dict(enumerate(y.astype('category').cat.categories))
        print(f"\nTarget classes: {cat_mapping}")
    
    # Calculate mutual information with discrete_features parameter
    print("\nCalculating mutual information...")
    mi_scores = mutual_info_classif(
        X_prepared, 
        y_encoded, 
        discrete_features=discrete_mask,
        random_state=42
    )
    
    # Create results DataFrame
    mi_df = pd.DataFrame({
        'feature': X_prepared.columns,
        'mutual_information': mi_scores,
        'dtype': [str(X[col].dtype) for col in X.columns],
        'is_discrete': discrete_mask
    }).sort_values('mutual_information', ascending=False)
    
    print("\n" + "="*60)
    print("MUTUAL INFORMATION SCORES")
    print("="*60)
    print(mi_df.to_string(index=False))
    print("="*60)
    
    # Interactive Plotly visualization
    if show_plot:
        fig = px.bar(
            mi_df,
            x='mutual_information',
            y='feature',
            orientation='h',
            title=f'Feature Importance: Mutual Information with {target}',
            labels={'mutual_information': 'Mutual Information Score', 'feature': 'Feature'},
            hover_data=['dtype', 'is_discrete'],
            color='mutual_information',
            color_continuous_scale='Viridis',
            height=max(400, len(mi_df) * 25)
        )
        
        fig.update_layout(
            yaxis={'categoryorder': 'total ascending'},
            showlegend=False,
            hovermode='closest'
        )
        
        fig.show()
    
    # Select top N features
    top_features = mi_df.head(top_n)['feature'].tolist()
    
    print(f"\nTop {top_n} features:")
    for i, feat in enumerate(top_features, 1):
        row = mi_df[mi_df['feature'] == feat].iloc[0]
        mi_score = row['mutual_information']
        dtype = row['dtype']
        is_discrete = row['is_discrete']
        feature_type = 'discrete' if is_discrete else 'continuous'
        print(f"  {i}. {feat} ({dtype}, {feature_type}): {mi_score:.4f}")
    
    # Create final dataset with top features
    X_final = df_clean[top_features + [target]]
    print(f"\nFinal dataset shape: {X_final.shape}")
    
    return X_final, mi_df, top_features

for col in cleaner.df.columns:
    fig = plot_categorical_distribution(cleaner.df, col)
    fig.show()
    
# One-line call with defaults
X_final, mi_df, top_features = analyse_mutual_information(cleaner.df, target)

In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder

def create_correlation_heatmap(df, title='Full Correlation Matrix (Numerical & Encoded Categorical Variables)',
                               width=1000, height=900, show_plot=True):
    """
    Create a correlation heatmap for a dataframe with both numerical and categorical columns.
    
    Parameters:
    -----------
    df : pandas.DataFrame
        Input dataframe to analyse
    title : str, optional
        Title for the heatmap (default: 'Full Correlation Matrix...')
    width : int, optional
        Width of the figure in pixels (default: 1000)
    height : int, optional
        Height of the figure in pixels (default: 900)
    show_plot : bool, optional
        Whether to display the plot (default: True)
    
    Returns:
    --------
    fig : plotly.graph_objects.Figure
        The correlation heatmap figure
    correlation_all : pandas.DataFrame
        The full correlation matrix
    """    
    # Create a copy of the dataframe
    df_encoded = df.copy()
    
    # Identify categorical and numerical columns
    categorical_cols = df_encoded.select_dtypes(include=['object', 'category']).columns
    numerical_cols = df_encoded.select_dtypes(include=['number']).columns
    
    print(f"Encoding {len(categorical_cols)} categorical columns")
    
    # Encode categorical columns
    encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
    for col in categorical_cols:
        # Fill NaN values with a placeholder string to avoid errors
        df_encoded[col] = df_encoded[col].fillna('missing')
        df_encoded[col] = encoder.fit_transform(df_encoded[col].values.reshape(-1, 1))
    
    # Calculate correlation matrix
    correlation_all = df_encoded.corr()
    
    # Create shortened labels for better visualization
    def create_short_label(text):
        words = str(text).split()
        if len(words) <= 1:
            return text
        elif len(words) <= 3:  # Keep names with up to 3 words unchanged
            return text
        else:
            return f"{words[0]} {words[1]} {words[2]} ... {words[-1]}"
    
    short_labels = {col: create_short_label(col) for col in correlation_all.columns}
    correlation_short = correlation_all.rename(columns=short_labels, index=short_labels)
    
    # Create heatmap
    fig = px.imshow(correlation_short, aspect='auto', title=title)
    
    # Create a proper 2D array of custom data for hover information
    n = len(correlation_all.columns)
    custom_data = np.empty((n, n), dtype='object')
    for i in range(n):
        for j in range(n):
            # Add row and column full names to custom data
            custom_data[i, j] = [correlation_all.index[i], correlation_all.columns[j]]
    
    # Add text and hover information
    fig.update_traces(
        text=correlation_all.round(2).values,
        texttemplate='%{text}',
        customdata=custom_data,
        hovertemplate='Row: %{customdata[0]}<br>Column: %{customdata[1]}<br>Correlation: %{z:.2f}'
    )
    
    # Update layout
    fig.update_layout(
        width=width,
        height=height,
        margin=dict(l=50, r=50, t=100, b=50)
    )
    
    # Adjust text size and color for better readability
    fig.update_traces(textfont=dict(size=10, color='black'))
    
    # Improve colorscale for better contrast
    fig.update_traces(colorscale='RdBu_r', zmid=0)
    
    # Print dataframe composition
    print(f"\nDataframe composition:")
    print(f"- Total columns: {len(df.columns)}")
    print(f"- Numerical columns: {len(numerical_cols)}")
    print(f"- Categorical columns: {len(categorical_cols)}")
    
    if show_plot:
        fig.show()
    
    return fig, correlation_all


# Usage example:
fig, corr_matrix = create_correlation_heatmap(cleaner.df)

In [None]:
# Extract only from CODE column
ecs_extracted = cleaner.extract_codes(only_ecs=True)
print(f"Columns after extracting and dropping ECS CODE: {cleaner.df.columns}")


X_final, mi_df, top_features = analyse_mutual_information(cleaner.df, target)
fig, corr_matrix = create_correlation_heatmap(cleaner.df)


In [None]:
all_codes_extracted = cleaner.extract_codes(only_ecs=False)
print(f"Columns after extracting and dropping all codes: {cleaner.df.columns}")

X_final, mi_df, top_features = analyse_mutual_information(cleaner.df, target)
fig, corr_matrix = create_correlation_heatmap(cleaner.df)

In [None]:
fig = plot_categorical_distribution(cleaner.df, cleaner.target)
fig.show()

In [None]:
for col in cleaner.df.columns:
    fig = plot_categorical_distribution(cleaner.df, col)
    fig.show()