In [None]:
! pip install mlxtend --upgrade

In [None]:
import pandas as pd
import numpy as np

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)


In [None]:
data = pd.read_csv("data/translated_new_binned_DropNaN.csv")
print(f"data.columns = {list(data.columns)}")
print()
print(data.head())

In [None]:
import pandas as pd
import numpy as np
from mlxtend.frequent_patterns import apriori, association_rules
import matplotlib.pyplot as plt
from scipy import sparse

# Data preprocessing (same as before)
def categorize_age(age):
    if age < 30:
        return "Young"
    elif 30 <= age < 60:
        return "Middle-aged"
    else:
        return "Senior"

def categorize_bmi(bmi):
    if bmi < 18.5:
        return "Underweight"
    elif 18.5 <= bmi < 25:
        return "Normal"
    elif 25 <= bmi < 30:
        return "Overweight"
    else:
        return "Obese"

def categorize_bsa(bsa):
    if bsa < 1.5:
        return "Small"
    elif 1.5 <= bsa < 1.8:
        return "Medium"
    else:
        return "Large"

# Apply categorizations
data["Age_Category"] = data["Age"].apply(categorize_age)
data["BMI_Category"] = data["BMI"].apply(categorize_bmi)
data["BSA_Category"] = data["BSA"].apply(categorize_bsa)

# Create binary columns for each category
columns_to_encode = [
    "Age_Category",
    "Sex",
    "Smoke",
    "BMI_Category",
    "BSA_Category",
    "occupation_category",
    # "A_20th_quantile_binned",
    # "A_25th_quantile_binned",
    # "A_50th_quantile_binned",
    # "A_75th_quantile_binned",
    # "A_max_binned",
    # "A_mean_binned",
    # "A_median_binned",
    # "A_min_binned",
    # "A_std_binned",
    "M_20th_quantile_binned",
    "M_25th_quantile_binned",
    "M_50th_quantile_binned",
    "M_75th_quantile_binned",
    "M_max_binned",
    "M_mean_binned",
    "M_median_binned",
    "M_min_binned",
    "M_std_binned",
]

encoded_data = pd.get_dummies(data[columns_to_encode], prefix=columns_to_encode)

# Convert to boolean type
encoded_data = encoded_data.astype(bool)

In [None]:
def run_apriori(data, min_support, min_confidence, min_lift=1):
    frequent_itemsets = apriori(
        data, min_support=min_support, use_colnames=True, low_memory=True
    )
    rules = association_rules(
        frequent_itemsets, metric="confidence", min_threshold=min_confidence
    )
    rules = rules[rules["lift"] >= min_lift]
    return rules.sort_values("lift", ascending=False)


def experiment_with_thresholds(data, support_thresholds, confidence_thresholds):
    results = []
    for support in support_thresholds:
        for confidence in confidence_thresholds:
            rules = run_apriori(data, support, confidence)
            results.append(
                {
                    "min_support": support,
                    "min_confidence": confidence,
                    "num_rules": len(rules),
                    "top_rules": rules.head(5) if len(rules) > 0 else pd.DataFrame(),
                }
            )
    return results


def run_apriori_in_batches(
    data, support_thresholds, confidence_thresholds, batch_size=1000
):
    results = []
    for i in range(0, len(data), batch_size):
        batch = data.iloc[i : i + batch_size]
        batch_results = experiment_with_thresholds(
            batch, support_thresholds, confidence_thresholds
        )
        results.extend(batch_results)
        print(f"Processed batch {i//batch_size + 1}")
    return results

In [None]:
# Define ranges for support and confidence thresholds
support_thresholds = [0.05, 0.1, 0.15, 0.2, 0.25]
confidence_thresholds = [0.5, 0.7, 0.75, 0.9]

# # Run experiments
# experiment_results = experiment_with_thresholds(
#     encoded_data, support_thresholds, confidence_thresholds
# )

# Use the function
batch_size = 5000  # Adjust this value based on your available memory
experiment_results = run_apriori_in_batches(
    encoded_data, support_thresholds, confidence_thresholds, batch_size
)

experiment_results_df = pd.DataFrame(experiment_results)

# Optionally, save the DataFrame to a CSV file
experiment_results_df.to_csv("apriori_results.csv", index=False)

In [None]:
# Create a DataFrame to summarize the results
summary_data = []
# Print results
for result in experiment_results:
    print(
        f"\nMin Support: {result['min_support']}, Min Confidence: {result['min_confidence']}"
    )
    print(f"Number of rules generated: {result['num_rules']}")
    if not result["top_rules"].empty:
        print("Top 5 rules:")
        print(
            result["top_rules"][
                ["antecedents", "consequents", "support", "confidence", "lift"]
            ]
        )
    else:
        print("No rules generated with these thresholds.")
    summary_data.append(
        {
            "min_support": result["min_support"],
            "min_confidence": result["min_confidence"],
            "num_rules": result["num_rules"],
        }
    )

summary_df = pd.DataFrame(summary_data)

In [None]:
# Visualize number of rules for different thresholds
plt.figure(figsize=(10, 6))
for confidence in confidence_thresholds:
    data = summary_df[summary_df["min_confidence"] == confidence]
    plt.plot(
        data["min_support"],
        data["num_rules"],
        marker="o",
        label=f"Confidence = {confidence}",
    )

plt.xlabel("Minimum Support")
plt.ylabel("Number of Rules")
plt.title("Number of Rules vs. Minimum Support for Different Confidence Levels")
plt.legend()
plt.show()

# Print the summary DataFrame
print(summary_df)

# Optionally, save the DataFrame to a CSV file
summary_df.to_csv("apriori_results_summary.csv", index=False)

print("Summary DataFrame has been created and saved to 'apriori_results_summary.csv'")

In [None]:
! pip install seaborn --upgrade

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from mlxtend.frequent_patterns import apriori, association_rules

def analyze_rules(rules_df, min_lift=1, min_confidence=0.5, min_support=0.05):
    """
    Analyze and find the best rules based on multiple metrics.
    
    Parameters:
    rules_df: DataFrame containing association rules
    min_lift: Minimum lift threshold
    min_confidence: Minimum confidence threshold
    min_support: Minimum support threshold
    """
    # Filter rules based on thresholds
    filtered_rules = rules_df[
        (rules_df['lift'] >= min_lift) &
        (rules_df['confidence'] >= min_confidence) &
        (rules_df['support'] >= min_support)
    ]
    
    # Sort rules by different metrics
    best_by_lift = filtered_rules.nlargest(10, 'lift')
    best_by_confidence = filtered_rules.nlargest(10, 'confidence')
    best_by_support = filtered_rules.nlargest(10, 'support')
    
    # Add additional metrics if they don't exist
    if 'conviction' not in filtered_rules.columns:
        filtered_rules['conviction'] = np.where(
            filtered_rules['confidence'] == 1,
            np.inf,
            (1 - filtered_rules['consequent support']) / (1 - filtered_rules['confidence'])
        )
    
    if 'leverage' not in filtered_rules.columns:
        filtered_rules['leverage'] = filtered_rules['support'] - (
            filtered_rules['antecedent support'] * filtered_rules['consequent support']
        )
    
    return {
        'filtered_rules': filtered_rules,
        'best_by_lift': best_by_lift,
        'best_by_confidence': best_by_confidence,
        'best_by_support': best_by_support
    }

def visualize_rule_metrics(rules_df):
    """
    Create visualizations to help identify the best rules.
    """
    # Scatter plot with lift encoded in color and size
    plt.figure(figsize=(12, 8))
    scatter = plt.scatter(rules_df['support'], 
                         rules_df['confidence'],
                         c=rules_df['lift'],
                         s=rules_df['lift']*50,
                         cmap='viridis',
                         alpha=0.6)
    plt.colorbar(scatter, label='Lift')
    plt.xlabel('Support')
    plt.ylabel('Confidence')
    plt.title('Support vs Confidence (color and size indicate Lift)')
    plt.show()
    
    # Distribution of metrics
    fig, axes = plt.subplots(1, 3, figsize=(15, 5))
    
    sns.histplot(rules_df['support'], bins=30, ax=axes[0])
    axes[0].set_title('Distribution of Support')
    
    sns.histplot(rules_df['confidence'], bins=30, ax=axes[1])
    axes[1].set_title('Distribution of Confidence')
    
    sns.histplot(rules_df['lift'], bins=30, ax=axes[2])
    axes[2].set_title('Distribution of Lift')
    
    plt.tight_layout()
    plt.show()

def find_best_rules(rules_df, target_consequent=None, min_metrics=None):
    """
    Find the best rules based on specific criteria.
    
    Parameters:
    rules_df: DataFrame containing association rules
    target_consequent: Optional specific consequent to look for
    min_metrics: Dictionary with minimum values for metrics
    """
    if min_metrics is None:
        min_metrics = {
            'support': 0.05,
            'confidence': 0.7,
            'lift': 1.5
        }
    
    filtered_rules = rules_df.copy()
    
    # Filter by metrics
    for metric, threshold in min_metrics.items():
        filtered_rules = filtered_rules[filtered_rules[metric] >= threshold]
    
    # Filter by target consequent if specified
    if target_consequent:
        filtered_rules = filtered_rules[
            filtered_rules['consequents'].apply(lambda x: target_consequent in x)
        ]
    
    # Calculate a composite score (normalized weighted sum)
    filtered_rules['score'] = (
        (filtered_rules['lift'] / filtered_rules['lift'].max()) * 0.4 +
        (filtered_rules['confidence'] / filtered_rules['confidence'].max()) * 0.4 +
        (filtered_rules['support'] / filtered_rules['support'].max()) * 0.2
    )
    
    return filtered_rules.nlargest(10, 'score')

def print_rule_summary(rules_dict):
    """
    Print a summary of the best rules in a readable format.
    """
    print("\n=== Best Rules by Lift ===")
    for _, rule in rules_dict['best_by_lift'].iterrows():
        print(f"\nIf {list(rule['antecedents'])} then {list(rule['consequents'])}")
        print(f"Lift: {rule['lift']:.2f}")
        print(f"Confidence: {rule['confidence']:.2f}")
        print(f"Support: {rule['support']:.2f}")
    
    print("\n=== Best Rules by Confidence ===")
    for _, rule in rules_dict['best_by_confidence'].iterrows():
        print(f"\nIf {list(rule['antecedents'])} then {list(rule['consequents'])}")
        print(f"Confidence: {rule['confidence']:.2f}")
        print(f"Lift: {rule['lift']:.2f}")
        print(f"Support: {rule['support']:.2f}")

In [None]:
# Generate rules
rules = run_apriori(encoded_data, min_support=0.1, min_confidence=0.5)

# Analyze rules
analysis_results = analyze_rules(rules, min_lift=1.2, min_confidence=0.6, min_support=0.1)

# Visualize the rules
visualize_rule_metrics(rules)

"""
High lift: Shows strong association between items
High confidence: Shows high reliability of the rule
High support: Shows the rule applies to a significant portion of your dataset
Balanced metrics: Good performance across all metrics
"""

# Find and print the best rules
best_rules = find_best_rules(
    rules,
    min_metrics={'support': 0.1, 'confidence': 0.7, 'lift': 1.5}
)

# Print summary of the best rules
print_rule_summary(analysis_results)

# Key Observations:

1. Strongest Rule Pattern:
```
If ['M_mean_binned_M_mean_Q2', 'M_20th_quantile_binned_M_20th_quantile_Q3'] 
then ['M_75th_quantile_binned_M_75th_quantile_Q2', 'Sex_F', 'M_25th_quantile_binned_M_25th_quantile_Q3']
```
- Lift: 8.60 (8.6 times more likely than random chance)
- Confidence: 100% (occurs every time the antecedent is present)
- Support: 10% (appears in 10% of all cases)

2. Common Patterns:
- Almost all strong rules involve:
  - Female patients (Sex_F)
  - Middle quartile measurements (Q2, Q3)
  - Multiple statistical measures (mean, percentiles)

3. Quality Metrics:
- Lift values are consistently high (8.60)
- Confidence levels are excellent (90-100%)
- Support is stable at 10%

4. Notable Relationships:
- Mean values in Q2 (middle range) strongly predict:
  - 75th percentile values in Q2
  - 20th percentile values in Q3
  - Female gender

# Best Rules Selection:
1. Most Reliable Rule:
```
If ['M_mean_binned_M_mean_Q2', 'M_std_binned_M_std_Q1', 'M_25th_quantile_binned_M_25th_quantile_Q3'] 
then ['M_75th_quantile_binned_M_75th_quantile_Q2', 'Sex_F', 'M_20th_quantile_binned_M_20th_quantile_Q3']
```
- Perfect confidence (1.0)
- High lift (8.60)
- Good support (0.10)

2. Most Practical Rule (Simpler Antecedent):
```
If ['M_mean_binned_M_mean_Q2', 'M_20th_quantile_binned_M_20th_quantile_Q3'] 
then ['M_75th_quantile_binned_M_75th_quantile_Q2', 'Sex_F', 'M_25th_quantile_binned_M_25th_quantile_Q3']
```
- Same high metrics but fewer conditions to check

In [None]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
from collections import defaultdict

class PatientClusterAnalyzer:
    def __init__(self, data, rules):
        self.data = data
        self.rules = rules
        self.clusters = None
        self.cluster_profiles = None
        
    def create_rule_features(self):
        """Create features based on how many rules each patient satisfies"""
        # Initialize dictionary to store rule matches
        rule_matches = {}
        
        # Process all rules at once
        for idx, rule in self.rules.iterrows():
            antecedent_cols = list(rule['antecedents'])
            consequent_cols = list(rule['consequents'])
            
            # Check if patient matches rule conditions
            antecedent_match = self.data[antecedent_cols].all(axis=1)
            consequent_match = self.data[consequent_cols].all(axis=1)
            rule_matches[f'rule_{idx}'] = (antecedent_match & consequent_match).astype(int)
        
        # Create DataFrame all at once
        rule_features = pd.DataFrame(rule_matches, index=self.data.index)
        return rule_features
    
    def cluster_patients(self, n_clusters=None):
        """Cluster patients based on rule satisfaction patterns"""
        # Create features based on rules
        rule_features = self.create_rule_features()
        
        # Standardize features
        scaler = StandardScaler()
        scaled_features = scaler.fit_transform(rule_features)
        
        # Find optimal number of clusters if not specified
        if n_clusters is None:
            n_clusters = self.find_optimal_clusters(scaled_features)
            print(f"Optimal number of clusters determined: {n_clusters}")
            
        # Perform clustering
        kmeans = KMeans(n_clusters=n_clusters, random_state=42)
        self.clusters = kmeans.fit_predict(scaled_features)
        
        # Add cluster assignments to original data
        self.data = self.data.copy()  # Create a copy to avoid fragmentation
        self.data['Cluster'] = self.clusters
        
        # Create cluster profiles
        self.cluster_profiles = self.create_cluster_profiles(rule_features)
        
        return self.clusters
    
    def find_optimal_clusters(self, features, max_clusters=10):
        """Find optimal number of clusters using silhouette score"""
        silhouette_scores = []
        print("Finding optimal number of clusters...")
        
        for k in range(2, max_clusters + 1):
            kmeans = KMeans(n_clusters=k, random_state=42)
            cluster_labels = kmeans.fit_predict(features)
            silhouette_avg = silhouette_score(features, cluster_labels)
            silhouette_scores.append(silhouette_avg)
            print(f"Clusters: {k}, Silhouette Score: {silhouette_avg:.3f}")
            
        optimal_clusters = silhouette_scores.index(max(silhouette_scores)) + 2
        return optimal_clusters
    
    def create_cluster_profiles(self, rule_features):
        """Create profiles for each cluster based on rule satisfaction"""
        profiles = defaultdict(dict)
        
        # Calculate all cluster statistics at once
        cluster_sizes = pd.Series(self.clusters).value_counts()
        total_patients = len(self.data)
        
        for cluster in range(len(set(self.clusters))):
            cluster_mask = self.clusters == cluster
            cluster_data = self.data[cluster_mask]
            
            # Calculate rule satisfaction percentages
            rule_satisfaction = rule_features[cluster_mask].mean()
            
            profiles[cluster] = {
                'size': cluster_sizes[cluster],
                'percentage': (cluster_sizes[cluster] / total_patients) * 100,
                'top_rules': rule_satisfaction[rule_satisfaction > 0.5].index.tolist(),
                'mean_rule_satisfaction': rule_satisfaction.mean(),
                'common_characteristics': self._get_common_characteristics(cluster_data)
            }
            
        return profiles
    
    def _get_common_characteristics(self, cluster_data, threshold=0.6):
        """Find common characteristics in a cluster"""
        # Process all columns at once
        characteristics = {}
        numeric_cols = cluster_data.select_dtypes(include=['bool', 'int64']).columns
        
        for col in numeric_cols:
            value_counts = cluster_data[col].value_counts(normalize=True)
            if len(value_counts) > 0 and value_counts.iloc[0] > threshold:
                characteristics[col] = value_counts.index[0]
                
        return characteristics
    
    def visualize_clusters(self):
        """Visualize cluster characteristics"""
        if self.clusters is None:
            raise ValueError("Must run cluster_patients() first")
            
        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))
        
        # Plot cluster sizes
        cluster_sizes = pd.Series(self.clusters).value_counts().sort_index()
        cluster_sizes.plot(kind='bar', ax=ax1)
        ax1.set_title('Cluster Sizes')
        ax1.set_xlabel('Cluster')
        ax1.set_ylabel('Number of Patients')
        
        # Plot rule satisfaction
        rule_satisfaction = pd.Series({
            i: self.cluster_profiles[i]['mean_rule_satisfaction'] 
            for i in range(len(self.cluster_profiles))
        }).sort_index()
        
        rule_satisfaction.plot(kind='bar', ax=ax2)
        ax2.set_title('Average Rule Satisfaction by Cluster')
        ax2.set_xlabel('Cluster')
        ax2.set_ylabel('Mean Rule Satisfaction')
        
        plt.tight_layout()
        plt.show()
        
        # Additional visualization: Rule satisfaction heatmap
        plt.figure(figsize=(12, 6))
        rule_features = self.create_rule_features()
        cluster_rule_satisfaction = pd.DataFrame({
            f'Cluster {i}': rule_features[self.clusters == i].mean()
            for i in range(len(self.cluster_profiles))
        })
        
        plt.imshow(cluster_rule_satisfaction.T, aspect='auto', cmap='YlOrRd')
        plt.colorbar(label='Rule Satisfaction Rate')
        plt.xlabel('Rules')
        plt.ylabel('Clusters')
        plt.title('Rule Satisfaction Patterns by Cluster')
        plt.show()
        
    def print_cluster_insights(self):
        """Print detailed insights about each cluster"""
        print("\n=== Cluster Analysis Results ===")
        print(f"Total number of clusters: {len(self.cluster_profiles)}")
        print(f"Total number of patients: {len(self.data)}\n")
        
        for cluster_id, profile in self.cluster_profiles.items():
            print(f"\nCluster {cluster_id}:")
            print(f"Size: {profile['size']} patients ({profile['percentage']:.1f}%)")
            
            print("\nCommon Characteristics:")
            chars = profile['common_characteristics']
            if chars:
                for char, value in chars.items():
                    print(f"- {char}: {value}")
            else:
                print("- No strong characteristic patterns found")
                
            print("\nTop Rules Satisfied:")
            if profile['top_rules']:
                for rule in profile['top_rules'][:5]:  # Show top 5 rules
                    print(f"- {rule}")
            else:
                print("- No strong rule patterns found")
                
            print(f"\nMean Rule Satisfaction: {profile['mean_rule_satisfaction']:.2f}")
            print("-" * 50)

def cluster_patients_from_rules(data, rules, n_clusters=None):
    """Main function to cluster patients based on association rules"""
    print("Starting patient clustering analysis...")
    analyzer = PatientClusterAnalyzer(data, rules)
    analyzer.cluster_patients(n_clusters)
    print("\nGenerating visualizations...")
    analyzer.visualize_clusters()
    print("\nGenerating cluster insights...")
    analyzer.print_cluster_insights()
    return analyzer

In [None]:
# Using your existing data and rules
analyzer = cluster_patients_from_rules(encoded_data, rules)

In [None]:
# ! pip install networkx --upgrade

In [None]:
# import matplotlib.colors as mcolors
# import matplotlib.pyplot as plt
# import networkx as nx
# import numpy as np
# from matplotlib import collections as mc
# from mlxtend.preprocessing import TransactionEncoder
# from networkx.drawing.nx_agraph import graphviz_layout


# def print_detailed_rules(rules, top_n=10):
#     print(f"\nTop {top_n} rules by lift:")
#     for i, (index, rule) in enumerate(rules.iterrows(), 1):
#         if i > top_n:
#             break
#         antecedents = ", ".join(list(rule["antecedents"]))
#         consequents = ", ".join(list(rule["consequents"]))
#         print(f"{i}. {antecedents} -> {consequents}")
#         print(f"   Support: {rule['support']:.4f}")
#         print(f"   Confidence: {rule['confidence']:.4f}")
#         print(f"   Lift: {rule['lift']:.4f}")
#         print(f"   Conviction: {rule['conviction']:.4f}")
#         print(f"   Leverage: {rule['leverage']:.4f}")
#         print(f"   Support Count: {int(rule['support'] * len(data))}")
#         print()


# def visualize_top_rules(
#     rules, top_n=10, figure_size=(20, 12), node_size_base=1000, font_size=8
# ):
#     G = nx.DiGraph()
#     for i, (index, rule) in enumerate(rules.iterrows()):
#         if i >= top_n:
#             break
#         for antecedent in rule["antecedents"]:
#             for consequent in rule["consequents"]:
#                 G.add_edge(
#                     antecedent,
#                     consequent,
#                     weight=rule["lift"],
#                     support=rule["support"],
#                     confidence=rule["confidence"],
#                 )

#     pos = nx.spring_layout(G, k=2, iterations=50)

#     degrees = dict(G.degree())
#     node_sizes = [node_size_base * (1 + degrees[node]) for node in G.nodes()]

#     edge_weights = [G[u][v]["weight"] for u, v in G.edges()]
#     max_weight, min_weight = max(edge_weights), min(edge_weights)
#     norm = mcolors.Normalize(vmin=min_weight, vmax=max_weight)
#     edge_colors = plt.cm.viridis(norm(edge_weights))

#     # Handle the case where all weights are the same
#     if max_weight == min_weight:
#         edge_widths = [1 for _ in edge_weights]
#     else:
#         edge_widths = [
#             1 + 2 * (weight - min_weight) / (max_weight - min_weight)
#             for weight in edge_weights
#         ]

#     fig, ax = plt.subplots(figsize=figure_size)

#     # Draw edges with curved arrows
#     curved_edges = [
#         ((x1, y1), (x2, y2))
#         for (x1, y1), (x2, y2) in (np.array([pos[u], pos[v]]) for u, v in G.edges())
#     ]
#     edge_collection = mc.LineCollection(
#         curved_edges, colors=edge_colors, linewidths=edge_widths, alpha=0.7, zorder=1
#     )
#     ax.add_collection(edge_collection)

#     # Draw arrow heads
#     for (u, v), color, width in zip(G.edges(), edge_colors, edge_widths):
#         x1, y1 = pos[u]
#         x2, y2 = pos[v]
#         dx, dy = x2 - x1, y2 - y1
#         ax.arrow(
#             x1,
#             y1,
#             dx * 0.8,
#             dy * 0.8,
#             color=color,
#             width=width * 0.001,
#             head_width=width * 0.005,
#             head_length=width * 0.01,
#             alpha=0.7,
#             length_includes_head=True,
#             zorder=2,
#         )

#         # Add edge labels
#         edge_label = f"L:{G[u][v]['weight']:.2f}\nS:{G[u][v]['support']:.2f}\nC:{G[u][v]['confidence']:.2f}"
#         ax.annotate(
#             edge_label,
#             xy=(x1 + dx * 0.4, y1 + dy * 0.4),
#             xytext=(3, 3),
#             textcoords="offset points",
#             fontsize=font_size - 2,
#             bbox=dict(boxstyle="round,pad=0.3", fc="white", ec="gray", alpha=0.7),
#             zorder=5,
#         )

#     # Draw nodes
#     for node, (x, y) in pos.items():
#         ax.scatter(
#             x,
#             y,
#             s=node_sizes[list(G.nodes()).index(node)],
#             c="lightblue",
#             alpha=0.8,
#             zorder=3,
#         )

#     # Draw node labels with adjusted positions
#     label_pos = {node: (coord[0], coord[1] + 0.02) for node, coord in pos.items()}
#     for node, (x, y) in label_pos.items():
#         ax.text(
#             x,
#             y,
#             node,
#             fontsize=font_size,
#             fontweight="bold",
#             ha="center",
#             va="center",
#             zorder=4,
#         )

#     # Add a colorbar for edge weights
#     sm = plt.cm.ScalarMappable(cmap=plt.cm.viridis, norm=norm)
#     sm.set_array([])
#     cbar = fig.colorbar(
#         sm, ax=ax, label="Lift", orientation="vertical", fraction=0.046, pad=0.04
#     )

#     ax.set_title("Top Rules Network", fontsize=16)
#     ax.axis("off")
#     plt.tight_layout()
#     plt.show()

#     # Print detailed rule information
#     print("\nDetailed Rule Information:")
#     for i, (index, rule) in enumerate(rules.iterrows()):
#         if i >= top_n:
#             break
#         print(f"\nRule {i+1}:")
#         print(f"Antecedents: {', '.join(rule['antecedents'])}")
#         print(f"Consequents: {', '.join(rule['consequents'])}")
#         print(f"Support: {rule['support']:.4f}")
#         print(f"Confidence: {rule['confidence']:.4f}")
#         print(f"Lift: {rule['lift']:.4f}")


# def filter_rules_by_item(rules, item, in_antecedents=True, in_consequents=True):
#     filtered_rules = rules[
#         (rules["antecedents"].apply(lambda x: item in x) if in_antecedents else True)
#         | (rules["consequents"].apply(lambda x: item in x) if in_consequents else True)
#     ]
#     return filtered_rules


# # After running the Apriori algorithm
# best_rules = run_apriori(encoded_data, min_support=0.1, min_confidence=0.5)

# print_detailed_rules(best_rules)

# # Example of filtering rules
# asthma_related_rules = filter_rules_by_item(best_rules, "BSA_Category_Large")
# print("\nRules related to large BSA:")
# print_detailed_rules(asthma_related_rules)

# # Visualize top rules
# visualize_top_rules(
#     best_rules, top_n=10, figure_size=(20, 10), node_size_base=100, font_size=10
# )

# # Print summary statistics
# print("\nSummary Statistics:")
# print(f"Total number of rules: {len(best_rules)}")
# print(f"Average lift: {best_rules['lift'].mean():.4f}")
# print(f"Average confidence: {best_rules['confidence'].mean():.4f}")
# print(f"Average support: {best_rules['support'].mean():.4f}")