In [None]:
#Association Goals: Identify common traits of diabetics

In [None]:
!pip install mlxtend

In [None]:
import pandas as pd
import mlxtend
from mlxtend.frequent_patterns import apriori, association_rules

In [None]:
df = pd.read_csv("cleaned_diabetes.csv") # this path is generalized, please input original local file path here
df

In [None]:
#Binning: Convert numerical variables into categorical bins as Apriori works with categorical data

def pregnancies_categories(preg):
    if preg == 0:
        return 'No Pregnancy'
    elif 1 <= preg <= 2:
        return 'Low Pregnancies'
    elif 3 <= preg <= 5:
        return 'Moderate Pregnancies'
    else:
        return 'High Pregnancies'

def glucose_categories(glucose):
    if glucose < 140:
        return 'Normal Glucose' 
    elif 140 <= glucose < 200: 
        return 'High Glucose'
    else: 
        return 'Very High Glucose'

def bp_categories(bp):
    if bp < 80: 
        return 'Normal BP' 
    elif 80 <= bp < 90: 
        return 'High-Normal BP' 
    else: 
        return 'High BP'

def skinthickness_categories(skin): 
    if skin <= 10: 
        return 'Very Low' 
    elif 11 <= skin <= 20: 
        return 'Low' 
    elif 21 <= skin <= 30: 
        return 'Medium' 
    elif 31 <= skin <= 40: 
        return 'High' 
    else: 
        return 'Very High'

def insulin_categories(insulin): 
    if insulin <= 30: 
        return 'Low'
    elif 31 <= insulin <= 60: 
        return 'Normal' 
    elif 61 <= insulin <= 100: 
        return 'Elevated' 
    else: 
        return 'High'

def bmi_categories(bmi):
    if bmi < 18.5:
        return 'Underweight'
    elif 18.5 <= bmi < 25:
        return 'Normal'
    elif 25 <= bmi < 30:
        return 'Overweight'
    else:
        return 'Obese'

def dpf_categories(dpf):
    if dpf <= 0.24: 
        return 'Low Genetic Risk' 
    elif 0.25 <= dpf <= 0.49: 
        return 'Moderate Genetic Risk' 
    elif 0.50 <= dpf <= 0.99: 
        return 'High Genetic Risk' 
    else: 
        return 'Very High Genetic Risk'

def age_categories(age):
    if 18 <= age <= 34:
        return 'Young Adult'  
    elif 35 <= age <= 49:
        return 'Middle-aged'  
    elif 50 <= age <= 64:
        return 'Senior'  
    else:
        return 'Elderly' 

df['Pregnancies_Level'] = df['Pregnancies'].apply(pregnancies_categories)
df['Glucose_level'] = df['Glucose'].apply(glucose_categories)
df['BP_Level'] = df['BloodPressure'].apply(bp_categories)
df['ST_Level'] = df['SkinThickness'].apply(skinthickness_categories)
df['Insulin_Level'] = df['Insulin'].apply(insulin_categories)
df['Obesity_Level'] = df['BMI'].apply(bmi_categories)
df['DPF_Level'] = df['DiabetesPedigreeFunction'].apply(dpf_categories)
df['Age_Level'] = df['Age'].apply(age_categories)
df

In [None]:
df.dtypes

In [None]:
df['Outcome'] = df['Outcome'].map({0: False, 1: True})
df[['Outcome']].head()

In [None]:
df.dtypes

In [None]:
df_diabetic = df[df['Outcome'] == True]
df_diabetic

In [None]:
mean_outcome = df['Outcome'].mean()
mean_outcome

In [None]:
association_inputs = df[['BP_Level','Obesity_Level','DPF_Level','Age_Level','Outcome']]
association_inputs

In [None]:
association_inputs = df[['BP_Level','Obesity_Level','DPF_Level','Age_Level','Outcome']]
association_inputs

In [None]:
# Convert categorical features into one-hot encoding
df_encoded = pd.get_dummies(association_inputs)

# Ensure all values are boolean (1 for presence, 0 for absence)
df_encoded = df_encoded.astype(bool)

# Run the Apriori algorithm
frequent_itemsets = apriori(df_encoded, min_support=0.01, use_colnames=True)
frequent_itemsets.head(10)

In [None]:
# Generate association rules from the frequent itemsets
rules = association_rules(df=frequent_itemsets, metric='confidence', min_threshold=0.70)

#Display the rules
rules

In [None]:
rules_filtered = rules[rules['consequents'].apply(lambda x: 'Outcome' in x)]

rules_filtered

In [None]:
print(f"Total number of filtered rules: {rules_filtered.shape[0]}")
pd.set_option('display.max_colwidth', None)
rules_filtered = rules_filtered.sort_values(by=['confidence'], ascending=[False])
rules_filtered

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# Simulating the filtered rules dataset based on previous observations
data = {
    'Rule': ['Rule 76: Normal BP, Obese, Senior', 
             'Rule 88: Middle-aged, High Genetic Risk, Obese', 
             'Rule 91: High Genetic Risk, Obese, Senior'],
    'Antecedent Support': [0.026042, 0.041667, 0.019531],
    'Consequent Support': [0.348958, 0.348958, 0.348958],
    'Support (Rule Support)': [0.018229, 0.031250, 0.016927],
    'Confidence': [0.700000, 0.750000, 0.866667],
    'Lift': [2.005970, 2.149254, 2.483582]
}

# Convert data to DataFrame and sort by confidence (as in your code)
rules_filtered_df = pd.DataFrame(data)
rules_filtered_df = rules_filtered_df.sort_values(by=['Confidence'], ascending=[False])

# Extracting data for visualization
metrics = ['Antecedent Support', 'Consequent Support', 'Support (Rule Support)', 'Confidence', 'Lift']
rule_1_values = rules_filtered_df.iloc[0, 1:].values  # Highest confidence rule
rule_2_values = rules_filtered_df.iloc[1, 1:].values  # Second highest confidence rule
rule_3_values = rules_filtered_df.iloc[2, 1:].values  # Third highest confidence rule

# Set positions for bars
x = np.arange(len(metrics))
width = 0.25

# Create the bar chart
fig, ax = plt.subplots(figsize=(12, 6))
bars1 = ax.bar(x - width, rule_1_values, width, label=rules_filtered_df.iloc[0, 0], color='tomato')
bars2 = ax.bar(x, rule_2_values, width, label=rules_filtered_df.iloc[1, 0], color='royalblue')
bars3 = ax.bar(x + width, rule_3_values, width, label=rules_filtered_df.iloc[2, 0], color='seagreen')

# Add labels and title
ax.set_xlabel('Metrics')
ax.set_ylabel('Values')
ax.set_title('Comparison of Top 3 Rules in Apriori Analysis (Sorted by Confidence)')
ax.set_xticks(x)
ax.set_xticklabels(metrics, rotation=20)
ax.legend()

# Display values on bars
for bars in [bars1, bars2, bars3]:
    for bar in bars:
        height = bar.get_height()
        ax.annotate(f'{height:.2f}',
                    xy=(bar.get_x() + bar.get_width() / 2, height),
                    xytext=(0, 3),  # Offset above bar
                    textcoords="offset points",
                    ha='center', va='bottom', fontsize=10, fontweight='bold')

# Show the plot
plt.show()


In [None]:
# Association rules validation
senior_obese_high_risk = df_diabetic[
    (df_diabetic["Age_Level"] == "Senior") & 
    (df_diabetic["Obesity_Level"] == "Obese") & 
    (df_diabetic["DPF_Level"] == "High Genetic Risk")
]

middle_obese_high_risk = df_diabetic[
    (df_diabetic["Age_Level"] == "Middle-aged") & 
    (df_diabetic["Obesity_Level"] == "Obese") & 
    (df_diabetic["DPF_Level"] == "High Genetic Risk")
]

senior_obese_normal_bp = df_diabetic[
    (df_diabetic["Age_Level"] == "Senior") & 
    (df_diabetic["Obesity_Level"] == "Obese") & 
    (df_diabetic["BP_Level"] == "Normal BP")
]

# Compute Glucose and Insulin category distributions for each group in df_diabetic
validation_results_diabetic = pd.DataFrame({
    "Group": ["Senior, Obese, High Genetic Risk", "Middle-aged, Obese, High Genetic Risk", "Senior, Obese, Normal BP"],
    "High Glucose (%)": [
        (senior_obese_high_risk["Glucose_level"].isin(["High Glucose", "Very High Glucose"])).mean() * 100,
        (middle_obese_high_risk["Glucose_level"].isin(["High Glucose", "Very High Glucose"])).mean() * 100,
        (senior_obese_normal_bp["Glucose_level"].isin(["High Glucose", "Very High Glucose"])).mean() * 100,
    ],
    "High Insulin (%)": [
        (senior_obese_high_risk["Insulin_Level"].isin(["Elevated", "High"])).mean() * 100,
        (middle_obese_high_risk["Insulin_Level"].isin(["Elevated", "High"])).mean() * 100,
        (senior_obese_normal_bp["Insulin_Level"].isin(["Elevated", "High"])).mean() * 100,
    ]
})
# Re-import ace_tools in case it was reset

# Alternative: Display results without using ace_tools (tools)
print("\nDiabetic Association Rule Validation:")
print(validation_results_diabetic.to_string(index=False))

In [None]:
import networkx as nx
import matplotlib.pyplot as plt

# Create a directed graph
G = nx.DiGraph()

# Add nodes and edges from filtered rules
for _, row in rules_filtered.iterrows():
    antecedents = row['antecedents']
    consequent = list(row['consequents'])[0]  # Assuming only 1 consequent (Outcome)
    
    for antecedent in antecedents:
        G.add_edge(antecedent, consequent, weight=row['confidence'])  # Edge weight = confidence

# Draw the graph
plt.figure(figsize=(10, 6))
pos = nx.spring_layout(G, seed=42)  # Layout for better spacing
edges = G.edges(data=True)

# Draw nodes
nx.draw(G, pos, with_labels=True, node_color="lightblue", edge_color="gray", node_size=2000, font_size=10)

# Draw edges with thickness proportional to confidence
edge_widths = [d['weight'] * 5 for (u, v, d) in edges]  # Scale confidence for visualization
nx.draw_networkx_edges(G, pos, edgelist=edges, width=edge_widths, edge_color="blue")

plt.title("Association Rules Network Graph")
plt.show()