# Mining - Association Rule Mining for Cardiovascular Comorbidity and Risk Factor Assessment

[description]

### Rules are assessed using the following criteria:

1. **Support:** Fraction of transactions containing the itemsets in both X and Y.
    * measures how frequently the combination appears in the data.
2. **Confidence:** Probability that transactions with X also include Y.
    * measures the reliability of the inference.
3. **Lift:** The ratio of observed support to that expected if X and Y were independent.
    * Lift > 1 implies a positive association â€” items occur together more than expected.
    * Lift = 1 implies independence.
    * Lift < 1 implies a negative association.

In [64]:
import pandas as pd
import numpy as np
from mlxtend.frequent_patterns import fpgrowth, apriori, association_rules

In [65]:
# Load preprocessed dataset
df = pd.read_csv(r'C:\Users\Admin\Documents\dataset_association_mining\heart_disease_preprocessed.csv')

print(df.head())

   HeartDisease_No  HeartDisease_Yes  BMI_cat_Underweight  BMI_cat_Normal  \
0             True             False                 True           False   
1             True             False                False            True   
2             True             False                False           False   
3             True             False                False            True   
4             True             False                False            True   

   BMI_cat_Overweight  BMI_cat_Obese  Smoking_No  Smoking_Yes  \
0               False          False       False         True   
1               False          False        True        False   
2                True          False       False         True   
3               False          False        True        False   
4               False          False        True        False   

   AlcoholDrinking_No  AlcoholDrinking_Yes  ...  GenHealth_Very good  \
0                True                False  ...                 True   
1 

In [66]:
# Convert True/False to 1/0 for Apriori
df_for_mining = df.astype(int)

print(f"Dataset shape: {df_for_mining.shape}")
print(df_for_mining.head())


Dataset shape: (319795, 61)
   HeartDisease_No  HeartDisease_Yes  BMI_cat_Underweight  BMI_cat_Normal  \
0                1                 0                    1               0   
1                1                 0                    0               1   
2                1                 0                    0               0   
3                1                 0                    0               1   
4                1                 0                    0               1   

   BMI_cat_Overweight  BMI_cat_Obese  Smoking_No  Smoking_Yes  \
0                   0              0           0            1   
1                   0              0           1            0   
2                   1              0           0            1   
3                   0              0           1            0   
4                   0              0           1            0   

   AlcoholDrinking_No  AlcoholDrinking_Yes  ...  GenHealth_Very good  \
0                   1                    0  ..

In [67]:
# Generate frequent itemsets using Apriori
freq_itemsets = fpgrowth(df_for_mining, min_support=0.1, use_colnames=True, max_len=3)

print(f"\nNumber of frequent itemsets: {len(freq_itemsets)}")
print(freq_itemsets.head(20))




Number of frequent itemsets: 2051
     support                   itemsets
0   0.963167         (KidneyDisease_No)
1   0.962260                (Stroke_No)
2   0.931903       (AlcoholDrinking_No)
3   0.914405          (HeartDisease_No)
4   0.861130           (DiffWalking_No)
5   0.775362     (PhysicalActivity_Yes)
6   0.766779               (Race_White)
7   0.608943      (SleepTime_cat_Short)
8   0.524727               (Sex_Female)
9   0.412477              (Smoking_Yes)
10  0.356034      (GenHealth_Very good)
11  0.161278    (MentalHealth_cat_High)
12  0.134061               (Asthma_Yes)
13  0.127588             (Diabetic_Yes)
14  0.106274   (PhysicalHealth_cat_Low)
15  0.906756            (SkinCancer_No)
16  0.865939                (Asthma_No)
17  0.843206              (Diabetic_No)
18  0.708545  (PhysicalHealth_cat_None)
19  0.642290    (MentalHealth_cat_None)


In [68]:
# Filter frequent itemsets - remove trivial 1-itemsets 
freq_itemsets [
    freq_itemsets['itemsets'].apply(lambda x: len(x) > 1)
].sort_values(by='support', ascending=False).head(20)

Unnamed: 0,support,itemsets
35,0.93009,"(Stroke_No, KidneyDisease_No)"
36,0.896237,"(AlcoholDrinking_No, KidneyDisease_No)"
37,0.89578,"(AlcoholDrinking_No, Stroke_No)"
39,0.890389,"(HeartDisease_No, Stroke_No)"
40,0.888375,"(HeartDisease_No, KidneyDisease_No)"
693,0.876743,"(SkinCancer_No, KidneyDisease_No)"
696,0.875201,"(SkinCancer_No, Stroke_No)"
42,0.86653,"(HeartDisease_No, KidneyDisease_No, Stroke_No)"
38,0.864641,"(AlcoholDrinking_No, Stroke_No, KidneyDisease_No)"
41,0.849876,"(AlcoholDrinking_No, HeartDisease_No)"


In [71]:
# Generate association rules 

rules = association_rules(freq_itemsets, metric='confidence', min_threshold=0.7)

rules = rules.sort_values(by='lift', ascending=False)
rules.head(30)


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
3943,(GenHealth_Excellent),"(Diabetic_No, PhysicalHealth_cat_None)",0.209015,0.617105,0.17692,0.846444,1.371637,1.0,0.047935,2.49352,0.34254,0.272519,0.59896,0.566568
3976,(GenHealth_Excellent),"(PhysicalActivity_Yes, PhysicalHealth_cat_None)",0.209015,0.578361,0.164746,0.788202,1.36282,1.0,0.04386,1.99076,0.336577,0.264597,0.497679,0.536526
3937,(GenHealth_Excellent),"(DiffWalking_No, PhysicalHealth_cat_None)",0.209015,0.657856,0.182073,0.871099,1.324149,1.0,0.044571,2.654318,0.309485,0.265878,0.623255,0.573933
3940,(GenHealth_Excellent),"(HeartDisease_No, PhysicalHealth_cat_None)",0.209015,0.663447,0.181466,0.868197,1.308615,1.0,0.042796,2.553449,0.298152,0.262616,0.608373,0.570859
4022,(GenHealth_Excellent),"(Diabetic_No, MentalHealth_cat_None)",0.209015,0.540643,0.147069,0.703629,1.301467,1.0,0.034067,1.549941,0.292846,0.244062,0.354814,0.487828
3949,(GenHealth_Excellent),"(Asthma_No, PhysicalHealth_cat_None)",0.209015,0.633018,0.171144,0.818812,1.293504,1.0,0.038834,2.025416,0.286866,0.2551,0.506274,0.544587
4027,"(MentalHealth_cat_None, GenHealth_Excellent)",(PhysicalHealth_cat_None),0.154824,0.708545,0.141578,0.914445,1.290596,1.0,0.031878,3.406644,0.266411,0.196149,0.706456,0.55713
4016,(GenHealth_Excellent),"(MentalHealth_cat_None, DiffWalking_No)",0.209015,0.566372,0.151447,0.724574,1.279325,1.0,0.033067,1.574392,0.276033,0.242727,0.364834,0.495986
3952,(GenHealth_Excellent),"(SkinCancer_No, PhysicalHealth_cat_None)",0.209015,0.646045,0.17231,0.824392,1.276059,1.0,0.037277,2.015595,0.273504,0.252377,0.503869,0.545554
3931,(GenHealth_Excellent),"(KidneyDisease_No, PhysicalHealth_cat_None)",0.209015,0.691543,0.183918,0.879926,1.272409,1.0,0.039375,2.568886,0.270662,0.256639,0.610726,0.572939


In [None]:
# Keep only strong rules with lift >= 1.2
strong_rules = rules[rules['lift'] >= 1.2]

# Display top 10 strong rules
strong_rules.head(10)


In [None]:
# Check rules that highlight risk factors for heart disease

risk_rules = rules[
    rules['consequents'].astype(str).str.contains('Yes')
]

risk_rules.head(10)

In [None]:
df_for_mining[['Race_White','Race_Black','Race_Asian','Race_Other']].sum()


In [None]:
race_risk_rules = risk_rules[
    risk_rules['antecedents'].astype(str).str.contains('Obesity_Yes|Smoking_Yes')
].sort_values(by='lift', ascending=False)

race_risk_rules.head(10)


### Analysis / Visualisations of Rule Metrics

### Save Rules in CSV file format