# Practical 3

In [None]:
''' 
Aim:
    Run Apriori algorithm to find frequent item sets and association rules on 2 real datasets and use
    appropriate evaluation measures to compute correctness of obtained patterns
        a) Use minimum support as 50% and minimum confidence as 75%
        b) Use minimum support as 60% and minimum confidence as 60 %
'''

' \nAim:\n    Run Apriori algorithm to find frequent item sets and association rules on 2 real datasets and use\n    appropriate evaluation measures to compute correctness of obtained patterns\n        a) Use minimum support as 50% and minimum confidence as 75%\n        b) Use minimum support as 60% and minimum confidence as 60 %\n'

## Dataset 1: Titanic Dataset

In [72]:
import kagglehub

# Download Titanic dataset
path_titanic = kagglehub.dataset_download("heptapod/titanic")
print("Titanic dataset downloaded at:", path_titanic)

Downloading from https://www.kaggle.com/api/v1/datasets/download/heptapod/titanic?dataset_version_number=1...


100%|██████████| 10.8k/10.8k [00:00<00:00, 7.16MB/s]

Extracting files...
Titanic dataset downloaded at: /Users/kimsan/.cache/kagglehub/datasets/heptapod/titanic/versions/1





In [95]:
import pandas as pd
from mlxtend.frequent_patterns import apriori, association_rules
from mlxtend.preprocessing import TransactionEncoder

# Dataset 1: Groceries
df = pd.read_csv('../dataset/titanic.csv')

print("df:")
print(df.head())


df:
   Passengerid   Age     Fare  Sex  SibSp  zero  zero.1  zero.2  zero.3  \
0            1  22.0   7.2500    0      1     0       0       0       0   
1            2  38.0  71.2833    1      1     0       0       0       0   
2            3  26.0   7.9250    1      0     0       0       0       0   
3            4  35.0  53.1000    1      1     0       0       0       0   
4            5  35.0   8.0500    0      0     0       0       0       0   

   zero.4  ...  zero.12  zero.13  zero.14  Pclass  zero.15  zero.16  Embarked  \
0       0  ...        0        0        0       3        0        0       2.0   
1       0  ...        0        0        0       1        0        0       0.0   
2       0  ...        0        0        0       3        0        0       2.0   
3       0  ...        0        0        0       1        0        0       2.0   
4       0  ...        0        0        0       3        0        0       2.0   

   zero.17  zero.18  Survived  
0        0        0       

In [97]:
# Keep only useful categorical attributes
cat_cols = ['Survived','Pclass','Sex','SibSp','Parch','Embarked']
df = df[cat_cols].dropna()

In [None]:
# Convert numeric to string so TE can treat as items
df = df.astype(str)

In [80]:
# Convert each row of features into a transaction list
transactions_titanic = df.values.tolist()

te = TransactionEncoder()
te_data = te.fit(transactions_titanic).transform(transactions_titanic)
basket_titanic = pd.DataFrame(te_data, columns=te.columns_)

print("\nBasket (Titanic):")
print(basket_titanic.head())


Basket (Titanic):
      0    0.0   0.17   0.33   0.42   0.67   0.75   0.83   0.92      1  ...  \
0  True  False  False  False  False  False  False  False  False   True  ...   
1  True   True  False  False  False  False  False  False  False   True  ...   
2  True  False  False  False  False  False  False  False  False   True  ...   
3  True  False  False  False  False  False  False  False  False   True  ...   
4  True  False  False  False  False  False  False  False  False  False  ...   

     991    992    993    994    995    996    997    998    999    nan  
0  False  False  False  False  False  False  False  False  False  False  
1  False  False  False  False  False  False  False  False  False  False  
2  False  False  False  False  False  False  False  False  False  False  
3  False  False  False  False  False  False  False  False  False  False  
4  False  False  False  False  False  False  False  False  False  False  

[5 rows x 1655 columns]


In [81]:
# ---- A) Support 50%, Confidence 75% ----
freq50_t = apriori(basket_titanic, min_support=0.50, use_colnames=True)
print("\nFrequent Itemsets ≥50% Support (Titanic):")
print(freq50_t)

if freq50_t.empty:
    print("\nNo frequent itemsets at 50% support (Titanic).")
else:
    rules50_t = association_rules(freq50_t, metric="confidence", min_threshold=0.75)
    print("\nAssociation Rules ≥75% confidence (Titanic):")
    print(rules50_t[['antecedents','consequents','support','confidence','lift']])





Frequent Itemsets ≥50% Support (Titanic):
    support  itemsets
0  1.000000       (0)
1  0.627196       (1)
2  0.699007     (2.0)
3  0.549274       (3)
4  0.627196    (0, 1)
5  0.699007  (0, 2.0)
6  0.549274    (0, 3)

Association Rules ≥75% confidence (Titanic):
  antecedents consequents   support  confidence  lift
0         (1)         (0)  0.627196         1.0   1.0
1       (2.0)         (0)  0.699007         1.0   1.0
2         (3)         (0)  0.549274         1.0   1.0


  cert_metric = np.where(certainty_denom == 0, 0, certainty_num / certainty_denom)


In [82]:
# ---- B) Support 60%, Confidence 60% ----
freq60_t = apriori(basket_titanic, min_support=0.60, use_colnames=True)
print("\nFrequent Itemsets ≥60% Support (Titanic):")
print(freq60_t)

if freq60_t.empty:
    print("\nNo frequent itemsets at 60% support (Titanic).")
else:
    rules60_t = association_rules(freq60_t, metric="confidence", min_threshold=0.60)
    print("\nAssociation Rules ≥60% confidence (Titanic):")
    print(rules60_t[['antecedents','consequents','support','confidence','lift']])


Frequent Itemsets ≥60% Support (Titanic):
    support  itemsets
0  1.000000       (0)
1  0.627196       (1)
2  0.699007     (2.0)
3  0.627196    (0, 1)
4  0.699007  (0, 2.0)

Association Rules ≥60% confidence (Titanic):
  antecedents consequents   support  confidence  lift
0         (0)         (1)  0.627196    0.627196   1.0
1         (1)         (0)  0.627196    1.000000   1.0
2         (0)       (2.0)  0.699007    0.699007   1.0
3       (2.0)         (0)  0.699007    1.000000   1.0


  cert_metric = np.where(certainty_denom == 0, 0, certainty_num / certainty_denom)


## Dataset 2: Mushroom Dataset

In [48]:
path = kagglehub.dataset_download("uciml/mushroom-classification")
print("Path to dataset files:", path)

Path to dataset files: /Users/kimsan/.cache/kagglehub/datasets/uciml/mushroom-classification/versions/1


In [53]:
df1 = pd.read_csv('../dataset/mushrooms.csv')
print("df1: ")
print(df1.head())

df1: 
  class cap-shape cap-surface cap-color bruises odor gill-attachment  \
0     p         x           s         n       t    p               f   
1     e         x           s         y       t    a               f   
2     e         b           s         w       t    l               f   
3     p         x           y         w       t    p               f   
4     e         x           s         g       f    n               f   

  gill-spacing gill-size gill-color  ... stalk-surface-below-ring  \
0            c         n          k  ...                        s   
1            c         b          k  ...                        s   
2            c         b          n  ...                        s   
3            c         n          n  ...                        s   
4            w         b          k  ...                        s   

  stalk-color-above-ring stalk-color-below-ring veil-type veil-color  \
0                      w                      w         p          w   
1 

In [64]:
# Convert each row to transactions
transactions = df1.astype(str).values.tolist()

In [65]:
# Convert each row to transactions
transactions = df1.astype(str).values.tolist()

In [69]:
from mlxtend.preprocessing import TransactionEncoder

te = TransactionEncoder()
te_data = te.fit(transactions).transform(transactions)
basket_mush = pd.DataFrame(te_data, columns=te.columns_)

In [70]:
from mlxtend.frequent_patterns import apriori, association_rules

# A) support 50%, confidence 75%
freq_50 = apriori(basket_mush, min_support=0.50, use_colnames=True)
rules_50 = association_rules(freq_50, metric="confidence", min_threshold=0.75)

print("Itemsets ≥50% support:")
print(freq_50)

print("Rules ≥75% confidence:")
print(rules_50[['antecedents','consequents','support','confidence','lift']])

Itemsets ≥50% support:
       support                     itemsets
0     0.947070                          (b)
1     0.855244                          (c)
2     0.964549                          (e)
3     1.000000                          (f)
4     0.520926                          (g)
...        ...                          ...
1314  0.611521     (n, e, f, s, t, o, w, p)
1315  0.505416  (n, e, f, b, s, o, w, c, p)
1316  0.516987  (n, e, f, b, s, t, w, c, p)
1317  0.580010  (n, e, f, b, s, o, t, w, p)
1318  0.511078  (n, e, f, s, t, o, w, c, p)

[1319 rows x 2 columns]
Rules ≥75% confidence:
      antecedents            consequents   support  confidence      lift
0             (b)                    (c)  0.802314    0.847154  0.990541
1             (c)                    (b)  0.802314    0.938112  0.990541
2             (e)                    (b)  0.911620    0.945125  0.997946
3             (b)                    (e)  0.911620    0.962568  0.997946
4             (f)                   

  cert_metric = np.where(certainty_denom == 0, 0, certainty_num / certainty_denom)


In [71]:
# B) support 60%, confidence 60%
freq_60 = apriori(basket_mush, min_support=0.60, use_colnames=True)
rules_60 = association_rules(freq_60, metric="confidence", min_threshold=0.60)

print("Itemsets ≥60% support:")
print(freq_60)

print("Rules ≥60% confidence:")
print(rules_60[['antecedents','consequents','support','confidence','lift']])

Itemsets ≥60% support:
      support                  itemsets
0    0.947070                       (b)
1    0.855244                       (c)
2    0.964549                       (e)
3    1.000000                       (f)
4    0.646972                       (k)
..        ...                       ...
694  0.613737  (n, e, f, b, o, w, c, p)
695  0.608075  (n, e, f, b, s, o, w, p)
696  0.603644  (n, e, f, b, t, o, w, p)
697  0.650665  (n, e, f, b, s, t, w, p)
698  0.611521  (n, e, f, s, t, o, w, p)

[699 rows x 2 columns]
Rules ≥60% confidence:
      antecedents            consequents   support  confidence      lift
0             (b)                    (c)  0.802314    0.847154  0.990541
1             (c)                    (b)  0.802314    0.938112  0.990541
2             (e)                    (b)  0.911620    0.945125  0.997946
3             (b)                    (e)  0.911620    0.962568  0.997946
4             (f)                    (b)  0.947070    0.947070  1.000000
...         

  cert_metric = np.where(certainty_denom == 0, 0, certainty_num / certainty_denom)
