In [1]:
import pandas as pd
import itertools
from mlxtend.frequent_patterns import apriori, association_rules

In [2]:
df = pd.read_csv('KaggleCovidDataset.csv').applymap(lambda x : 1 if x == "Yes" else 0)
df

Unnamed: 0,Breathing Problem,Fever,Dry Cough,Sore throat,Running Nose,Asthma,Chronic Lung Disease,Headache,Heart Disease,Diabetes,...,Fatigue,Gastrointestinal,Abroad travel,Contact with COVID Patient,Attended Large Gathering,Visited Public Exposed Places,Family working in Public Exposed Places,Wearing Masks,Sanitization from Market,COVID-19
0,1,1,1,1,1,0,0,0,0,1,...,1,1,0,1,0,1,1,0,0,1
1,1,1,1,1,0,1,1,1,0,0,...,1,0,0,0,1,1,0,0,0,1
2,1,1,1,1,1,1,1,1,0,1,...,1,1,1,0,0,0,0,0,0,1
3,1,1,1,0,0,1,0,0,1,1,...,0,0,1,0,1,1,0,0,0,1
4,1,1,1,1,1,0,1,1,1,1,...,0,1,0,1,0,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5429,1,1,0,1,1,1,1,0,0,0,...,1,1,0,0,0,0,0,0,0,1
5430,1,1,1,0,1,1,0,1,0,1,...,1,0,0,0,0,0,0,0,0,1
5431,1,1,1,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
5432,1,1,1,0,1,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0


In [3]:
def aprioriBAD(df, min_support, use_colnames):
    df = df.copy(deep=True)
    if not use_colnames:
        df.columns = [idx for idx in range(len(df.columns))]

    # Generate 1-itemsets
    oneItemSets = df.apply(lambda x: x.value_counts(normalize=True).get(1, 0)).loc[lambda x: x >= min_support].to_dict()
    oneItemSets = {frozenset((k,)): v for k, v in oneItemSets.items()}

    # Function to generate k-itemsets
    def generate_k_itemset(itemset, k):
        itemset_k = []
        for item1 in itemset:
            for item2 in itemset:
                if item1[:-2] == item2[:-2] and item1[-2:] != item2[-2:]:
                    new_item = tuple(sorted(set(item1 + item2)))
                    if len(new_item) == k:
                        itemset_k.append(new_item)
        itemset_k = list(set(itemset_k))
        itemset_k_support_map = {}
        for kitem in itemset_k:
            countPresent = 0
            for doc in range(df.shape[0]):
                addCount = True
                for item in kitem:
                    if df[item][doc] == 0:
                        addCount = False
                        break
                if addCount:
                    countPresent += 1
            support = countPresent/df.shape[0]
            if support >= min_support:
                itemset_k_support_map[frozenset(kitem)] = support
        return itemset_k_support_map
    
    # Create all itemsets
    allItemSets = {**oneItemSets}
    
    # Generate k-itemsets for k > 1
    kItemSets = oneItemSets
    k = 2
    while len(kItemSets) > 0:
        kItemSets = generate_k_itemset([tuple(key) for key in kItemSets.keys()], k)
        allItemSets.update(kItemSets)
        k += 1

    return pd.DataFrame([(v, k) for k, v in allItemSets.items()], columns=['support', 'itemsets'])

def association_rulesBAD(df, min_confidence):
    frequent_itemsets = {frozenset(itemset): support for itemset, support in zip(df['itemsets'], df['support'])}

    rules = []
    for itemset in frequent_itemsets:
        n = len(itemset)
        if n > 1:
            for i in range(1, n):
                lefts = list(itertools.combinations(itemset, i))
                for left in lefts:
                    left = frozenset(left)
                    right = itemset.difference(left)
                    confidence = frequent_itemsets[itemset] / frequent_itemsets[left]
                    if confidence >= min_confidence:
                        rules.append((left, right, confidence))
                        
    return pd.DataFrame(rules, columns=['antecedents', 'consequents', 'confidence'])

In [16]:
aBadDF = aprioriBAD(df, 0.2, True)
aDF = apriori(df, 0.2, True)



In [17]:
good, bad = set(aDF['itemsets']), set(aBadDF['itemsets'])
whatsnotthere = good.difference(bad)
whatsnotthere

set()

In [18]:
association_rulesBAD(aBadDF, min_confidence=0.6)

Unnamed: 0,antecedents,consequents,confidence
0,(Visited Public Exposed Places),(COVID-19),0.852128
1,(Headache),(Fever),0.771930
2,(Abroad travel),(Dry Cough),0.940840
3,(Asthma),(Sore throat),0.766508
4,(Abroad travel),(Fever),0.844553
...,...,...,...
2894,"(Breathing Problem, COVID-19, Dry Cough, Sore ...",(Fever),0.809211
2895,"(Breathing Problem, COVID-19, Dry Cough, Fever...",(Sore throat),0.880668
2896,"(Breathing Problem, COVID-19, Sore throat, Fev...",(Dry Cough),0.884185
2897,"(Breathing Problem, Dry Cough, Sore throat, Fe...",(COVID-19),1.000000


In [19]:
association_rules(aDF, metric='confidence', min_threshold=0.6)[['antecedents', 'consequents', 'confidence']]

Unnamed: 0,antecedents,consequents,confidence
0,(Breathing Problem),(Fever),0.812431
1,(Fever),(Breathing Problem),0.688275
2,(Dry Cough),(Breathing Problem),0.704667
3,(Breathing Problem),(Dry Cough),0.838398
4,(Breathing Problem),(Sore throat),0.823204
...,...,...,...
2894,"(COVID-19, Visited Public Exposed Places, Cont...","(Dry Cough, Fever, Sore throat)",0.722700
2895,"(Dry Cough, Visited Public Exposed Places, Con...","(Sore throat, COVID-19, Fever)",0.803532
2896,"(Sore throat, Visited Public Exposed Places, C...","(Dry Cough, COVID-19, Fever)",0.840000
2897,"(Fever, Visited Public Exposed Places, Contact...","(Dry Cough, COVID-19, Sore throat)",0.822289
