In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules, fpgrowth

In [2]:
df = pd.read_csv('KaggleCovidDataset.csv')
df

Unnamed: 0,Breathing Problem,Fever,Dry Cough,Sore throat,Running Nose,Asthma,Chronic Lung Disease,Headache,Heart Disease,Diabetes,...,Fatigue,Gastrointestinal,Abroad travel,Contact with COVID Patient,Attended Large Gathering,Visited Public Exposed Places,Family working in Public Exposed Places,Wearing Masks,Sanitization from Market,COVID-19
0,Yes,Yes,Yes,Yes,Yes,No,No,No,No,Yes,...,Yes,Yes,No,Yes,No,Yes,Yes,No,No,Yes
1,Yes,Yes,Yes,Yes,No,Yes,Yes,Yes,No,No,...,Yes,No,No,No,Yes,Yes,No,No,No,Yes
2,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,No,Yes,...,Yes,Yes,Yes,No,No,No,No,No,No,Yes
3,Yes,Yes,Yes,No,No,Yes,No,No,Yes,Yes,...,No,No,Yes,No,Yes,Yes,No,No,No,Yes
4,Yes,Yes,Yes,Yes,Yes,No,Yes,Yes,Yes,Yes,...,No,Yes,No,Yes,No,Yes,No,No,No,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5429,Yes,Yes,No,Yes,Yes,Yes,Yes,No,No,No,...,Yes,Yes,No,No,No,No,No,No,No,Yes
5430,Yes,Yes,Yes,No,Yes,Yes,No,Yes,No,Yes,...,Yes,No,No,No,No,No,No,No,No,Yes
5431,Yes,Yes,Yes,No,No,No,No,No,Yes,No,...,No,No,No,No,No,No,No,No,No,No
5432,Yes,Yes,Yes,No,Yes,No,No,Yes,Yes,No,...,No,No,No,No,No,No,No,No,No,No


In [3]:
# Check for bad data
df.isna().sum(), df.isnull().sum()

(Breathing Problem                          0
 Fever                                      0
 Dry Cough                                  0
 Sore throat                                0
 Running Nose                               0
 Asthma                                     0
 Chronic Lung Disease                       0
 Headache                                   0
 Heart Disease                              0
 Diabetes                                   0
 Hyper Tension                              0
 Fatigue                                    0
 Gastrointestinal                           0
 Abroad travel                              0
 Contact with COVID Patient                 0
 Attended Large Gathering                   0
 Visited Public Exposed Places              0
 Family working in Public Exposed Places    0
 Wearing Masks                              0
 Sanitization from Market                   0
 COVID-19                                   0
 dtype: int64,
 Breathing Problem 

In [4]:
# Label with boolean numnbers
# The below works in latest Notebooks, but not in the lab. :skull:
# df = df.map(lambda x : 1 if x == "Yes" else 0)
# df
df = df.applymap(lambda x : 1 if x == "Yes" else 0)
df

Unnamed: 0,Breathing Problem,Fever,Dry Cough,Sore throat,Running Nose,Asthma,Chronic Lung Disease,Headache,Heart Disease,Diabetes,...,Fatigue,Gastrointestinal,Abroad travel,Contact with COVID Patient,Attended Large Gathering,Visited Public Exposed Places,Family working in Public Exposed Places,Wearing Masks,Sanitization from Market,COVID-19
0,1,1,1,1,1,0,0,0,0,1,...,1,1,0,1,0,1,1,0,0,1
1,1,1,1,1,0,1,1,1,0,0,...,1,0,0,0,1,1,0,0,0,1
2,1,1,1,1,1,1,1,1,0,1,...,1,1,1,0,0,0,0,0,0,1
3,1,1,1,0,0,1,0,0,1,1,...,0,0,1,0,1,1,0,0,0,1
4,1,1,1,1,1,0,1,1,1,1,...,0,1,0,1,0,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5429,1,1,0,1,1,1,1,0,0,0,...,1,1,0,0,0,0,0,0,0,1
5430,1,1,1,0,1,1,0,1,0,1,...,1,0,0,0,0,0,0,0,0,1
5431,1,1,1,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
5432,1,1,1,0,1,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0


## Apriori Algorithm application

In [5]:
# Find the itemsets and their supports
aDF = apriori(df=df, min_support = 0.2, use_colnames = True, verbose = 1)
aDF



Processing 348 combinations | Sampling itemset size 65


Unnamed: 0,support,itemsets
0,0.666176,(Breathing Problem)
1,0.786345,(Fever)
2,0.792602,(Dry Cough)
3,0.727457,(Sore throat)
4,0.543246,(Running Nose)
...,...,...
775,0.203717,"(Dry Cough, Fever, Abroad travel, Breathing Pr..."
776,0.248620,"(Dry Cough, Fever, Contact with COVID Patient,..."
777,0.223408,"(Dry Cough, Fever, Attended Large Gathering, B..."
778,0.226353,"(Dry Cough, Fever, Visited Public Exposed Plac..."


In [6]:
# Find the association by applying the confidence metric
df_ar = association_rules(df=aDF, metric = "confidence", min_threshold = 0.6)
df_ar

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(Fever),(Breathing Problem),0.786345,0.666176,0.541222,0.688275,1.033173,0.017378,1.070894,0.150281
1,(Breathing Problem),(Fever),0.666176,0.786345,0.541222,0.812431,1.033173,0.017378,1.139073,0.096183
2,(Dry Cough),(Breathing Problem),0.792602,0.666176,0.558520,0.704667,1.057779,0.030508,1.130330,0.263372
3,(Breathing Problem),(Dry Cough),0.666176,0.792602,0.558520,0.838398,1.057779,0.030508,1.283385,0.163628
4,(Breathing Problem),(Sore throat),0.666176,0.727457,0.548399,0.823204,1.131620,0.063785,1.541573,0.348420
...,...,...,...,...,...,...,...,...,...,...
2894,"(Dry Cough, Visited Public Exposed Places, Con...","(Fever, COVID-19, Sore throat)",0.250092,0.596430,0.200957,0.803532,1.347236,0.051795,2.054127,0.343695
2895,"(Fever, Visited Public Exposed Places, Contact...","(Dry Cough, COVID-19, Sore throat)",0.244387,0.593301,0.200957,0.822289,1.385955,0.055962,2.288541,0.368543
2896,"(COVID-19, Visited Public Exposed Places, Cont...","(Dry Cough, Fever, Sore throat)",0.278064,0.526132,0.200957,0.722700,1.373611,0.054659,1.708866,0.376753
2897,"(Sore throat, Visited Public Exposed Places, C...","(Dry Cough, Fever, COVID-19)",0.239234,0.611152,0.200957,0.840000,1.374453,0.054748,2.430300,0.358110


In [7]:
df_ar["antecedents"] = df_ar["antecedents"].apply(lambda x: ', '.join(list(x))).astype("unicode")
df_ar["consequents"] = df_ar["consequents"].apply(lambda x: ', '.join(list(x))).astype("unicode")
df_ar.sort_values('confidence', ascending=False)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
1755,"Dry Cough, Running Nose, Attended Large Gathering",COVID-19,0.223408,0.806588,0.223408,1.000000,1.239790,0.043210,inf,0.249052
1853,"Abroad travel, Visited Public Exposed Places, ...",COVID-19,0.211078,0.806588,0.211078,1.000000,1.239790,0.040825,inf,0.245160
1564,"Fever, Attended Large Gathering, Sore throat",COVID-19,0.334008,0.806588,0.334008,1.000000,1.239790,0.064601,inf,0.290412
1765,"Dry Cough, Abroad travel, Headache",COVID-19,0.222304,0.806588,0.222304,1.000000,1.239790,0.042996,inf,0.248699
1829,"Running Nose, Abroad travel, Sore throat",COVID-19,0.220096,0.806588,0.220096,1.000000,1.239790,0.042569,inf,0.247994
...,...,...,...,...,...,...,...,...,...,...
2002,"Fever, Asthma","Dry Cough, Breathing Problem, COVID-19",0.378911,0.530180,0.227641,0.600777,1.133156,0.026750,1.176835,0.189198
697,"COVID-19, Contact with COVID Patient",Attended Large Gathering,0.475156,0.461907,0.285425,0.600697,1.300473,0.065947,1.347582,0.440225
2892,"Fever, Sore throat, COVID-19, Visited Public E...","Dry Cough, Contact with COVID Patient",0.334560,0.423629,0.200957,0.600660,1.417892,0.059228,1.443309,0.442906
289,Chronic Lung Disease,"Dry Cough, Fever",0.472028,0.644461,0.283401,0.600390,0.931616,-0.020803,0.889715,-0.122060


In [8]:
df_confidence = df_ar.loc[df_ar['confidence'] == 1]
df_confidence[['antecedents', 'consequents', 'confidence']]

Unnamed: 0,antecedents,consequents,confidence
85,Abroad travel,COVID-19,1.0
254,"Abroad travel, Breathing Problem",COVID-19,1.0
266,"Attended Large Gathering, Breathing Problem",COVID-19,1.0
433,"Fever, Abroad travel",COVID-19,1.0
443,"Fever, Attended Large Gathering",COVID-19,1.0
...,...,...,...
2715,"Dry Cough, Attended Large Gathering, Sore thro...",COVID-19,1.0
2724,"Dry Cough, Sore throat, Visited Public Exposed...",COVID-19,1.0
2784,"Dry Cough, Fever, Abroad travel, Breathing Pro...",COVID-19,1.0
2831,"Dry Cough, Fever, Attended Large Gathering, Br...",COVID-19,1.0


## FPGrowth Algorithm application

In [9]:
fpgDF = fpgrowth(df=df, min_support = 0.2, use_colnames = True, verbose = 1)
df_ar = association_rules(df=fpgDF, metric = "confidence", min_threshold = 0.6)
df_ar["antecedents"] = df_ar["antecedents"].apply(lambda x: ', '.join(list(x))).astype("unicode")
df_ar["consequents"] = df_ar["consequents"].apply(lambda x: ', '.join(list(x))).astype("unicode")
df_ar.sort_values('confidence', ascending=False)
df_confidence = df_ar.loc[df_ar['confidence'] == 1]
df_confidence[['antecedents', 'consequents', 'confidence']]

19 itemset(s) from tree conditioned on items ()
0 itemset(s) from tree conditioned on items (COVID-19)
1 itemset(s) from tree conditioned on items (Dry Cough)
2 itemset(s) from tree conditioned on items (Fever)
0 itemset(s) from tree conditioned on items (Fever, COVID-19)
1 itemset(s) from tree conditioned on items (Fever, Dry Cough)
3 itemset(s) from tree conditioned on items (Sore throat)
0 itemset(s) from tree conditioned on items (Sore throat, COVID-19)
1 itemset(s) from tree conditioned on items (Sore throat, Fever)
2 itemset(s) from tree conditioned on items (Sore throat, Dry Cough)
0 itemset(s) from tree conditioned on items (Sore throat, Dry Cough, COVID-19)
1 itemset(s) from tree conditioned on items (Sore throat, Dry Cough, Fever)
4 itemset(s) from tree conditioned on items (Breathing Problem)
0 itemset(s) from tree conditioned on items (Breathing Problem, COVID-19)
1 itemset(s) from tree conditioned on items (Breathing Problem, Dry Cough)
2 itemset(s) from tr



Unnamed: 0,antecedents,consequents,confidence
967,"Dry Cough, Visited Public Exposed Places, Cont...",COVID-19,1.0
971,"Fever, Visited Public Exposed Places, Contact ...",COVID-19,1.0
979,"Dry Cough, Fever, Visited Public Exposed Place...",COVID-19,1.0
991,"Visited Public Exposed Places, Sore throat, Co...",COVID-19,1.0
1006,"Fever, Sore throat, Visited Public Exposed Pla...",COVID-19,1.0
...,...,...,...
2786,"Abroad travel, Contact with COVID Patient",COVID-19,1.0
2790,"Dry Cough, Abroad travel, Contact with COVID P...",COVID-19,1.0
2793,"Fever, Abroad travel, Contact with COVID Patient",COVID-19,1.0
2799,"Dry Cough, Fever, Abroad travel, Contact with ...",COVID-19,1.0
