In [1]:
import pandas as pd
import numpy as np

n = 1000
p_male = 0.5
p_female = 1 - p_male
p_ad = 0.3

og_table = pd.DataFrame(columns=['rule', 'fr(X)', 'fr(X, C)'], 
                     data=np.array(
                         [[r'smoking $\rightarrow$ AD', 300, 125],
                          [r'stress $\rightarrow$ AD', 500, 150],
                          [r'higheducation $\rightarrow$ $\neg$ AD', 500, 400],
                          [r'tea $\rightarrow$ $\neg$ AD', 342, 240],
                          [r'turmeric $\rightarrow$ $\neg$ AD', 2, 2],
                          [r'female $\rightarrow$ $\neg$ AD', 500, 352],
                          [r'female, stress $\rightarrow$ AD', 260, 100],
                          [r'berries, apples $\rightarrow$ AD', 120, 32],
                          [r'smoking, tea $\rightarrow$ AD', 240, 100],
                          [r'smoking, higheducation $\rightarrow$ AD', 80, 32],
                          [r'stress, smoking $\rightarrow$ AD', 200, 100],
                          [r'female, higheducation $\rightarrow$ $\neg$ AD', 251, 203]]))
og_table

Unnamed: 0,rule,fr(X),"fr(X, C)"
0,smoking $\rightarrow$ AD,300,125
1,stress $\rightarrow$ AD,500,150
2,higheducation $\rightarrow$ $\neg$ AD,500,400
3,tea $\rightarrow$ $\neg$ AD,342,240
4,turmeric $\rightarrow$ $\neg$ AD,2,2
5,female $\rightarrow$ $\neg$ AD,500,352
6,"female, stress $\rightarrow$ AD",260,100
7,"berries, apples $\rightarrow$ AD",120,32
8,"smoking, tea $\rightarrow$ AD",240,100
9,"smoking, higheducation $\rightarrow$ AD",80,32


In [2]:
# a) Leverage values
table = og_table.copy()

def leverage(p_ab, p_a, p_b):
    return p_ab - p_a * p_b

for r in table.index:
    p_x = int(table.loc[r, r'fr(X)']) / n
    if r'$\neg$' in table.loc[r, 'rule']:
        p_c = 1 - p_ad
    else:
        p_c = p_ad
    p_xc = int(table.loc[r, r'fr(X, C)']) / n
    table.at[r, r'leverage $\delta$'] = leverage(p_xc, p_x, p_c)

# prune out rows with non-positive statistical dependence
table = table[table[r'leverage $\delta$'] > 0]

table

Unnamed: 0,rule,fr(X),"fr(X, C)",leverage $\delta$
0,smoking $\rightarrow$ AD,300,125,0.035
2,higheducation $\rightarrow$ $\neg$ AD,500,400,0.05
3,tea $\rightarrow$ $\neg$ AD,342,240,0.0006
4,turmeric $\rightarrow$ $\neg$ AD,2,2,0.0006
5,female $\rightarrow$ $\neg$ AD,500,352,0.002
6,"female, stress $\rightarrow$ AD",260,100,0.022
8,"smoking, tea $\rightarrow$ AD",240,100,0.028
9,"smoking, higheducation $\rightarrow$ AD",80,32,0.008
10,"stress, smoking $\rightarrow$ AD",200,100,0.04
11,"female, higheducation $\rightarrow$ $\neg$ AD",251,203,0.0273


In [3]:
# b)

def MI(p_xc, p_x, p_c):
    a = p_xc
    b = p_x - p_xc  # p_x = p_xc + p_x~c <=> p_x~c = p_x - p_xc
    c = p_c - p_xc  # p_c = p_xc + p_~xc <=> p_~xc = p_c - p_xc
    d = 1 - (a+b+c)  # p_xc + p_x~c + p_~xc + p_~x~c = 1
    # print(a+b+c+d)
    num = a**a * b**b * c**c * d**d
    denom = p_x**p_x * (1-p_x)**(1-p_x) * p_c**p_c * (1-p_c)**(1-p_c)
    return np.log2(num/denom)


for r in table.index:
    p_x = int(table.loc[r, r'fr(X)']) / n
    if r'$\neg$' in table.loc[r, 'rule']:
        p_c = 1 - p_ad
    else:
        p_c = p_ad
    p_xc = int(table.loc[r, r'fr(X, C)']) / n
    table.at[r, r'$n \cdot MI$'] = np.round(n * MI(p_xc, p_x, p_c), 2)
    
table = table[table[r'$n \cdot MI$'] >= 1.5]

table

Unnamed: 0,rule,fr(X),"fr(X, C)",leverage $\delta$,$n \cdot MI$
0,smoking $\rightarrow$ AD,300,125,0.035,19.44
2,higheducation $\rightarrow$ $\neg$ AD,500,400,0.05,34.85
6,"female, stress $\rightarrow$ AD",260,100,0.022,8.4
8,"smoking, tea $\rightarrow$ AD",240,100,0.028,14.2
9,"smoking, higheducation $\rightarrow$ AD",80,32,0.008,2.85
10,"stress, smoking $\rightarrow$ AD",200,100,0.04,32.27
11,"female, higheducation $\rightarrow$ $\neg$ AD",251,203,0.0273,14.46


In [4]:
# c)

def MI_C(p_x, p_xq, p_xc, p_xqc):
    a = p_x
    b = p_xqc
    c = p_xq - p_xqc
    d = p_xc - p_xqc
    e = 1 - (b+c+d)
    num = a**a * b**b * c**c * d**d * e**e
    denom = p_xq**p_xq * (p_x-p_xq)**(p_x-p_xq) * p_xc**p_xc * (p_x-p_xc)**(p_x-p_xc)
    return np.log2(num/denom)

Y_table = og_table[0:6]
for r in Y_table.index:
    Y_table.at[r, 'rule'] = Y_table.at[r, 'rule'].split()[0]
Y_table = Y_table.set_index('rule')
for r in table.index:
    # Only need to look at the rules where X is a set of two
    if r > 5:
        
        # Get proper subset conditional probabilities
        rule = table.loc[r, 'rule'].split(', ')
        y1 = rule[0]
        y2 = rule[1].split()[0]
        p_y1, p_y1c = int(Y_table.loc[y1, 'fr(X)']) / n, int(Y_table.loc[y1, 'fr(X, C)']) / n
        p_y2, p_y2c = int(Y_table.loc[y2, 'fr(X)']) / n, int(Y_table.loc[y2, 'fr(X, C)']) / n
        p_c_cond_y1 = p_y1c / p_y1
        p_c_cond_y2 = p_y2c / p_y2
        
        p_x = int(table.loc[r, r'fr(X)']) / n
        p_xc = int(table.loc[r, r'fr(X, C)']) / n
        if r'$\neg$' in table.loc[r, 'rule']:
            p_c = 1 - p_ad
        else:
            p_c = p_ad
        p_c_cond_x = p_xc / p_c
        
        # Compute n * MI_Cs for both attributes of the two attribute set X
        mic_y1, mic_y2 = n * MI_C(p_y1, p_x, p_y1c, p_xc), n * MI_C(p_y2, p_x, p_y2c, p_xc)
        
        # Prune out rules if any proper subset Y's P(C=c|Y) >= P(C=c|X) or any of the
        # previously computed n * MI_C < 0.5.
        if (p_c_cond_y1 >= p_c_cond_x) or (p_c_cond_y2 >= p_c_cond_x) or (mic_y1 < 0.5) or (mic_y2 < 0.5):
            table = table.drop(r, axis=0)

table

Unnamed: 0,rule,fr(X),"fr(X, C)",leverage $\delta$,$n \cdot MI$
0,smoking $\rightarrow$ AD,300,125,0.035,19.44
2,higheducation $\rightarrow$ $\neg$ AD,500,400,0.05,34.85
