In [10]:
%matplotlib inline

import math
import subprocess
import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt

from collections import defaultdict
from scipy.stats.stats import pearsonr

In [11]:
def call_apriori(fileinput, fileoutput, delimiter=',', target_type='s', 
                 min_nbr_items=1, min_sup=2, min_conf=2):
    # apriori
    # -t# {m: maximal, c: closed, s: frequent, r: association rules}
    # -m# minimum number of items per item set/association rule
    # -s# minimum support of an item set, positive: percentage, negative: absolute
    # -c# minimum confidence rule percentage
    # -b# line delimiter (,)
    # The default additional information output format for rules is " (%X, %C)"
    # %X relative body set support as a percentage
    # %C rule confidence as a percentage
    # %L lift

    if target_type == 'r':
        call_cmd = ['./apriori', '-b%s' % delimiter, '-t%s' % target_type, '-m%s' % min_nbr_items, 
                    '-s%s' % min_sup, '-c%s' % min_conf, '-v (%X, %C, %L)', 
                    fileinput, fileoutput]
    else:
        call_cmd = ['./apriori', '-b%s' % delimiter, '-t%s' % target_type, 
                           '-m%s' % min_nbr_items, '-s%s' % min_sup, fileinput, fileoutput]

    ret = subprocess.call(call_cmd,  stdout=open('apriori_stdout.txt', 'w'), 
                          stderr=open('apriori_stderr.txt', 'w'))
    return ret

In [12]:
# def read_rules(filename):
#     data = open(filename, 'r')
#     rules = list()
#     for row in data:
#         fileds = row.rstrip('\n').replace('\r','').split(' <- ')
#         cons = fileds[0]
#         other = fileds[1].split(' (')
#         ant = other[0].split(' ')
#         other2 = other[1].split(', ')
#         sup = float(other2[0])
#         conf = float(other2[1])
#         lift = float(other2[2].replace(')', ''))
#         rule = {
#             'ant': ant,
#             'cons': cons,
#             'sup': sup,
#             'conf': conf,
#             'lift': lift
#         }
#         rules.append(rule)
#     data.close()
#     return rules

In [13]:
def read_rules(filename):
    data = open(filename, 'r')
    rules = list()
    alltxt = data.read()
    alltxt = alltxt.replace('\n','').replace('\r','')
    alltxt = alltxt .replace(')',')*')
    alltxt = alltxt[:-1]
    
    
    for row in alltxt.split('*'):
        print (row)
        fileds = row.rstrip('\n').replace('\r','').split(' <- ')
        cons = fileds[0]
        other = fileds[1].split(' (')
        ant = other[0].split(' ')
        other2 = other[1].split(', ')
        sup = float(other2[0])
        conf = float(other2[1])
        lift = float(other2[2].replace(')', ''))
        rule = {
            'ant': ant,
            'cons': cons,
            'sup': sup,
            'conf': conf,
            'lift': lift
        }
        rules.append(rule)
    data.close()
    return rules

In [14]:
df = pd.read_csv("diabetes_preprocessed.csv")

In [15]:
df.head()

Unnamed: 0,DiabetesPedigreeFunction,Age,GlucoseFill,BloodPressureFill,BMIFill,Pregnancies,Outcome
0,0.627,50,148.0,72.0,33.6,6,1
1,0.351,31,85.0,66.0,26.6,1,0
2,0.672,32,183.0,64.0,23.3,8,1
3,0.167,21,89.0,66.0,28.1,1,0
4,0.201,30,116.0,74.0,25.6,5,0


In [16]:
df.head()

Unnamed: 0,DiabetesPedigreeFunction,Age,GlucoseFill,BloodPressureFill,BMIFill,Pregnancies,Outcome
0,0.627,50,148.0,72.0,33.6,6,1
1,0.351,31,85.0,66.0,26.6,1,0
2,0.672,32,183.0,64.0,23.3,8,1
3,0.167,21,89.0,66.0,28.1,1,0
4,0.201,30,116.0,74.0,25.6,5,0


In [17]:
df.describe()

Unnamed: 0,DiabetesPedigreeFunction,Age,GlucoseFill,BloodPressureFill,BMIFill,Pregnancies,Outcome
count,697.0,697.0,697.0,697.0,697.0,697.0,697.0
mean,0.443534,32.008608,121.176471,72.005739,32.449641,3.727403,0.381636
std,0.286457,10.362811,29.999169,11.487248,6.723821,3.237755,0.486137
min,0.078,21.0,44.0,24.0,18.2,0.0,0.0
25%,0.239,24.0,99.0,64.0,27.6,1.0,0.0
50%,0.362,28.0,116.0,72.0,32.3,3.0,0.0
75%,0.593,39.0,140.0,78.0,36.5,6.0,1.0
max,2.42,70.0,199.0,114.0,67.1,17.0,1.0


In [18]:
df['AgeGroup'] = pd.cut(df['Age'], bins=range(20, 90, 10), right=False, labels=range(20,80,10))

In [19]:
df['GlucoseGroup'] = pd.cut(df['GlucoseFill'], bins=range(40, 210, 10), right=False, labels=range(40, 200, 10))

In [20]:
df['BloodPressureGroup'] = pd.cut(df['BloodPressureFill'], bins=range(20, 121, 10), right=False, labels=range(20, 120, 10))

In [21]:
range(20, 80, 10)

range(20, 80, 10)

In [22]:
df['BMIGroup'] = pd.cut(df['BMIFill'], bins=range(15, 80, 10), right=False, labels=range(15, 75, 10))

In [23]:
df['DiabetesPedigreeFunctionGroup'] = pd.cut(df['DiabetesPedigreeFunction'], bins=[0.0,0.5,1.0,1.5,2.0,2.5], right=False, labels=[0.0,0.5,1.0,1.5,2.0])

In [24]:
df['PregnanciesGroup'] = pd.cut(df['Pregnancies'], bins=range(0, 17, 3), right=False, labels=range(2, 17, 3))

For PregnanciesGroup, we give a max number in the range as it's label, others we give  the min value in the range as the label

In [25]:
df.head()

Unnamed: 0,DiabetesPedigreeFunction,Age,GlucoseFill,BloodPressureFill,BMIFill,Pregnancies,Outcome,AgeGroup,GlucoseGroup,BloodPressureGroup,BMIGroup,DiabetesPedigreeFunctionGroup,PregnanciesGroup
0,0.627,50,148.0,72.0,33.6,6,1,50,140,70,25,0.5,8
1,0.351,31,85.0,66.0,26.6,1,0,30,80,60,25,0.0,2
2,0.672,32,183.0,64.0,23.3,8,1,30,180,60,15,0.5,8
3,0.167,21,89.0,66.0,28.1,1,0,20,80,60,25,0.0,2
4,0.201,30,116.0,74.0,25.6,5,0,30,110,70,25,0.0,5


In [26]:
df.drop(['DiabetesPedigreeFunction','Age', 'GlucoseFill','BloodPressureFill','BMIFill','Pregnancies'], axis=1, inplace=True)
df.head()

Unnamed: 0,Outcome,AgeGroup,GlucoseGroup,BloodPressureGroup,BMIGroup,DiabetesPedigreeFunctionGroup,PregnanciesGroup
0,1,50,140,70,25,0.5,8
1,0,30,80,60,25,0.0,2
2,1,30,180,60,15,0.5,8
3,0,20,80,60,25,0.0,2
4,0,30,110,70,25,0.0,5


In [27]:
df1 = df
#D = diabetic and N = Non-diabetic
df1['Outcome'] = df['Outcome'].astype(str) + '_D'
df1['AgeGroup'] = df['AgeGroup'].astype(str) + '_A'
df1['GlucoseGroup'] = df['GlucoseGroup'].astype(str) + '_G'
df1['BloodPressureGroup'] = df['BloodPressureGroup'].astype(str) + '_B'
df1['BMIGroup'] = df['BMIGroup'].astype(str) + '_BMI'
df1['DiabetesPedigreeFunctionGroup'] = df['DiabetesPedigreeFunctionGroup'].astype(str) + '_F'
df1['PregnanciesGroup'] = df['PregnanciesGroup'].astype(str) + '_P'

In [28]:
df1.head()

Unnamed: 0,Outcome,AgeGroup,GlucoseGroup,BloodPressureGroup,BMIGroup,DiabetesPedigreeFunctionGroup,PregnanciesGroup
0,1_D,50_A,140_G,70_B,25_BMI,0.5_F,8.0_P
1,0_D,30_A,80_G,60_B,25_BMI,0.0_F,2.0_P
2,1_D,30_A,180_G,60_B,15_BMI,0.5_F,8.0_P
3,0_D,20_A,80_G,60_B,25_BMI,0.0_F,2.0_P
4,0_D,30_A,110_G,70_B,25_BMI,0.0_F,5.0_P


In [29]:
df1.to_csv('PIMA_for_patterns.csv', header=False)

In [30]:
delimiter=','
target_type='c'
min_nbr_items=3
min_sup=20
#min_conf=2

ret_val = call_apriori('PIMA_for_patterns.csv', 'PIMA_freq_patterns_target_type_c.txt', 
                       delimiter, target_type, min_nbr_items, min_sup)

In [31]:
delimiter=','
target_type='m'
min_nbr_items=3
min_sup=20
#min_conf=2

ret_val = call_apriori('PIMA_for_patterns.csv', 'PIMA_freq_patterns_target_type_m.txt', 
                       delimiter, target_type, min_nbr_items, min_sup)

In [30]:
delimiter=','
target_type='s'
min_nbr_items=3
min_sup=10
#min_conf=2

ret_val = call_apriori('PIMA_for_patterns.csv', 'PIMA_freq_patterns.txt', 
                       delimiter, target_type, min_nbr_items, min_sup)

In [31]:
delimiter=','
target_type='r'
min_nbr_items=3
min_sup=10
min_conf=25

ret_val = call_apriori('PIMA_for_patterns.csv', 'PIMA_rules.txt', delimiter, target_type, 
                       min_nbr_items, min_sup, min_conf)

In [32]:
rules = read_rules('PIMA_rules.txt')
print(len(rules))
for r in rules:
     print (r['ant'], '-->', r['cons'], ' lift', r['lift'], ' conf', r['conf'])

15_BMI <- 90_G 0_D (10.9039, 30.2632, 226.811)
90_G <- 15_BMI 0_D (12.3386, 26.7442, 219.302)
90_G <- 15_BMI 0.0_F (10.4735, 26.0274, 213.425)
5.0_P <- 90_G 0_D (10.9039, 27.6316, 104.104)
90_G <- 60_B 2.0_P 20_A 0_D (10.7604, 26.6667, 218.667)
60_B <- 90_G 0_D (10.9039, 39.4737, 148.72)
0.5_F <- 90_G 0_D (10.9039, 25, 82.9762)
70_B <- 90_G 0_D (10.9039, 32.8947, 87.8453)
2.0_P <- 90_G 0_D (10.9039, 55.2632, 120.37)
20_A <- 90_G 0_D (10.9039, 75, 137.929)
25_BMI <- 90_G 0_D (10.9039, 47.3684, 86.8837)
0.0_F <- 90_G 0_D (10.9039, 75, 113.889)
5.0_P <- 15_BMI 0.0_F (10.4735, 27.3973, 103.221)
15_BMI <- 60_B 2.0_P 20_A 0_D (10.7604, 28, 209.849)
15_BMI <- 60_B 2.0_P 0_D (11.6212, 27.1605, 203.558)
15_BMI <- 60_B 20_A 0_D 0.0_F (11.3343, 30.3797, 227.685)
15_BMI <- 60_B 20_A 0_D (14.7776, 26.2136, 196.461)
15_BMI <- 60_B 20_A 0.0_F (12.1951, 28.2353, 211.613)
15_BMI <- 60_B 0_D 0.0_F (14.2037, 29.2929, 219.539)
60_B <- 15_BMI 0_D (12.3386, 38.3721, 144.569)
15_BMI <- 60_B 0_D (18.7948, 25.

In [33]:
rulse_cons_D = list()
for r in rules:
    if r['cons'].endswith('_D'):
        rulse_cons_D.append(r)

In [34]:
print (len(rulse_cons_D))

77


In [35]:
sorted_rules_cons_D = sorted(rulse_cons_D, key=lambda r: r['conf'], reverse=True)

In [36]:
for r in sorted_rules_cons_D[:3]:
    print (r['ant'],'-->', r['cons'], ' lift', r['lift'], ' conf', r['conf'])

['15_BMI', '0.0_F'] --> 0_D  lift 150.64  conf 93.1507
['60_B', '20_A', '0.0_F'] --> 0_D  lift 150.302  conf 92.9412
['2.0_P', '20_A', '25_BMI', '0.0_F'] --> 0_D  lift 147.204  conf 91.0256


In [37]:
df1.values[0]

array(['1_D', '50_A', '140_G', '70_B', '25_BMI', '0.5_F', '8.0_P'], dtype=object)

In [38]:
patient_test = df1.values[10]

In [39]:
#for r in rules:
for r in sorted_rules_cons_D[:10]:
    if (set(r['ant']) < set(patient_test))&(r['cons'].endswith('_D')):
        print (r['ant'], '-->', r['cons'])

In [40]:
len(sorted_rules_cons_D)

77

In [41]:
minNumOfRulesPerLine=2
minNumOfRules2Consider=0
numberOfPatientRows=0


In [42]:
s =set()

In [43]:
for pv in df1.values:
    for i in range(1,102,100):
        mincnt=0
        for r in sorted_rules_cons_D[:i]:
            if (set(r['ant']) < set(pv))&(r['cons'].endswith('_D')):
                mincnt+=1
                if mincnt==minNumOfRulesPerLine:
                    minNumOfRules2Consider = max(minNumOfRules2Consider,i)
                    if ','.join(pv) not in s:
                        numberOfPatientRows+=1
                        s.add(','.join(pv))
                    break
    
    

In [44]:
len(s)

551

In [45]:
minNumOfRules2Consider

101

In [46]:
len(sorted_rules_cons_D)

77

In [47]:
numberOfPatientRows

551

In [48]:
df1.values.shape

(697, 7)

In [49]:
setOfSamples = dict()

In [50]:

for pv in df1.values:
    if(','.join(pv) in s):
        setOfSamples[','.join(pv)] = dict()
        for r in sorted_rules_cons_D[:102]:
            ln=len(set(r['ant']).intersection(set(pv)))
            if ln==0:
                continue
            if ln not in setOfSamples[','.join(pv)]:
                setOfSamples[','.join(pv)]=dict()
                setOfSamples[','.join(pv)][ln]=[]
            setOfSamples[','.join(pv)][ln].append(r)

            
        

In [51]:
setOfSamples

{'0_D,20_A,100_G,30_B,15_BMI,0.0_F,2.0_P': {1: [{'ant': ['70_B', '2.0_P'],
    'conf': 28.7129,
    'cons': '1_D',
    'lift': 75.2364,
    'sup': 14.4907},
   {'ant': ['0.5_F', '20_A'],
    'conf': 28.5714,
    'cons': '1_D',
    'lift': 74.8657,
    'sup': 16.0689},
   {'ant': ['60_B', '25_BMI', '0.0_F'],
    'conf': 28.3784,
    'cons': '1_D',
    'lift': 74.3599,
    'sup': 10.6169},
   {'ant': ['70_B', '20_A'],
    'conf': 25.2101,
    'cons': '1_D',
    'lift': 66.058,
    'sup': 17.0732}]},
 '0_D,20_A,100_G,40_B,15_BMI,0.0_F,2.0_P': {1: [{'ant': ['70_B', '2.0_P'],
    'conf': 28.7129,
    'cons': '1_D',
    'lift': 75.2364,
    'sup': 14.4907},
   {'ant': ['0.5_F', '20_A'],
    'conf': 28.5714,
    'cons': '1_D',
    'lift': 74.8657,
    'sup': 16.0689},
   {'ant': ['60_B', '25_BMI', '0.0_F'],
    'conf': 28.3784,
    'cons': '1_D',
    'lift': 74.3599,
    'sup': 10.6169},
   {'ant': ['70_B', '20_A'],
    'conf': 25.2101,
    'cons': '1_D',
    'lift': 66.058,
    'sup': 17.073

In [52]:
correctExamples = 0
for pv,rules in setOfSamples.items():
    pvoutcome = pv.split(',')[0].split('_')[0]
    vote0=0
    vote1=0
    for _,r in rules.items():
        for rule in r:
            if rule['cons'].replace('_D','')=='0':
                vote0+=1
            if rule['cons'].replace('_D','')=='1':
                vote1+=1
        if vote0>vote1:
            if pvoutcome=='0':
                correctExamples+=1
        else:
            if pvoutcome=='1':
                correctExamples+=1
        
            
        
        
            

In [53]:
correctExamples

236

In [54]:
len(setOfSamples)

551