In [33]:
%matplotlib inline

import math
import subprocess
import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt

from collections import defaultdict
from scipy.stats.stats import pearsonr

In [34]:
def call_apriori(fileinput, fileoutput, delimiter=',', target_type='s', 
                 min_nbr_items=1, min_sup=2, min_conf=2):
    # apriori
    # -t# {m: maximal, c: closed, s: frequent, r: association rules}
    # -m# minimum number of items per item set/association rule
    # -s# minimum support of an item set, positive: percentage, negative: absolute
    # -c# minimum confidence rule percentage
    # -b# line delimiter (,)
    # The default additional information output format for rules is " (%X, %C)"
    # %X relative body set support as a percentage
    # %C rule confidence as a percentage
    # %L lift

    if target_type == 'r':
        call_cmd = ['./apriori', '-b%s' % delimiter, '-t%s' % target_type, '-m%s' % min_nbr_items, 
                    '-s%s' % min_sup, '-c%s' % min_conf, '-v (%X, %C, %L)', 
                    fileinput, fileoutput]
    else:
        call_cmd = ['./apriori', '-b%s' % delimiter, '-t%s' % target_type, 
                           '-m%s' % min_nbr_items, '-s%s' % min_sup, fileinput, fileoutput]

    ret = subprocess.call(call_cmd,  stdout=open('apriori_stdout.txt', 'w'), 
                          stderr=open('apriori_stderr.txt', 'w'))
    return ret

In [35]:
# def read_rules(filename):
#     data = open(filename, 'r')
#     rules = list()
#     for row in data:
#         fileds = row.rstrip('\n').replace('\r','').split(' <- ')
#         cons = fileds[0]
#         other = fileds[1].split(' (')
#         ant = other[0].split(' ')
#         other2 = other[1].split(', ')
#         sup = float(other2[0])
#         conf = float(other2[1])
#         lift = float(other2[2].replace(')', ''))
#         rule = {
#             'ant': ant,
#             'cons': cons,
#             'sup': sup,
#             'conf': conf,
#             'lift': lift
#         }
#         rules.append(rule)
#     data.close()
#     return rules

In [36]:
def read_rules(filename):
    data = open(filename, 'r')
    rules = list()
    alltxt = data.read()
    alltxt = alltxt.replace('\n','').replace('\r','')
    alltxt = alltxt .replace(')',')*')
    alltxt = alltxt[:-1]
    
    
    for row in alltxt.split('*'):
        print (row)
        fileds = row.rstrip('\n').replace('\r','').split(' <- ')
        cons = fileds[0]
        other = fileds[1].split(' (')
        ant = other[0].split(' ')
        other2 = other[1].split(', ')
        sup = float(other2[0])
        conf = float(other2[1])
        lift = float(other2[2].replace(')', ''))
        rule = {
            'ant': ant,
            'cons': cons,
            'sup': sup,
            'conf': conf,
            'lift': lift
        }
        rules.append(rule)
    data.close()
    return rules

In [37]:
df_original = pd.read_csv("diabetes.csv")
df = pd.read_csv("diabetes_preprocessed.csv")

In [38]:
df['Insulin'] = df_original['Insulin']

In [39]:
df_train = df[df['Insulin']!=0]
df_test = df[df['Insulin']==0]

In [53]:
df_train.describe()

Unnamed: 0,DiabetesPedigreeFunction,Age,GlucoseFill,BloodPressureFill,BMIFill,Pregnancies,Outcome,Insulin
count,361.0,361.0,361.0,361.0,361.0,361.0,361.0,361.0
mean,0.436136,31.376731,119.110803,71.351801,32.348199,3.634349,0.360111,154.462604
std,0.271299,9.62502,30.060659,11.050556,6.277827,3.169181,0.480699,119.825657
min,0.085,21.0,44.0,24.0,19.3,0.0,0.0,14.0
25%,0.249,23.0,98.0,64.0,27.6,1.0,0.0,76.0
50%,0.355,28.0,114.0,70.0,32.0,3.0,0.0,125.0
75%,0.58,38.0,136.0,78.0,36.6,5.0,1.0,190.0
max,1.893,66.0,199.0,114.0,53.2,14.0,1.0,846.0


In [41]:
df_test.shape

(336, 8)

In [42]:
df_test.head()

Unnamed: 0,DiabetesPedigreeFunction,Age,GlucoseFill,BloodPressureFill,BMIFill,Pregnancies,Outcome,Insulin
0,0.627,50,148.0,72.0,33.6,6,1,0
1,0.351,31,85.0,66.0,26.6,1,0,0
2,0.672,32,183.0,64.0,23.3,8,1,0
5,0.248,26,78.0,50.0,31.0,3,1,0
7,0.158,53,197.0,70.0,30.5,2,1,0


In [43]:
df_train.head()

Unnamed: 0,DiabetesPedigreeFunction,Age,GlucoseFill,BloodPressureFill,BMIFill,Pregnancies,Outcome,Insulin
3,0.167,21,89.0,66.0,28.1,1,0,94
4,0.201,30,116.0,74.0,25.6,5,0,168
6,0.134,29,115.0,70.0,35.3,10,0,88
8,0.232,54,125.0,96.0,34.3,8,1,543
13,0.484,32,100.0,74.5,30.0,7,1,846


In [44]:
df_train.describe()

Unnamed: 0,DiabetesPedigreeFunction,Age,GlucoseFill,BloodPressureFill,BMIFill,Pregnancies,Outcome,Insulin
count,361.0,361.0,361.0,361.0,361.0,361.0,361.0,361.0
mean,0.436136,31.376731,119.110803,71.351801,32.348199,3.634349,0.360111,154.462604
std,0.271299,9.62502,30.060659,11.050556,6.277827,3.169181,0.480699,119.825657
min,0.085,21.0,44.0,24.0,19.3,0.0,0.0,14.0
25%,0.249,23.0,98.0,64.0,27.6,1.0,0.0,76.0
50%,0.355,28.0,114.0,70.0,32.0,3.0,0.0,125.0
75%,0.58,38.0,136.0,78.0,36.6,5.0,1.0,190.0
max,1.893,66.0,199.0,114.0,53.2,14.0,1.0,846.0


In [45]:
df_test.describe()

Unnamed: 0,DiabetesPedigreeFunction,Age,GlucoseFill,BloodPressureFill,BMIFill,Pregnancies,Outcome,Insulin
count,336.0,336.0,336.0,336.0,336.0,336.0,336.0,336.0
mean,0.451482,32.6875,123.395833,72.708333,32.558631,3.827381,0.404762,0.0
std,0.302102,11.075344,29.818273,11.915175,7.18005,3.311666,0.491578,0.0
min,0.078,21.0,56.0,44.0,18.2,0.0,0.0,0.0
25%,0.2365,24.0,102.0,64.0,27.7,1.0,0.0,0.0
50%,0.3665,28.5,119.0,72.0,32.4,3.0,0.0,0.0
75%,0.6105,40.0,144.0,80.0,36.5,6.0,1.0,0.0
max,2.42,70.0,198.0,110.0,67.1,17.0,1.0,0.0


In [46]:
df_train['AgeGroup'] = pd.cut(df_train['Age'], bins=range(20, 90, 10), right=False, labels=range(20,80,10))
df_test['AgeGroup'] = pd.cut(df_test['Age'], bins=range(20, 90, 10), right=False, labels=range(20,80,10))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [47]:
df_train['GlucoseGroup'] = pd.cut(df_train['GlucoseFill'], bins=range(40, 210, 10), right=False, labels=range(40, 200, 10))
df_test['GlucoseGroup'] = pd.cut(df_test['GlucoseFill'], bins=range(40, 210, 10), right=False, labels=range(40, 200, 10))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [48]:
df_train['BloodPressureGroup'] = pd.cut(df_train['BloodPressureFill'], bins=range(20, 121, 10), right=False, labels=range(20, 120, 10))
df_test['BloodPressureGroup'] = pd.cut(df_test['BloodPressureFill'], bins=range(20, 121, 10), right=False, labels=range(20, 120, 10))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [49]:
df_train['BMIGroup'] = pd.cut(df_train['BMIFill'], bins=range(15, 80, 10), right=False, labels=range(15, 75, 10))
df_test['BMIGroup'] = pd.cut(df_test['BMIFill'], bins=range(15, 80, 10), right=False, labels=range(15, 75, 10))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [50]:
df_train['DiabetesPedigreeFunctionGroup'] = pd.cut(df_train['DiabetesPedigreeFunction'], bins=[0.0,0.5,1.0,1.5,2.0,2.5], right=False, labels=[0.0,0.5,1.0,1.5,2.0])
df_test['DiabetesPedigreeFunctionGroup'] = pd.cut(df_test['DiabetesPedigreeFunction'], bins=[0.0,0.5,1.0,1.5,2.0,2.5], right=False, labels=[0.0,0.5,1.0,1.5,2.0])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [51]:
df_train['PregnanciesGroup'] = pd.cut(df_train['Pregnancies'], bins=range(0, 17, 3), right=False, labels=range(2, 17, 3))
df_test['PregnanciesGroup'] = pd.cut(df_test['Pregnancies'], bins=range(0, 17, 3), right=False, labels=range(2, 17, 3))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [58]:
df_train['InsulinGroup'] = pd.cut(df_train['Insulin'], bins=range(0, 900, 10), right=False, labels=range(0, 890, 10))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


For PregnanciesGroup, we give a max number in the range as it's label, others we give  the min value in the range as the label

In [59]:
df_train.head()

Unnamed: 0,DiabetesPedigreeFunction,Age,GlucoseFill,BloodPressureFill,BMIFill,Pregnancies,Outcome,Insulin,AgeGroup,GlucoseGroup,BloodPressureGroup,BMIGroup,DiabetesPedigreeFunctionGroup,PregnanciesGroup,InsulinGroup
3,0.167,21,89.0,66.0,28.1,1,0,94,20,80,60,25,0.0,2,90
4,0.201,30,116.0,74.0,25.6,5,0,168,30,110,70,25,0.0,5,160
6,0.134,29,115.0,70.0,35.3,10,0,88,20,110,70,35,0.0,11,80
8,0.232,54,125.0,96.0,34.3,8,1,543,50,120,90,25,0.0,8,540
13,0.484,32,100.0,74.5,30.0,7,1,846,30,100,70,25,0.0,8,840


In [60]:
df_test.head()

Unnamed: 0,DiabetesPedigreeFunction,Age,GlucoseFill,BloodPressureFill,BMIFill,Pregnancies,Outcome,Insulin,AgeGroup,GlucoseGroup,BloodPressureGroup,BMIGroup,DiabetesPedigreeFunctionGroup,PregnanciesGroup
0,0.627,50,148.0,72.0,33.6,6,1,0,50,140,70,25,0.5,8
1,0.351,31,85.0,66.0,26.6,1,0,0,30,80,60,25,0.0,2
2,0.672,32,183.0,64.0,23.3,8,1,0,30,180,60,15,0.5,8
5,0.248,26,78.0,50.0,31.0,3,1,0,20,70,50,25,0.0,5
7,0.158,53,197.0,70.0,30.5,2,1,0,50,190,70,25,0.0,2


In [61]:
df_train.drop(['DiabetesPedigreeFunction','Age', 'GlucoseFill','BloodPressureFill','BMIFill','Pregnancies','Insulin'], axis=1, inplace=True)
df_train.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,Outcome,AgeGroup,GlucoseGroup,BloodPressureGroup,BMIGroup,DiabetesPedigreeFunctionGroup,PregnanciesGroup,InsulinGroup
3,0,20,80,60,25,0.0,2,90
4,0,30,110,70,25,0.0,5,160
6,0,20,110,70,35,0.0,11,80
8,1,50,120,90,25,0.0,8,540
13,1,30,100,70,25,0.0,8,840


In [62]:
df_test.drop(['DiabetesPedigreeFunction','Age', 'GlucoseFill','BloodPressureFill','BMIFill','Pregnancies','Insulin'], axis=1, inplace=True)
df_test.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,Outcome,AgeGroup,GlucoseGroup,BloodPressureGroup,BMIGroup,DiabetesPedigreeFunctionGroup,PregnanciesGroup
0,1,50,140,70,25,0.5,8
1,0,30,80,60,25,0.0,2
2,1,30,180,60,15,0.5,8
5,1,20,70,50,25,0.0,5
7,1,50,190,70,25,0.0,2


In [63]:
df1 = df_train
#D = diabetic and N = Non-diabetic
df1['Outcome'] = df_train['Outcome'].astype(str) + '_D'
df1['AgeGroup'] = df_train['AgeGroup'].astype(str) + '_A'
df1['GlucoseGroup'] = df_train['GlucoseGroup'].astype(str) + '_G'
df1['BloodPressureGroup'] = df_train['BloodPressureGroup'].astype(str) + '_B'
df1['BMIGroup'] = df_train['BMIGroup'].astype(str) + '_BMI'
df1['DiabetesPedigreeFunctionGroup'] = df_train['DiabetesPedigreeFunctionGroup'].astype(str) + '_F'
df1['PregnanciesGroup'] = df_train['PregnanciesGroup'].astype(str) + '_P'
df1['InsulinGroup'] = df_train['InsulinGroup'].astype(str) + '_I'

df2 = df_test
#D = diabetic and N = Non-diabetic
df2['Outcome'] = df_test['Outcome'].astype(str) + '_D'
df2['AgeGroup'] = df_test['AgeGroup'].astype(str) + '_A'
df2['GlucoseGroup'] = df_test['GlucoseGroup'].astype(str) + '_G'
df2['BloodPressureGroup'] = df_test['BloodPressureGroup'].astype(str) + '_B'
df2['BMIGroup'] = df_test['BMIGroup'].astype(str) + '_BMI'
df2['DiabetesPedigreeFunctionGroup'] = df_test['DiabetesPedigreeFunctionGroup'].astype(str) + '_F'
df2['PregnanciesGroup'] = df_test['PregnanciesGroup'].astype(str) + '_P'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the ca

In [64]:
df1.head()

Unnamed: 0,Outcome,AgeGroup,GlucoseGroup,BloodPressureGroup,BMIGroup,DiabetesPedigreeFunctionGroup,PregnanciesGroup,InsulinGroup
3,0_D,20_A,80_G,60_B,25_BMI,0.0_F,2_P,90_I
4,0_D,30_A,110_G,70_B,25_BMI,0.0_F,5_P,160_I
6,0_D,20_A,110_G,70_B,35_BMI,0.0_F,11_P,80_I
8,1_D,50_A,120_G,90_B,25_BMI,0.0_F,8_P,540_I
13,1_D,30_A,100_G,70_B,25_BMI,0.0_F,8_P,840_I


In [65]:
df2.head()

Unnamed: 0,Outcome,AgeGroup,GlucoseGroup,BloodPressureGroup,BMIGroup,DiabetesPedigreeFunctionGroup,PregnanciesGroup
0,1_D,50_A,140_G,70_B,25_BMI,0.5_F,8.0_P
1,0_D,30_A,80_G,60_B,25_BMI,0.0_F,2.0_P
2,1_D,30_A,180_G,60_B,15_BMI,0.5_F,8.0_P
5,1_D,20_A,70_G,50_B,25_BMI,0.0_F,5.0_P
7,1_D,50_A,190_G,70_B,25_BMI,0.0_F,2.0_P


In [66]:
df1.to_csv('PIMA_for_missing_values_patterns.csv', header=False)

In [67]:
delimiter=','
target_type='c'
min_nbr_items=3
min_sup=10
#min_conf=2

ret_val = call_apriori('PIMA_for_missing_values_patterns.csv', 'PIMA_freq_patterns_target_type_c_missing_values.txt', 
                       delimiter, target_type, min_nbr_items, min_sup)

In [68]:
delimiter=','
target_type='m'
min_nbr_items=3
min_sup=10
#min_conf=2

ret_val = call_apriori('PIMA_for_missing_values_patterns.csv', 'PIMA_freq_patterns_target_type_m_missing_values.txt', 
                       delimiter, target_type, min_nbr_items, min_sup)

In [105]:
delimiter=','
target_type='s'
min_nbr_items=2
min_sup=10
#min_conf=2

ret_val = call_apriori('PIMA_for_missing_values_patterns.csv', 'PIMA_freq_patterns_missing_values.txt', 
                       delimiter, target_type, min_nbr_items, min_sup)

In [106]:
delimiter=','
target_type='r'
min_nbr_items=2
min_sup=5
min_conf=25

ret_val = call_apriori('PIMA_for_missing_values_patterns.csv', 'PIMA_rules_missing_values.txt', delimiter, target_type, 
                       min_nbr_items, min_sup, min_conf)

In [107]:
rules = read_rules('PIMA_rules_missing_values.txt')
for r in rules[:3]:
    print (r['ant'], '-->', r['cons'], ' lift', r['lift'], ' conf', r['conf'])

100_G <- 60_I (5.26316, 26.3158, 182.692)
30_A <- 60_I (5.26316, 26.3158, 114.458)
0.5_F <- 60_I (5.26316, 31.5789, 112.871)
35_BMI <- 60_I (5.26316, 31.5789, 106.542)
70_B <- 60_I (5.26316, 57.8947, 158.333)
2_P <- 60_I (5.26316, 57.8947, 125.15)
25_BMI <- 60_I (5.26316, 47.3684, 86.3636)
20_A <- 60_I (5.26316, 57.8947, 105.025)
0_D <- 60_I (5.26316, 78.9474, 123.377)
0.0_F <- 60_I (5.26316, 63.1579, 93.0612)
15_BMI <- 100_I (5.26316, 26.3158, 206.522)
8_P <- 100_I (5.26316, 26.3158, 169.643)
0.5_F <- 100_I (5.26316, 47.3684, 169.307)
5_P <- 100_I (5.26316, 26.3158, 91.3462)
35_BMI <- 100_I (5.26316, 31.5789, 106.542)
60_B <- 100_I (5.26316, 42.1053, 139.45)
1_D <- 100_I (5.26316, 42.1053, 116.923)
70_B <- 100_I (5.26316, 42.1053, 115.152)
2_P <- 100_I (5.26316, 36.8421, 79.6407)
25_BMI <- 100_I (5.26316, 42.1053, 76.7677)
20_A <- 100_I (5.26316, 52.6316, 95.4774)
0_D <- 100_I (5.26316, 57.8947, 90.4762)
0.0_F <- 100_I (5.26316, 52.6316, 77.551)
40_A <- 40_I (5.54017, 25, 150.417)
80_

110_G <- 5_P 70_B 25_BMI 0.0_F (5.54017, 25, 196.196)
110_G <- 5_P 70_B 0_D (6.92521, 28, 219.739)
110_G <- 5_P 70_B 0.0_F (8.86427, 25, 196.196)
70_B <- 110_G 5_P (5.26316, 52.6316, 143.939)
5_P <- 110_G 25_BMI 0.0_F (6.09418, 40.9091, 142.002)
25_BMI <- 110_G 5_P (5.26316, 52.6316, 95.9596)
5_P <- 110_G 25_BMI (7.20222, 38.4615, 133.506)
5_P <- 110_G 20_A 0_D (7.20222, 26.9231, 93.4541)
5_P <- 110_G 20_A 0.0_F (6.37119, 26.087, 90.5518)
20_A <- 110_G 5_P (5.26316, 42.1053, 76.3819)
5_P <- 110_G 20_A (8.03324, 27.5862, 95.756)
5_P <- 110_G 0_D 0.0_F (7.47922, 40.7407, 141.417)
0_D <- 110_G 5_P (5.26316, 73.6842, 115.152)
5_P <- 110_G 0_D (9.69529, 40, 138.846)
0.0_F <- 110_G 5_P (5.26316, 84.2105, 124.082)
5_P <- 110_G 0.0_F (9.9723, 44.4444, 154.274)
5_P <- 110_G (12.7424, 41.3043, 143.374)
110_G <- 35_BMI 2_P 20_A 0_D 0.0_F (5.54017, 25, 196.196)
35_BMI <- 110_G 2_P 20_A (5.26316, 31.5789, 106.542)
35_BMI <- 110_G 2_P 0_D (5.26316, 26.3158, 88.785)
35_BMI <- 110_G 2_P (6.09418, 27.2

In [108]:
rulse_cons_I = list()
for r in rules:
    if r['cons'].endswith('_I'):
        rulse_cons_I.append(r)

In [109]:
print (len(rulse_cons_I))

3


In [110]:
sorted_rules_cons_I = sorted(rulse_cons_I, key=lambda r: r['conf'], reverse=True)

In [112]:
for r in sorted_rules_cons_I[:100]:
    print (r['ant'],'-->', r['cons'], ' lift', r['lift'], ' conf', r['conf'])

['30_A', '25_BMI', '0_D', '0.0_F'] --> 70_I  lift 410.227  conf 27.2727
['30_A', '0_D', '0.0_F'] --> 70_I  lift 410.227  conf 27.2727
['35_BMI', '2_P', '20_A', '0_D', '0.0_F'] --> 50_I  lift 410.227  conf 25.0


In [113]:
df2.values[0]

array(['1_D', '50_A', '140_G', '70_B', '25_BMI', '0.5_F', '8.0_P'], dtype=object)

In [114]:
patient_test = df2.values[0]

In [115]:
#for r in rules:
for r in sorted_rules_cons_I[:500]:
    if (set(r['ant']) < set(patient_test))&(r['cons'].endswith('_I')):
        print (r['ant'], '-->', r['cons'])

In [116]:
len(sorted_rules_cons_I)

3

In [121]:
minNumOfRulesPerLine=1
minNumOfRules2Consider=0
numberOfPatientRows=0

In [122]:
s =set()

In [123]:
for pv in df2.values:
    for i in range(1,102,100):
        mincnt=0
        for r in sorted_rules_cons_I[:i]:
            if (set(r['ant']) < set(pv))&(r['cons'].endswith('_I')):
                mincnt+=1
                if mincnt==minNumOfRulesPerLine:
                    minNumOfRules2Consider = max(minNumOfRules2Consider,i)
                    if ','.join(pv) not in s:
                        numberOfPatientRows+=1
                        s.add(','.join(pv))
                    break

In [124]:
len(s)

23

In [125]:
minNumOfRules2Consider

101

In [129]:
len(sorted_rules_cons_I)

3

In [130]:
numberOfPatientRows

23

In [131]:
df1.values.shape

(361, 8)

In [132]:
setOfSamples = dict()

In [133]:
for pv in df2.values:
    if(','.join(pv) in s):
        setOfSamples[','.join(pv)] = dict()
        for r in sorted_rules_cons_I[:100]:
            ln=len(set(r['ant']).intersection(set(pv)))
            if ln==0:
                continue
            if ln not in setOfSamples[','.join(pv)]:
                setOfSamples[','.join(pv)]=dict()
                setOfSamples[','.join(pv)][ln]=[]
            setOfSamples[','.join(pv)][ln].append(r)

In [134]:
setOfSamples

{'0_D,30_A,100_G,70_B,25_BMI,0.0_F,8.0_P': {2: [{'ant': ['35_BMI',
     '2_P',
     '20_A',
     '0_D',
     '0.0_F'],
    'conf': 25.0,
    'cons': '50_I',
    'lift': 410.227,
    'sup': 5.54017}]},
 '0_D,30_A,100_G,80_B,35_BMI,0.0_F,5.0_P': {3: [{'ant': ['30_A',
     '25_BMI',
     '0_D',
     '0.0_F'],
    'conf': 27.2727,
    'cons': '70_I',
    'lift': 410.227,
    'sup': 6.09418},
   {'ant': ['30_A', '0_D', '0.0_F'],
    'conf': 27.2727,
    'cons': '70_I',
    'lift': 410.227,
    'sup': 9.14127},
   {'ant': ['35_BMI', '2_P', '20_A', '0_D', '0.0_F'],
    'conf': 25.0,
    'cons': '50_I',
    'lift': 410.227,
    'sup': 5.54017}]},
 '0_D,30_A,110_G,70_B,15_BMI,0.0_F,8.0_P': {2: [{'ant': ['35_BMI',
     '2_P',
     '20_A',
     '0_D',
     '0.0_F'],
    'conf': 25.0,
    'cons': '50_I',
    'lift': 410.227,
    'sup': 5.54017}]},
 '0_D,30_A,110_G,70_B,25_BMI,0.0_F,8.0_P': {2: [{'ant': ['35_BMI',
     '2_P',
     '20_A',
     '0_D',
     '0.0_F'],
    'conf': 25.0,
    'cons': '50

In [135]:
import operator
with open('PIMA_missing3.txt', mode='w+', encoding='utf-8') as myfile:
    vote=dict()
    correctExamples = 0
    for pv,rules in setOfSamples.items():
        pvoutcome = pv.split(',')[0].split('_')[0]
        vote['0']=0
        vote['100']=0
        vote['200']=0
        vote['300']=0
        vote['400']=0
        vote['500']=0
        vote['600']=0
        vote['700']=0
        vote['800']=0
        for _,r in rules.items():
            for rule in r:
                if rule['cons'].replace('_I','')=='0':
                    vote['0']+=1
                if rule['cons'].replace('_I','')=='100':
                    vote['100']+=1
                if rule['cons'].replace('_I','')=='200':
                    vote['200']+=1
                if rule['cons'].replace('_I','')=='300':
                    vote['300']+=1
                if rule['cons'].replace('_I','')=='400':
                    vote['400']+=1
                if rule['cons'].replace('_I','')=='500':
                    vote['500']+=1
                if rule['cons'].replace('_I','')=='600':
                    vote['600']+=1
                if rule['cons'].replace('_I','')=='700':
                    vote['700']+=1
                if rule['cons'].replace('_I','')=='800':
                    vote['800']+=1
            p=','.join([pv,str(max(vote.items(), key=operator.itemgetter(1))[0])+'_I'])
            print(p)
            myfile.write(''.join([p,'\n']))

0_D,30_A,120_G,70_B,25_BMI,0.0_F,5.0_P,0_I
0_D,30_A,120_G,60_B,25_BMI,0.0_F,2.0_P,0_I
0_D,30_A,110_G,90_B,35_BMI,0.0_F,5.0_P,0_I
0_D,30_A,80_G,70_B,35_BMI,0.0_F,8.0_P,0_I
0_D,30_A,120_G,60_B,25_BMI,0.0_F,8.0_P,0_I
0_D,30_A,110_G,70_B,15_BMI,0.0_F,8.0_P,0_I
0_D,30_A,100_G,80_B,35_BMI,0.0_F,5.0_P,0_I
0_D,30_A,150_G,70_B,25_BMI,0.0_F,5.0_P,0_I
0_D,30_A,130_G,60_B,25_BMI,0.0_F,11.0_P,0_I
0_D,30_A,140_G,70_B,25_BMI,0.0_F,5.0_P,0_I
0_D,30_A,110_G,90_B,25_BMI,0.0_F,5.0_P,0_I
0_D,30_A,150_G,70_B,35_BMI,0.0_F,2.0_P,0_I
0_D,30_A,80_G,60_B,25_BMI,0.0_F,2.0_P,0_I
0_D,30_A,80_G,80_B,15_BMI,0.0_F,8.0_P,0_I
0_D,30_A,100_G,70_B,25_BMI,0.0_F,8.0_P,0_I
0_D,30_A,110_G,70_B,25_BMI,0.0_F,8.0_P,0_I
0_D,30_A,150_G,90_B,25_BMI,0.0_F,5.0_P,0_I
0_D,30_A,120_G,70_B,35_BMI,0.0_F,11.0_P,0_I
0_D,30_A,110_G,90_B,25_BMI,0.0_F,8.0_P,0_I
0_D,30_A,120_G,70_B,25_BMI,0.0_F,8.0_P,0_I
0_D,30_A,130_G,90_B,25_BMI,0.0_F,8.0_P,0_I
0_D,30_A,80_G,60_B,15_BMI,0.0_F,5.0_P,0_I
0_D,30_A,70_G,60_B,25_BMI,0.0_F,2.0_P,0_I


In [136]:
df = pd.read_csv("PIMA_missing3.txt" , header = None)
df.columns=['Outcome','AgeGroup','GlucoseGroup','BloodPressureGroup','BMIGroup','DiabetesPedigreeFunctionGroup','PregnanciesGroup','InsulinGroup']

In [137]:
df.head()

Unnamed: 0,Outcome,AgeGroup,GlucoseGroup,BloodPressureGroup,BMIGroup,DiabetesPedigreeFunctionGroup,PregnanciesGroup,InsulinGroup
0,0_D,30_A,120_G,70_B,25_BMI,0.0_F,5.0_P,0_I
1,0_D,30_A,120_G,60_B,25_BMI,0.0_F,2.0_P,0_I
2,0_D,30_A,110_G,90_B,35_BMI,0.0_F,5.0_P,0_I
3,0_D,30_A,80_G,70_B,35_BMI,0.0_F,8.0_P,0_I
4,0_D,30_A,120_G,60_B,25_BMI,0.0_F,8.0_P,0_I


In [62]:
# correctExamples

99

In [63]:
# len(setOfSamples)

102