Calculation of information gain of each attribute to determine which have greater impacts on CHD outcome. Though there likely exist libraries that are already capable of performing this function, here the code is expanded as a demonstration of the math and code behind the calculation. 

In [1]:
import pandas as pd
import numpy as np

In [2]:
# A previously-cleaned dataset is used here. 
df = pd.read_csv('framingham_cleaned.csv')
df.head(10)

Unnamed: 0,male,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,1,"[32, 42.0]",4.0,0,"[0.0, 0.0]",0.0,0,0,0,"[107.0, 206.0]","[83.5, 117.0]","[48.0, 75.0]","(25.41, 28.04]","(75.0, 83.0]","(72.0, 80.0]",no
1,0,"(42.0, 49.0]",2.0,0,"[0.0, 0.0]",0.0,0,0,0,"(234.0, 262.0]","(117.0, 128.0]","(75.0, 82.0]","(28.04, 56.8]","(83.0, 143.0]","(72.0, 80.0]",no
2,1,"(42.0, 49.0]",1.0,1,"(0.0, 20.0]",0.0,0,0,0,"(234.0, 262.0]","(117.0, 128.0]","(75.0, 82.0]","(23.08, 25.41]","(68.0, 75.0]","[40.0, 72.0]",no
3,0,"(56.0, 70]",3.0,1,"(20.0, 70.0]",0.0,0,1,0,"(206.0, 234.0]","(144.0, 295.0]","(89.88, 142.5]","(28.04, 56.8]","[44.0, 68.0]","(85.0, 394.0]",yes
4,0,"(42.0, 49.0]",3.0,1,"(20.0, 70.0]",0.0,0,0,0,"(262.0, 696.0]","(128.0, 144.0]","(82.0, 89.88]","(23.08, 25.41]","(83.0, 143.0]","(80.0, 85.0]",no
5,0,"(42.0, 49.0]",2.0,0,"[0.0, 0.0]",0.0,0,1,0,"(206.0, 234.0]","(144.0, 295.0]","(89.88, 142.5]","(28.04, 56.8]","(75.0, 83.0]","(85.0, 394.0]",no
6,0,"(56.0, 70]",1.0,0,"[0.0, 0.0]",0.0,0,0,0,"[107.0, 206.0]","(128.0, 144.0]","[48.0, 75.0]","(28.04, 56.8]","[44.0, 68.0]","(80.0, 85.0]",yes
7,0,"(42.0, 49.0]",2.0,1,"(0.0, 20.0]",0.0,0,0,0,"(262.0, 696.0]","[83.5, 117.0]","[48.0, 75.0]","[15.54, 23.08]","(75.0, 83.0]","(72.0, 80.0]",no
8,1,"(49.0, 56.0]",1.0,0,"[0.0, 0.0]",0.0,0,1,0,"(234.0, 262.0]","(128.0, 144.0]","(82.0, 89.88]","(25.41, 28.04]","(75.0, 83.0]","(72.0, 80.0]",no
9,1,"(42.0, 49.0]",1.0,1,"(20.0, 70.0]",0.0,0,1,0,"(206.0, 234.0]","(144.0, 295.0]","(89.88, 142.5]","(23.08, 25.41]","(83.0, 143.0]","(85.0, 394.0]",no


In [50]:
# A dictionary of the possible values of each attribute is created. 
columns = [col for col in df.iloc[: , :15]]

column_values = {}

for col in columns:
    values = [v for v in df[col].unique()]
    column_values[col] = values
    
print(column_values)

df['age'].value_counts()

{'male': [1, 0], 'age': ['[32, 42.0]', '(42.0, 49.0]', '(56.0, 70]', '(49.0, 56.0]'], 'education': [4.0, 2.0, 1.0, 3.0], 'currentSmoker': [0, 1], 'cigsPerDay': ['[0.0, 0.0]', '(0.0, 20.0]', '(20.0, 70.0]'], 'BPMeds': [0.0, 1.0], 'prevalentStroke': [0, 1], 'prevalentHyp': [0, 1], 'diabetes': [0, 1], 'totChol': ['[107.0, 206.0]', '(234.0, 262.0]', '(206.0, 234.0]', '(262.0, 696.0]'], 'sysBP': ['[83.5, 117.0]', '(117.0, 128.0]', '(144.0, 295.0]', '(128.0, 144.0]'], 'diaBP': ['[48.0, 75.0]', '(75.0, 82.0]', '(89.88, 142.5]', '(82.0, 89.88]'], 'BMI': ['(25.41, 28.04]', '(28.04, 56.8]', '(23.08, 25.41]', '[15.54, 23.08]'], 'heartRate': ['(75.0, 83.0]', '(83.0, 143.0]', '(68.0, 75.0]', '[44.0, 68.0]'], 'glucose': ['(72.0, 80.0]', '[40.0, 72.0]', '(85.0, 394.0]', '(80.0, 85.0]']}


(42.0, 49.0]    1115
[32, 42.0]      1100
(56.0, 70]      1049
(49.0, 56.0]     974
Name: age, dtype: int64

In [51]:
# An equation for information gain of the target attribute is implemented in a function. 
def information_gain_target(df): 
    ig_CHD = 0
    attr = 'TenYearCHD'
    CHD = df[attr].value_counts()
    size = df[attr].size
    yes, no = CHD[0], CHD[1]
    ig_CHD = (-1 * yes/size * np.log2(yes/size)) - (no/size * np.log2(no/size))
    
    return ig_CHD


In [52]:
# An equation for information gain of a given attribute is implemented in a function. 
def information_gain(p_count_yes, p_count_no):    
    if p_count_yes == 0 or p_count_no == 0:
        return 0
    size = p_count_yes + p_count_no
    ig = (-1 * p_count_yes/size * np.log2(p_count_yes/size)) - (p_count_no/size * np.log2(p_count_no/size))
    
    return ig


In [53]:
# Information gain for each attribute in the dataset is calculated.
def information_gain_attributes(df, ig_CHD, attributes, attribute_values):    
    results = {
            'male': 0,
            'age': 0,
            'education': 0,
            'currentSmoker': 0,
            'cigsPerDay': 0,
            'BPMeds': 0,
            'prevalentStroke': 0,
            'prevalentHyp': 0,
            'diabetes': 0,
            'totChol': 0,
            'sysBP': 0,
            'diaBP': 0,
            'BMI': 0,
            'heartRate': 0,
            'glucose': 0     
        }
    
    d_range = len(df)
    
    for attribute in attributes:
        ig_attribute = 0
        value_counts = dict()
        vcount = df[attribute].value_counts()
        for att_value in attribute_values[attribute]:
            
            z = df.groupby([attribute, 'TenYearCHD']).size()
            yes = z[att_value]['yes']
            if 'no' in z[att_value]:
                no = z[att_value]['no']
            else:
                no = 0
                
            ig_val = information_gain(yes, no)
            value_counts[att_value] = (ig_val * vcount[att_value] / sum(vcount))
        
        ig_attribute = sum(value_counts.values())

        results[attribute] = ig_CHD - ig_attribute
        
    
    
    return results


igs = information_gain_attributes(df, information_gain_target(df), columns, column_values)


In [59]:
# The dictionary of attribute information gains is sorted from largest IG to smallest. 
top_CHD_causes = {k: round(v, 5) for k, v in sorted(igs.items(), key=lambda item: item[1], reverse=True)}
top_CHD_causes
for k,v in top_CHD_causes.items():
    print(k, v)

age 0.03483
sysBP 0.02528
prevalentHyp 0.02135
diaBP 0.0142
male 0.00558
diabetes 0.00531
education 0.00525
totChol 0.00509
BMI 0.00471
BPMeds 0.00432
glucose 0.00276
cigsPerDay 0.00221
prevalentStroke 0.00202
heartRate 0.0004
currentSmoker 0.00027


<h4><center>Table 2. Information Gain of Attributes</center></h4>

| **Attribute**       	| **Information Gain** 	|
|:-----------------:	|:------------------:	|
| age             	| 0.03483          	|
| sysBP           	| 0.02528          	|
| prevalentHyp    	| 0.02135          	|
| diaBP           	| 0.0142           	|
| male            	| 0.00558          	|
| diabetes        	| 0.00531          	|
| education       	| 0.00525          	|
| totChol         	| 0.00509          	|
| BMI             	| 0.00471          	|
| BPMeds          	| 0.00432          	|
| glucose         	| 0.00276          	|
| cigsPerDay      	| 0.00221          	|
| prevalentStroke 	| 0.00202          	|
| heartRate       	| 0.0004           	|
| currentSmoker   	| 0.00027          	|

From this information, it can be determined that age is the largest factor in determining one's risk for CHD. 

In [55]:
# Compare age groups to see which has the highest prevalence of CHD, which is simply a matter of finding the 
# percentages of 'yes' values in each bin. 
age_brackets = column_values['age']

age_risks = {}

for bracket in age_brackets:
    z = df.groupby(['age', 'TenYearCHD']).size()
    yes = z[bracket]['yes']
    if 'no' in z[bracket]:
        no = z[bracket]['no']
    else:
        no = 0
    age_risks[bracket] = round(yes/(yes+no), 2)

age_risks = {k: round(v, 5) for k, v in sorted(age_risks.items(), key=lambda item: item[1], reverse=True)}

age_risks

{'(56.0, 70]': 0.27,
 '(49.0, 56.0]': 0.18,
 '(42.0, 49.0]': 0.11,
 '[32, 42.0]': 0.06}

From this analysis, the impacts of each health factor regarding ten year CHD has been assessed. Individual components can be broken down into the prevalence of CHD in each of their bins.