# Weights of evidence, information value function

- Returns Information value of variable before and after coarse classing.
- Returns WOE-IV table before coarse classing.
- Returns plot of WOEs from table before coarse classing.

Both function - for contionous and for discrete variables -  take 3 arguments:
- data - pandas DataFrame containing actual, clean data
- target - column in data which is target variable - must be entered with quotation marks
- attribute - column in data for which WOEs and IV is calculated - must be entered with quotation marks

Requires importing numpy, pandas and matplotlib.pyplot

Also, requires target variable to be binary, and take 0 (no event) and 1 (event) values.

Does not handle missing values(requires imputation).

In [14]:
#continous variables
def woe_iv_cont(data, attribute, target):
    data_binned = pd.qcut(data[attribute], q=10, duplicates='drop')
    data_wtarget = pd.DataFrame(np.hstack((np.array(data_binned.values).reshape(-1,1), data[target].values.reshape(-1,1))), 
                     columns=[attribute, target])
    data0 = data_wtarget[data_wtarget[target] == 0]
    data1 = data_wtarget[data_wtarget[target] == 1]
    data_woe = pd.DataFrame(np.hstack((data_wtarget[attribute].sort_values().unique().reshape(-1,1),
           data0.groupby(attribute).count().reindex(labels=data_wtarget[attribute].unique(), fill_value = 0.5).sort_index().values,
           data1.groupby(attribute).count().reindex(labels=data_wtarget[attribute].unique(), fill_value = 0.5).sort_index().values
          )), columns = ['interval', 'NO event', 'event'])
    data_woe['event'] = data_woe['event'].astype('float')
    data_woe['NO event'] = data_woe['NO event'].astype('float')
    data_woe['NO event %'] = data_woe['NO event'].values/sum(data_woe['NO event'].values)
    data_woe['event %'] = data_woe['event'].values/sum(data_woe['event'].values)
    data_woe['WOE'] = np.log(data_woe['NO event %']/data_woe['event %'])
    data_woe['IV'] = (data_woe['NO event %']-data_woe['event %'])*data_woe['WOE']
    print('Information value of {} is:'.format(attribute))
    print(sum(data_woe['IV']))
    data_coarsed1 = data_woe.sort_values(by='WOE', ascending=False)
    data_coarsed2 = data_coarsed1.set_index(np.arange(0,len(data_woe-1)))
    woe_class = []
    x = -1
    while x <= len(data_coarsed1)-2:
        x = x+1
        if data_coarsed2.at[x, 'WOE'] <= -3:
            woe_class.append(0)
        elif data_coarsed2.at[x, 'WOE'] > -3 and data_coarsed2.at[x, 'WOE'] <= -2 :
            woe_class.append(1)
        elif data_coarsed2.at[x, 'WOE'] > -2 and data_coarsed2.at[x, 'WOE'] <= -1.5 :
            woe_class.append(2)
        elif data_coarsed2.at[x, 'WOE'] > -1.5 and data_coarsed2.at[x, 'WOE'] <= -1 :
            woe_class.append(3)
        elif data_coarsed2.at[x, 'WOE'] > -1 and data_coarsed2.at[x, 'WOE'] <= -0.7 :
            woe_class.append(4)
        elif data_coarsed2.at[x, 'WOE'] > -0.7 and data_coarsed2.at[x, 'WOE'] <= -0.3 :
            woe_class.append(5)
        elif data_coarsed2.at[x, 'WOE'] > -0.3 and data_coarsed2.at[x, 'WOE'] <= 0 :
            woe_class.append(6)
        elif data_coarsed2.at[x, 'WOE'] > 0 and data_coarsed2.at[x, 'WOE'] <= 0.3 :
            woe_class.append(7)
        elif data_coarsed2.at[x, 'WOE'] > 0.3 and data_coarsed2.at[x, 'WOE'] <= 0.7 :
            woe_class.append(8)
        elif data_coarsed2.at[x, 'WOE'] > 0.7 and data_coarsed2.at[x, 'WOE'] <= 1 :
            woe_class.append(9)
        elif data_coarsed2.at[x, 'WOE'] > 1 and data_coarsed2.at[x, 'WOE'] <= 1.5 :
            woe_class.append(10)
        elif data_coarsed2.at[x, 'WOE'] > 1.5 and data_coarsed2.at[x, 'WOE'] <= 2 :
            woe_class.append(11)
        elif data_coarsed2.at[x, 'WOE'] > 2 and data_coarsed2.at[x, 'WOE'] <= 3 :
            woe_class.append(12)
        else:
            woe_class.append(13)
    data_coarsed1['woe_class'] = woe_class
    data_coarsed3 = data_coarsed1.sort_index()
    y = 0
    neighbor = []
    w = np.arange(2, len(data_coarsed3)+1)
    if data_coarsed3['woe_class'][0] == data_coarsed3['woe_class'][1]:
        neighbor.append(0)
    else:
        neighbor.append(1)
    while y < len(data_coarsed3) - 2:
        y = y+1
        if data_coarsed3['woe_class'][y] == data_coarsed3['woe_class'][y+1] or data_coarsed3['woe_class'][y] == data_coarsed3['woe_class'][y-1]:
            neighbor.append(0)
        else:
            neighbor.append(w[x-1])
    if data_coarsed3['woe_class'][len(data_coarsed3)-1] == data_coarsed3['woe_class'][len(data_coarsed3)-2]:
        neighbor.append(0)
    else:
        neighbor.append(w[-1])
    data_coarsed3['neighbor'] = neighbor
    groupby = data_coarsed3[['woe_class','neighbor', 'NO event', 'event']].groupby(['woe_class','neighbor']).sum()
    groupby['NO event %'] = groupby['NO event'].values/sum(groupby['NO event'].values)
    groupby['event %'] = groupby['event'].values/sum(groupby['event'].values)
    groupby['WOE'] = np.log(groupby['NO event %']/groupby['event %'])
    groupby['IV'] = (groupby['NO event %']-groupby['event %'])*groupby['WOE']
    print('Information value of {} after coarse classing is:'.format(attribute))
    print(sum(groupby['IV']))
    print('\n')
    print(data_woe)
    plt.plot(data_woe['WOE'])

In [15]:
#discrete variables
def woe_iv_discr(data, attribute, target):
    data_wtarget = pd.DataFrame(np.hstack((np.array(data[attribute].values).reshape(-1,1), data[target].values.reshape(-1,1))), 
                     columns=[attribute, target])
    data0 = data_wtarget[data_wtarget[target] == 0]
    data1 = data_wtarget[data_wtarget[target] == 1]
    data_woe = pd.DataFrame(np.hstack((data_wtarget[attribute].sort_values().unique().reshape(-1,1),
           data0.groupby(attribute).count().reindex(labels=data_wtarget[attribute].unique(), fill_value = 0.5).sort_index().values,
           data1.groupby(attribute).count().reindex(labels=data_wtarget[attribute].unique(), fill_value = 0.5).sort_index().values
          )), columns = ['interval', 'NO event', 'event'])
    data_woe['event'] = data_woe['event'].astype('float')
    data_woe['NO event'] = data_woe['NO event'].astype('float')
    data_woe['NO event %'] = data_woe['NO event'].values/sum(data_woe['NO event'].values)
    data_woe['event %'] = data_woe['event'].values/sum(data_woe['event'].values)
    data_woe['WOE'] = np.log(data_woe['NO event %']/data_woe['event %'])
    data_woe['IV'] = (data_woe['NO event %']-data_woe['event %'])*data_woe['WOE']
    print('Information value of {} is:'.format(attribute))
    print(sum(data_woe['IV']))
    data_coarsed1 = data_woe.sort_values(by='WOE', ascending=False)
    data_coarsed2 = data_coarsed1.set_index(np.arange(0,len(data_woe-1)))
    woe_class = []
    x = -1
    while x <= len(data_coarsed1)-2:
        x = x+1
        if data_coarsed2.at[x, 'WOE'] <= -3:
            woe_class.append(0)
        elif data_coarsed2.at[x, 'WOE'] > -3 and data_coarsed2.at[x, 'WOE'] <= -2 :
            woe_class.append(1)
        elif data_coarsed2.at[x, 'WOE'] > -2 and data_coarsed2.at[x, 'WOE'] <= -1.5 :
            woe_class.append(2)
        elif data_coarsed2.at[x, 'WOE'] > -1.5 and data_coarsed2.at[x, 'WOE'] <= -1 :
            woe_class.append(3)
        elif data_coarsed2.at[x, 'WOE'] > -1 and data_coarsed2.at[x, 'WOE'] <= -0.7 :
            woe_class.append(4)
        elif data_coarsed2.at[x, 'WOE'] > -0.7 and data_coarsed2.at[x, 'WOE'] <= -0.3 :
            woe_class.append(5)
        elif data_coarsed2.at[x, 'WOE'] > -0.3 and data_coarsed2.at[x, 'WOE'] <= 0 :
            woe_class.append(6)
        elif data_coarsed2.at[x, 'WOE'] > 0 and data_coarsed2.at[x, 'WOE'] <= 0.3 :
            woe_class.append(7)
        elif data_coarsed2.at[x, 'WOE'] > 0.3 and data_coarsed2.at[x, 'WOE'] <= 0.7 :
            woe_class.append(8)
        elif data_coarsed2.at[x, 'WOE'] > 0.7 and data_coarsed2.at[x, 'WOE'] <= 1 :
            woe_class.append(9)
        elif data_coarsed2.at[x, 'WOE'] > 1 and data_coarsed2.at[x, 'WOE'] <= 1.5 :
            woe_class.append(10)
        elif data_coarsed2.at[x, 'WOE'] > 1.5 and data_coarsed2.at[x, 'WOE'] <= 2 :
            woe_class.append(11)
        elif data_coarsed2.at[x, 'WOE'] > 2 and data_coarsed2.at[x, 'WOE'] <= 3 :
            woe_class.append(12)
        else:
            woe_class.append(13)
    data_coarsed1['woe_class'] = woe_class
    data_coarsed3 = data_coarsed1.sort_index()
    y = 0
    neighbor = []
    w = np.arange(2, len(data_coarsed3)+1)
    if data_coarsed3['woe_class'][0] == data_coarsed3['woe_class'][1]:
        neighbor.append(0)
    else:
        neighbor.append(1)
    while y < len(data_coarsed3) - 2:
        y = y+1
        if data_coarsed3['woe_class'][y] == data_coarsed3['woe_class'][y+1] or data_coarsed3['woe_class'][y] == data_coarsed3['woe_class'][y-1]:
            neighbor.append(0)
        else:
            neighbor.append(w[x-1])
    if data_coarsed3['woe_class'][len(data_coarsed3)-1] == data_coarsed3['woe_class'][len(data_coarsed3)-2]:
        neighbor.append(0)
    else:
        neighbor.append(w[-1])
    data_coarsed3['neighbor'] = neighbor
    groupby = data_coarsed3[['woe_class','neighbor', 'NO event', 'event']].groupby(['woe_class','neighbor']).sum()
    groupby['NO event %'] = groupby['NO event'].values/sum(groupby['NO event'].values)
    groupby['event %'] = groupby['event'].values/sum(groupby['event'].values)
    groupby['WOE'] = np.log(groupby['NO event %']/groupby['event %'])
    groupby['IV'] = (groupby['NO event %']-groupby['event %'])*groupby['WOE']
    print('Information value of {} after coarse classing is:'.format(attribute))
    print(sum(groupby['IV']))
    print('\n')
    print(data_woe)
    plt.plot(data_woe['WOE'])