In [136]:
# -*- coding: utf-8 -*-
"""
Created on Mon Feb 27 21:35:59 2017

Visualize SAS Proc Varclus and where LexisNexis variables fit in
the existing cluster structure

@author: Jingmin
"""

import plotly.plotly as py
import plotly.graph_objs as go
import networkx as nx
import pandas as pd
import numpy as np

# Import cluster structure data
df = pd.read_csv(
   r'C:\Users\GlowingToilet\Google Drive\Projects\clus-network\CB_ALL.csv'
)

df.drop(['var_name', 'rsquared_ratio', 'Gini_Statistic', 'Information_Value_Ordering'], axis=1, inplace=True)
df['Information_Value'] = df.apply(lambda x: np.round(x['Information_Value'], decimals=2), axis=1)
df['rsquared'] = df.apply(lambda x: np.round(x['rsquared'], decimals=1), axis=1)
df['rsquared_next'] = df.apply(lambda x: np.round(x['rsquared_next'], decimals=1), axis=1)

df = df.loc[lambda x: x['Information_Value'] > 0.01, :]
df.rename(columns=dict(Information_Value='iv', rsquared='rsq', rsquared_next='rsq_n'), inplace=True)

# Import VC2 to potentially re-assign LN vars

vc2 = pd.read_csv(
   r'C:\Users\GlowingToilet\Google Drive\Projects\clus-network\VC2.csv'
)

vc2['var_name_o'] = vc2['var_name'].apply(lambda x: x[4:])
vc2['rsquared'] = vc2.apply(lambda x: np.round(x['rsquared'], decimals=1), axis=1)
vc2['rsquared_next'] = vc2.apply(lambda x: np.round(x['rsquared_next'], decimals=1), axis=1)
vc2.rename(columns=dict(rsquared='rsq', rsquared_next='rsq_n'), inplace=True)
vc2.drop(['var_name', 'rsquared_ratio'], axis=1, inplace=True)

df = df.merge(vc2, how='left', on='var_name_o')

def compare(row):
    cluster, rsq, cluster_n, rsq_n = row['cluster_x'], row['rsq_x'], row['clus_next_x'], row['rsq_n_x']
    source = row['source']
    
    if np.isnan(row['rsq_y']):
        pass
    elif row['rsq_y'] > row['rsq_n_x']:
        if row['rsq_y'] > row['rsq_x']:
            cluster, rsq = row['cluster_y'], row['rsq_y']
            source = 'fmcc'
            if row['rsq_n_y'] > row['rsq_x']:
                cluster_n, rsq_n = row['clus_next_y'], row['rsq_n_y']
            else:
                cluster_n, rsq_n = row['cluster_x'], row['rsq_x']
        elif row['rsq_y'] > row['rsq_n_x']:
            cluster_n, rsq_n = row['cluster_y'], row['rsq_y']
    
    return pd.Series(
        dict(
             var_name_o=row['var_name_o'],
             iv=row['iv'],
             cluster=cluster,
             rsq=rsq, 
             cluster_next=cluster_n, 
             rsq_n=rsq_n,
             source=source
        )
    )
            
df = df.apply(compare, axis=1)
# df.to_csv(r'C:\Users\GlowingToilet\Google Drive\Projects\clus-network\check.csv')
    
df.head()

Unnamed: 0,cluster,cluster_next,iv,rsq,rsq_n,source,var_name_o
204,Clus07,Clus59,0.03,1.0,0.4,fmcc,POP5_029
205,Clus07,Clus59,0.03,1.0,0.4,fmcc,POP5_030
206,Clus07,Clus59,0.04,0.9,0.4,fmcc,POP5_026
207,Clus07,Clus59,0.04,0.9,0.4,fmcc,POP5_024
208,Clus07,Clus59,0.04,0.9,0.4,fmcc,ETC_037
209,Clus07,Clus59,0.04,0.9,0.4,fmcc,ETC_040
210,Clus07,Clus59,0.04,0.9,0.4,fmcc,POP5_028
211,Clus07,Clus59,0.04,0.9,0.4,fmcc,ETC_053
212,Clus07,Clus59,0.04,0.9,0.4,fmcc,ETC_039
213,Clus07,Clus59,0.04,0.9,0.4,fmcc,POP5_027


In [130]:
max_iv = df.iv.max()

df.describe()

Unnamed: 0,iv,rsq,rsq_n
count,965.0,965.0,965.0
mean,0.049627,0.663212,0.32601
std,0.050924,0.231368,0.199785
min,0.02,0.0,0.0
25%,0.02,0.5,0.2
50%,0.04,0.7,0.3
75%,0.05,0.8,0.4
max,0.44,1.0,0.8


In [144]:
# Empty graph
G = nx.Graph()

# Add nodes and edges
corr = 0.7

for row in df.itertuples():
    _, clus, clus_n, iv, rsq, rsq_n, source, var = tuple(row)
    G.add_node(clus, type='clus', cluster=clus, source=source)
    G.add_node(var, type='var', cluster=clus, source=source, iv=iv, rsq=rsq)     
    
    if rsq > corr ** 2:
        G.add_edge(clus, var, rsq=rsq)
    if rsq_n > corr ** 2:
        G.add_edge(clus_n, var, rsq=rsq_n)     

print('Done')

Done


In [141]:
# Generate layout
pos = nx.spring_layout(G)
print('Done')

Done


In [150]:
# Customize the plot
edge_trace = go.Scatter(
    x=[],                     
    y=[],
    line=go.Line(
        width=[],
        color='#CCC',
    ),
    hoverinfo=None,
    mode='line'
)

for edge in G.edges():
    x0, y0 = pos[edge[0]]
    x1, y1 = pos[edge[1]]
    edge_trace['x'] += [x0, x1, None]
    edge_trace['y'] += [y0, y1, None]
    edge_trace['line']['width'].append(G[edge[0]][edge[1]]['rsq'])
    
node_trace = go.Scatter(
    x=[], 
    y=[], 
    text=[],
    marker=go.Marker(
        showscale=True,
        colorscale='Portland',
        color=[], 
        size=[],
        symbol=[],       
        colorbar=dict(
            thickness=15,
            title='Cluster',
            xanchor='left',
            titleside='right'
        ),
        line=dict(width=1)
    ),
    mode='markers', 
    hoverinfo='text',
)
        
for node in G.nodes():
    x, y = pos[node]
    node_trace['x'].append(x)
    node_trace['y'].append(y)
    
    if G.node[node]['type'] == 'var':
        node_size = 5 + G.node[node]['iv'] / max_iv * 15
        node_trace['marker']['size'].append(node_size)
        node_info = '({0}: {3}, iv: {1}, rsq: {2})'.format(
            node, G.node[node]['iv'], G.node[node]['rsq'], G.node[node]['cluster'])
        
        if G.node[node]['source'] == 'fmcc':
            node_trace['marker']['symbol'].append('circle')
        else:
            node_trace['marker']['symbol'].append('x')
        
    else:
        max_cluster_iv = df.loc[lambda x: x['cluster'] == node, 'iv'].max()
        node_size = 10 + max_cluster_iv / max_iv * 25
        node_trace['marker']['size'].append(node_size)
        node_info = '{0} component, max iv: {1}'.format(node, max_cluster_iv)
        if G.node[node]['source'] == 'fmcc':
            node_trace['marker']['symbol'].append('diamond')
        else:
            node_trace['marker']['symbol'].append('x')
 
    node_trace['marker']['color'].append(int(G.node[node]['cluster'][-2:]))
    
    node_trace['text'].append(node_info)

        
print('Done')

Done


In [152]:
# Make the plot on Plotly
fig = go.Figure(data=go.Data([edge_trace, node_trace]),
                layout=go.Layout(
                    title='<br>Variable Cluster Structure for Medium FICO with LN Clusters (corr > {})'.format(corr),
                    titlefont=dict(size=16),
                    showlegend=False, 
                    hovermode='closest',
                    margin=dict(b=20,l=5,r=5,t=40),
                    annotations=[ dict(
                        showarrow=False,
                        text=('x: LN vars/clus; diamond: Cluster component'),
                        xref="paper", yref="paper",
                        x=0.005, y=-0.002 ) ],
                    xaxis=go.XAxis(showgrid=False, zeroline=False, showticklabels=False),
                    yaxis=go.YAxis(showgrid=False, zeroline=False, showticklabels=False)))

py.iplot(fig, filename='varclus_ln_med_all')