In [50]:
# -*- coding: utf-8 -*-
"""
Created on Mon Feb 27 21:35:59 2017

Visualize SAS Proc Varclus and where LexisNexis variables fit in
the existing cluster structure

@author: Jingmin
"""

import plotly.plotly as py
import plotly.graph_objs as go
import networkx as nx
import pandas as pd
import numpy as np

# Import cluster structure data
df = pd.read_csv(
   r'C:\Users\GlowingToilet\Google Drive\Projects\clus-network\CB_ALL.csv'
)

df.drop(['var_name', 'rsquared_ratio', 'Gini_Statistic', 'Information_Value_Ordering'], axis=1, inplace=True)
df['Information_Value'] = df.apply(lambda x: np.round(x['Information_Value'], decimals=2), axis=1)
df['rsquared'] = df.apply(lambda x: np.round(x['rsquared'], decimals=1), axis=1)
df['rsquared_next'] = df.apply(lambda x: np.round(x['rsquared_next'], decimals=1), axis=1)

df = df.loc[lambda x: x['Information_Value'] > 0.01, :]
df.rename(columns=dict(Information_Value='iv', rsquared='rsq', rsquared_next='rsq_n'), inplace=True)

# Import VC2 to potentially re-assign LN vars

vc2 = pd.read_csv(
   r'C:\Users\GlowingToilet\Google Drive\Projects\clus-network\VC2.csv'
)

vc2['var_name_o'] = vc2['var_name'].apply(lambda x: x[4:])
vc2['rsquared'] = vc2.apply(lambda x: np.round(x['rsquared'], decimals=1), axis=1)
vc2['rsquared_next'] = vc2.apply(lambda x: np.round(x['rsquared_next'], decimals=1), axis=1)
vc2.rename(columns=dict(rsquared='rsq', rsquared_next='rsq_n'), inplace=True)
vc2.drop(['var_name', 'rsquared_ratio'], axis=1, inplace=True)

df = df.merge(vc2, how='left', on='var_name_o')

def compare(row):
    cluster, rsq, cluster_n, rsq_n = row['cluster_x'], row['rsq_x'], row['cluster_next_x'], row['rsq_n_x']
    
    if np.nan(row['cluster_y']):
        return cluster, rsq, cluster_n, rsq_n
    
    if row['rsq_y'] > row['rsq_n_x']:
        if row['rsq_y'] > row['rsq_x']:
            cluster, rsq = row['cluster_y'], row['rsq_y']
            if row['rsq_n_y'] > row['rsq_x']:
                cluster_n, rsq_n = row['cluster_next_y'], row['rsq_n_y']
            else:
                cluster_n, rsq_n = row['cluster_x'], row['rsq_x']
        elif row['rsq_y'] > row['rsq_n_x']:
            cluster_n, rsq_n = row['cluster_y'], row['rsq_y']
            
    return cluster, rsq, cluster_n, rsq_n
            

df.tail(5)

Unnamed: 0,var_name_o,iv,cluster_x,rsq_x,rsq_n_x,clus_next_x,source,cluster_y,rsq_y,rsq_n_y,clus_next_y
960,purchaseactivitydollartotal,0.02,Clus85,0.9,0.1,Clus80,ln,Clus56,0.1,0.0,Clus10
961,inquirynonshortterm12month,0.09,Clus86,0.8,0.2,Clus80,ln,Clus08,0.1,0.1,Clus62
962,inquiryauto12month,0.08,Clus86,0.7,0.1,Clus80,ln,Clus08,0.1,0.1,Clus62
963,subjectwillingnessprimaryfac,0.07,Clus86,0.6,0.1,Clus80,ln,Clus08,0.0,0.0,Clus62
964,subjectwillingnessindex,0.12,Clus86,0.5,0.3,Clus81,ln,Clus62,0.1,0.1,Clus39


In [27]:
max_iv = df.iv.max()

df.describe()

Unnamed: 0,iv,rsq,rsq_n
count,965.0,965.0,965.0
mean,0.049627,0.662798,0.325699
std,0.050924,0.232376,0.200162
min,0.02,0.0,0.0
25%,0.02,0.5,0.2
50%,0.04,0.7,0.3
75%,0.05,0.8,0.4
max,0.44,1.0,0.8


In [28]:
# Empty graph
G = nx.Graph()

# Add nodes and edges
for row in df.itertuples():
    _, var, iv, clus, rsq, rsq_n, clus_n, source = tuple(row)
    G.add_node(clus, type='clus', cluster=clus, source=source)
    G.add_node(var, type='var', cluster=clus, source=source, iv=iv)     
    G.add_node(clus_n, type='clus', cluster=clus_n, source=source)
    
    G.add_edge(clus, var, rsq=rsq)
    if rsq_n > 0.25:
        G.add_edge(clus_n, var, rsq=rsq_n)     

print('Done')

Done


In [29]:
# Generate layout
pos = nx.spring_layout(G)
print('Done')

Done


In [38]:
# Customize the plot
edge_trace = go.Scatter(
    x=[],                     
    y=[],
    line=go.Line(
        width=[],
        color='#699',
    ),
    hoverinfo=None,
    mode='line'
)

for edge in G.edges():
    x0, y0 = pos[edge[0]]
    x1, y1 = pos[edge[1]]
    edge_trace['x'] += [x0, x1, None]
    edge_trace['y'] += [y0, y1, None]
    edge_trace['line']['width'].append(G[edge[0]][edge[1]]['rsq'])
    
node_trace = go.Scatter(
    x=[], 
    y=[], 
    text=[],
    marker=go.Marker(
        showscale=True,
        colorscale='Portland',
        color=[], 
        size=[],
        symbol=[],       
        colorbar=dict(
            thickness=15,
            title='Cluster',
            xanchor='left',
            titleside='right'
        ),
        line=dict(width=1)
    ),
    mode='markers', 
    hoverinfo='text',
)
        
for node in G.nodes():
    x, y = pos[node]
    node_trace['x'].append(x)
    node_trace['y'].append(y)
    
    if G.node[node]['type'] == 'var':
        node_size = 5 + G.node[node]['iv'] / max_iv * 10
        node_trace['marker']['size'].append(node_size)
        
        if G.node[node]['source'] == 'fmcc':
            node_trace['marker']['symbol'].append('circle')
        else:
            node_trace['marker']['symbol'].append('x')
        
    else:
        node_trace['marker']['size'].append(20)
        if G.node[node]['source'] == 'fmcc':
            node_trace['marker']['symbol'].append('circle')
        else:
            node_trace['marker']['symbol'].append('x')
        
    node_trace['marker']['color'].append(int(G.node[node]['cluster'][-2:]))
    node_trace['text'].append(node)
print('Done')

Done


In [39]:
# Make the plot on Plotly
fig = go.Figure(data=go.Data([edge_trace, node_trace]),
                layout=go.Layout(
                    title='<br>Cluster Structure for Medium FICO',
                    titlefont=dict(size=16),
                    showlegend=False, 
                    hovermode='closest',
                    margin=dict(b=20,l=5,r=5,t=40),
                    annotations=[ dict(
                        showarrow=False,
                        text=("Github: <a href='https://github.com/jingmin1987/clus-network/'>"
                              'https://github.com/jingmin1987/clus-network/ </a>'),
                        xref="paper", yref="paper",
                        x=0.005, y=-0.002 ) ],
                    xaxis=go.XAxis(showgrid=False, zeroline=False, showticklabels=False),
                    yaxis=go.YAxis(showgrid=False, zeroline=False, showticklabels=False)))

py.iplot(fig, filename='varclus_ln')