In [18]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import networkx as nx

In [19]:
import plotly
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)

In [20]:
# Load data
df = pd.read_csv('data/groceries_all.csv')

In [21]:
# Get a list of all features
features = df.columns

In [22]:
# Get a list of all response types
response_list = pd.DataFrame(columns=['responses'], index=features)
for feature in features:
    response_list['responses'][feature]=list(df[feature].value_counts().index)

In [23]:
# Count all the observations
observation_count = df.shape[0]

In [24]:
# Create a list of support for each item
support_list = pd.DataFrame(columns=['support'], index=features)

In [25]:
# Calculate support by feature count divided by total observation count
for feature in features:
    support_list['support'][feature]=(df.sum()[feature])/observation_count

In [26]:
# Create a matrix of support for each pair of features, divided by total observation count
pair_support_matrix = pd.DataFrame(columns=features, index=features)
pair_support_matrix.name = "Pair Support"
for x in features:
    for y in features:
        pair_count=df[(df[x]==response_list['responses'][x][1]) & (df[y]==response_list['responses'][y][1])].shape[0]
        pair_support_matrix[x][y]=pair_count/observation_count

In [31]:
# Create a matrix of confidence (conditional probability) for each pair of features, divided by total observation count
confidence_matrix = pd.DataFrame(columns=features, index=features)
confidence_matrix.name = "Conditional Probability"
for x in features:
    for y in features:
        confidence_matrix[x][y]=pair_support_matrix[x][y]/support_list['support'][x]

In [32]:
# Create a matrix of lift for each pair of features, where lift is the conditional probability(x,y), divided by the support(y)
lift_matrix = pd.DataFrame(columns=features, index=features)
lift_matrix.name = "Lift"
for x in features:
    for y in features:
        if x==y:
            lift_matrix[x][y]=0
        else:
            lift_matrix[x][y]=confidence_matrix[x][y]/support_list['support'][y]

In [33]:
def build_network_viz(matrix,threshold,layout,**kwargs):
    weighted = kwargs.get('weighted', False)
    # Suggested layouts: [nx.spring_layout, nx.fruchterman_reingold_layout, nx.circular_layout, nx.random_layout, nx.spectral_layout]
    # Documentation about layout types here: https://networkx.github.io/documentation/stable/reference/drawing.html#module-networkx.drawing.layout
    stack = pd.DataFrame(matrix.mask(np.triu(np.ones(matrix.shape)).astype(bool)).stack()).reset_index().rename(columns={'level_0':'x','level_1':'y',0:'measure'})
    orig_size = stack.shape[0]
    stack = stack[stack['measure']>threshold]
    thresh_size = stack.shape[0]
    print("Removed",orig_size-thresh_size,"edges from stack. Original edge count:",orig_size,"| New edge count:",thresh_size)
    print("Top 10 pairs:",stack.sort_values(by='measure',ascending=False).head(10))
    # Create a networkx graph from the list of pairs
    G=nx.from_pandas_edgelist(stack, 'x', 'y', ['measure'])
    # Generate position data for each node
    if weighted:
        pos=layout(G, weight='measure')
        weighted_label="weighted"
    else:
        pos=layout(G)
        weighted_label="unweighted"
        
    # Save x, y locations of each edge
    edge_x = []
    edge_y = []
    for edge in G.edges():
        x0, y0 = pos[edge[0]]
        x1, y1 = pos[edge[1]]
        edge_x.append(x0)
        edge_x.append(x1)
        edge_x.append(None)
        edge_y.append(y0)
        edge_y.append(y1)
        edge_y.append(None)

    # Draw edges
    edge_trace = go.Scatter(
        x=edge_x, y=edge_y,
        line=dict(width=0.5, color='#888'),
        hoverinfo='none',
        mode='lines')
    
    # Save x, y locations of each node
    node_x = []
    node_y = []
    
    # Save node name for annotation
    node_name = []
    for node in G.nodes():
        node_name.append(node)
        x, y = pos[node]
        node_x.append(x)
        node_y.append(y)

    # Draw nodes
    node_trace = go.Scatter(
        x=node_x, y=node_y,
        mode='markers',
        hoverinfo='text',
        marker=dict(
            showscale=True,
            # colorscale options
            #'Greys' | 'YlGnBu' | 'Greens' | 'YlOrRd' | 'Bluered' | 'RdBu' |
            #'Reds' | 'Blues' | 'Picnic' | 'Rainbow' | 'Portland' | 'Jet' |
            #'Hot' | 'Blackbody' | 'Earth' | 'Electric' | 'Viridis' |
            colorscale='YlGnBu',
            reversescale=True,
            color=[],
            size=10,
            colorbar=dict(
                thickness=15,
                title='Node Connections',
                xanchor='left',
                titleside='right'
            ),
            line_width=2))
    
    # Count number of connections of each node
    node_adjacencies = []
    node_text = []
    for node, adjacencies in enumerate(G.adjacency()):
        node_adjacencies.append(len(adjacencies[1]))
        node_text.append('Feature: '+str(list(G.nodes())[node])+'<br>Connections: '+str(len(adjacencies[1])))
    node_trace.marker.color = node_adjacencies
    node_trace.text = node_text
    
    # Draw figure
    fig = go.Figure(data=[edge_trace,node_trace],
             layout=go.Layout(
                title=str('Feature Graph of '+matrix.name+'<br><i>(thresheld at '+str(threshold)+", "+weighted_label+")</i>"),
                titlefont_size=16,
                showlegend=False,
                hovermode='closest',
                margin=dict(b=20,l=5,r=5,t=120),
                xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
                yaxis=dict(showgrid=False, zeroline=False, showticklabels=False))
                )
    # Show figure
    fig.show()

In [34]:
build_network_viz(confidence_matrix,0.05,nx.fruchterman_reingold_layout,weighted=True)

Removed 11460 edges from stack. Original edge count: 14196 | New edge count: 2736
Top 10 pairs:                            x                      y   measure
14022          shopping bags   sound storage medium         1
5557   fruit/vegetable juice              baby food         1
7723                 waffles              baby food         1
7847                cake bar              baby food         1
8098               chocolate              baby food         1
13958          shopping bags              baby food         1
7118             salty snack              baby food         1
1530              rolls/buns                tidbits  0.521739
8730                   candy  preservation products       0.5
9400            dish cleaner  preservation products       0.5


In [35]:
build_network_viz(lift_matrix,30,nx.spring_layout)

Removed 14162 edges from stack. Original edge count: 14196 | New edge count: 34
Top 10 pairs:                            x                       y  measure
5134            cocoa drinks   preservation products  223.523
4661               baby food       finished products  153.672
4733               baby food                   soups  146.791
9537        abrasive cleaner   preservation products    140.5
10912         baby cosmetics                   cream   126.09
12933   sound storage medium  frozen potato products  118.494
14073                   bags                 tidbits  106.902
3562   preservation products                  spices  96.4216
11947        kitchen utensil                    fish  84.7845
7847                cake bar               baby food  75.6538
