# Preparation

In [1]:
import numpy as np
import pandas as pd
import networkx as nx
import plotly.offline as po
import plotly.graph_objects as go

In [2]:
def autocomplete_product(stump):
    """ Autocomplete (Case Insensitive) Product Name and provide ID """
    # for each product
    for x in raw.product_name.unique():
        # if it contains provided stump
        if x.lower().__contains__(stump.lower()):
            # print name and id
            print(f"{x} - {name_to_id[x]}")

In [3]:
def create_graph(df, name, plot=False, offline=True, notebook=False, nf=0.05):
    """
    Create Graph of products. Node size ~ totals sales. Edge strength ~ shared sales adjusted for total sales.
    
    :df -> Dataframe to generate edges from
    :name -> file suffix for html page
    :offline -> create html page with plot and display as new tab
    :notebook -> display in notebook
    :nf -> normalizing factor for path strenght. Smaller values => thicker edges
    """
    # create edgelist
    G = nx.from_pandas_edgelist(df, source="product_x", target="product_y", edge_attr=True)
    # position nodes using Fruchterman-Reingold force-directed algorithm
    pos_ = nx.spring_layout(G)
    
    # generate plotly trace for every edge
    edge_trace = []
    for edge in G.edges():

        if G.edges()[edge]['weight'] > 0:
            prod_1 = edge[0]
            prod_2 = edge[1]

            x0, y0 = pos_[prod_1]
            x1, y1 = pos_[prod_2]
            # generate hovertext (only displays on starting point. Bug in Plotly https://community.plotly.com/t/plotly-hover-event-not-getting-triggered-for-al-data-points/387)
            text   = str(id_to_name[prod_2]) + '--' + str(id_to_name[prod_1]) + ': ' + str(G.edges()[edge]['weight'])
            # calculate adjusted width
            width = G.edges()[edge]['weight']/(nf*prod_count[prod_1]+nf*prod_count[prod_2])
            # generate trace
            trace = go.Scatter(x = [x0, x1, None], y = [y0, y1, None],
                               line      = dict(width=width, color='green'),
                               hoverinfo = 'text',
                               text      = ([text]),
                               mode      = 'lines')
            # append to edge_trace list
            edge_trace.append(trace)
            
    # create empty node trace
    node_trace = go.Scatter(x = [], y = [],
                            text      = [],
                            textposition = "top center",
                            textfont_size = 10,
                            mode      = 'markers+text',
                            hoverinfo = 'none',
                            marker    = dict(color=[], size=[], line=None))
    # add every node to node_trace 
    for node in G.nodes():
        x, y = pos_[node]
        node_trace['x'] += tuple([x])
        node_trace['y'] += tuple([y])
        node_trace['marker']['color'] += tuple(['cornflowerblue'])
        node_trace['marker']['size'] += tuple([np.log(prod_count[node])])
        node_trace['text'] += tuple([id_to_name[node]])
    # create layout
    layout = go.Layout(
        paper_bgcolor='rgba(0,0,0,0)',
        plot_bgcolor='rgba(0,0,0,0)'
    )
    # create figure
    fig = go.Figure(layout = layout)
    
    # add edge traces
    for trace in edge_trace:
        fig.add_trace(trace)
    
    # add node traces
    fig.add_trace(node_trace)
    
    # cleanup layout
    fig.update_layout(showlegend = False)
    fig.update_xaxes(showticklabels = False)
    fig.update_yaxes(showticklabels = False)
    # display in specified way(s)
    if notebook:
        fig.show()
    if offline:
        po.plot(fig, filename=f'networks/network_{name}.html')

In [4]:
def cl_department(department):
    """
    Create Cooccurrency List for Department
    
    :department -> department for which to filter for
    """
    # filter and copy
    df = raw[raw.department == department][["order_id", "product_id"]].copy()
    # rename columns
    df.rename(columns={"order_id":"order", "product_id":"product"}, inplace=True)
    # create cooccurrency list
    # steps:
    # merge to self and drop rows with reflexive relationship
    # group by both products and count occurences
    # rename columns
    # filter duplicate rows
    # sort and reset index
    df_coocc = df.merge(df, on=['order']).query('product_x != product_y')\
        .groupby(['product_x','product_y'], as_index=False).count()\
        .rename(columns={'order':'weight'})\
        .query('product_x < product_y')\
        .sort_values(by="weight", ignore_index=True)
    return df_coocc

In [5]:
def graph_product(product_id, top_n=25, filter_dict=None, offline=True, notebook=False, nf=0.05):
    """
    Draw Cooccurrency Graph for specified Product
    
    :product_id -> center of graph
    :filter_dict -> limit by condition passsed as dict. To filter to county Alpine,
    pass filter_dict = {"county": "Alpine"}
    :top_n -> number of relationships to include (by absolute number of cooccurrencies)
    :offline -> create html page with plot and display as new tab
    :notebook -> display in notebook
    :nf -> normalizing factor for path strenght. Smaller values => thicker edges
    """
    # create copy
    df = raw.copy()
    # filter rows that dont satisfy conditions if filters are passed
    if filter_dict:
        for key in filter_dict.keys():
            df = df[df[key] == filter_dict[key]]
    df = df[["order_id", "product_id"]]
    # rename columns
    df.rename(columns={"order_id":"order", "product_id":"product"}, inplace=True)
    # create cooccurrency list
    # steps:
    # merge to self and filter to relevant rows
    # drop reflexive relationships
    # group by both products and count occurences
    # rename columns
    # filter duplicate rows
    # sort and reset index
    df_coocc = df.merge(df, on=['order']).query(f'product_x == {product_id} | product_y == {product_id}')\
            .query('product_x != product_y')\
            .groupby(['product_x','product_y'], as_index=False).count()\
            .rename(columns={'order':'weight'})\
            .query('product_x < product_y')\
            .sort_values(by="weight", ignore_index=True)
    # draw graph
    create_graph(df_coocc.tail(top_n), product_id, offline=offline, notebook=notebook, nf=nf)

In [6]:
# read raw data
raw = pd.read_parquet("shoppingcarts.parquet")

In [7]:
# create lookup tables for id to name and vice versa 
# for product
id_to_name = dict(zip(raw.product_id.unique(), raw.product_name.unique()))
name_to_id = dict(zip(raw.product_name.unique(), raw.product_id.unique()))
# for aisle
aisle_id_to_name = dict(zip(raw.aisle_id.unique(), raw.aisle.unique()))
aisle_name_to_id = dict(zip(raw.aisle.unique(), raw.aisle_id.unique()))
# for department
dep_id_to_name = dict(zip(raw.department_id.unique(), raw.department.unique()))
dep_name_to_id = dict(zip(raw.department.unique(), raw.department_id.unique()))

In [8]:
# count number of orders per product, aisle, department and county
prod_count = raw.groupby("product_id")["order_id"].count()
aisle_count = raw.groupby("aisle_id")["order_id"].count()
dep_count = raw.groupby("department_id")["order_id"].count()
county_count = raw.groupby("county")["order_id"].count()

# Live Demo

In [33]:
# Use this to find Product-Ids
# shows all products containing stub. Case insensitive
autocomplete_product("")

Original Chili con Carne with Beans - 35621
Original W/Beans Chili Con Carne - 21307
Chunky W/Beans Chili Con Carne - 47726
Chili Con Carne No Bean - 42158
Original Chili Con Carne With Beans - 25075
Chili Con Carne With Beans - 24989
Chili Con Carne - 14132
Hot Chili Con Carne With Beans - 18866


In [30]:
# create product graph
# offline -> create html page with plot and display as new tab
# notebook -> display in notebook
graph_product(product_id=, top_n=50, filter_dict=None, nf=0.03)