## Imports

In [1]:
# numerics
import pandas as pd
# tables
import numpy as np

In [2]:
# plotting
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import plotly.offline as po

In [3]:
# graphs
import networkx as nx

## Function Definition

In [4]:
def autocomplete_product(stump):
    """ Autocomplete (Case Insensitive) Product Name and provide ID """
    # for each product
    for x in raw.product_name.unique():
        # if it contains provided stump
        if x.lower().__contains__(stump.lower()):
            # print name and id
            print(f"{x} - {name_to_id[x]}")

In [5]:
def create_graph(df, name, plot=False, offline=True, notebook=False):
    """
    Create Graph of products. Node size ~ totals sales. Edge strength ~ shared sales adjusted for total sales.
    
    :df -> Dataframe to generate edges from
    :name -> file suffix for html page
    :offline -> create html page with plot and display as new tab
    :notebook -> display in notebook
    """
    # create edgelist
    G = nx.from_pandas_edgelist(df, source="product_x", target="product_y", edge_attr=True)
    # position nodes using Fruchterman-Reingold force-directed algorithm
    pos_ = nx.spring_layout(G)
    
    # generate plotly trace for every edge
    edge_trace = []
    for edge in G.edges():

        if G.edges()[edge]['weight'] > 0:
            prod_1 = edge[0]
            prod_2 = edge[1]

            x0, y0 = pos_[prod_1]
            x1, y1 = pos_[prod_2]
            # generate hovertext (only displays on starting point. Bug in Plotly https://community.plotly.com/t/plotly-hover-event-not-getting-triggered-for-al-data-points/387)
            text   = str(id_to_name[prod_2]) + '--' + str(id_to_name[prod_1]) + ': ' + str(G.edges()[edge]['weight'])
            # calculate adjusted width
            width = G.edges()[edge]['weight']/(0.2*prod_count[prod_1]+0.2*prod_count[prod_2])
            # generate trace
            trace = go.Scatter(x = [x0, x1, None], y = [y0, y1, None],
                               line      = dict(width=width, color='green'),
                               hoverinfo = 'text',
                               text      = ([text]),
                               mode      = 'lines')
            # append to edge_trace list
            edge_trace.append(trace)
            
    # create empty node trace
    node_trace = go.Scatter(x = [], y = [],
                            text      = [],
                            textposition = "top center",
                            textfont_size = 10,
                            mode      = 'markers+text',
                            hoverinfo = 'none',
                            marker    = dict(color=[], size=[], line=None))
    # add every node to node_trace 
    for node in G.nodes():
        x, y = pos_[node]
        node_trace['x'] += tuple([x])
        node_trace['y'] += tuple([y])
        node_trace['marker']['color'] += tuple(['cornflowerblue'])
        node_trace['marker']['size'] += tuple([np.log(prod_count[node])])
        node_trace['text'] += tuple([id_to_name[node]])
    # create layout
    layout = go.Layout(
        paper_bgcolor='rgba(0,0,0,0)',
        plot_bgcolor='rgba(0,0,0,0)'
    )
    # create figure
    fig = go.Figure(layout = layout)
    
    # add edge traces
    for trace in edge_trace:
        fig.add_trace(trace)
    
    # add node traces
    fig.add_trace(node_trace)
    
    # cleanup layout
    fig.update_layout(showlegend = False)
    fig.update_xaxes(showticklabels = False)
    fig.update_yaxes(showticklabels = False)
    # display in specified way(s)
    if notebook:
        fig.show()
    if offline:
        po.plot(fig, filename=f'networks/network_{name}.html')

In [6]:
def cl_department(department):
    """
    Create Cooccurrency List for Department
    
    :department -> department for which to filter for
    """
    # filter and copy
    df = raw[raw.department == department][["order_id", "product_id"]].copy()
    # rename columns
    df.rename(columns={"order_id":"order", "product_id":"product"}, inplace=True)
    # create cooccurrency list
    # steps:
    # merge to self and drop rows with reflexive relationship
    # group by both products and count occurences
    # rename columns
    # filter duplicate rows
    # sort and reset index
    df_coocc = df.merge(df, on=['order']).query('product_x != product_y')\
        .groupby(['product_x','product_y'], as_index=False).count()\
        .rename(columns={'order':'weight'})\
        .query('product_x < product_y')\
        .sort_values(by="weight", ignore_index=True)
    return df_coocc

In [7]:
def graph_product(product_id, top_n=25, offline=True, notebook=False):
    """
    Draw Cooccurrency Graph for specified Product
    
    :product_id -> center of graph
    :top_n -> number of relationships to include (by absolute number of cooccurrencies)
    :offline -> create html page with plot and display as new tab
    :notebook -> display in notebook
    """
    # create copy 
    df = raw[["order_id", "product_id"]].copy()
    # rename columns
    df.rename(columns={"order_id":"order", "product_id":"product"}, inplace=True)
    # create cooccurrency list
    # steps:
    # merge to self and filter to relevant rows
    # drop reflexive relationships
    # group by both products and count occurences
    # rename columns
    # filter duplicate rows
    # sort and reset index
    df_coocc = df.merge(df, on=['order']).query(f'product_x == {product_id} | product_y == {product_id}')\
            .query('product_x != product_y')\
            .groupby(['product_x','product_y'], as_index=False).count()\
            .rename(columns={'order':'weight'})\
            .query('product_x < product_y')\
            .sort_values(by="weight", ignore_index=True)
    # draw graph
    create_graph(df_coocc.tail(top_n), product_id, offline=offline, notebook=notebook)

## Preprocessing

In [8]:
# read raw data
raw = pd.read_parquet("shoppingcarts.parquet")

In [9]:
# create lookup tables for id to name and vice versa
id_to_name = dict(zip(raw.product_id.unique(), raw.product_name.unique()))
name_to_id = dict(zip(raw.product_name.unique(), raw.product_id.unique()))

In [10]:
# count number of orders per product
prod_count = raw.groupby("product_id")["order_id"].count()

### Out of Context Wordcloud

In [None]:
from wordcloud import WordCloud
from nltk.tokenize import word_tokenize

corpus = word_tokenize(' '.join(raw.head(10000)['product_name']))
wc = WordCloud(background_color='white', random_state=1, max_words=3000, width=1200, height=1200)
wc.generate(' '.join(corpus))

plt.figure(figsize=[10,10])
plt.imshow(wc, interpolation="bilinear")
plt.show()

### Sales By Department

In [None]:
px.bar(raw.groupby("department")["order_id"].count().sort_values())

### Graphen 

### Top 5000 Products per Department (no need to run again)

In [None]:
# for every department
for department in raw.department.unique():
    # create cooccurrency list
    cl_department(department).to_csv(f"cls\cl_{department}.csv")

In [None]:
# for every department
for department in raw.department.unique():
    # read cooccurrency list
    df_coocc = pd.read_csv(f"cls\cl_{department}.csv", index_col="Unnamed: 0").tail(5000)
    # draw graph
    create_graph(df_coocc, name=department)

### Exciting part

In [11]:
# Use this to find Product-Ids
# shows all products containing stub. Case insensitive
autocomplete_product("Stracciatella")

Lindor Stracciatella White Chocolate Truffles - 25268
Stracciatella Gelato - 38741


In [12]:
# create product graph
# offline -> create html page with plot and display as new tab
# notebook -> display in notebook
graph_product(product_id=38741, top_n=25, offline=True, notebook=False)