<a href="https://colab.research.google.com/github/khamzovich/visualization/blob/main/plotly_sankey.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install chart-studio --quiet

[?25l[K     |█████                           | 10 kB 21.1 MB/s eta 0:00:01[K     |██████████▏                     | 20 kB 17.4 MB/s eta 0:00:01[K     |███████████████▎                | 30 kB 11.3 MB/s eta 0:00:01[K     |████████████████████▍           | 40 kB 4.8 MB/s eta 0:00:01[K     |█████████████████████████▍      | 51 kB 5.0 MB/s eta 0:00:01[K     |██████████████████████████████▌ | 61 kB 5.8 MB/s eta 0:00:01[K     |████████████████████████████████| 64 kB 1.9 MB/s 
[?25h  Building wheel for retrying (setup.py) ... [?25l[?25hdone


In [3]:
import pandas as pd
import numpy as np

import chart_studio
import chart_studio.plotly as py
import plotly.graph_objects as go

### Load Data

In [5]:
events = pd.read_csv('/content/drive/MyDrive/CV_egabdrak/appendix/users_event_names.csv')
events.head(3)

Unnamed: 0,event_timestamp,user_id,name
0,1641408689791774,61ae53c84d1d22aef9160881,review_start
1,1641408720551338,61ae53c84d1d22aef9160881,review_finish
2,1641368139229908,288780102,recommendation_open


In [6]:
events.name.unique()

array(['review_start', 'review_finish', 'recommendation_open', 'faq_open',
       'alert_open', 'registration_success', 'share_open',
       'profile_completed', 'read_open', 'video_start', 'video_live_open',
       'video_tab_open', 'video_subscribe'], dtype=object)

### Sankey Diagram

[User journey (sankey) diagram](https://medium.com/multiplyai/user-journey-sankey-diagram-25bb1aa42484)

#### functions

1) Define a starting point and return the first n_steps for each user

In [7]:
def filter_starting_step(x, starting_step, n_steps):
  """
  Function used to return the first n_steps for each user starting from the "starting_step".
  The function will be used to generate the event sequence journey for each user.
  """
  starting_step_index = x.index(starting_step)
    
  return x[starting_step_index: starting_step_index + n_steps]

In [8]:
def user_journey(events, starting_step, n_steps=5, events_per_step=10, id='user_id'):
  """
  Function used to map out the journey for each user starting from the defined "starting_step" and count
  how many identical journeys exist across users.
  """

  # sort events by time
  events = events.sort_values([id, 'event_timestamp'])

  # find the users that have performed the starting_step
  valid_ids = events[events['name'] == starting_step][id].unique()

  # plan out the journey per user, with each step in a separate column
  flow = (events[(events[id].isin(valid_ids))]
          .groupby(id)
          .name.agg(list)
          .to_frame()['name']
          .apply(lambda x: filter_starting_step(x, starting_step=starting_step, n_steps=n_steps))
          .to_frame()
          ['name'].apply(pd.Series)
          )

  # fill NaNs with "End" to denote no further step by user; this will be filtered out later
  flow = flow.fillna('End')

  # add the step number as prefix to each step
  for i, col in enumerate(flow.columns):
    flow[col] = '{}: '.format(i + 1) + flow[col].astype(str)

  # replace events not in the top "events_per_step" most frequent list with the name "Other"
  # this is done to avoid having too many nodes in the sankey diagram
  for col in flow.columns:
    all_events = flow[col].value_counts().index.tolist()
    all_events = [e for e in all_events if e != (str(col + 1) + ': End')]
    top_events = all_events[:events_per_step]
    to_replace = list(set(all_events) - set(top_events))
    flow[col].replace(to_replace, [str(col + 1) + ': Other'] * len(to_replace), inplace=True)

  # count the number of identical journeys up the max step defined
  flow = (flow
          .groupby(list(range(n_steps)))
          .size()
          .to_frame()
          .rename({0: 'count'}, axis=1)
          .reset_index()
          )

  return flow

4) Transform the DataFrame to count source:target pairs

In [9]:
def sankey_df(events, starting_step, n_steps=5, events_per_step=10):
  """
  Function used to generate the dataframe needed to be passed to the sankey generation function.
  "source" and "target" column pairs denote links that will be shown in the sankey diagram.
  """
  # generate the user user flow dataframe
  flow = user_journey(events, starting_step, n_steps, events_per_step)

  # create the nodes labels list
  label_list = []
  cat_cols = flow.columns[:-1].values.tolist()
  for cat_col in cat_cols:
    label_list_temp = list(set(flow[cat_col].values))
    label_list = label_list + label_list_temp

  # create a list of colours for the nodes
  # assign 'blue' to any node and 'grey' to "Other" nodes
  colors_list = ['blue' if i.find('Other') < 0 else 'grey' for i in label_list]

  # transform flow df into a source-target pair
  for i in range(len(cat_cols) - 1):
    if i == 0:
      source_target_df = flow[[cat_cols[i], cat_cols[i + 1], 'count']]
      source_target_df.columns = ['source', 'target', 'count']
    else:
      temp_df = flow[[cat_cols[i], cat_cols[i + 1], 'count']]
      temp_df.columns = ['source', 'target', 'count']
      source_target_df = pd.concat([source_target_df, temp_df])
    source_target_df = source_target_df.groupby(['source', 'target']).agg({'count': 'sum'}).reset_index()

    # add index for source-target pair
    source_target_df['source_id'] = source_target_df['source'].apply(lambda x: label_list.index(x))
    source_target_df['target_id'] = source_target_df['target'].apply(lambda x: label_list.index(x))

    # filter out the end step
    source_target_df = source_target_df[(~source_target_df['source'].str.contains('End')) &
                                        (~source_target_df['target'].str.contains('End'))]

  return label_list, colors_list, source_target_df

In [10]:
def plot_user_flow(events, starting_step, n_steps=5, events_per_step=10, title='Sankey Diagram'):
  """
  Function used to generate the sankey plot for user journeys.
  """
  # transform raw events dataframe into  source:target pairs including node ids and count of each combination
  label_list, colors_list, source_target_df = sankey_df(events, starting_step, n_steps, events_per_step)

  # creating the sankey diagram
  data = dict(
      type='sankey',
      node=dict(
          pad=20,
          thickness=20,
          color=colors_list,
          line=dict(
            color="black",
            width=0.5
            ),
            label=label_list
            ),
            link=dict(
              source=source_target_df['source_id'].values.tolist(),
              target=source_target_df['target_id'].values.tolist(),
              value=source_target_df['count'].astype(int).values.tolist(),
              hoverlabel=dict(
                bgcolor='#C2C4C7')
              )
            )

  # set window width so that steps are evenly spaced out
  if n_steps < 5:
    width = None
  else:
    width = n_steps * 250
    
  layout = dict(
      height=600,
      width=width,
      margin=dict(t=30, l=0, r=0, b=30),
      title=title,
      font=dict(
          size=10
          )
      )

  # fig = dict(data=[data], layout=layout)

  # plotly
  fig = go.Figure(data=[data], layout=layout)

  # fig.update_layout(title_text="Basic Sankey Diagram", font_size=10)
  # fig.show()

  return fig

#### plot diagram

In [11]:
fig = plot_user_flow(events, 'registration_success', n_steps=5, events_per_step=15, title='Events')
fig.show()

#### upload diagrams

In [None]:
chart_studio.tools.set_credentials_file(username='USER_NAME', api_key='API_KEY')

In [None]:
py.plot(fig, filename = 'FILE_NAME', auto_open=True) # out: URL to diagram