In [1]:
import pandas as pd
# from retentioneering import datasets
from typing import Literal, Union, List, Optional, Iterable, get_args, Dict, Tuple
import numpy as np 
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans, DBSCAN
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go

from abc import ABC, abstractmethod
import warnings

import datetime
import itertools

In [2]:
import sys

sys.path.append('D:\\SUSU\\diplom\\product\\insight_pulse')

for path in sys.path:
    print(path)

d:\SUSU\diplom\product\insight_pulse\features_beta\ux_research
d:\Prog\CondaProg\anaconda\python311.zip
d:\Prog\CondaProg\anaconda\DLLs
d:\Prog\CondaProg\anaconda\Lib
d:\Prog\CondaProg\anaconda

d:\Prog\CondaProg\anaconda\Lib\site-packages
d:\Prog\CondaProg\anaconda\Lib\site-packages\win32
d:\Prog\CondaProg\anaconda\Lib\site-packages\win32\lib
d:\Prog\CondaProg\anaconda\Lib\site-packages\Pythonwin
d:\Prog\CondaProg\anaconda\Lib\site-packages\IPython\extensions
C:\Users\george\.ipython
D:\SUSU\diplom\product\insight_pulse


In [3]:
from eventframing.eventframe import EventFrame
from eventframing.cols_schema import EventFrameColsSchema
from eventframing.event_type import EventType

from data_preprocessing.preprocessors_lib.add_start_end_events import AddStartEndEventsPreprocessor
from data_preprocessing.preprocessors_lib.split_sessions import SplitSessionsPreprocessor

from utils.time_units import TimeUnits
from utils.time_unit_period import TimeUnitPeriod

## Data Generation

In [4]:
def generate_user_events_data(num_users=1000, num_events=10, num_cities=10, num_os=2, num_rows=5000, seed=42):
    """
    Функция для генерации данных о событиях пользователей.
    """
    # Устанавливаем seed для генератора случайных чисел
    np.random.seed(seed)

    # Создаем список пользователей
    users = ['user' + str(i) for i in range(1, num_users+1)]

    # Создаем список событий
    events = ['event' + str(i) for i in range(1, num_events+1)]

    # Создаем список дат
    dates = [datetime.datetime.now() - datetime.timedelta(days=x) for x in range(180)]

    # Создаем список городов
    cities = ['city' + str(i) for i in range(1, num_cities+1)]

    # Создаем список операционных систем
    os = ['iOS', 'Android']

    statuses = ['status' + str(i) for i in range(1, num_os+1)]

    # Создаем DataFrame
    df = pd.DataFrame({
        'user_id': np.random.choice(users, num_rows),
        'event': np.random.choice(events, num_rows),
        'event_datetime': np.random.choice(dates, num_rows),
        'city': np.random.choice(cities, num_rows),
        'operation_system': np.random.choice(os, num_rows),
        'status': np.random.choice(statuses, num_rows)
    })

    return df

In [31]:
data = generate_user_events_data(num_rows=5000, num_users=5000)
cols_schema = {'user_id': 'user_id', 'event_timestamp': 'event_datetime', 'event_name': 'event'}
ef = EventFrame(data, cols_schema)
ef.to_dataframe().head()

EventFrameColsSchema(event_id=None, event_type=None, event_index=None, event_name=event, event_timestamp=event_datetime, user_id=user_id, session_id=None, cohort_group=None, custom_cols=[])


Unnamed: 0,user_id,event,event_datetime,city,operation_system,status,event_id,event_type,event_type_index
1916,user10,event1,2024-10-22 14:36:39.174220,city10,Android,status2,1916,raw,2
846,user10,event2,2024-12-09 14:36:39.174220,city10,Android,status1,846,raw,2
616,user10,event8,2025-03-29 14:36:39.173221,city5,iOS,status1,616,raw,2
3671,user1000,event6,2025-01-28 14:36:39.173221,city7,iOS,status2,3671,raw,2
1822,user1001,event10,2025-03-30 14:36:39.173221,city9,iOS,status2,1822,raw,2


## Implementation

In [None]:
class StepSankey:
    _path_end_event_name = 'ENDED'

    def __init__(self, ef: EventFrame):
        self.ef = ef
        self.cols_schema = ef.cols_schema
        self.weight_col = ''
        self.total_weight_col = ''
        self.step_weight_col = ''
        self.link_weight_col = ''
        self.nodes = None
        self.links = None

    def plot(self, data: Optional[EventFrame] = None, max_steps: int = 10, threshold: float = 0.05, weight_col: str = '',
            target_events: List[str] = None, title:str = '') -> None:
        step_matrix = self._fit(data=data, max_steps=max_steps, weight_col=weight_col)
        step_matrix, rare_events = self._threshold_events(step_matrix=step_matrix, threshold=threshold)

        self._plot(step_matrix, rare_events, max_steps=max_steps, title=title, target_events=target_events)

    def fit(self, data: Optional[EventFrame] = None, max_steps: int = 10, threshold: float = 0.05, 
            weight_col: str = '', events_to_keep: Optional[List[str]] = None) -> pd.DataFrame:
        data, _ = self._get_data_and_schema(data=data)
        prepared_data = self._prepare_data(data, max_steps, weight_col)
        nodes, rare_events = self._get_nodes(prepared_data, threshold, events_to_keep)
        links = self._get_links(prepared_data, nodes, rare_events)
        self.nodes = nodes
        self.links = links
        return nodes, links
    
    def _plot(self, data: Optional[EventFrame] = None, max_steps: int = 10, threshold: float = 0.05, 
            weight_col: str = '', events_to_keep: Optional[List[str]] = None, title: str = 'StepSankey'):
        nodes, links = self.fit(data, max_steps, threshold, weight_col, events_to_keep)

        fig = go.Figure(data=[go.Sankey(
            node = dict(
            pad = 15,
            thickness = 20,
            line = dict(color = "black", width = 0.5),
            label = nodes.step.astype(str) + nodes.event.astype(str),
            customdata = nodes.desc,
            hovertemplate = '%{customdata}',
            color = nodes.color
            ),
            link = dict(
            source = links.source_index,
            target = links.target_index,
            value = links[self.link_weight_col],
            hovertemplate='Step from %{source.label}<br />'+
            'to %{target.label}<br />made %{value} users',
        ))])

        fig.update_layout(title_text=title, font_size=10)
        fig.show()        
    
    def _get_data_and_schema(self, data: Optional[Union[EventFrame, pd.DataFrame]] = None, cols_schema: Optional[EventFrameColsSchema] = None) -> Tuple[pd.DataFrame, EventFrameColsSchema]:
        if data is None:
            print('data is None')
            data = self.ef.to_dataframe().copy()
            cols_schema = self.cols_schema
        else:
            if isinstance(data, EventFrame):
                cols_schema = data.cols_schema
                data = data.to_dataframe().copy()
            else:
                cols_schema = cols_schema
                data = data.copy()
        return data, cols_schema
    
    def _get_next_event(self, data: pd.DataFrame, weight_col: str, event_col: str) -> pd.DataFrame:
        grouped = data.groupby(weight_col)
        data['next_event'] = grouped[event_col].shift(-1)
        return data
    
    def _prepare_data(self, data: pd.DataFrame, max_steps: int, weight_col: str):
        data = data.copy()

        user_col = self.cols_schema.user_id
        event_col = self.cols_schema._event_name
        session_col = self.cols_schema.session_id
        dt_col = self.cols_schema.event_timestamp

        if weight_col != '':
            if weight_col not in data.columns:
                raise ValueError(f'Column {weight_col} is not in the EventFrame.')
        else:
            weight_col = session_col if session_col  is not None else user_col
        self.weight_col = weight_col
        self.total_weight_col = f'total_{weight_col}'
        self.step_weight_col = f'step_{weight_col}'
        self.link_weight_col = f'link_{weight_col}'

        # Add column with number of step in session or the whole path
        data = data.sort_values(by=[user_col, dt_col])
        data['step'] = data.groupby(weight_col)[dt_col].cumcount() + 1
        data = data[data['step'] <= max_steps]

        # Add path terminating event if user has less steps than max_steps
        data = data.pivot_table(index=weight_col, columns='step', values=event_col, aggfunc=lambda x: x)\
            .fillna(self._path_end_event_name).reset_index()
        
        # Unpivot table into original format but with terminating event 
        data = data.melt(id_vars=weight_col, var_name='step', value_name=event_col)
        # Add info about next event in path or session
        data = self._get_next_event(data, weight_col, event_col)
        return data
    
    def _get_nodes(self, prepared_data: pd.DataFrame, threshold: Union[float, int] = 0, events_to_keep: List[str] = None) -> pd.DataFrame:
        event_col = self.cols_schema._event_name
        if events_to_keep is None:
            events_to_keep = [self._path_end_event_name]
        else:
            events_to_keep.append(self._path_end_event_name)

        nodes = prepared_data.groupby(by=['step', event_col]).agg(**{
            self.step_weight_col: (self.weight_col, 'nunique')
        }).reset_index()

        total_weight_col_value = prepared_data.loc[prepared_data['step'] == 1, self.weight_col].nunique()
        nodes[self.total_weight_col] = total_weight_col_value
        nodes['pers_of_total'] = nodes[self.step_weight_col].divide(nodes[self.total_weight_col])

        threshold_metric = self.step_weight_col if isinstance(threshold, int) else 'pers_of_total'
        events_to_keep.extend(self._path_end_event_name)
        rare_events = nodes[(nodes[threshold_metric] < threshold) & (~nodes[event_col].isin(events_to_keep))]
        
        if not rare_events.empty:
            rare_events_idx = rare_events.index
            rare_events_replacers = rare_events.groupby('step').agg(**{
                'pers_of_total': ('pers_of_total', 'sum'),
                self.step_weight_col: (self.step_weight_col, 'sum'),
                'event': (event_col, lambda col: f'thresholded_{col.count()}')
            }).reset_index()
            rare_events_replacers[self.total_weight_col] = total_weight_col_value

            nodes = nodes.drop(index=rare_events_idx)
            nodes = pd.concat([
                nodes,
                rare_events_replacers,
            ], axis=0) 

            rare_events = pd.merge(
                rare_events.loc[:, ('step', event_col)],
                rare_events_replacers.loc[:, ('step', event_col)]\
                    .rename(columns={event_col: 'new_event_name'}),
                on='step'
            )

        all_events = nodes[event_col].unique()
        pallete = self._prepare_palette(all_events, event_col)

        nodes = pd.merge(
            nodes,
            pallete,
            on=event_col,
            how='inner'
        )

        nodes = nodes.sort_values(by=['step', self.step_weight_col], ascending=[True, True])
        nodes['index'] = list(range(nodes.shape[0]))
        nodes['desc'] = nodes.apply(lambda node: self._get_node_description(node, event_col), axis=1)        
        return nodes, rare_events
    
    def _get_node_description(self, node, event_col):
        desc = node[event_col] + ' ' + str(node[self.step_weight_col]) +\
            ' (' +  str(round(node['pers_of_total'] * 100, 1)) + '% of total)' 
        return desc
    
    def _replace_links_rare_events(self, links: pd.DataFrame, rare_events: pd.DataFrame) -> pd.DataFrame:
        if rare_events.empty:
            return links
        
        event_col = self.cols_schema.event_name
        replaced_links = pd.merge(
            links,
            rare_events,
            on=['step', event_col],
            how='left'  
        )

        replaced_links = pd.merge(
            replaced_links.assign(next_step=lambda x: x['step'] + 1),
            rare_events.rename(columns={event_col: 'next_event', 'step': 'next_step', 
                                        'new_event_name': 'new_next_event_name'}),
            on=['next_step', 'next_event'],
            how='left'
        )

        replaced_links[event_col] = replaced_links['new_event_name'].fillna(replaced_links[event_col])
        replaced_links['next_event'] = replaced_links['new_next_event_name'].fillna(replaced_links['next_event'])
        # replaced_links = replaced_links.drop(columns=['new_event_name', 'new_next_event_name'])
        return replaced_links
    
    def _get_links(self, prepared_data: pd.DataFrame, nodes: pd.DataFrame, rare_events: pd.DataFrame) -> pd.DataFrame:
        event_col = self.cols_schema._event_name

        links = prepared_data.groupby(by=['step', event_col, 'next_event']).agg(**{
            self.link_weight_col: (self.weight_col, 'nunique')
        }).reset_index()

        links = self._replace_links_rare_events(links, rare_events)

        links = pd.merge(
            links,
            nodes,
            on=['step', event_col],
            how='inner'
        )
        links['pers_of_step'] = links[self.link_weight_col].divide(links[self.step_weight_col])
        links = links.rename(columns={'index': 'source_index'})

        links = pd.merge(
            links.assign(next_step = lambda x: x['step'] + 1),
            nodes\
                .rename(columns={event_col: 'next_event', 'step': 'next_step'})\
                .loc[:, ('next_step', 'next_event', 'index')],
            on=['next_step', 'next_event'],
            how='left'
        )
        links = links.rename(columns={'index': 'target_index'})
        return links
    
    @staticmethod
    def _prepare_palette(all_events: list, event_col: str) -> list[tuple]:
        palette_hex = [
            "50BE97",  # Нежный зеленый
            "E4655C",  # Красный
            "FCC865",  # Яркий желтый
            "BFD6DE",  # Светло-голубой
            "3E5066",  # Темный синий
            "353A3E",  # Темный серый
            "E6E6E6",  # Светло-серый
            "6D4C41",  # Коричневый
            "FFD54F",  # Лимонно-желтый
            "4DB6AC"   # Бирюзовый
        ]
        # convert HEX to RGB
        palette = []
        for color in palette_hex:
            rgb_color = tuple(int(color[i : i + 2], 16) for i in (0, 2, 4))
            palette.append(rgb_color)

        # extend color palette if number of events more than default colors list
        complementary_palette = sns.color_palette("deep", len(all_events) - len(palette))
        if len(complementary_palette) > 0:
            colors = complementary_palette.as_hex()
            for c in colors:
                col = c[1:]
                palette.append(tuple(int(col[i : i + 2], 16) for i in (0, 2, 4)))

        # return palette, all_events

        palette = pd.DataFrame({event_col: all_events, 'color': palette})
        palette['color'] = 'rgb' + palette['color'].astype(str).str.replace(' ', '')
        return palette



In [78]:
ss = StepSankey(ef)

In [53]:
(ss._prepare_data(data, max_steps=3, weight_col='user_id').user_id.value_counts() != 3).sum()
prd = ss._prepare_data(data, max_steps=3, weight_col='user_id')
prd[(prd['step'] == 5) & (prd['event'] == 'event1')]
# ss.step_weight_col

Unnamed: 0,user_id,step,event,next_event


In [38]:
data['user_id'].nunique() * 3

9264

In [59]:
data.apply(lambda x: print(x, type(x)))

0        user861
1       user3773
2       user3093
3        user467
4       user4427
          ...   
4995    user2052
4996    user3399
4997    user3951
4998     user306
4999     user908
Name: user_id, Length: 5000, dtype: object <class 'pandas.core.series.Series'>
0        event1
1        event1
2        event1
3        event5
4       event10
         ...   
4995     event2
4996     event5
4997     event4
4998     event8
4999     event7
Name: event, Length: 5000, dtype: object <class 'pandas.core.series.Series'>
0      2025-02-05 14:36:39.173221
1      2024-11-28 14:36:39.174220
2      2024-12-29 14:36:39.174220
3      2025-03-26 14:36:39.173221
4      2024-12-06 14:36:39.174220
                  ...            
4995   2025-01-03 14:36:39.174220
4996   2024-12-17 14:36:39.174220
4997   2024-10-31 14:36:39.174220
4998   2025-04-07 14:36:39.173221
4999   2025-03-02 14:36:39.173221
Name: event_datetime, Length: 5000, dtype: datetime64[ns] <class 'pandas.core.series.Series'>
0       city4

user_id             None
event               None
event_datetime      None
city                None
operation_system    None
status              None
dtype: object

In [79]:
ss._plot(max_steps=10, threshold=5, events_to_keep=['event1', 'event2'])

data is None


In [72]:
ss.nodes

Unnamed: 0,step,event,step_user_id,total_user_id,pers_of_total,color,index,desc
22,1,event5,261,3088,0.084521,"rgb(53,58,62)",0,event5 261 (8.5% of total)
34,1,event8,285,3088,0.092293,"rgb(255,213,79)",1,event8 285 (9.2% of total)
14,1,event3,294,3088,0.095207,"rgb(191,214,222)",2,event3 294 (9.5% of total)
18,1,event4,303,3088,0.098122,"rgb(62,80,102)",3,event4 303 (9.8% of total)
26,1,event6,307,3088,0.099417,"rgb(230,230,230)",4,event6 307 (9.9% of total)
38,1,event9,315,3088,0.102008,"rgb(77,182,172)",5,event9 315 (10.2% of total)
9,1,event2,328,3088,0.106218,"rgb(252,200,101)",6,event2 328 (10.6% of total)
30,1,event7,328,3088,0.106218,"rgb(109,76,65)",7,event7 328 (10.6% of total)
0,1,event1,331,3088,0.107189,"rgb(80,190,151)",8,event1 331 (10.7% of total)
5,1,event10,336,3088,0.108808,"rgb(228,101,92)",9,event10 336 (10.9% of total)


In [26]:
n, l = ss.fit(max_steps=10)

data is None


In [27]:
n

Unnamed: 0,step,event,step_user_id,total_user_id,pers_of_total,index
1,1,event10,473,5000,0.0946,0
7,1,event7,475,5000,0.0950,1
0,1,event1,482,5000,0.0964,2
8,1,event8,491,5000,0.0982,3
9,1,event9,497,5000,0.0994,4
...,...,...,...,...,...,...
91,10,event10,501,5000,0.1002,95
95,10,event5,506,5000,0.1012,96
92,10,event2,510,5000,0.1020,97
99,10,event9,514,5000,0.1028,98


In [8]:
StepSankey(ef)._prepare_palette(data['event'].unique(), 'event')

Unnamed: 0,event,color
0,event5,"rgb(80,190,151)"
1,event4,"rgb(228,101,92)"
2,event7,"rgb(252,200,101)"
3,event9,"rgb(191,214,222)"
4,event10,"rgb(62,80,102)"
5,event3,"rgb(53,58,62)"
6,event2,"rgb(230,230,230)"
7,event1,"rgb(109,76,65)"
8,event6,"rgb(255,213,79)"
9,event8,"rgb(77,182,172)"


In [94]:
# data = generate_user_events_data(num_rows=50000, num_users=50)
# cols_schema = {'user_id': 'user_id', 'event_timestamp': 'event_datetime', 'event_name': 'event'}
# ef = EventFrame(data, cols_schema)

sm = StepSankey(ef)
pdt = sm._prepare_data(max_steps=5)
n, re = sm._get_nodes(pdt, threshold=10)
n
# sm.plot(max_steps=20, target_events=['event1', 'event2'],threshold=0.15)
# sm.plot(max_steps=10,  target_events=['event1', 'event2'], threshold=0.15, weight_col='user_id')


data is None


Unnamed: 0,step,event,step_user_id,total_user_id,pers_of_total,color,index
2,1,event2,37,470,0.078723,"rgb(252,200,101)",0
7,1,event7,37,470,0.078723,"rgb(109,76,65)",1
9,1,event9,41,470,0.087234,"rgb(77,182,172)",2
3,1,event3,44,470,0.093617,"rgb(191,214,222)",3
4,1,event4,44,470,0.093617,"rgb(62,80,102)",4
1,1,event10,50,470,0.106383,"rgb(228,101,92)",5
8,1,event8,52,470,0.110638,"rgb(255,213,79)",6
0,1,event1,54,470,0.114894,"rgb(80,190,151)",7
6,1,event6,54,470,0.114894,"rgb(230,230,230)",8
5,1,event5,57,470,0.121277,"rgb(53,58,62)",9


In [39]:
# re.groupby('step').transform(lambda x: f'thresholded_{x["event"].count()}')
re

Unnamed: 0,step,event,step_user_id,total_user_id,pers_of_total,new_event_name
0,2,event1,2,470,0.004255,thresholded_9
1,2,event10,4,470,0.008511,thresholded_9
2,2,event2,5,470,0.010638,thresholded_9
3,2,event3,6,470,0.012766,thresholded_9
4,2,event5,3,470,0.006383,thresholded_9
5,2,event6,4,470,0.008511,thresholded_9
6,2,event7,2,470,0.004255,thresholded_9
7,2,event8,1,470,0.002128,thresholded_9
8,2,event9,2,470,0.004255,thresholded_9
9,3,event9,1,470,0.002128,thresholded_1


In [30]:
re.groupby('step')['event']

<pandas.core.groupby.generic.SeriesGroupBy object at 0x000002057FDDC050>

In [14]:
sm._get_nodes(pdt, threshold=10)

Unnamed: 0,step,event,step_user_id,total_user_id,pers_of_total,amount_of_events,color,index
2,1,event2,37,470,0.078723,,"rgb(252,200,101)",0
7,1,event7,37,470,0.078723,,"rgb(109,76,65)",1
9,1,event9,41,470,0.087234,,"rgb(77,182,172)",2
3,1,event3,44,470,0.093617,,"rgb(191,214,222)",3
4,1,event4,44,470,0.093617,,"rgb(62,80,102)",4
1,1,event10,50,470,0.106383,,"rgb(228,101,92)",5
8,1,event8,52,470,0.110638,,"rgb(255,213,79)",6
0,1,event1,54,470,0.114894,,"rgb(80,190,151)",7
6,1,event6,54,470,0.114894,,"rgb(230,230,230)",8
5,1,event5,57,470,0.121277,,"rgb(53,58,62)",9


In [95]:
l = sm._get_links(pdt, n, re)

In [92]:
# l['new_event_name'].fillna(l['event'])
l

Unnamed: 0,step,event,next_event,link_user_id,new_event_name,next_step,new_next_event_name,step_user_id,total_user_id,pers_of_total,color,source_index,pers_of_step,target_index
0,1,event1,ENDED,51,,2,,54,470,0.114894,"rgb(80,190,151)",7,0.944444,11.0
1,1,event1,event10,1,,2,thresholded_9,54,470,0.114894,"rgb(80,190,151)",7,0.018519,
2,1,event1,event2,1,,2,thresholded_9,54,470,0.114894,"rgb(80,190,151)",7,0.018519,
3,1,event1,event3,1,,2,thresholded_9,54,470,0.114894,"rgb(80,190,151)",7,0.018519,
4,1,event10,ENDED,48,,2,,50,470,0.106383,"rgb(228,101,92)",5,0.96,11.0
5,1,event10,event3,1,,2,thresholded_9,50,470,0.106383,"rgb(228,101,92)",5,0.02,
6,1,event10,event5,1,,2,thresholded_9,50,470,0.106383,"rgb(228,101,92)",5,0.02,
7,1,event2,ENDED,33,,2,,37,470,0.078723,"rgb(252,200,101)",0,0.891892,11.0
8,1,event2,event10,1,,2,thresholded_9,37,470,0.078723,"rgb(252,200,101)",0,0.027027,
9,1,event2,event2,1,,2,thresholded_9,37,470,0.078723,"rgb(252,200,101)",0,0.027027,


In [147]:
n[n['step'] == 1]['index']
n.assign(step = lambda x: x['step'] + 1)
n

Unnamed: 0,step,event,step_user_id,color,total_user_id,pers_of_total,index
35,1,event7,2,"rgb(62,80,102)",50,0.04,0
20,1,event4,3,"rgb(109,76,65)",50,0.06,1
10,1,event2,4,"rgb(77,182,172)",50,0.08,2
30,1,event6,4,"rgb(230,230,230)",50,0.08,3
45,1,event9,5,"rgb(255,213,79)",50,0.1,4
0,1,event1,6,"rgb(228,101,92)",50,0.12,5
25,1,event5,6,"rgb(53,58,62)",50,0.12,6
40,1,event8,6,"rgb(191,214,222)",50,0.12,7
5,1,event10,7,"rgb(80,190,151)",50,0.14,8
15,1,event3,7,"rgb(252,200,101)",50,0.14,9


In [148]:
n[(n['step'] == 19) & (n['event'] == 'event8')]

Unnamed: 0,step,event,step_user_id,color,total_user_id,pers_of_total,index


In [149]:
l

Unnamed: 0,step,event,next_event,link_user_id,step_user_id,color,total_user_id,pers_of_total,source_index,pers_of_step,target_index
0,2,event1,event10,1,6,"rgb(228,101,92)",50,0.12,5,0.166667,10
1,2,event1,event3,1,6,"rgb(228,101,92)",50,0.12,5,0.166667,17
2,2,event1,event4,2,6,"rgb(228,101,92)",50,0.12,5,0.333333,18
3,2,event1,event5,1,6,"rgb(228,101,92)",50,0.12,5,0.166667,19
4,2,event1,event6,1,6,"rgb(228,101,92)",50,0.12,5,0.166667,12
...,...,...,...,...,...,...,...,...,...,...,...
146,5,event8,event6,2,9,"rgb(191,214,222)",50,0.18,39,0.222222,49
147,5,event8,event9,3,9,"rgb(191,214,222)",50,0.18,39,0.333333,47
148,5,event9,event5,1,3,"rgb(255,213,79)",50,0.06,33,0.333333,46
149,5,event9,event6,1,3,"rgb(255,213,79)",50,0.06,33,0.333333,49


In [96]:
import plotly.graph_objects as go

fig = go.Figure(data=[go.Sankey(
    node = dict(
      pad = 15,
      thickness = 20,
      line = dict(color = "black", width = 0.5),
      label = n.step.astype(str) + n.event.astype(str),
      color = n.color
    ),
    link = dict(
      source = l.source_index,
      target = l.target_index,
      value = l.link_user_id
  ))])

fig.update_layout(title_text="Basic Sankey Diagram", font_size=10)
fig.show()