In [1]:
import pandas as pd
from datetime import datetime, timedelta
from explorer import EventExplorer

In [2]:
import plotly.graph_objects as go
import plotly.express as px
from ipywidgets import Output, AppLayout, Box, Dropdown, VBox
from functools import partial
import json
from dataclasses import dataclass, field
from typing import Union, List

In [3]:
pd.set_option('display.max_rows',1000)
pd.set_option('display.max_columns', 1000)

In [4]:
df = pd.read_parquet('parsed_december_page_edits.parquet')

FileNotFoundError: [Errno 2] No such file or directory: 'parsed_december_page_edits.parquet'

In [None]:
df['event_name'] = df['event_entity'] + '_' + df['event_type']
df = df.loc[:, [
    'event_name', 
    'event_timestamp',
    'event_user_text_historical_escaped', 
    'event_user_is_bot_by_historical_string',
    'event_user_revision_count',
    'event_user_seconds_since_previous_revision',
    'page_id',
    'page_title_historical_escaped',
    'page_revision_count',
    'revision_text_bytes',
    'revision_is_identity_reverted',
    'revision_seconds_to_identity_revert',
    'revision_tags_string'
]]

In [8]:
df.to_parquet('december_df.parquet')

In [5]:
df = pd.read_parquet('../data/december_df.parquet')

In [6]:
print(f'{(df.memory_usage(deep=True) * 1e-9).sum().round(2)}GB')

1.44GB


In [6]:
ex = EventExplorer(df)

In [8]:
#ex.plot_event_volume()

In [9]:
df[
    (df['event_name']=='page_delete')
    & (df['event_timestamp'] > datetime(2022, 12, 9, 18))
    & (df['event_timestamp'] < datetime(2022, 12, 9, 19))
].value_counts('event_user_text_historical_escaped')

event_user_text_historical_escaped
Ponyo        1547
Cyrius          2
Liz             2
DanCherek       1
Fastily         1
Sdrqaz          1
dtype: int64

In [10]:
ex.entities

['event_name',
 'event_user_text_historical_escaped',
 'event_user_is_bot_by_historical_string',
 'page_title_historical_escaped',
 'revision_is_identity_reverted',
 'revision_tags_string']

In [11]:
#TODO: hmm, this is basically just the above for a particular user, maybe it doesn't need to be a different method?


ex.explore_entity(entity='event_user_text_historical_escaped', item='Marcocapelle')

FigureWidget({
    'data': [{'line': {'color': '#636EFA'},
              'marker': {'color': [#636EFA, #636EFA, #636EFA, #636EFA, #636EFA,
                                   #636EFA, #636EFA, #636EFA, #636EFA, #636EFA,
                                   #636EFA, #636EFA, #636EFA, #636EFA, #636EFA,
                                   #636EFA, #636EFA, #636EFA, #636EFA, #636EFA,
                                   #636EFA],
                         'size': [5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
                                  5, 5, 5, 5, 5]},
              'mode': 'lines+markers',
              'name': 'revision_create',
              'type': 'scatter',
              'uid': 'a520bc50-6bd0-4185-a6ca-1dab497c52d3',
              'x': array([datetime.datetime(2022, 12, 1, 0, 0),
                          datetime.datetime(2022, 12, 2, 0, 0),
                          datetime.datetime(2022, 12, 3, 0, 0),
                          datetime.datetime(2022, 12, 4, 0, 0),
             

In [12]:
# sessionize
# initial event (or start of session)
# final event (or end of session)
# include/collapse duplicate events (discussion [here](https://stackoverflow.com/questions/19463985/pandas-drop-consecutive-duplicates))
# ignore particular events

In [None]:
# We can build sessions initially, but we will also be making new sub-sessions after that


# 1. filter out anything we don't want, dedup if that's what we want
# 2. find first occurance of an event in a session
# 3. find next n events

In [7]:
first_event = 'page_create' # page_title_historical_escaped
last_event = None
n_events = 5
include_duplicates = True
session_length = timedelta(hours=1)
ignore_list = ['page_create-page']

entity = 'event_user_text_historical_escaped'

In [65]:
df = ex.internal_df.loc [:, :] # make a copy

In [53]:
df['event_name'].value_counts()

revision_create     3143510
page_create           54949
page_create-page      46993
page_move             13167
page_delete            9507
page_restore            306
page_merge               35
Name: event_name, dtype: int64

In [8]:
# initial filtering
df = df[df['event_user_is_bot_by_historical_string'].isna()] # remove bots
df = df.loc[df[ex.event_name_col].isin(set(ex.event_names) - set(ignore_list)), :] # remove ignored event types
#df = df.loc[] # dedup events


In [67]:
# build sessions

df['timediff'] = (
    df
    .sort_values([entity,  ex.event_time_col])
    .groupby(entity)[ex.event_time_col]
    .diff() 
)
df['_is_session_start'] = (df['timediff'].isna() | (df['timediff'] > session_length))
df['_is_session_end'] = df.sort_values([entity,  ex.event_time_col])[['_is_session_start']].shift(-1)
df['_session_id'] = df.sort_values([entity,  ex.event_time_col])['_is_session_start'].cumsum()

df.drop(columns=['timediff'], inplace=True)

df['_rank'] = df.groupby('_session_id')[ex.event_time_col].rank(method='first')

In [15]:
# df[df['_rank'] > 1000]['event_user_text_historical_escaped'].drop_duplicates()

In [16]:
#df[(df['event_user_text_historical_escaped'] == 'Marcocapelle') & (df['_session_id'] == 550073)]

In [68]:
first_event = 'page_create'

In [154]:
def make_sankey(
    df, 
    entity='event_user_text_historical_escaped',
    first_event=None,
    last_event=None,
    n_events=5,
    session_length=timedelta(hours=2),
    remove_bots=True,
    include_duplicates=True,
    ignore_list=[],
):  # TODO: turn session start and session end into events themselves
    
    # check params
    if first_event is not None and last_event is not None:
        raise ValueError('pass in either first or last event, not both')
    
    if first_event is None and last_event is None:
        raise ValueError('either first or last event is required')

    if first_event is not None:
        key_event = first_event
        shift_direction = 1
    
    if last_event is not None:
        raise(NotImplementedError)
        key_event = last_event
        shift_direction = -1


    # initial filtering
    if remove_bots:
        df = df[df['event_user_is_bot_by_historical_string'].isna()] # remove bots

    if not include_duplicates:
        raise NotImplementedError
    
    if len(ignore_list) > 0:
        df = df.loc[df[ex.event_name_col].isin(set(ex.event_names) - set(ignore_list)), :] # remove ignored event types


    # build sessions
    df['timediff'] = (
        df
        .sort_values([entity,  ex.event_time_col])
        .groupby(entity)[ex.event_time_col]
        .diff() 
    )
    df['_is_session_start'] = (df['timediff'].isna() | (df['timediff'] > session_length))
    df['_is_session_end'] = df.sort_values([entity,  ex.event_time_col])[['_is_session_start']].shift(-1)
    df['_session_id'] = df.sort_values([entity,  ex.event_time_col])['_is_session_start'].cumsum()
    df.drop(columns=['timediff'], inplace=True)
    df['_rank'] = df.groupby('_session_id')[ex.event_time_col].rank(method='first')
    

    # select events to include

    key_events = df.loc[df['event_name'] == key_event, ['_session_id', '_rank']].groupby('_session_id', as_index=False).min()
    key_events['_key_event'] = 1
    rnk_increment = pd.DataFrame({'rank_increment': range(n_events)})
    selected_events = key_events.merge(rnk_increment, how='cross')
    selected_events['_rank'] = selected_events['_rank'] + selected_events['rank_increment']
    selected_events.drop(columns=['_key_event'], inplace=True)
    events_to_plot = df.merge(selected_events, on=['_session_id', '_rank'], how='inner', suffixes=None)
    

    # format events for plotting

    final_df = events_to_plot[['_session_id', ex.event_name_col, 'rank_increment']]

    paired_events = pd.concat([
        final_df, 
        final_df
        .shift(-1 * shift_direction)
        .rename(columns={'_session_id': 'next_session_id', ex.event_name_col: f'next_{ex.event_name_col}', 'rank_increment': 'next_rank_increment'})
        ], axis=1
    )
    to_plot = (
        paired_events[paired_events['_session_id'] == paired_events['next_session_id']]
        [['rank_increment', ex.event_name_col, f'next_{ex.event_name_col}', 'next_rank_increment']]
        .groupby(['rank_increment', ex.event_name_col, f'next_{ex.event_name_col}'], as_index=False)
        .count()
        .rename(columns={'next_rank_increment': 'count'})
    )

    to_plot['source'] = to_plot['rank_increment'].astype(str) + '_' + to_plot['event_name']
    to_plot['target'] = (to_plot['rank_increment'] + 1).astype(str) + '_' + to_plot['next_event_name']


    nodes = set(to_plot['source']).union(to_plot['target'])
    node_ind_dict = {k:v for k, v in zip(nodes, range(len(nodes)))}
    to_plot_indices = to_plot[['source', 'target', 'count']].replace(node_ind_dict)


    # plot

    flow = go.Sankey(
        node=dict(
            pad = 15,
            thickness = 20,
            line = dict(color = 'blue', width = 0.5),
            label = list(node_ind_dict.keys()),
            color = 'purple'
        ),
        link=dict(
            source = to_plot_indices['source'].to_list(),  
            target = to_plot_indices['target'].to_list(),
            value = to_plot_indices['count'].to_list()
    ))

    fig = go.Figure()
    fig.add_trace(flow)
    fig.update_layout(title_text="Basic Sankey Diagram", font_size=10)

    return fig



In [156]:
ex.entities

['event_name',
 'event_user_text_historical_escaped',
 'event_user_is_bot_by_historical_string',
 'page_title_historical_escaped',
 'revision_is_identity_reverted',
 'revision_tags_string']

In [157]:
fig = make_sankey(
    df, 
    entity='page_title_historical_escaped',
    first_event='page_create',
    # last_event=None,
    # n_events=5,
    # session_length=timedelta(hours=2),
    # remove_bots=True,
    # include_duplicates=True,
    # ignore_list=[],
)

fig.show()


Passing 'suffixes' as a <class 'NoneType'>, is not supported and may give unexpected results. Provide 'suffixes' as a tuple instead. In the future a 'TypeError' will be raised.



In [69]:
key_events = df.loc[df['event_name'] == first_event, ['_session_id', '_rank']].groupby('_session_id', as_index=False).min()
key_events['_key_event'] = 1


In [70]:
rnk_increment = pd.DataFrame({'rank_increment': range(n_events)})

In [71]:
selected_events = key_events.merge(rnk_increment, how='cross')

In [72]:
selected_events['_rank'] = selected_events['_rank'] + selected_events['rank_increment']

In [75]:
selected_events.drop(columns=['_key_event'], inplace=True)

In [73]:
df.columns

Index(['event_name', 'event_timestamp', 'event_user_text_historical_escaped',
       'event_user_is_bot_by_historical_string', 'event_user_revision_count',
       'event_user_seconds_since_previous_revision', 'page_id',
       'page_title_historical_escaped', 'page_revision_count',
       'revision_text_bytes', 'revision_is_identity_reverted',
       'revision_seconds_to_identity_revert', 'revision_tags_string', '_count',
       '_coarse_dates', '_fine_dates', '_is_session_start', '_is_session_end',
       '_session_id', '_rank'],
      dtype='object')

In [76]:
selected_events.columns

Index(['_session_id', '_rank', 'rank_increment'], dtype='object')

In [77]:
events_to_plot = df.merge(selected_events, on=['_session_id', '_rank'], how='inner', suffixes=None)

  events_to_plot = df.merge(selected_events, on=['_session_id', '_rank'], how='inner', suffixes=None)


In [143]:
final_df = events_to_plot[['_session_id', ex.event_name_col, 'rank_increment']]

paired_events = pd.concat([
    final_df, 
    final_df
    .shift(-1)
    .rename(columns={'_session_id': 'next_session_id', ex.event_name_col: f'next_{ex.event_name_col}', 'rank_increment': 'next_rank_increment'})
    ], axis=1
)
to_plot = (
    paired_events[paired_events['_session_id'] == paired_events['next_session_id']]
    [['rank_increment', ex.event_name_col, f'next_{ex.event_name_col}', 'next_rank_increment']]
    .groupby(['rank_increment', ex.event_name_col, f'next_{ex.event_name_col}'], as_index=False)
    .count()
    .rename(columns={'next_rank_increment': 'count'})
)

to_plot['source'] = to_plot['rank_increment'].astype(str) + '_' + to_plot['event_name']
to_plot['target'] = (to_plot['rank_increment'] + 1).astype(str) + '_' + to_plot['next_event_name']

In [144]:
nodes = set(to_plot['source']).union(to_plot['target'])
node_ind_dict = {k:v for k, v in zip(nodes, range(len(nodes)))}
to_plot_indices = to_plot[['source', 'target', 'count']].replace(node_ind_dict)

In [147]:
 # TODO: no page_merge?

In [149]:
flow = go.Sankey(
    node=dict(
        pad = 15,
        thickness = 20,
        line = dict(color = 'blue', width = 0.5),
        label = list(node_ind_dict.keys()),
        color = 'purple'
    ),
    link=dict(
        source = to_plot_indices['source'].to_list(),  
        target = to_plot_indices['target'].to_list(),
        value = to_plot_indices['count'].to_list()
))

fig = go.Figure()
fig.add_trace(flow)
fig.update_layout(title_text="Basic Sankey Diagram", font_size=10)
fig.show()

In [25]:
def _add_sessions(self, df, entity, session_length):
    session_id_col = '_session_id' # this could be more efficient for non-sessionization
    df.sort_values([entity,  self.event_time_col], inplace=True)
    df['timediff'] = (
        df
        .groupby(entity)[self.event_time_col]
        .diff()  # TODO: can I change the shift logic below to work more like this?
    )
    df['_is_session_start'] = (df['timediff'].isna() | (df['timediff'] > session_length))
    df.drop(columns=['timediff'], inplace=True)
    df['_is_session_end'] = df[['_is_session_start']].shift(-1)
    df[session_id_col] = df['_is_session_start'].cumsum()
    df['_rank'] = df.groupby(session_id_col)[self.event_time_col].rank(method='first')
    return df

In [26]:
df = _add_sessions(ex, df, entity, session_length)

In [21]:
df.drop(columns=['_is_session_start', '_is_session_end', '_session_id', '_rank'], inplace=True)

In [None]:
b

In [63]:
def _add_sessions(self, df, entity, session_length):
        session_id_col = '_session_id' # this could be more efficient for non-sessionization
        df.sort_values([entity,  self.event_time_col], inplace=True)
        df['timediff'] = (
            df
            .groupby(entity)[self.event_time_col]
            .diff()  # TODO: can I change the shift logic below to work more like this?
        )
        df['_is_session_start'] = (df['timediff'].isna() | (df['timediff'] > session_length))
        df.drop(columns=['timediff'], inplace=True)
        df['_is_session_end'] = df[['_is_session_start']].shift(-1)
        df[session_id_col] = df['_is_session_start'].cumsum()
        df['_rank'] = df.groupby(session_id_col)[self.event_time_col].rank(method='first')
        return df

In [74]:
df = pd.read_parquet('../data/movielens.parquet')

In [75]:
ex = EventExplorer(
    event_data=df,
    event_name_col='rating',
    event_time_col='timestamp',
    coarse_grain=timedelta(days=365),
    fine_grain='d',
)

In [76]:
ex.event_name_col = 'primary_genre'

In [77]:
entity='userId'
first_event='Drama'
last_event=None
n_events=5
session_length=timedelta(days=365*30)
ignore_list=[]


# check params
if first_event is not None and last_event is not None:
    raise ValueError('pass in either first or last event, not both')

if first_event is None and last_event is None:
    raise ValueError('either first or last event is required')

if first_event is not None:
    key_event = first_event
    shift_direction = 1

if last_event is not None:
    raise(NotImplementedError)
    key_event = last_event
    shift_direction = -1


# initial filtering
if not include_duplicates:
    raise NotImplementedError

if len(ignore_list) > 0:
    df = df.loc[df[ex.event_name_col].isin(set(ex.event_names) - set(ignore_list)), :] # remove ignored event types



In [78]:

# build sessions
df = _add_sessions(ex, df, entity, session_length)


In [79]:
df.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres,primary_genre,_count,_coarse_dates,_fine_dates,_is_session_start,_is_session_end,_session_id,_rank
350481,1,5952,4.0,2006-05-17 12:14:13,"Lord of the Rings: The Two Towers, The (2002)",Adventure|Fantasy,Adventure,1,2005-12-23,2006-05-17,True,False,1,1.0
197179,1,2012,2.5,2006-05-17 12:14:28,Back to the Future Part III (1990),Adventure|Comedy|Sci-Fi|Western,Comedy,1,2005-12-23,2006-05-17,False,False,1,2.0
174189,1,2011,2.5,2006-05-17 12:14:39,Back to the Future Part II (1989),Adventure|Comedy|Sci-Fi,Sci-Fi,1,2005-12-23,2006-05-17,False,False,1,3.0
152299,1,1653,4.0,2006-05-17 12:14:57,Gattaca (1997),Drama|Sci-Fi|Thriller,Sci-Fi,1,2005-12-23,2006-05-17,False,False,1,4.0
135029,1,1250,4.0,2006-05-17 12:20:14,"Bridge on the River Kwai, The (1957)",Adventure|Drama|War,Drama,1,2005-12-23,2006-05-17,False,False,1,5.0


In [80]:

# select events to include
session_id_col = '_session_id' # TODO: consolidate this var
key_events = df.loc[df[ex.event_name_col] == key_event, [session_id_col, '_rank']].groupby(session_id_col, as_index=False).min()
key_events['_key_event'] = 1
rnk_increment = pd.DataFrame({'rank_increment': range(n_events)})
selected_events = key_events.merge(rnk_increment, how='cross')
selected_events['_rank'] = selected_events['_rank'] + selected_events['rank_increment']
selected_events.drop(columns=['_key_event'], inplace=True)
events_to_plot = df.merge(selected_events, on=[session_id_col, '_rank'], how='inner', suffixes=None)



  events_to_plot = df.merge(selected_events, on=[session_id_col, '_rank'], how='inner', suffixes=None)


In [81]:

# format events for plotting

final_df = events_to_plot[[session_id_col, ex.event_name_col, 'rank_increment']]

paired_events = pd.concat([
    final_df, 
    final_df
    .shift(-1 * shift_direction)
    .rename(columns={session_id_col: 'next_session_id', ex.event_name_col: f'next_{ex.event_name_col}', 'rank_increment': 'next_rank_increment'})
    ], axis=1
)
to_plot = (
    paired_events[paired_events[session_id_col] == paired_events[session_id_col]]
    [['rank_increment', ex.event_name_col, f'next_{ex.event_name_col}', 'next_rank_increment']]
    .groupby(['rank_increment', ex.event_name_col, f'next_{ex.event_name_col}'], as_index=False)
    .count()
    .rename(columns={'next_rank_increment': 'count'})
)


In [87]:
to_plot.head()

Unnamed: 0,rank_increment,primary_genre,next_primary_genre,count
0,0,Drama,(no genres listed),44
1,0,Drama,Action,14927
2,0,Drama,Adventure,13501
3,0,Drama,Animation,1867
4,0,Drama,Children,3741


In [None]:
# TODO: starting to figure out "other" category, but it isn't quite there

In [86]:
groups = []
for g in to_plot.sort_values(['rank_increment', 'count']).groupby('rank_increment'):
    groups.append(g.loc[0:5, :])
    other_count = pd.DataFrameag.loc[5:, 'count'].sum()
    if other_count >0:
        pd.DataFrame({
            'rank_increment': g['rank_increment'].iloc[0],
            entity: ,
            f'next_{entity}': 'other'
        })
        groups.append(pd.DataFrameag.loc[5:, 'count'].sum())

AttributeError: 'DataFrameGroupBy' object has no attribute 'loc'

In [None]:

to_plot['source'] = to_plot['rank_increment'].astype(str) + '_' + to_plot[self.event_name_col]
to_plot['target'] = (to_plot['rank_increment'] + 1).astype(str) + '_' + to_plot[f'next_{self.event_name_col}']


nodes = set(to_plot['source']).union(to_plot['target'])
node_ind_dict = {k:v for k, v in zip(nodes, range(len(nodes)))}
to_plot_indices = to_plot[['source', 'target', 'count']].replace(node_ind_dict)


In [None]:


# plot

flow = go.Sankey(
    node=dict(
        pad = 15,
        thickness = 20,
        line = dict(color = 'blue', width = 0.5),
        label = list(node_ind_dict.keys()),
        color = 'purple'
    ),
    link=dict(
        source = to_plot_indices['source'].to_list(),  
        target = to_plot_indices['target'].to_list(),
        value = to_plot_indices['count'].to_list()
))

fig = go.Figure()
fig.add_trace(flow)
fig.update_layout(title_text="Basic Sankey Diagram", font_size=10)

