In [1]:
import pandas as pd
from datetime import datetime, timedelta

from explorer import EventExplorer

# Event Data



Here an "event dataset" is any dataset consisting of timestamped observations of different types of events.

Today we'll look at page edits on English-language wikipedia.

In [2]:
df = pd.read_parquet(
    '../data/december_df.parquet',
    columns= [
        'event_name',
        'event_timestamp',
        'event_user_text_historical_escaped',
        'event_user_is_bot_by_historical_string',
        'page_id',
        'page_title_historical_escaped'
    ]
    ) 

In [3]:
df.head(10)

Unnamed: 0,event_name,event_timestamp,event_user_text_historical_escaped,event_user_is_bot_by_historical_string,page_id,page_title_historical_escaped
4,revision_create,2022-12-01 00:00:00,Dale Arnett,,37511171,Battle_of_Brooklyn_(college_rivalry)
7,revision_create,2022-12-01 00:00:00,Pelmeen10,,7122751,Ginásio_Clube_Vilacondense
13,revision_create,2022-12-01 00:00:00,86.18.158.129,,483698,"Normanton,_Derby"
15,revision_create,2022-12-01 00:00:01,Stvbastian,,51928380,Chen_Yufei
17,revision_create,2022-12-01 00:00:01,BrickMaster02,,68737747,NFL_Slimetime
18,revision_create,2022-12-01 00:00:01,1980fast,,149425,The_Verve
22,revision_create,2022-12-01 00:00:02,BrickMaster02,,67665461,Eureka!_(2022_TV_series)
25,revision_create,2022-12-01 00:00:02,Citation bot,"name,group",68724157,Timeline_of_the_2020_Democratic_Party_presiden...
29,revision_create,2022-12-01 00:00:03,AnomieBOT,"name,group",18080870,Oklahoma_Department_of_Tourism_and_Recreation
30,revision_create,2022-12-01 00:00:03,73.240.29.237,,2201315,Skye_McCole_Bartusiak


In [4]:
df['event_name'].value_counts()

revision_create     3143510
page_create           54949
page_create-page      46993
page_move             13167
page_delete            9507
page_restore            306
page_merge               35
Name: event_name, dtype: int64

In [5]:
wiki_edit_explorer = EventExplorer(
    event_data=df,
    event_name_col='event_name',
    event_time_col='event_timestamp',
)

In [6]:
wiki_edit_explorer.plot_event_volume()

HBox(children=(FigureWidget({
    'data': [{'line': {'color': '#636EFA'},
              'marker': {'color': [#…

In [7]:
df[
    (df['event_name']=='page_delete')
    & (df['event_timestamp'] > datetime(2022, 12, 9, 18))
    & (df['event_timestamp'] < datetime(2022, 12, 9, 19))
].value_counts('event_user_text_historical_escaped')

event_user_text_historical_escaped
Ponyo        1547
Cyrius          2
Liz             2
DanCherek       1
Fastily         1
Sdrqaz          1
dtype: int64

In [10]:
fig, _ = wiki_edit_explorer.plot_sankey(
    entity='event_user_text_historical_escaped',
    first_event='page_create',
    n_events=4,
    session_length=timedelta(hours=2),
)

fig.show()


Passing 'suffixes' as a <class 'NoneType'>, is not supported and may give unexpected results. Provide 'suffixes' as a tuple instead. In the future a 'TypeError' will be raised.



In [11]:
fig2, _ = wiki_edit_explorer.plot_sankey(
    first_event='page_create',
    entity='page_title_historical_escaped',
    n_events=4,
    session_length=timedelta(days=60), # length longer than time span as a hack to ignore sessions
)
fig2.show()


Passing 'suffixes' as a <class 'NoneType'>, is not supported and may give unexpected results. Provide 'suffixes' as a tuple instead. In the future a 'TypeError' will be raised.

