In [1]:
import polars as pl

In [2]:
test = pl.read_ndjson( "/var/tmp/xgregor1/data/otto/test.jsonl" )

In [3]:
test = test.explode( "events" ).unnest( "events" )
test

session,aid,ts,type
i64,i64,i64,str
12899779,59625,1661724000278,"""clicks"""
12899780,1142000,1661724000378,"""clicks"""
12899780,582732,1661724058352,"""clicks"""
12899780,973453,1661724109199,"""clicks"""
12899780,736515,1661724136868,"""clicks"""
…,…,…,…
14571577,1141710,1662328774770,"""clicks"""
14571578,519105,1662328775009,"""clicks"""
14571579,739876,1662328775605,"""clicks"""
14571580,202353,1662328781067,"""clicks"""


In [4]:
test["type"].unique()

type
str
"""clicks"""
"""carts"""
"""orders"""


In [5]:
print( f'Number of test events: {test.shape[0]:_}' )

Number of test events: 6_928_123


In [6]:
print( f'Number of unique sessions: {test["session"].n_unique():_}' )
print( f'Number of unique sessions with not only views event: {test.filter( pl.col("type") != "clicks" )["session"].n_unique():_}' )

Number of unique sessions: 1_671_803
Number of unique sessions with not only views event: 245_737


In [7]:
train = pl.read_ndjson( "/var/tmp/xgregor1/data/otto/train.jsonl" )
train = train.explode( "events" ).unnest( "events" )

train

session,aid,ts,type
i64,i64,i64,str
0,1517085,1659304800025,"""clicks"""
0,1563459,1659304904511,"""clicks"""
0,1309446,1659367439426,"""clicks"""
0,16246,1659367719997,"""clicks"""
0,1781822,1659367871344,"""clicks"""
…,…,…,…
12899776,1737908,1661723987073,"""clicks"""
12899777,384045,1661723976974,"""clicks"""
12899777,384045,1661723986800,"""clicks"""
12899778,561560,1661723983611,"""clicks"""


In [8]:
print( f'Number of train events: {train.shape[0]:_}' )
print( f'Number of unique sessions: {train["session"].n_unique():_}' )
print( f'Number of unique sessions with not only views event: {train.filter( pl.col("type") != "clicks" )["session"].n_unique():_}' )

Number of train events: 216_716_096
Number of unique sessions: 12_899_779
Number of unique sessions with not only views event: 3_846_669


In [9]:
print( f'Number of events in sessions without only view events: {train.filter( pl.col("type") != "clicks" ).shape[0]:_}' )

Number of events in sessions without only view events: 21_995_142


In [10]:
import plotly.express as px
import plotly.graph_objects as go

In [11]:
# fig = px.histogram( train, x="ts", color="type", nbins=300 )
# fig.show() 

In [12]:
train.describe()

statistic,session,aid,ts,type
str,f64,f64,f64,str
"""count""",216716096.0,216716096.0,216716096.0,"""216716096"""
"""null_count""",0.0,0.0,0.0,"""0"""
"""mean""",4702800.0,928805.229105,1660500000000.0,
"""std""",3665000.0,536691.747782,696150000.0,
"""min""",0.0,0.0,1659300000000.0,"""carts"""
"""25%""",1456817.0,467421.0,1659900000000.0,
"""50%""",3915777.0,928109.0,1660500000000.0,
"""75%""",7468753.0,1394862.0,1661100000000.0,
"""max""",12899778.0,1855602.0,1661700000000.0,"""orders"""


In [13]:
from datetime import datetime

test_ts_min = datetime.fromtimestamp( test["ts"].min() / 1000 )
test_ts_max = datetime.fromtimestamp( test["ts"].max() / 1000 )

train_ts_min = datetime.fromtimestamp( train["ts"].min() / 1000 )
train_ts_max = datetime.fromtimestamp( train["ts"].max() / 1000 )

print( f'Train from {train_ts_min.date()} to {train_ts_max.date()}' )
print( f'Test from {test_ts_min.date()} to {test_ts_max.date()}' )


Train from 2022-08-01 to 2022-08-28
Test from 2022-08-29 to 2022-09-04


In [14]:
train.shape[0]

216716096

In [15]:
train = train.sort( pl.col("ts") )

In [16]:
last_50m = train.tail( 50_000_000 )

last_50m_ts_min = datetime.fromtimestamp( last_50m["ts"].min() / 1000 )
last_50m_ts_max = datetime.fromtimestamp( last_50m["ts"].max() / 1000 )

print( f'Last 50m events from {last_50m_ts_min.date()} to {last_50m_ts_max.date()}' )

Last 50m events from 2022-08-22 to 2022-08-28


In [17]:
last_50m.describe()

statistic,session,aid,ts,type
str,f64,f64,f64,str
"""count""",50000000.0,50000000.0,50000000.0,"""50000000"""
"""null_count""",0.0,0.0,0.0,"""0"""
"""mean""",6565300.0,927330.342202,1661500000000.0,
"""std""",4496500.0,537564.393936,170560000.0,
"""min""",0.0,0.0,1661200000000.0,"""carts"""
"""25%""",2148575.0,465128.0,1661300000000.0,
"""50%""",6213630.0,925754.0,1661500000000.0,
"""75%""",11406157.0,1394787.0,1661600000000.0,
"""max""",12899778.0,1855602.0,1661700000000.0,"""orders"""


In [18]:
train_counts = train.group_by( pl.col( "session" ) ).len()

fig = px.histogram( train_counts, x="len" )
# fig.show()

In [19]:
train_counts["len"].max()

500

In [20]:
last_50m_counts = last_50m.group_by( pl.col( "session" ) ).len()

fig = px.histogram( last_50m_counts, x="len" )
# fig.show()

In [21]:
print( f'Original len median: { train_counts["len"].median() }' )
print( f'Truncated len median: { last_50m_counts["len"].median() }' )

Original len median: 6.0
Truncated len median: 4.0


In [22]:
train_df = pl.read_ndjson( "/var/tmp/xgregor1/data/otto/train.jsonl" )

In [23]:
train_agg = train_df.explode( "events" )\
    .unnest( "events" )\
    .group_by( "session" )\
    .agg(
        min_ts=pl.col( "ts" ).min(), 
        max_ts=pl.col( "ts" ).max(), 
        len=pl.col( "ts" ).len(),
        unique_types=pl.col( "type" ).n_unique(),
        unique_values=pl.col( "type" ).unique()
    )

In [24]:
train_df = train_df.join( train_agg, on="session" )

In [25]:
train_df.describe()

statistic,session,events,min_ts,max_ts,len,unique_types,unique_values
str,f64,f64,f64,f64,f64,f64,f64
"""count""",12899779.0,12899779.0,12899779.0,12899779.0,12899779.0,12899779.0,12899779.0
"""null_count""",0.0,0.0,0.0,0.0,0.0,0.0,0.0
"""mean""",6449889.0,,1660200000000.0,1660800000000.0,16.799985,1.421483,
"""std""",3723800.0,,691250000.0,688430000.0,33.57738,0.700292,
"""min""",0.0,,1659300000000.0,1659300000000.0,2.0,1.0,
"""25%""",3224945.0,,1659600000000.0,1660300000000.0,3.0,1.0,
"""50%""",6449889.0,,1660100000000.0,1660900000000.0,6.0,1.0,
"""75%""",9674834.0,,1660700000000.0,1661400000000.0,15.0,2.0,
"""max""",12899778.0,,1661700000000.0,1661700000000.0,500.0,3.0,


In [26]:
train_df.filter( (pl.col( "unique_types" ) == 1) & (pl.col( "unique_values" ).list.first() != "clicks") )

session,events,min_ts,max_ts,len,unique_types,unique_values
i64,list[struct[3]],i64,i64,u32,u32,list[str]


In [27]:
train_df = train_df.filter( pl.col( "unique_types" ) != 1 )
train_df

session,events,min_ts,max_ts,len,unique_types,unique_values
i64,list[struct[3]],i64,i64,u32,u32,list[str]
11192522,"[{1076558,1661166813784,""clicks""}, {1076558,1661166942845,""clicks""}, … {1076558,1661167630719,""orders""}]",1661166813784,1661167630719,14,3,"[""orders"", ""clicks"", ""carts""]"
1862909,"[{119591,1659459696430,""clicks""}, {119591,1659459714435,""carts""}, … {718252,1659608503382,""clicks""}]",1659459696430,1659608503382,9,3,"[""carts"", ""clicks"", ""orders""]"
3822160,"[{92706,1659684971015,""clicks""}, {92706,1659684991255,""carts""}, … {1219940,1661335638999,""clicks""}]",1659684971015,1661335638999,244,3,"[""orders"", ""clicks"", ""carts""]"
11202222,"[{1498161,1661168961662,""clicks""}, {1498161,1661168990294,""carts""}, … {1498161,1661172214578,""orders""}]",1661168961662,1661172214578,21,3,"[""orders"", ""carts"", ""clicks""]"
8441417,"[{1048234,1660492099612,""clicks""}, {854849,1660492130630,""carts""}, … {61751,1660916806868,""clicks""}]",1660492099612,1660916806868,19,2,"[""carts"", ""clicks""]"
…,…,…,…,…,…,…
390093,"[{611592,1659347469200,""clicks""}, {659153,1659347578765,""clicks""}, … {1802596,1661394428708,""clicks""}]",1659347469200,1661394428708,58,3,"[""orders"", ""clicks"", ""carts""]"
9079381,"[{1733620,1660593868474,""clicks""}, {538711,1660594075063,""clicks""}, … {770208,1660595124693,""clicks""}]",1660593868474,1660595124693,11,2,"[""clicks"", ""carts""]"
5498146,"[{654609,1659958270168,""clicks""}, {1450355,1659958630994,""clicks""}, … {802004,1659960132781,""clicks""}]",1659958270168,1659960132781,23,2,"[""carts"", ""clicks""]"
12677393,"[{1438759,1661677124218,""clicks""}, {603225,1661677188166,""clicks""}, … {1503306,1661677371208,""clicks""}]",1661677124218,1661677371208,6,2,"[""clicks"", ""carts""]"


In [28]:
f'{train_df["len"].sum():_}'

'143_170_192'

In [29]:
train_df_nv_e = train_df.explode("events").unnest("events")

In [30]:
ts_split = train_df_nv_e["ts"].quantile( 0.75 )

In [31]:
before = train_df.filter( pl.col( "max_ts" ) < ts_split )
after = train_df.filter( pl.col( "min_ts" ) > ts_split )
inbetween = train_df.filter( (pl.col( "max_ts" ) >= ts_split) & (pl.col( "min_ts" ) <= ts_split) )

In [32]:
f'{inbetween["len"].sum():_}'

'93_980_250'

In [33]:
f'{before["len"].sum():_}'

'41_208_960'

In [34]:
f'{after["len"].sum():_}'

'7_980_982'

In [35]:
after.explode( "events" ).unnest( "events" )["type"].unique_counts()

type
u32
6234309
1333761
412912


In [36]:
after_labels = after.explode( "events" ).unnest( "events" ).filter( pl.col( "type" ) == "carts" )
after_labels = after_labels.rename({
    "aid": "label",
    "ts": "label_ts"
})

after = after.join( after_labels[["session", "label", "label_ts"]], on="session" )

In [37]:
after

session,events,min_ts,max_ts,len,unique_types,unique_values,label,label_ts
i64,list[struct[3]],i64,i64,u32,u32,list[str],i64,i64
11192522,"[{1076558,1661166813784,""clicks""}, {1076558,1661166942845,""clicks""}, … {1076558,1661167630719,""orders""}]",1661166813784,1661167630719,14,3,"[""orders"", ""clicks"", ""carts""]",1076558,1661166976771
11192522,"[{1076558,1661166813784,""clicks""}, {1076558,1661166942845,""clicks""}, … {1076558,1661167630719,""orders""}]",1661166813784,1661167630719,14,3,"[""orders"", ""clicks"", ""carts""]",1076558,1661166992236
11202222,"[{1498161,1661168961662,""clicks""}, {1498161,1661168990294,""carts""}, … {1498161,1661172214578,""orders""}]",1661168961662,1661172214578,21,3,"[""orders"", ""carts"", ""clicks""]",1498161,1661168990294
11202222,"[{1498161,1661168961662,""clicks""}, {1498161,1661168990294,""carts""}, … {1498161,1661172214578,""orders""}]",1661168961662,1661172214578,21,3,"[""orders"", ""carts"", ""clicks""]",42283,1661169192920
11202222,"[{1498161,1661168961662,""clicks""}, {1498161,1661168990294,""carts""}, … {1498161,1661172214578,""orders""}]",1661168961662,1661172214578,21,3,"[""orders"", ""carts"", ""clicks""]",1551504,1661170293721
…,…,…,…,…,…,…,…,…
11737419,"[{277662,1661335134481,""clicks""}, {120108,1661335152529,""carts""}, … {1323678,1661335466924,""orders""}]",1661335134481,1661335466924,11,3,"[""carts"", ""clicks"", ""orders""]",120108,1661335152529
11737419,"[{277662,1661335134481,""clicks""}, {120108,1661335152529,""carts""}, … {1323678,1661335466924,""orders""}]",1661335134481,1661335466924,11,3,"[""carts"", ""clicks"", ""orders""]",1128670,1661335225084
11737419,"[{277662,1661335134481,""clicks""}, {120108,1661335152529,""carts""}, … {1323678,1661335466924,""orders""}]",1661335134481,1661335466924,11,3,"[""carts"", ""clicks"", ""orders""]",1323678,1661335282402
12677393,"[{1438759,1661677124218,""clicks""}, {603225,1661677188166,""clicks""}, … {1503306,1661677371208,""clicks""}]",1661677124218,1661677371208,6,2,"[""clicks"", ""carts""]",603225,1661677248022


In [38]:
after_final = after.drop(("min_ts", "max_ts", "len", "unique_types", "unique_values"))\
    .explode( "events" )\
    .unnest( "events" )\
    .filter( pl.col( "ts" ) < pl.col( "label_ts" ) )

In [40]:
after_final = after_final\
                .with_columns( after_final[["aid", "ts", "type"]].to_struct().alias( "events" ) )\
                .drop(( "aid", "ts", "type" ))

In [45]:
after_final = after_final.drop( "label_ts" )\
    .group_by(( "session", "label" ))\
    .agg(
        pl.col( "events" )
    )

after_final.shape

(1125910, 3)

In [47]:
after_final.write_parquet( "test.parquet" )

In [51]:
before = before.drop(( "min_ts", "max_ts", "len", "unique_types", "unique_values" ))

before.write_parquet( "train.parquet" )

In [52]:
tst = pl.read_parquet( "data/test.parquet" ) 

In [53]:
tst

session,label,events
i64,i64,list[struct[3]]
11896653,527209,"[{1682122,1661371400731,""clicks""}, {1316412,1661371595934,""clicks""}, … {1029386,1661374206607,""clicks""}]"
12027709,891174,"[{899805,1661432423664,""clicks""}, {717853,1661432466052,""clicks""}, … {891174,1661432601234,""clicks""}]"
11841521,596420,"[{382968,1661360371933,""clicks""}, {382968,1661360424662,""carts""}, … {596420,1661360807048,""clicks""}]"
11588604,1311630,"[{437273,1661275810432,""clicks""}, {437273,1661275831198,""carts""}, … {1311630,1661275901408,""clicks""}]"
12178219,554660,"[{549100,1661492854578,""clicks""}, {70139,1661530203549,""clicks""}, … {554660,1661680867761,""carts""}]"
…,…,…
12477564,1820709,"[{477345,1661601509917,""clicks""}, {583811,1661601546873,""clicks""}, … {1820709,1661627306039,""clicks""}]"
12508126,471216,"[{1290649,1661609098363,""clicks""}, {1216622,1661609251695,""clicks""}, … {471216,1661609930029,""clicks""}]"
11214067,1114123,"[{1086499,1661171641283,""clicks""}, {1114123,1661707048525,""clicks""}, … {1114123,1661707231518,""clicks""}]"
12673272,690067,"[{1574201,1661676239739,""clicks""}, {1574201,1661676504267,""clicks""}, {690067,1661676525482,""clicks""}]"
