### Reading the first parquet file

In [13]:
import pyarrow as pa
import pyarrow.parquet as pq
import os

In [14]:
def get_first_parquet_from_path(path):
    for (dir_path, _, files) in os.walk(path):
        for f in files:
            if f.endswith(".parquet"):
                first_pq_path = os.path.join(dir_path, f)
                return first_pq_path

In [15]:
path = './train_parquet'
first_pq = get_first_parquet_from_path(path)
first_pq

'./train_parquet\\000000000_000100000.parquet'

In [16]:
first_ds = pq.read_table(first_pq)
first_ds.num_rows, first_ds.num_columns, first_ds.schema

(5227653,
 4,
 session: int64
 aid: int64
 ts: int64
 type: string
 -- schema metadata --
 pandas: '{"index_columns": [{"kind": "range", "name": null, "start": 0, "' + 688)

In [17]:
parquet_file = pq.ParquetFile(first_pq)
ts=parquet_file.metadata.row_group(0)
for nm in range(ts.num_columns):
    print(ts.column(nm))

<pyarrow._parquet.ColumnChunkMetaData object at 0x000002E8E3CD3B00>
  file_offset: 1045713
  file_path: 
  physical_type: INT64
  num_values: 5227653
  path_in_schema: session
  is_stats_set: True
  statistics:
    <pyarrow._parquet.Statistics object at 0x000002E8E3CD3A60>
      has_min_max: True
      min: 0
      max: 99999
      null_count: 0
      distinct_count: 0
      num_values: 5227653
      physical_type: INT64
      logical_type: None
      converted_type (legacy): NONE
  compression: SNAPPY
  encodings: ('RLE_DICTIONARY', 'PLAIN', 'RLE')
  has_dictionary_page: True
  dictionary_page_offset: 4
  data_page_offset: 400266
  total_compressed_size: 1045709
  total_uncompressed_size: 1510526
<pyarrow._parquet.ColumnChunkMetaData object at 0x000002E8E3CD3B00>
  file_offset: 19895452
  file_path: 
  physical_type: INT64
  num_values: 5227653
  path_in_schema: aid
  is_stats_set: True
  statistics:
    <pyarrow._parquet.Statistics object at 0x000002E88E8A8D60>
      has_min_max: Tru

### Reading and Writing Single Files

In [18]:
first_ds.to_pandas()

Unnamed: 0,session,aid,ts,type
0,0,1517085,1659304800025,clicks
1,0,1563459,1659304904511,clicks
2,0,1309446,1659367439426,clicks
3,0,16246,1659367719997,clicks
4,0,1781822,1659367871344,clicks
...,...,...,...,...
5227648,99999,1544954,1660373630318,clicks
5227649,99999,1032408,1660373656430,clicks
5227650,99999,1544954,1660373678083,clicks
5227651,99999,554230,1660373715477,clicks


In [19]:
import numpy as np
import pandas as pd

df = first_ds.to_pandas()

df['ts'] = pd.to_datetime(df['ts'], unit='ms')
df['day_of_week'] = df['ts'].dt.dayofweek
df['hour_of_day'] = df['ts'].dt.hour
df['elapsed_time'] = df['ts'] - df.groupby('session')['ts'].transform('first')
df['elapsed_time'] = df['elapsed_time'].dt.total_seconds().astype(np.int64)

df

Unnamed: 0,session,aid,ts,type,day_of_week,hour_of_day,elapsed_time
0,0,1517085,2022-07-31 22:00:00.025,clicks,6,22,0
1,0,1563459,2022-07-31 22:01:44.511,clicks,6,22,104
2,0,1309446,2022-08-01 15:23:59.426,clicks,0,15,62639
3,0,16246,2022-08-01 15:28:39.997,clicks,0,15,62919
4,0,1781822,2022-08-01 15:31:11.344,clicks,0,15,63071
...,...,...,...,...,...,...,...
5227648,99999,1544954,2022-08-13 06:53:50.318,clicks,5,6,1046918
5227649,99999,1032408,2022-08-13 06:54:16.430,clicks,5,6,1046944
5227650,99999,1544954,2022-08-13 06:54:38.083,clicks,5,6,1046966
5227651,99999,554230,2022-08-13 06:55:15.477,clicks,5,6,1047003


In [20]:
from sklearn.preprocessing import LabelEncoder

df = first_ds.to_pandas()

df['ts'] = pd.to_datetime(df['ts'], unit='ms')

le = LabelEncoder()
df['aid'] = le.fit_transform(df['aid'])
df['type'] = le.fit_transform(df['type'])
df

Unnamed: 0,session,aid,ts,type
0,0,541856,2022-07-31 22:00:00.025,1
1,0,558586,2022-07-31 22:01:44.511,1
2,0,467564,2022-08-01 15:23:59.426,1
3,0,5802,2022-08-01 15:28:39.997,1
4,0,636874,2022-08-01 15:31:11.344,1
...,...,...,...,...
5227648,99999,551919,2022-08-13 06:53:50.318,1
5227649,99999,368435,2022-08-13 06:54:16.430,1
5227650,99999,551919,2022-08-13 06:54:38.083,1
5227651,99999,197149,2022-08-13 06:55:15.477,1


In [21]:
df = first_ds.to_pandas()

df['ts'] = pd.to_datetime(df['ts'], unit='ms')
df['day_of_week'] = df['ts'].dt.dayofweek
df['hour_of_day'] = df['ts'].dt.hour
df['elapsed_time'] = df['ts'] - df.groupby('session')['ts'].transform('first')
df['elapsed_time'] = df['elapsed_time'].dt.total_seconds().astype(np.int64)

session_df = df.groupby('session').agg({
    'type': ['count', lambda x: (x == 'click').sum(), lambda x: (x == 'cart').sum(), lambda x: (x == 'order').sum()],
    'elapsed_time': ['mean', 'min', 'max']
})

session_df.columns = ['_'.join(col) for col in session_df.columns]
session_df = session_df.reset_index()
session_df

Unnamed: 0,session,type_count,type_<lambda_0>,type_<lambda_1>,type_<lambda_2>,elapsed_time_mean,elapsed_time_min,elapsed_time_max
0,0,276,0,0,0,1.456662e+06,0,2380183
1,1,32,0,0,0,7.503567e+05,0,2410054
2,2,33,0,0,0,1.601513e+06,0,2409415
3,3,226,0,0,0,6.387480e+05,0,1804866
4,4,19,0,0,0,1.738969e+06,0,2281881
...,...,...,...,...,...,...,...,...
99995,99995,21,0,0,0,7.477689e+05,0,1743071
99996,99996,302,0,0,0,1.310787e+06,0,2300943
99997,99997,112,0,0,0,1.066683e+06,0,2367980
99998,99998,3,0,0,0,1.044053e+05,0,270174


In [22]:
df['ts'] = pd.to_datetime(df['ts'], unit='ms')

# Create a feature for the number of events in each session
event_count = df.groupby('session')['type'].count().reset_index()
event_count.columns = ['session', 'event_count']

# Create a feature for the length of each session
session_start = df.groupby('session')['ts'].min().reset_index()
session_start.columns = ['session', 'session_start']
session_end = df.groupby('session')['ts'].max().reset_index()
session_end.columns = ['session', 'session_end']
session_length = pd.merge(session_start, session_end, on='session')
session_length['session_length'] = (session_length['session_end'] - session_length['session_start']).dt.total_seconds()

# Create a feature for the time elapsed since the last event in each session
df['time_since_last_event'] = df.groupby('session')['ts'].diff().dt.total_seconds().fillna(0)

# Merge the new features with the original data
df = pd.merge(df, event_count, on='session')
df = pd.merge(df, session_length[['session', 'session_length']], on='session')
df

Unnamed: 0,session,aid,ts,type,day_of_week,hour_of_day,elapsed_time,time_since_last_event,event_count,session_length
0,0,1517085,2022-07-31 22:00:00.025,clicks,6,22,0,0.000,276,2380183.682
1,0,1563459,2022-07-31 22:01:44.511,clicks,6,22,104,104.486,276,2380183.682
2,0,1309446,2022-08-01 15:23:59.426,clicks,0,15,62639,62534.915,276,2380183.682
3,0,16246,2022-08-01 15:28:39.997,clicks,0,15,62919,280.571,276,2380183.682
4,0,1781822,2022-08-01 15:31:11.344,clicks,0,15,63071,151.347,276,2380183.682
...,...,...,...,...,...,...,...,...,...,...
5227648,99999,1544954,2022-08-13 06:53:50.318,clicks,5,6,1046918,157184.694,18,1047014.072
5227649,99999,1032408,2022-08-13 06:54:16.430,clicks,5,6,1046944,26.112,18,1047014.072
5227650,99999,1544954,2022-08-13 06:54:38.083,clicks,5,6,1046966,21.653,18,1047014.072
5227651,99999,554230,2022-08-13 06:55:15.477,clicks,5,6,1047003,37.394,18,1047014.072


In [23]:
from transformers4rec.dataset import SessionDataset
from transformers4rec.models import BERT4Rec
from transformers4rec.evaluation import Evaluator

# Load the training data into a pandas DataFrame
train_df = pd.read_json("./input/train.jsonl", lines=True)

# Create a SessionDataset object from the training data
train_dataset = SessionDataset(df=train_df, item_col='aid', session_col='session', time_col='ts')

# Load the test data into a pandas DataFrame
test_df = pd.read_json("./input/test.jsonl", lines=True)

# Create a SessionDataset object from the test data
test_dataset = SessionDataset(df=test_df, item_col='aid', session_col='session', time_col='ts')

# Initialize the BERT4Rec model
model = BERT4Rec(num_items=train_dataset.num_items, 
                 embedding_dim=128, 
                 hidden_size=256, 
                 num_layers=2)

# Train the model on the training data
model.fit(train_dataset, epochs=10, batch_size=32)

# Use the model to make predictions on the test data
predictions = model.predict(test_dataset)

# Evaluate the model's performance on the test data
evaluator = Evaluator(test_dataset, top_k=20)
recall = evaluator.recall(predictions)
print(f"Recall@20: {recall:.4f}")


ModuleNotFoundError: No module named 'transformers4rec'