In [1]:
from ranx import Qrels, Run, evaluate
import bm25s
import Stemmer
from datasets import load_from_disk
from datasets import Dataset, DatasetDict
import numpy as np
import json
import pandas as pd
from collections import defaultdict
from typing import List, Dict
import random
import os
from src.utils.project_dirs import get_hfdata_dir
from src.utils.project_dirs import processed_data_dir



In [22]:
def check_monotonicity(df, column='timestamp', group_by=None):
    """
    Checks if a column is monotonically increasing globally and within groups.

    Args:
        df (pd.DataFrame): The input DataFrame.
        column (str, optional): The column to check for monotonicity. Defaults to 'timestamp'.
        group_by (str, optional): The column to group by. If None, checks global monotonicity only.
            Defaults to None.

    Returns:
        dict: A dictionary containing the results:
            - 'global_monotonic': True if the column is monotonically increasing globally, False otherwise.
            - 'grouped_monotonic': True if the column is monotonically increasing within all groups,
              False if it is not monotonically increasing in at least one group, None if no grouping is performed.
            - 'first_violating_group': The first group where monotonicity is violated, None if no violation.
    """
    results = {}
    results['global_monotonic'] = df[column].is_monotonic_increasing
    results['first_violating_group'] = None # Initialize

    if group_by:
        group_monotonicity = df.groupby(group_by, observed=False)[column].apply(lambda x: x.is_monotonic_increasing)
        results['grouped_monotonic'] = group_monotonicity.all()

        # Find the first group where monotonicity is violated
        violating_groups = group_monotonicity[~group_monotonicity]
        if not violating_groups.empty:
            results['first_violating_group'] = violating_groups.index[0]  # Get the first violating group
        elif results['global_monotonic'] is False:
            results['first_violating_group'] = None
    else:
        results['grouped_monotonic'] = None

    return results

def find_first_violating_user(df, timestamp_column='timestamp', user_id_column='user_id'):
    """
    Finds the first user ID for which the timestamp column is not monotonically increasing.

    Args:
        df (pd.DataFrame): The input DataFrame.
        timestamp_column (str, optional): The name of the timestamp column. Defaults to 'timestamp'.
        user_id_column (str, optional): The name of the user ID column. Defaults to 'user_id'.

    Returns:
        int or None: The first user ID where the timestamp is not monotonically increasing,
                      or None if the timestamp is monotonically increasing for all users.
    """
    monotonicity_check = check_monotonicity(df, column=timestamp_column, group_by=user_id_column)
    if not monotonicity_check['grouped_monotonic']:
        return monotonicity_check['first_violating_group']
    else:
        return None

In [18]:
dataset_name = 'ml1m'

df_ui = pd.read_json(str(processed_data_dir(dataset_name) / 'df_ui.json'), orient='records', lines=True)
display(df_ui.shape, df_ui.head())
print(df_ui.dtypes)

check_monotonicity(df_ui, group_by='user_id')
find_first_violating_user(df_ui)


(1000209, 6)

Unnamed: 0,user_id,movie_id,rating,timestamp,title,genre
0,1,3186,4,2000-12-31 22:00:19,"Girl, Interrupted (1999)",Drama
1,1,1270,5,2000-12-31 22:00:55,Back to the Future (1985),"Comedy,Sci-Fi"
2,1,1721,4,2000-12-31 22:00:55,Titanic (1997),"Drama,Romance"
3,1,1022,5,2000-12-31 22:00:55,Cinderella (1950),"Animation,Children's,Musical"
4,1,2340,3,2000-12-31 22:01:43,Meet Joe Black (1998),Romance


user_id               int64
movie_id              int64
rating                int64
timestamp    datetime64[ns]
title                object
genre                object
dtype: object


20

In [19]:
df_ui[df_ui['user_id'] == 20]['timestamp']

202723   2001-12-29 23:37:51
202724   2001-12-29 23:37:51
202725   2001-12-29 23:37:51
202726   2001-12-29 23:37:51
202727   2001-12-29 23:38:35
202728   2001-12-29 23:38:35
202729   2001-12-29 23:38:36
202730   2001-12-29 23:39:41
202731   2001-12-29 23:39:41
202732   2001-12-29 23:39:41
202733   2001-12-29 23:40:27
202734   2000-12-30 02:25:06
202735   2000-12-30 02:26:21
202736   2000-12-30 02:28:26
202737   2000-12-30 02:29:04
202738   2000-12-30 02:29:15
202739   2000-12-30 02:29:29
202740   2000-12-30 02:29:29
202741   2000-12-30 02:31:48
202742   2000-12-30 02:31:48
202743   2000-12-30 02:31:48
202744   2000-12-30 02:31:48
202745   2000-12-30 02:31:48
202746   2000-12-30 02:32:56
Name: timestamp, dtype: datetime64[ns]

In [9]:
dataset_name = 'ml100k'

df_ui = pd.read_json(str(processed_data_dir(dataset_name) / 'df_ui.json'), orient='records', lines=True)
display(df_ui.shape, df_ui.head())

df_ui.dtypes

check_monotonicity(df_ui, group_by='user_id')


(100000, 6)

Unnamed: 0,user_id,movie_id,rating,timestamp,title,genre
0,1,168,5,1997-09-22 21:57:58,Monty Python and the Holy Grail (1974),Comedy
1,1,172,5,1997-09-22 21:57:58,"Empire Strikes Back, The (1980)","Action,Adventure,Drama,Romance,Sci-Fi,War"
2,1,165,5,1997-09-22 21:58:38,Jean de Florette (1986),Drama
3,1,156,4,1997-09-22 21:59:16,Reservoir Dogs (1992),"Crime,Thriller"
4,1,196,5,1997-09-22 22:01:17,Dead Poets Society (1989),Drama


{'global_monotonic': False,
 'first_violating_group': None,
 'grouped_monotonic': True}

In [24]:
dataset_name = 'beauty'

df_ui = pd.read_json(str(processed_data_dir(dataset_name) / 'df_ui.json'), orient='records', lines=True)
display(df_ui.shape, df_ui.head())

df_ui.dtypes

check_monotonicity(df_ui, group_by='reviewerID', column='unixReviewTime')


(198371, 5)

Unnamed: 0,reviewerID,asin,unixReviewTime,overall,title
0,A00414041RD0BXM6WK0GX,B007IY97U0,1405296000,3,63cm Long Zipper Beige+pink Wavy Cosplay Hair ...
1,A00414041RD0BXM6WK0GX,B00870XLDS,1405296000,2,MapofBeauty Long Wave Curly Hair Wig Full Wig ...
2,A00414041RD0BXM6WK0GX,B008MIRO88,1405296000,1,MapofBeauty Cosplay Costume Long Curly Hair Wi...
3,A00414041RD0BXM6WK0GX,B00BQYYMN0,1405296000,3,32&quot; 80cm Long Hair Heat Resistant Spiral ...
4,A00414041RD0BXM6WK0GX,B00GRTQBTM,1405296000,5,MapofBeauty 28&quot; 70cm Long Curly Hair Ends...


{'global_monotonic': False,
 'first_violating_group': None,
 'grouped_monotonic': True}

In [25]:
dataset_name = 'toys'

df_ui = pd.read_json(str(processed_data_dir(dataset_name) / 'df_ui.json'), orient='records', lines=True)
display(df_ui.shape, df_ui.head())

df_ui.dtypes

check_monotonicity(df_ui, group_by='reviewerID', column='unixReviewTime')


(166757, 5)

Unnamed: 0,reviewerID,asin,unixReviewTime,overall,title
0,A012468118FTQAINEI0OQ,B001HA9JOA,1356220800,5,Chutes and Ladders Super Hero Squad
1,A012468118FTQAINEI0OQ,B00804BCO6,1356220800,5,LeapFrog Letter Factory Phonics
2,A012468118FTQAINEI0OQ,B00005BZM6,1360108800,5,Crayola Art Smock-
3,A012468118FTQAINEI0OQ,B002BY2BVE,1360108800,5,"Crayola Giant Fingerpaint Paper, (99-3405)"
4,A012468118FTQAINEI0OQ,B003S7EYZY,1360108800,5,Gund Philbin 13&quot; Bear


{'global_monotonic': False,
 'first_violating_group': None,
 'grouped_monotonic': True}

In [26]:
dataset_name = 'sports'

df_ui = pd.read_json(str(processed_data_dir(dataset_name) / 'df_ui.json'), orient='records', lines=True)
display(df_ui.shape, df_ui.head())

df_ui.dtypes

check_monotonicity(df_ui, group_by='reviewerID', column='unixReviewTime')


(295091, 5)

Unnamed: 0,reviewerID,asin,unixReviewTime,overall,title
0,A00046902LP5YSDV0VVNF,B0010O748Q,1353628800,5,Magnesium Fire Starter
1,A00046902LP5YSDV0VVNF,B00178CS4K,1353628800,5,Survivor HK-106320 Outdoor Fixed Blade Knife 7...
2,A00046902LP5YSDV0VVNF,B001THHXA8,1353628800,5,Big Bohica Kukhri Machete
3,A00046902LP5YSDV0VVNF,B004U8CP88,1353628800,2,Ultralight Backpacking Canister Camp Stove wit...
4,A00046902LP5YSDV0VVNF,B007BO931U,1353628800,4,Tac Forece TF-710 Series Assisted Opening Fold...


{'global_monotonic': False,
 'first_violating_group': None,
 'grouped_monotonic': True}

In [68]:


def get_seqlen_stats(df, user_col='user_id', item_col='item_id', bottom_percentile=0.1, top_percentile=0.9):
    user_seqlen_counts = df[user_col].value_counts() # will count number of times each user has rated any item, could be repeated items
    print("mean: ", user_seqlen_counts.mean(), "std: ", user_seqlen_counts.std())
    # this actually percentile can be misleading as lots of users can be on the edge of the histogram, so better to manually select seqlen cutoffs
    # print("bottom_percentile: ", user_seqlen_counts.quantile(bottom_percentile), "top_percentile: ", user_seqlen_counts.quantile(top_percentile)) 
    return user_seqlen_counts

dataset_name = 'beauty'
df_ui = pd.read_json(str(processed_data_dir(dataset_name) / 'df_ui.json'), orient='records', lines=True)
usc = get_seqlen_stats(df_ui, user_col =  'reviewerID', item_col = 'asin')
low_seqlen = 5
high_seqlen = 14
print('Amazon Beauty',(usc <= low_seqlen).sum() / len(usc), (usc > high_seqlen).sum() / len(usc))


dataset_name = 'sports'
df_ui = pd.read_json(str(processed_data_dir(dataset_name) / 'df_ui.json'), orient='records', lines=True)
usc = get_seqlen_stats(df_ui, user_col =  'reviewerID', item_col = 'asin')
low_seqlen = 5
high_seqlen = 13
print('Amazon Sports',(usc <= low_seqlen).sum() / len(usc), (usc > high_seqlen).sum() / len(usc))


dataset_name = 'toys'
df_ui = pd.read_json(str(processed_data_dir(dataset_name) / 'df_ui.json'), orient='records', lines=True)
usc = get_seqlen_stats(df_ui, user_col =  'reviewerID', item_col = 'asin')
low_seqlen = 5
high_seqlen = 13
print('Amazon Toys',(usc <= low_seqlen).sum() / len(usc), (usc > high_seqlen).sum() / len(usc))


mean:  8.870500380092116 std:  8.160898111531392
Amazon Beauty 0.3210660465948218 0.09779546572463443
mean:  8.289771609967133 std:  6.0523199381244845
Amazon Sports 0.32598252661741156 0.09891282973284266
mean:  8.593064000824487 std:  8.489432709896912
Amazon Toys 0.3452025146861795 0.106925693084613


In [44]:
usc

reviewerID
A3OXHLG6DIBRW8    294
AN81JUYW2SL24     142
AOVTLYTHVDNUX     139
A8VI7KMUHI7ZH     115
A2XRMQA6PJ5ZJ8    103
                 ... 
AFLEIZY9LOV0N       3
AP6DZJX8D1JNB       3
A16AMRVH5VLUSX      3
AS141ADSV4RQF       3
AY9ZBOTKBA30N       3
Name: count, Length: 35597, dtype: int64

In [41]:
usc

reviewerID
A3OXHLG6DIBRW8    294
AN81JUYW2SL24     142
AOVTLYTHVDNUX     139
A8VI7KMUHI7ZH     115
A2XRMQA6PJ5ZJ8    103
                 ... 
AFLEIZY9LOV0N       3
AP6DZJX8D1JNB       3
A16AMRVH5VLUSX      3
AS141ADSV4RQF       3
AY9ZBOTKBA30N       3
Name: count, Length: 35597, dtype: int64