In [1]:
from IPython.display import display
import pandas as pd
import pyrootutils

pyrootutils.set_root(os.path.abspath(".."), pythonpath=True)


In [2]:
from src.utils.data import load_news, load_behaviors

In [3]:
def news_stats(variant):
    news = load_news(variant=variant, drop_duplicates=False, data_dir="../data")
    news = news.reset_index()
    
    aggregations = {
        'id': 'nunique',
        'category': 'nunique',
        'subcategory': 'nunique',
    }
    splits = news.groupby("split", sort=False).agg(aggregations)
    total = news.agg(aggregations).to_frame(name="total").T
    stats = pd.concat([splits, total])
    stats = stats.rename(columns={
        'id': 'n_articles',
        'category': 'n_categories',
        'subcategory': 'n_subcategories',
    })
    display(stats)
    
news_stats("small")

Unnamed: 0,n_articles,n_categories,n_subcategories
train,51282,17,264
dev,42416,17,257
total,65238,18,270


In [4]:
news_stats("large")

Unnamed: 0,n_articles,n_categories,n_subcategories
train,101527,18,285
dev,72023,17,269
test,120959,18,290
total,130379,18,293


In [5]:
def impression_stats(variant):
    behaviors = load_behaviors(variant, data_dir="../data")
    behaviors["history_length"] = behaviors["history"].apply(len)
    behaviors["n_impressions"] = behaviors["impressions"].apply(len)
    behaviors["n_clicks"] = behaviors["impressions"].apply(lambda x: len([s for s in x if s.endswith("-1")]))
    
    aggregations = {
        'impression_id': 'count',
        'user': 'nunique',
        'history_length': ['mean', 'median', 'min', 'max'],
        'n_impressions': ['sum', 'mean', 'median', 'min', 'max'],
        'n_clicks': ['sum', 'mean', 'median', 'min', 'max']
    }
    
    splits = behaviors.groupby("split", sort=False).agg(aggregations)
    total = behaviors.agg(aggregations).stack().to_frame(name="total").T

    total.columns = total.columns.swaplevel()
    total = total.reindex(columns=splits.columns)

    stats = pd.concat([splits, total])
    stats = stats.rename(columns={
        'impression_id': 'logs',
    })
    display(stats)
    
impression_stats("small")

Unnamed: 0_level_0,logs,user,history_length,history_length,history_length,history_length,n_impressions,n_impressions,n_impressions,n_impressions,n_impressions,n_clicks,n_clicks,n_clicks,n_clicks,n_clicks
Unnamed: 0_level_1,count,nunique,mean,median,min,max,sum,mean,median,min,max,sum,mean,median,min,max
train,156965.0,50000.0,32.539987,19.0,0.0,558.0,5843444.0,37.227688,24.0,2.0,299.0,236344.0,1.505711,1.0,1.0,35.0
dev,73152.0,50000.0,32.295959,19.0,0.0,444.0,2740998.0,37.469898,23.0,2.0,295.0,111383.0,1.522624,1.0,1.0,24.0
total,230117.0,94057.0,32.462413,19.0,0.0,558.0,8584442.0,37.304684,24.0,2.0,299.0,347727.0,1.511088,1.0,1.0,35.0


In [11]:
impression_stats("large")

Unnamed: 0_level_0,logs,user,history_length,history_length,history_length,history_length,n_impressions,n_impressions,n_impressions,n_impressions,n_impressions,n_clicks,n_clicks,n_clicks,n_clicks,n_clicks
Unnamed: 0_level_1,count,nunique,mean,median,min,max,sum,mean,median,min,max,sum,mean,median,min,max
train,2232748.0,711222.0,32.977241,19.0,0.0,801.0,83507374.0,37.401164,25.0,2.0,300.0,3383656.0,1.515467,1.0,1.0,51.0
dev,376471.0,255990.0,32.62446,19.0,0.0,801.0,14085557.0,37.41472,23.0,2.0,299.0,574845.0,1.52693,1.0,1.0,39.0
test,2370727.0,702005.0,41.599264,25.0,0.0,1021.0,93115001.0,39.276982,25.0,1.0,300.0,0.0,0.0,0.0,0.0,0.0
total,4979946.0,876956.0,37.055127,21.0,0.0,1021.0,190707932.0,38.295181,25.0,1.0,300.0,3958501.0,0.794888,1.0,0.0,51.0
