In [None]:
%reload_ext autoreload
%autoreload 2

import polars as pl
from config import ACTIVITY_LEVELS, AGE_THRESHOLDS, TOP_N
from parse import get_site_dfs, load_dfs
from polars import col

(
    joinyears,
    df_users,
    df_users_registered,
    df_posts_all,
    df_comments_all,
    df_activity_all,
) = load_dfs("../infodump")

df_months, df_posts, df_comments, df_activity = get_site_dfs(
    "all", df_posts_all, df_comments_all, df_activity_all
)

In [None]:
df_users_monthly = (
    df_months.join(
        df_activity.select("userid", "month"),
        on="month",
        how="left",
        coalesce=True,
    )
    .group_by(["month", "userid"], maintain_order=True)
    .agg(count=col("userid").count())
)

df_users_monthly

In [None]:
df_users_monthly_activity = df_users_monthly.group_by("month").agg(
    (
        col("count").filter(col("count") >= level).count().alias(str(level))
        for level in ACTIVITY_LEVELS
    )
)

df_users_monthly_activity

In [None]:
df_users_monthly_by_joined = (
    df_users_monthly.join(
        df_users.select("userid", "joinyear"),
        on="userid",
        how="left",
        coalesce=True,
    )
    .group_by("month")
    .agg(
        (
            col("joinyear").filter(joinyear=year).count().alias(str(year))
            for year in joinyears
        ),
    )
    .drop("month")
)

df_users_monthly_by_joined

In [None]:
df_activity_by_age = (
    df_months.join(
        df_activity.select("userid", "datestamp", "month"),
        on="month",
        how="left",
        coalesce=True,
    )
    .join(
        df_users.select("userid", "joindate"),
        on="userid",
        how="left",
        coalesce=True,
    )
    .with_columns(age=(col("datestamp") - col("joindate")).dt.total_days())
    .with_columns(
        col("age")
        .is_between(AGE_THRESHOLDS[i], AGE_THRESHOLDS[i + 1], closed="left")
        .alias(str(i))
        for i in range(len(AGE_THRESHOLDS) - 1)
    )
    .group_by("month")
    .agg(col(str(i)).sum() for i in range(len(AGE_THRESHOLDS) - 1))
    .drop("month")
)

df_activity_by_age

In [None]:
df_users_new = (
    df_months.join(
        df_activity.select("userid", "month").unique("userid", keep="first"),
        on="month",
        how="left",
        coalesce=True,
    )
    .group_by("month")
    .agg(new=col("userid").count())
)

df_users_new

In [None]:
df_users_cum = df_users_new.select("month", cum=col("new").cum_sum())

df_users_cum

In [None]:
df_posts_deleted = (
    df_months.join(
        df_posts.select("month", "deleted"),
        on="month",
        how="left",
        coalesce=True,
    )
    .group_by("month")
    .agg(
        col("deleted").filter(col("deleted").is_in([1, 3])).len()
    )  # 1: deleted, 3: deleted and closed on Metatalk
)

df_posts_deleted

In [None]:
df_activity_by_top_users = (
    df_months.join(
        df_activity.select("month", "userid"), on="month", how="left", coalesce=True
    )
    .group_by("month")
    .agg(
        pl.len(),
        col("userid").unique_counts().alias("counts").sort(descending=True),
    )
    .select(
        (
            col("counts").list.head(col("counts").list.len() * n).list.sum()
            / col("len")
        ).alias(str(n))
        for n in TOP_N
    )
    .select(pl.all().round(3))
)

df_activity_by_top_users