In [None]:
%reload_ext autoreload
%autoreload 2

import datetime
import polars as pl
from polars import col

from calc import load_dfs

ANONYMOUS_USERID = 17564

(
    joinyears,
    df_users,
    df_users_registered,
    df_posts_all,
    df_comments_all,
    df_activity_all,
) = load_dfs("../infodump")

In [None]:
from private.mungings import MUNGED_USERNAMES, MUNGED_USERS_NOT_WIPED


df_munged_users_not_wiped = pl.DataFrame(
    MUNGED_USERS_NOT_WIPED,
    schema={"userid": pl.Int64},
    orient="row",
)

df_munged_usernames = pl.DataFrame(
    MUNGED_USERNAMES,
    schema={"userid": pl.Int64, "name": pl.String},
    orient="row",
)

# https://mefiwiki.com/wiki/Infodump#Userid_munging
df_munged_users = (
    df_users.filter(col("userid") >= 1_000_000)
    .join(df_munged_users_not_wiped, on="userid", how="anti")
    .join(df_munged_usernames, on="userid", how="left")
    .sort("userid")
    .select("userid", name=pl.coalesce("name", "name_right"))
)

df_munged_users

In [None]:
df_users_with_posts_deleted_by_closer = (
    df_posts_all.filter(col("reason").str.contains("user/344145"))
    .select("userid")
    .unique()
    .filter(col("userid") != ANONYMOUS_USERID)
    .join(df_users, on="userid")
    .select("userid", "name")
)

df_users_with_posts_deleted_by_closer

In [None]:
df_wiped_users = (
    pl.concat([df_munged_users, df_users_with_posts_deleted_by_closer])
    .unique("userid")
    .join(df_users, on="userid")
    .select("userid", "name")
    .sort("userid")
)

df_wiped_users

In [None]:
df_posts_by_wiped_users = (
    df_posts_all.join(df_wiped_users, on="userid")
    .filter(
        col("reason").str.contains("(?i)Poster's request")
    )  # select posts deleted because of wipe (we don't want everyday deletions)
    .sort("datestamp")
    .drop("category", "month")
)

df_posts_by_wiped_users

In [None]:
# check >=2020 posts by munged IDs (tkolar comment)

display(
    df_posts_by_wiped_users.filter(
        (col("datestamp") >= datetime.date(2020, 1, 1)) & (col("userid") > 1_000_000)
    )
)

In [None]:
df_posts_by_wiped_users_agg = (
    df_posts_by_wiped_users.group_by("userid")
    .agg(
        pl.first("name"),
        pl.len().alias("posts"),
    )
    .sort("posts", descending=True)
)

# this df contains 57 users, versus 60 in df_wiped_users (August 2025).
# the 3 missing users made one post each and it was deleted for everyday, non-wipe reasons.

df_posts_by_wiped_users_agg

In [None]:
df_posts_by_wiped_users.select("comments", "favorites").sum()

In [None]:
# posts by Anonymous outside AskMe must (?) be wiped users, using Rhaomi's new approach.
# ignoring AskMe: no way to distinguish wipes from normal Anonymous questions.
df_anonymised_posts = (
    df_posts_all.filter(
        (col("userid") == ANONYMOUS_USERID)
        & (col("site") != "askme")
        & (col("deleted") != 1)
        & (col("deleted") != 3)
    )
    .sort("datestamp")
    .drop("category", "month")
)

df_anonymised_posts

In [None]:
display(df_anonymised_posts.group_by("site").agg(pl.len()).sort("len", descending=True))

In [None]:
df_anonymised_comments = (
    df_comments_all.filter(
        (col("userid") == ANONYMOUS_USERID) & (col("site") != "askme")
    )
    .sort("datestamp")
    .drop("month")
)

df_anonymised_comments

In [None]:
display(
    df_anonymised_comments.group_by("site").agg(pl.len()).sort("len", descending=True)
)