## Reddit dataset consolidation

This notebook gathers every CSV inside `Reddit Dataset/` (except the large `kaggle_RC_2019-05.csv`) and loads them with the correct headers provided in `headers.txt`.

In [1]:
from pathlib import Path
import pandas as pd

DATA_DIR = Path("Reddit Dataset")
EXCLUDE_FILES = {"kaggle_RC_2019-05.csv"}  # huge generic dump; skip to keep things tractable

COLUMN_NAMES = [
    "text",
    "id",
    "subreddit",
    "meta",
    "time",
    "author",
    "ups",
    "downs",
    "authorlinkkarma",
    "authorkarma",
    "authorisgold",
]

csv_paths = sorted(
    path for path in DATA_DIR.glob("*.csv") if path.name not in EXCLUDE_FILES
)
meta_groups = sorted({path.stem.split("_", 1)[0] for path in csv_paths})

print(f"Found {len(csv_paths)} subreddit CSV files to combine.")
print("Meta groups:", meta_groups)
print("First 5 files:", [p.name for p in csv_paths[:5]])
print("Last 5 files:", [p.name for p in csv_paths[-5:]])


Found 49 subreddit CSV files to combine.
Meta groups: ['entertainment', 'gaming', 'humor', 'learning', 'lifestyle', 'news', 'television']
First 5 files: ['entertainment_comicbooks.csv', 'entertainment_harrypotter.csv', 'entertainment_movies.csv', 'entertainment_music.csv', 'entertainment_starwars.csv']
Last 5 files: ['television_gameofthrones.csv', 'television_himym.csv', 'television_mylittlepony.csv', 'television_startrek.csv', 'television_thewalkingdead.csv']


Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
frames = []
total_filtered = 0

for csv_path in csv_paths:
    df = pd.read_csv(csv_path)

    # Drop exported pandas index + rogue placeholder column if present
    df = df.drop(columns=df.columns[0])
    if len(df.columns) > len(COLUMN_NAMES):
        df = df.drop(columns=df.columns[0])

    if len(df.columns) != len(COLUMN_NAMES):
        raise ValueError(
            f"Unexpected column count {len(df.columns)} in {csv_path.name}."
        )

    df.columns = COLUMN_NAMES

    meta_group = csv_path.stem.split("_", 1)[0]
    before = len(df)
    df = df[df["meta"] == meta_group]
    filtered = before - len(df)
    if filtered:
        total_filtered += filtered
        print(f"Filtered {filtered} malformed rows in {csv_path.name}")

    frames.append(df)

combined_df = pd.concat(frames, ignore_index=True)
print(f"\nCombined shape: {combined_df.shape[0]:,} rows × {combined_df.shape[1]} columns")
print(f"Dropped {total_filtered:,} mismatched rows overall.")

combined_df.head()


Filtered 1 malformed rows in entertainment_comicbooks.csv
Filtered 1 malformed rows in entertainment_harrypotter.csv
Filtered 1 malformed rows in entertainment_movies.csv

Combined shape: 2,423,702 rows × 11 columns
Dropped 3 mismatched rows overall.


Unnamed: 0,text,id,subreddit,meta,time,author,ups,downs,authorlinkkarma,authorkarma,authorisgold
0,sometimes they have a difference of opinion s...,d01727e,comicbooks,entertainment,1455577000.0,TheStealthBox,5.0,0.0,208.0,32044.0,0.0
1,try polysuede or felt that is acidfree or pass...,d02fswl,comicbooks,entertainment,1455661000.0,mrindustrialist,1.0,0.0,1.0,75.0,0.0
2,take them in to a second hand book store amp ...,d01qm82,comicbooks,entertainment,1455615000.0,matthew_lane,2.0,0.0,250.0,7710.0,0.0
3,a lot of cities have ways of getting comics in...,d01k3vi,comicbooks,entertainment,1455597000.0,Daiteach,3.0,0.0,439.0,11111.0,0.0
4,i m probably in the minority but even the wo...,d01km27,comicbooks,entertainment,1455598000.0,Nejfelt,2.0,0.0,150.0,918.0,0.0


In [3]:
print("Records per meta subreddit (top 10):")
print(combined_df["meta"].value_counts().head(10))

combined_df.sample(3, random_state=42)

Records per meta subreddit (top 10):
meta
gaming           428443
news             408716
lifestyle        384494
humor            382197
television       321794
learning         271179
entertainment    226879
Name: count, dtype: int64


Unnamed: 0,text,id,subreddit,meta,time,author,ups,downs,authorlinkkarma,authorkarma,authorisgold
1767257,i wish this sub would ban dumb shit like this ...,d01yzxb,libertarian,news,1455638000.0,AlCapone564,30.0,0.0,2794.0,1807.0,0.0
237144,if only mmr could get you attitude,d02kli8,dota2,gaming,1455668000.0,ShrikeGFX,1.0,0.0,276.0,2542.0,0.0
1747502,so basically you re fucked out of a good job o...,d02tety,conspiracy,news,1455682000.0,goober_boobz,1.0,0.0,190.0,2997.0,0.0


In [4]:
import os
import re
import numpy as np
import pandas as pd
from tqdm.auto import tqdm

import torch
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"

RECENT_DAYS = 5               # look for emergence in the most recent 5 days
BASELINE_DAYS = 15            # compare against the preceding 15 days
FRESHNESS_DAYS = 20           # ignore tokens first seen earlier than this
MIN_RECENT_USES = 15          # minimum contexts in the recent window
MAX_CONTEXTS_PER_TOKEN = 5
MAX_BASELINE_CONTEXTS = 20_000
MAX_RECENT_CONTEXTS = 4_000
COSINE_DUP_THRESHOLD = 0.90
TARGET_TERM_COUNT = 20

TOKEN_REGEX = r"(?P<token>[a-zA-Z][a-zA-Z0-9'#_+\-]{1,24})"

STOPWORDS = {
    "the","and","you","that","with","this","have","your","from","they","them",
    "what","when","were","would","there","could","should","about","because",
    "their","just","like","cant","dont","doesnt","im","ive","ill","lets",
    "was","for","are","but",
}

def normalize_text(text: str) -> str:
    text = text.lower()
    text = re.sub(r"\s+", " ", text)
    return text.strip()

def sample_contexts(df: pd.DataFrame, max_total: int) -> pd.DataFrame:
    """Limit contexts per token and overall while keeping pandas happy."""
    sampled = (
        df.groupby("token", group_keys=False, sort=False)
          .apply(lambda g: g.sample(min(len(g), MAX_CONTEXTS_PER_TOKEN), random_state=42))
    )
    if len(sampled) > max_total:
        sampled = sampled.sample(max_total, random_state=42)
    return sampled.reset_index(drop=True)


  from .autonotebook import tqdm as notebook_tqdm


In [5]:
df = combined_df.loc[:, ["meta", "time", "text"]].copy()
df["event_dt"] = pd.to_datetime(df["time"], unit="s", utc=True).dt.floor("D")

analysis_end = df["event_dt"].max()
recent_start = analysis_end - pd.Timedelta(days=RECENT_DAYS - 1)
baseline_start = recent_start - pd.Timedelta(days=BASELINE_DAYS)
fresh_cutoff = analysis_end - pd.Timedelta(days=FRESHNESS_DAYS)

window_mask = df["event_dt"].between(baseline_start, analysis_end)
df = df.loc[window_mask].copy()
df["text_norm"] = df["text"].fillna("").map(normalize_text)
df = df.loc[df["text_norm"].str.len() > 0]

print(f"Filtered to {len(df):,} rows within [{baseline_start.date()} → {analysis_end.date()}].")

token_matches = (
    df["text_norm"]
    .str.extractall(TOKEN_REGEX)
    .reset_index()
    .rename(columns={"level_0": "row_id", "token": "token"})
)

token_df = token_matches.merge(
    df[["meta", "event_dt", "text_norm"]],
    left_on="row_id",
    right_index=True,
    how="left",
)

token_df = token_df.loc[~token_df["token"].isin(STOPWORDS)]
token_df["context"] = token_df["token"] + " || " + token_df["text_norm"].str.slice(0, 220)
token_df = token_df.loc[:, ["meta", "event_dt", "token", "context"]]

print(f"Extracted {len(token_df):,} token-context pairs "
      f"({token_df['token'].nunique():,} unique tokens).")

baseline_df = token_df[
    token_df["event_dt"].between(baseline_start, recent_start - pd.Timedelta(days=1))
]
recent_df = token_df[
    token_df["event_dt"].between(recent_start, analysis_end) &
    token_df["event_dt"].ge(fresh_cutoff)
]

baseline_df = sample_contexts(baseline_df, MAX_BASELINE_CONTEXTS)
recent_df = sample_contexts(recent_df, MAX_RECENT_CONTEXTS)

print(f"Baseline contexts: {len(baseline_df):,} ({baseline_df['token'].nunique():,} tokens)")
print(f"Recent contexts:   {len(recent_df):,} ({recent_df['token'].nunique():,} tokens) "
      f"[{recent_start.date()} → {analysis_end.date()}]")


Filtered to 2,183,806 rows within [2016-01-29 → 2016-02-17].
Extracted 59,458,099 token-context pairs (60,070 unique tokens).


  .apply(lambda g: g.sample(min(len(g), MAX_CONTEXTS_PER_TOKEN), random_state=42))
  .apply(lambda g: g.sample(min(len(g), MAX_CONTEXTS_PER_TOKEN), random_state=42))


Baseline contexts: 20,000 (11,041 tokens)
Recent contexts:   4,000 (3,905 tokens) [2016-02-13 → 2016-02-17]


In [6]:
from sentence_transformers import SentenceTransformer
from sklearn.ensemble import IsolationForest

model = SentenceTransformer(MODEL_NAME, device=DEVICE)
print(f"Embedding on {DEVICE.upper()}…")

baseline_emb = model.encode(
    baseline_df["context"].tolist(),
    batch_size=512,
    show_progress_bar=True,
    convert_to_numpy=True,
    normalize_embeddings=True,
)

recent_emb = model.encode(
    recent_df["context"].tolist(),
    batch_size=512,
    show_progress_bar=True,
    convert_to_numpy=True,
    normalize_embeddings=True,
)

iso = IsolationForest(
    n_estimators=256,
    contamination=0.05,
    random_state=42,
    n_jobs=-1,
)
iso.fit(baseline_emb)

scores = -iso.score_samples(recent_emb)
recent_scored = recent_df.reset_index(drop=True).assign(
    anomaly_score=scores,
    embedding=list(recent_emb),
)

def build_stats(min_uses):
    term_stats = (
        recent_scored.groupby("token")
        .agg(
            mean_anomaly=("anomaly_score", "mean"),
            recent_uses=("token", "size"),
            first_seen=("event_dt", "min"),
            metas=("meta", lambda x: ", ".join(pd.Series(x).value_counts().head(2).index)),
            example_context=("context", "first"),
        )
    )
    centroids = (
        recent_scored.groupby("token")["embedding"]
        .apply(lambda vecs: np.vstack(vecs).mean(axis=0))
        .rename("centroid")
    )
    stats = (
        term_stats.join(centroids)
        .query("recent_uses >= @min_uses")
        .reset_index()
        .rename(columns={"token": "term"})
    )
    stats["novelty_score"] = stats["mean_anomaly"] * np.log1p(stats["recent_uses"])
    return stats.sort_values("novelty_score", ascending=False)

stats = build_stats(MIN_RECENT_USES)
if stats.empty:
    relaxed_min = max(3, MIN_RECENT_USES // 2)
    print(f"⚠️ No terms met recent_uses ≥ {MIN_RECENT_USES}; retrying with ≥ {relaxed_min}.")
    stats = build_stats(relaxed_min)

if stats.empty:
    print("❌ No candidate terms available even after relaxing thresholds.")
    candidates = pd.DataFrame(columns=[
        "term","mean_anomaly","recent_uses","first_seen",
        "metas","example_context","baseline_window","recent_window"
    ])
else:
    def cosine_sim(a: np.ndarray, b: np.ndarray) -> float:
        return float(np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b) + 1e-9))

    selected = []
    for _, row in stats.iterrows():
        emb = row["centroid"]
        if all(cosine_sim(emb, sel["centroid"]) < COSINE_DUP_THRESHOLD for sel in selected):
            selected.append(row)
        if len(selected) == TARGET_TERM_COUNT:
            break

    if not selected:
        print("⚠️ No terms passed the uniqueness filter; falling back to top-ranked terms.")
        selected = [row for _, row in stats.head(TARGET_TERM_COUNT).iterrows()]

    candidates = pd.DataFrame(selected).reset_index(drop=True)
    candidates["baseline_window"] = f"{baseline_start.date()} → {(recent_start - pd.Timedelta(days=1)).date()}"
    candidates["recent_window"] = f"{recent_start.date()} → {analysis_end.date()}"
    candidates = candidates.drop(columns="centroid")

display(candidates[[
    "term","mean_anomaly","recent_uses","first_seen",
    "metas","example_context","baseline_window","recent_window"
]])

print(f"\nReturned {len(candidates)} candidate terms ready for SIR modeling, "
      "semantic tracking, and downstream analysis.")


Embedding on CUDA…


Batches: 100%|██████████| 40/40 [00:04<00:00,  9.13it/s]
Batches: 100%|██████████| 8/8 [00:00<00:00,  8.43it/s]


⚠️ No terms met recent_uses ≥ 15; retrying with ≥ 7.
❌ No candidate terms available even after relaxing thresholds.


Unnamed: 0,term,mean_anomaly,recent_uses,first_seen,metas,example_context,baseline_window,recent_window



Returned 0 candidate terms ready for SIR modeling, semantic tracking, and downstream analysis.
