## Reddit dataset consolidation

This notebook gathers every CSV inside `Reddit Dataset/` (except the large `kaggle_RC_2019-05.csv`) and loads them with the correct headers provided in `headers.txt`.

In [1]:
from pathlib import Path
import pandas as pd

DATA_DIR = Path('Reddit Dataset')
COLUMN_NAMES = [
    'text',
    'id',
    'subreddit',
    'meta',
    'time',
    'author',
    'ups',
    'downs',
    'authorlinkkarma',
    'authorkarma',
    'authorisgold',
]

csv_paths = sorted(
    path for path in DATA_DIR.glob('*.csv')
)
meta_groups = sorted({path.stem.split('_', 1)[0] for path in csv_paths})
print(f"Found {len(csv_paths)} subreddit CSV files to combine.")
print('Meta groups:', meta_groups)
print('First 5 files:', [p.name for p in csv_paths[:5]])
print('Last 5 files:', [p.name for p in csv_paths[-5:]])

Found 49 subreddit CSV files to combine.
Meta groups: ['entertainment', 'gaming', 'humor', 'learning', 'lifestyle', 'news', 'television']
First 5 files: ['entertainment_comicbooks.csv', 'entertainment_harrypotter.csv', 'entertainment_movies.csv', 'entertainment_music.csv', 'entertainment_starwars.csv']
Last 5 files: ['television_gameofthrones.csv', 'television_himym.csv', 'television_mylittlepony.csv', 'television_startrek.csv', 'television_thewalkingdead.csv']


In [2]:
frames = []
total_filtered = 0
for csv_path in csv_paths:
    df = pd.read_csv(csv_path)
    df = df.drop(columns=df.columns[0])
    if len(df.columns) > len(COLUMN_NAMES):
        df = df.drop(columns=df.columns[0])
    if len(df.columns) != len(COLUMN_NAMES):
        raise ValueError(
            f"Unexpected column count {len(df.columns)} in {csv_path.name}."
        )
    df.columns = COLUMN_NAMES
    meta_group = csv_path.stem.split('_', 1)[0]
    before = len(df)
    df = df[df['meta'] == meta_group]
    filtered = before - len(df)
    if filtered:
        total_filtered += filtered
        print(f"Filtered {filtered} malformed rows in {csv_path.name}")
    frames.append(df)

combined_df = pd.concat(frames, ignore_index=True)
print(f"Combined shape: {combined_df.shape[0]:,} rows x {combined_df.shape[1]} columns")
combined_df.head()

Filtered 1 malformed rows in entertainment_comicbooks.csv
Filtered 1 malformed rows in entertainment_harrypotter.csv
Filtered 1 malformed rows in entertainment_movies.csv
Combined shape: 2,423,702 rows x 11 columns


Unnamed: 0,text,id,subreddit,meta,time,author,ups,downs,authorlinkkarma,authorkarma,authorisgold
0,sometimes they have a difference of opinion s...,d01727e,comicbooks,entertainment,1455577000.0,TheStealthBox,5.0,0.0,208.0,32044.0,0.0
1,try polysuede or felt that is acidfree or pass...,d02fswl,comicbooks,entertainment,1455661000.0,mrindustrialist,1.0,0.0,1.0,75.0,0.0
2,take them in to a second hand book store amp ...,d01qm82,comicbooks,entertainment,1455615000.0,matthew_lane,2.0,0.0,250.0,7710.0,0.0
3,a lot of cities have ways of getting comics in...,d01k3vi,comicbooks,entertainment,1455597000.0,Daiteach,3.0,0.0,439.0,11111.0,0.0
4,i m probably in the minority but even the wo...,d01km27,comicbooks,entertainment,1455598000.0,Nejfelt,2.0,0.0,150.0,918.0,0.0


In [3]:
print('Records per meta subreddit (top 10):')
print(
    combined_df['meta']
    .value_counts()
    .head(10)
)

combined_df.sample(3, random_state=42)

Records per meta subreddit (top 10):
meta
gaming           428443
news             408716
lifestyle        384494
humor            382197
television       321794
learning         271179
entertainment    226879
Name: count, dtype: int64


Unnamed: 0,text,id,subreddit,meta,time,author,ups,downs,authorlinkkarma,authorkarma,authorisgold
1767257,i wish this sub would ban dumb shit like this ...,d01yzxb,libertarian,news,1455638000.0,AlCapone564,30.0,0.0,2794.0,1807.0,0.0
237144,if only mmr could get you attitude,d02kli8,dota2,gaming,1455668000.0,ShrikeGFX,1.0,0.0,276.0,2542.0,0.0
1747502,so basically you re fucked out of a good job o...,d02tety,conspiracy,news,1455682000.0,goober_boobz,1.0,0.0,190.0,2997.0,0.0


In [5]:
# Cell 1 – configuration + helpers for term detection
import re
from collections import Counter
import pandas as pd

# Detection window settings
RECENT_WINDOW_DAYS = 7          # "now" window we care about
BASELINE_WINDOW_DAYS = 21       # compare against the previous few weeks
FRESHNESS_DAYS = 35             # only keep terms first seen in this period
MIN_RECENT_FREQ = 20            # minimum token occurrences in recent window
MAX_BASELINE_FREQ = 50          # treat anything more frequent as "not novel"
TOP_K_TERMS = 20                # final number of candidates to keep

token_pattern = re.compile(r"[a-zA-Z][a-zA-Z0-9'#_+-]{1,24}")

STOPWORDS = {
    "the","and","you","that","with","this","have","your","from","they","them","what",
    "when","were","would","there","could","should","about","because","their","just",
    "like","cant","dont","doesnt","im","ive","ill","lets","was","for","are","but",
}

def normalize_text(text: str) -> str:
    return " ".join(text.lower().split())

def extract_terms(text: str) -> list[str]:
    tokens = [t for t in token_pattern.findall(text.lower()) if t not in STOPWORDS]
    bigrams = [
        f"{a} {b}"
        for a, b in zip(tokens, tokens[1:])
        if a not in STOPWORDS or b not in STOPWORDS
    ]
    return tokens + bigrams

In [6]:
# Cell 2 – explode posts into daily term counts
term_df = (
    combined_df
    .assign(
        event_dt=pd.to_datetime(combined_df["time"], unit="s", utc=True).dt.floor("D"),
        text_norm=combined_df["text"].fillna("").map(normalize_text),
    )
    .loc[:, ["meta", "event_dt", "text_norm"]]
)

term_df["terms"] = term_df["text_norm"].map(extract_terms)
term_df = term_df.explode("terms").dropna(subset=["terms"])

daily_counts = (
    term_df.groupby(["meta", "event_dt", "terms"])
    .size()
    .rename("freq")
    .reset_index()
)

print(f"Exploded {len(term_df):,} term uses into "
      f"{daily_counts['terms'].nunique():,} unique terms across "
      f"{daily_counts['meta'].nunique()} meta communities.")
daily_counts.head()


Exploded 120,348,886 term uses into 646,888 unique terms across 7 meta communities.


Unnamed: 0,meta,event_dt,terms,freq
0,entertainment,2015-09-14 00:00:00+00:00,accepting,50
1,entertainment,2015-09-14 00:00:00+00:00,accepting new,50
2,entertainment,2015-09-14 00:00:00+00:00,account,50
3,entertainment,2015-09-14 00:00:00+00:00,account is,50
4,entertainment,2015-09-14 00:00:00+00:00,adhere,50


In [7]:
# Cell 3 – novelty scoring + top candidate list
analysis_end = daily_counts["event_dt"].max()
recent_start = analysis_end - pd.Timedelta(days=RECENT_WINDOW_DAYS - 1)
baseline_start = recent_start - pd.Timedelta(days=BASELINE_WINDOW_DAYS)

recent_mask = daily_counts["event_dt"].between(recent_start, analysis_end)
baseline_mask = daily_counts["event_dt"].between(baseline_start, recent_start - pd.Timedelta(days=1))
fresh_mask = daily_counts["event_dt"] >= analysis_end - pd.Timedelta(days=FRESHNESS_DAYS)

recent_freq = (
    daily_counts.loc[recent_mask]
    .groupby(["meta", "terms"])["freq"]
    .sum()
    .rename("recent_freq")
)

baseline_freq = (
    daily_counts.loc[baseline_mask]
    .groupby(["meta", "terms"])["freq"]
    .sum()
    .rename("baseline_freq")
)

first_seen = (
    daily_counts.loc[fresh_mask]
    .groupby(["meta", "terms"])["event_dt"]
    .min()
    .rename("first_seen")
)

scored = (
    recent_freq
    .to_frame()
    .join(baseline_freq, how="left")
    .join(first_seen, how="left")
    .fillna({"baseline_freq": 0})
    .reset_index()
)

scored = scored[
    (scored["recent_freq"] >= MIN_RECENT_FREQ) &
    (scored["baseline_freq"] <= MAX_BASELINE_FREQ) &
    (scored["first_seen"].notna())
]

scored["growth_ratio"] = (scored["recent_freq"] + 1) / (scored["baseline_freq"] + 1)
scored["novelty_score"] = scored["growth_ratio"] * scored["recent_freq"]

top_terms = (
    scored.sort_values("novelty_score", ascending=False)
    .head(TOP_K_TERMS)
    .assign(
        analysis_window=f"{recent_start.date()} → {analysis_end.date()}",
        baseline_window=f"{baseline_start.date()} → {(recent_start - pd.Timedelta(days=1)).date()}",
    )
)

display(top_terms[[
    "meta", "terms", "recent_freq", "baseline_freq",
    "growth_ratio", "novelty_score", "first_seen",
    "baseline_window", "analysis_window",
]])

print(f"\nSelected {len(top_terms)} candidate terms; "
      "use this list as input to the SIR + semantic tracking pipeline.")


Unnamed: 0,meta,terms,recent_freq,baseline_freq,growth_ratio,novelty_score,first_seen,baseline_window,analysis_window
236471,humor,bernie,191997,0.0,191998.0,36863040000.0,2016-02-12 00:00:00+00:00,2016-01-21 → 2016-02-10,2016-02-11 → 2016-02-17
290422,humor,sanders,188700,0.0,188701.0,35607880000.0,2016-02-13 00:00:00+00:00,2016-01-21 → 2016-02-10,2016-02-11 → 2016-02-17
236498,humor,bernie sanders,187600,0.0,187601.0,35193950000.0,2016-02-13 00:00:00+00:00,2016-01-21 → 2016-02-10,2016-02-11 → 2016-02-17
290428,humor,sanders bernie,186150,0.0,186151.0,34652010000.0,2016-02-16 00:00:00+00:00,2016-01-21 → 2016-02-10,2016-02-11 → 2016-02-17
264018,humor,is,178048,0.0,178049.0,31701270000.0,2016-02-11 00:00:00+00:00,2016-01-21 → 2016-02-10,2016-02-11 → 2016-02-17
262139,humor,in,95667,0.0,95668.0,9152271000.0,2016-02-12 00:00:00+00:00,2016-01-21 → 2016-02-10,2016-02-11 → 2016-02-17
266526,humor,kanye,77245,0.0,77246.0,5966867000.0,2016-02-14 00:00:00+00:00,2016-01-21 → 2016-02-10,2016-02-11 → 2016-02-17
614252,news,he,77094,0.0,77095.0,5943562000.0,2016-02-11 00:00:00+00:00,2016-01-21 → 2016-02-10,2016-02-11 → 2016-02-17
307323,humor,west,74898,0.0,74899.0,5609785000.0,2016-02-14 00:00:00+00:00,2016-01-21 → 2016-02-10,2016-02-11 → 2016-02-17
266565,humor,kanye west,74348,0.0,74349.0,5527699000.0,2016-02-15 00:00:00+00:00,2016-01-21 → 2016-02-10,2016-02-11 → 2016-02-17



Selected 20 candidate terms; use this list as input to the SIR + semantic tracking pipeline.


NOTE: Above is very naive term detection with exact matches to establish baseline results. The below is a more refined version using embedding-based anomaly detection. Further updates required to refine memory use.

In [9]:
# Cell 2 – configure windows + build context datasets
import re
import numpy as np
import pandas as pd

MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
BASELINE_DAYS = 21        # training window for "normal" language
RECENT_DAYS = 7           # window we want to flag anomalies in
FRESHNESS_DAYS = 35       # only examine tokens first seen recently
MIN_RECENT_USES = 10
MAX_CONTEXTS_PER_TOKEN = 5
MAX_BASELINE_CONTEXTS = 20000
MAX_RECENT_CONTEXTS = 6000

token_pattern = re.compile(r"[a-zA-Z][a-zA-Z0-9'#_+-]{1,24}")

def normalize_text(text: str) -> str:
    return re.sub(r"\s+", " ", text.lower().strip())

df = combined_df.copy()
df["event_dt"] = pd.to_datetime(df["time"], unit="s", utc=True).dt.floor("D")
df["text_norm"] = df["text"].fillna("").map(normalize_text)
df["tokens"] = df["text_norm"].str.findall(token_pattern)

token_df = (
    df.loc[:, ["meta", "event_dt", "text_norm", "tokens"]]
      .explode("tokens")
      .dropna(subset=["tokens"])
)
token_df["context"] = token_df["tokens"] + " || " + token_df["text_norm"].str.slice(0, 240)

analysis_end = token_df["event_dt"].max()
recent_start = analysis_end - pd.Timedelta(days=RECENT_DAYS - 1)
baseline_start = recent_start - pd.Timedelta(days=BASELINE_DAYS)
fresh_cutoff = analysis_end - pd.Timedelta(days=FRESHNESS_DAYS)

baseline_df = token_df[
    token_df["event_dt"].between(baseline_start, recent_start - pd.Timedelta(days=1))
]
recent_df = token_df[
    token_df["event_dt"].between(recent_start, analysis_end) &
    token_df["event_dt"].ge(fresh_cutoff)
]

def cap_contexts(group, max_rows):
    if len(group) <= max_rows:
        return group
    return group.sample(max_rows, random_state=42)

baseline_df = (
    baseline_df.groupby("tokens", group_keys=False)
               .apply(cap_contexts, MAX_CONTEXTS_PER_TOKEN)
               .sample(min(len(baseline_df), MAX_BASELINE_CONTEXTS), random_state=42)
               .reset_index(drop=True)
)

recent_df = (
    recent_df.groupby("tokens", group_keys=False)
             .apply(cap_contexts, MAX_CONTEXTS_PER_TOKEN)
             .sample(min(len(recent_df), MAX_RECENT_CONTEXTS), random_state=42)
             .reset_index(drop=True)
)

print(f"Baseline contexts: {len(baseline_df):,} across {baseline_df['tokens'].nunique():,} tokens")
print(f"Recent contexts:   {len(recent_df):,} across {recent_df['tokens'].nunique():,} tokens "
      f"({recent_start.date()} → {analysis_end.date()})")


MemoryError: Unable to allocate 73.1 MiB for an array with shape (76657067,) and data type bool

In [None]:
# Cell 3 – embed contexts, run anomaly detection, and list top emerging terms
from sentence_transformers import SentenceTransformer
from sklearn.ensemble import IsolationForest

model = SentenceTransformer(MODEL_NAME)

baseline_emb = model.encode(
    baseline_df["context"].tolist(),
    batch_size=256,
    show_progress_bar=True,
    convert_to_numpy=True,
    normalize_embeddings=True,
)

recent_emb = model.encode(
    recent_df["context"].tolist(),
    batch_size=256,
    show_progress_bar=True,
    convert_to_numpy=True,
    normalize_embeddings=True,
)

iso = IsolationForest(
    n_estimators=256,
    contamination=0.05,
    random_state=42,
    n_jobs=-1,
)
iso.fit(baseline_emb)

recent_scores = -iso.score_samples(recent_emb)  # higher = more anomalous
recent_scored = recent_df.assign(anomaly_score=recent_scores)

agg = (
    recent_scored.groupby("tokens")
    .agg(
        mean_anomaly=("anomaly_score", "mean"),
        recent_uses=("tokens", "size"),
        first_seen=("event_dt", "min"),
        metas=("meta", lambda x: ", ".join(pd.Series(x).value_counts().head(2).index)),
    )
    .query("recent_uses >= @MIN_RECENT_USES")
    .sort_values("mean_anomaly", ascending=False)
)

top_terms = (
    agg.head(20)
       .assign(
           recent_window=f"{recent_start.date()} → {analysis_end.date()}",
           baseline_window=f"{baseline_start.date()} → {(recent_start - pd.Timedelta(days=1)).date()}",
       )
       .reset_index(names="term")
)

display(top_terms[[
    "term", "mean_anomaly", "recent_uses", "first_seen",
    "metas", "baseline_window", "recent_window",
]])

print(f"\nIdentified {len(top_terms)} embedding-level anomaly terms; "
      "feed their aggregated counts into the SIR pipeline next.")
