In [None]:
# =========================
# Reddit → 2 Clean Datasets
# =========================
# 1) reddit_raw.csv:
#    Platform, URL, Likes, Comments, Engagement, Text, Matched Keywords
# 2) reddit_labeled.csv:
#    Platform, URL, Engagement, Clean Text, Word Count, Toxicity Score, Label, Matched Keywords
#
# Notes:
# - Uses PRAW to search Reddit by keywords.
# - If Detoxify is available it’s used for toxicity; otherwise a simple lexicon fallback is used.
# - Files are saved under /content/ for Colab. (Uncomment the Drive section to save to Drive.)

!pip -q install praw detoxify torch torchvision torchaudio

import re
import sys
import math
import pandas as pd
from typing import List, Dict

# --------- OPTIONAL: Save outputs to Google Drive ----------
# from google.colab import drive
# drive.mount('/content/drive')
# OUT_DIR = "/content/drive/MyDrive/reddit_exports"  # change if you want
OUT_DIR = "/content"  # default: local in Colab

# --------- CONFIG: Fill these with your Reddit app credentials ----------
CLIENT_ID     = "JzYJ3wz6xAd9wU-GXjHUAw"   # <-- paste your Reddit app CLIENT_ID
CLIENT_SECRET = "7AEDgf3WuFlF9UmEDONFuneR9ov5xg"   # <-- paste your Reddit app CLIENT_SECRET
USER_AGENT    = "hate-project:v1.0 (by u/yourusername)"  # something unique

# Keywords to search (exact phrase match for "Matched Keywords" column)
KEYWORDS = [
    "hate speech","go back to your country","immigrants","refugees",
    "politicians are corrupt","racism","racial slurs","white supremacy",
    "black people","Asian people","islamophobia","anti-semitism",
    "religious hate","Muslims","Jews","Christians","misogyny"
]

# Search params
TARGET_POSTS = 8          # per keyword (16 keywords × 8 ≈ 128 total)
TIMEFILTER   = "month"     # 'hour' 'day' 'week' 'month' 'year' 'all'
SEARCH_SORT  = "relevance" # 'relevance' 'hot' 'top' 'new' 'comments'

RAW_CSV_PATH     = f"{OUT_DIR}/reddit_raw.csv"
LABELED_CSV_PATH = f"{OUT_DIR}/reddit_labeled.csv"

# --------- Text utilities ----------
URL_RE = re.compile(r'(https?://\S+)|www\.\S+')
MULTISPACE_RE = re.compile(r'\s+')
PUNCT_RE = re.compile(r"[^\w\s']")  # keep apostrophes
APOSTROPHE_FIX = re.compile(r"’|‘")

NEGATIVE_LEXICON = {
    "hate","hates","hateful","racist","racism","sexist","bigot","bigotry","misogyny","misogynist",
    "scum","trash","idiot","stupid","dumb","loser","filthy","disgusting","inferior","subhuman",
    "terrorist","invader","parasite","vermin","kill","die","deport","ban","go back"
}

def clean_text(s: str) -> str:
    if not isinstance(s, str):
        return ""
    s = APOSTROPHE_FIX.sub("'", s)
    s = URL_RE.sub(" ", s)
    s = PUNCT_RE.sub(" ", s)
    s = s.lower()
    s = MULTISPACE_RE.sub(" ", s).strip()
    return s

def word_count(s: str) -> int:
    return len(s.split()) if isinstance(s, str) and s.strip() else 0

def find_matched_keywords(text: str, keywords: List[str]) -> List[str]:
    t = (text or "").lower()
    return [kw for kw in keywords if kw.lower() in t]

# --------- Toxicity scoring (Detoxify or fallback) ----------
def try_detoxify_model():
    try:
        from detoxify import Detoxify
        return Detoxify('original')
    except Exception as e:
        print("Detoxify not available, using lexicon fallback.", file=sys.stderr)
        return None

def toxicity_score_detoxify(model, text: str) -> float:
    try:
        if not text.strip():
            return 0.0
        out = model.predict([text])
        if 'toxicity' in out and out['toxicity']:
            return float(out['toxicity'][0])
        vals = [float(v[0]) for v in out.values() if isinstance(v, (list, tuple)) and v]
        return float(sum(vals)/len(vals)) if vals else 0.0
    except Exception:
        return 0.0

def toxicity_score_lexicon(clean: str) -> float:
    if not clean.strip():
        return 0.0
    toks = clean.split()
    neg_hits = sum(1 for t in toks if t in NEGATIVE_LEXICON)
    return neg_hits / max(1, len(toks))

def label_from_score(score: float, threshold: float) -> str:
    return "toxic" if score >= threshold else "non-toxic"

# --------- Reddit fetch ----------
def fetch_reddit_posts(keywords: List[str], target_posts=60, timefilter="month", sort="relevance") -> List[Dict]:
    import praw
    reddit = praw.Reddit(client_id=CLIENT_ID, client_secret=CLIENT_SECRET, user_agent=USER_AGENT)
    reddit.read_only = True

    rows: List[Dict] = []
    for kw in keywords:
        try:
            results = reddit.subreddit("all").search(kw, sort=sort, time_filter=timefilter, limit=target_posts)
            for r in results:
                full_text = f"{getattr(r, 'title', '') or ''}\n{getattr(r, 'selftext', '') or ''}".strip()
                url = f"https://www.reddit.com{getattr(r, 'permalink', '')}" if getattr(r, 'permalink', None) else getattr(r, 'url', '')
                likes = int(getattr(r, "score", 0) or 0)
                comments = int(getattr(r, "num_comments", 0) or 0)
                engagement = likes + comments

                rows.append({
                    "Platform": "Reddit",
                    "URL": url,
                    "Likes": likes,
                    "Comments": comments,
                    "Engagement": engagement,
                    "Text": full_text,
                    "Matched Keywords": find_matched_keywords(full_text, keywords)
                })
        except Exception as e:
            print(f"Search error for '{kw}': {e}", file=sys.stderr)
            continue
    return rows

# --------- Raw dataframe builder ----------
def build_raw_dataframe(rows: List[Dict]) -> pd.DataFrame:
    return pd.DataFrame(rows, columns=["Platform","URL","Likes","Comments","Engagement","Text","Matched Keywords"])

# --------- Labeled dataframe builder ----------
def build_labeled_dataframe(raw_df: pd.DataFrame) -> pd.DataFrame:
    model = try_detoxify_model()

    clean_list, wc_list, tox_list, label_list = [], [], [], []
    for _, row in raw_df.iterrows():
        txt = str(row.get("Text", "") or "")
        c = clean_text(txt)
        wc = word_count(c)

        if model is not None:
            score = toxicity_score_detoxify(model, txt)
            label = label_from_score(score, threshold=0.5)
        else:
            score = toxicity_score_lexicon(c)
            label = label_from_score(score, threshold=0.25)

        clean_list.append(c)
        wc_list.append(wc)
        tox_list.append(float(score))
        label_list.append(label)

    # Keep Matched Keywords from raw
    out = pd.DataFrame({
        "Platform": raw_df["Platform"],
        "URL": raw_df["URL"],
        "Engagement": raw_df["Engagement"],
        "Clean Text": clean_list,
        "Word Count": wc_list,
        "Toxicity Score": tox_list,
        "Label": label_list,
        "Matched Keywords": raw_df["Matched Keywords"]
    })

    return out

# --------- Run pipeline ----------
try:
    if not (CLIENT_ID and CLIENT_SECRET and USER_AGENT):
        raise ValueError("Please set CLIENT_ID, CLIENT_SECRET, and USER_AGENT at the top of the cell.")

    rows = fetch_reddit_posts(KEYWORDS, target_posts=TARGET_POSTS, timefilter=TIMEFILTER, sort=SEARCH_SORT)
    raw_df = build_raw_dataframe(rows)
    labeled_df = build_labeled_dataframe(raw_df)

    raw_df.to_csv(RAW_CSV_PATH, index=False)
    labeled_df.to_csv(LABELED_CSV_PATH, index=False)

    print("Saved:")
    print(" -", RAW_CSV_PATH)
    print(" -", LABELED_CSV_PATH)
    display(raw_df.head())
    display(labeled_df.head())

except Exception as e:
    print("Error:", e, file=sys.stderr)
    print("\nTip: Make sure you filled Reddit credentials at the top of the cell.")


It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/l

Saved:
 - /content/reddit_raw.csv
 - /content/reddit_labeled.csv


Unnamed: 0,Platform,URL,Likes,Comments,Engagement,Text,Matched Keywords
0,Reddit,https://www.reddit.com/r/law/comments/1nifvhk/...,42675,9945,52620,"Attorney General Pam Bondi: ""There's free spee...",[hate speech]
1,Reddit,https://www.reddit.com/r/PoliticalCompassMemes...,869,439,1308,We’re bringing hate speech laws back baby!!,[hate speech]
2,Reddit,https://www.reddit.com/r/Conservative/comments...,1688,150,1838,Freedom of speech - not freedom from consequen...,[hate speech]
3,Reddit,https://www.reddit.com/r/Libertarian/comments/...,854,37,891,“There is no such thing as hate speech”,[hate speech]
4,Reddit,https://www.reddit.com/r/AskConservatives/comm...,109,316,425,"AG Bondi: ""There's free speech and then there'...",[hate speech]


Unnamed: 0,Platform,URL,Engagement,Clean Text,Word Count,Toxicity Score,Label,Matched Keywords
0,Reddit,https://www.reddit.com/r/law/comments/1nifvhk/...,52620,attorney general pam bondi there's free speech...,44,0.178996,non-toxic,[hate speech]
1,Reddit,https://www.reddit.com/r/PoliticalCompassMemes...,1308,we're bringing hate speech laws back baby,7,0.297615,non-toxic,[hate speech]
2,Reddit,https://www.reddit.com/r/Conservative/comments...,1838,freedom of speech not freedom from consequence...,125,0.001698,non-toxic,[hate speech]
3,Reddit,https://www.reddit.com/r/Libertarian/comments/...,891,there is no such thing as hate speech,8,0.007241,non-toxic,[hate speech]
4,Reddit,https://www.reddit.com/r/AskConservatives/comm...,425,ag bondi there's free speech and then there's ...,107,0.038068,non-toxic,[hate speech]
