In [None]:
import pandas as pd
from pathlib import Path

### Loading scraped raw data

In [None]:
# Load standardize metadata

# Base folder where the Parquet file lives
BASE_PATH = Path("/Users/apple/Desktop/30112_python/Scrape_Reddit")

# Parquet file produced by the Arctic Shift scraping script (TEST mode)
parquet_path = BASE_PATH / "scraped_data" / "reddit_inflation_2020_2025_posts_and_comments_TEST.parquet"

# Sanity check: make sure the file exists before loading
print("Parquet path:", parquet_path)
print("Exists?", parquet_path.exists())

# Load the full Reddit dataset (posts + comments)
df = pd.read_parquet(parquet_path)
print("Original shape:", df.shape)
df.head()

### Cleaning missing and unmeaningful data

In [None]:
# Keep only the columns we actually need for analysis.
# If some are missing (e.g., num_comments on comments), we just keep the ones that exist.
EXPECTED_COLS = [
    "type",          # "submission" or "comment"
    "id",
    "link_id",
    "parent_id",
    "subreddit",
    "title",
    "body",          # main text for comments, selftext for posts
    "created_utc",   # timestamp (epoch seconds)
    "score",
    "num_comments",  # only meaningful for submissions
]

keep_cols = [c for c in EXPECTED_COLS if c in df.columns]
df = df[keep_cols].copy()
print("After column selection:", df.shape)
df.head()

In [None]:
# FIXED: Properly convert created_utc (epoch seconds) into a timezone-aware datetime.
# Check data type first
print("Original created_utc dtype:", df["created_utc"].dtype)
print("Sample values before conversion:")
print(df["created_utc"].head())

# Convert from Unix timestamp (seconds since epoch) to datetime
# The 'unit' parameter is critical - it must be 's' for seconds
df["created_utc"] = pd.to_datetime(df["created_utc"], unit="s", utc=True)

# Verify conversion worked
print("\nAfter conversion:")
print("New created_utc dtype:", df["created_utc"].dtype)
print("Sample datetime values:")
print(df["created_utc"].head())

# Add year/month/date columns for grouping later.
df["date"] = df["created_utc"].dt.date
df["year"] = df["created_utc"].dt.year
df["month"] = df["created_utc"].dt.month

# Verify year range
print("\nYear distribution:")
print(df["year"].value_counts().sort_index())

df[["created_utc", "date", "year", "month"]].head()

### Text cleaning & Preprocessing
(Organizing & Tokenizing)

In [None]:
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Run these once in a fresh environment (comment them out afterwards).
# They download NLTK data files to your machine.
nltk.download("punkt")
nltk.download("stopwords")

# English stopword list (e.g., "the", "and", "is")
STOPWORDS = set(stopwords.words("english"))

# Translation table to remove punctuation characters
PUNCT_TABLE = str.maketrans("", "", string.punctuation)

def clean_text_basic(text: str) -> str:
    """
    Basic text cleaner:
    - Handle missing values
    - Remove URLs
    - Lowercase text
    - Remove punctuation
    - Collapse extra whitespace
    """
    if not isinstance(text, str):
        return ""
    # Remove URLs (http..., www...)
    text = re.sub(r"http\S+|www\.\S+", " ", text)
    # Lowercase everything
    text = text.lower()
    # Remove punctuation characters
    text = text.translate(PUNCT_TABLE)
    # Replace multiple spaces/newlines with a single space
    text = re.sub(r"\s+", " ", text).strip()
    return text

def tokenize_and_remove_stopwords(text: str):
    """
    Tokenize a cleaned text string, then:
    - Keep only alphabetic tokens (no numbers, no leftover punctuation)
    - Drop stopwords like "the", "and", etc.
    Returns a list of tokens.
    """
    if not isinstance(text, str):
        return []
    tokens = word_tokenize(text)
    tokens = [t for t in tokens if t.isalpha() and t not in STOPWORDS]
    return tokens

In [None]:
# Combine title + body into a single raw_text field so we always
# have all the context for a post or comment in one place.
# (For comments, title will be empty; for posts, body is the selftext.)
df["raw_text"] = (
    df[["title", "body"]]
    .fillna("")         # replace NaN with empty strings
    .agg(" ".join, axis=1)  # join title and body with a space
    .str.strip()
)

# Apply the cleaner to get a normalized string version.
df["clean_text"] = df["raw_text"].apply(clean_text_basic)

# Tokenize into a list of words (no stopwords, no punctuation).
df["tokens"] = df["clean_text"].apply(tokenize_and_remove_stopwords)

# Quick peek at the processed columns to make sure things look reasonable.
df[["type", "subreddit", "created_utc", "raw_text", "clean_text", "tokens"]].head()

In [None]:
# Save as Parquet (recommended for speed/size)

out_path = BASE_PATH / "scraped_data" / "reddit_inflation_2020_2025_posts_and_comments_TEST_clean.parquet"

# Create folders if they don't exist
out_path.parent.mkdir(parents=True, exist_ok=True)

df.to_parquet(out_path, engine="pyarrow", index=False)
print("Saved cleaned data to:", out_path)