In [1]:
import pandas as pd
from pathlib import Path

### Loading scraped raw data

In [2]:
# Load standardize metadata

# Base folder where the Parquet file lives
BASE_PATH = Path("/Users/apple/Desktop/30112_python/Scrape_Reddit")

# Parquet file produced by the Arctic Shift scraping script (TEST mode)
parquet_path = BASE_PATH / "reddit_inflation_2020_2025_posts_and_comments_TEST.parquet"

# Sanity check: make sure the file exists before loading
print("Parquet path:", parquet_path)
print("Exists?", parquet_path.exists())

# Load the full Reddit dataset (posts + comments)
df = pd.read_parquet(parquet_path)
print("Original shape:", df.shape)
df.head()

Parquet path: /Users/apple/Desktop/30112_python/Scrape_Reddit/reddit_inflation_2020_2025_posts_and_comments_TEST.parquet
Exists? True
Original shape: (35023, 10)


Unnamed: 0,type,id,link_id,parent_id,subreddit,title,body,created_utc,score,num_comments
0,submission,shg9hy,shg9hy,,personalfinance,Mortgage options for buying house that you won...,"If a friend is living with me rent free, can t...",1643673217,1,6.0
1,submission,shg01r,shg01r,,personalfinance,Invest or pay extra on student loans?,"Hi personal finance,\n\nI'm trying to decide w...",1643672497,1,6.0
2,submission,shfohd,shfohd,,personalfinance,"New HVAC Rebates, don't want to miss anything.",Location: Ohio\n\nPurchased a Bryant Evolution...,1643671641,1,0.0
3,submission,shfgx2,shfgx2,,personalfinance,Which financial decision benefits me long term?,"Hello,\n\nI got two different financial decisi...",1643671106,1,14.0
4,submission,shej98,shej98,,personalfinance,File an amendment now?,I just submitted my federal return using credi...,1643668802,1,4.0


### Cleaning missing and unmeaningful data

In [4]:
# Keep only the columns we actually need for analysis.
# If some are missing (e.g., num_comments on comments), we just keep the ones that exist.
EXPECTED_COLS = [
    "type",          # "submission" or "comment"
    "id",
    "link_id",
    "parent_id",
    "subreddit",
    "title",
    "body",          # main text for comments, selftext for posts
    "created_utc",   # timestamp (epoch seconds)
    "score",
    "num_comments",  # only meaningful for submissions
]

keep_cols = [c for c in EXPECTED_COLS if c in df.columns]
df = df[keep_cols].copy()
print("After column selection:", df.shape)
df.head()

After column selection: (35023, 10)


Unnamed: 0,type,id,link_id,parent_id,subreddit,title,body,created_utc,score,num_comments
0,submission,shg9hy,shg9hy,,personalfinance,Mortgage options for buying house that you won...,"If a friend is living with me rent free, can t...",2022-01-31 23:53:37+00:00,1,6.0
1,submission,shg01r,shg01r,,personalfinance,Invest or pay extra on student loans?,"Hi personal finance,\n\nI'm trying to decide w...",2022-01-31 23:41:37+00:00,1,6.0
2,submission,shfohd,shfohd,,personalfinance,"New HVAC Rebates, don't want to miss anything.",Location: Ohio\n\nPurchased a Bryant Evolution...,2022-01-31 23:27:21+00:00,1,0.0
3,submission,shfgx2,shfgx2,,personalfinance,Which financial decision benefits me long term?,"Hello,\n\nI got two different financial decisi...",2022-01-31 23:18:26+00:00,1,14.0
4,submission,shej98,shej98,,personalfinance,File an amendment now?,I just submitted my federal return using credi...,2022-01-31 22:40:02+00:00,1,4.0


In [3]:
# Convert created_utc (epoch seconds) into a timezoneâ€‘aware datetime.
# Then add year/month/date columns for grouping later.
df["created_utc"] = pd.to_datetime(df["created_utc"], unit="s", utc=True)
df["date"] = df["created_utc"].dt.date
df["year"] = df["created_utc"].dt.year
df["month"] = df["created_utc"].dt.month

df[["created_utc", "date", "year", "month"]].head()

Unnamed: 0,created_utc,date,year,month
0,2022-01-31 23:53:37+00:00,2022-01-31,2022,1
1,2022-01-31 23:41:37+00:00,2022-01-31,2022,1
2,2022-01-31 23:27:21+00:00,2022-01-31,2022,1
3,2022-01-31 23:18:26+00:00,2022-01-31,2022,1
4,2022-01-31 22:40:02+00:00,2022-01-31,2022,1


### Text cleaning & Preprocessing
(Organizing & Tokenizing)

In [5]:
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Run these once in a fresh environment (comment them out afterwards).
# They download NLTK data files to your machine.
nltk.download("punkt")
nltk.download("stopwords")

# English stopword list (e.g., "the", "and", "is")
STOPWORDS = set(stopwords.words("english"))

# Translation table to remove punctuation characters
PUNCT_TABLE = str.maketrans("", "", string.punctuation)

def clean_text_basic(text: str) -> str:
    """
    Basic text cleaner:
    - Handle missing values
    - Remove URLs
    - Lowercase text
    - Remove punctuation
    - Collapse extra whitespace
    """
    if not isinstance(text, str):
        return ""
    # Remove URLs (http..., www...)
    text = re.sub(r"http\S+|www\.\S+", " ", text)
    # Lowercase everything
    text = text.lower()
    # Remove punctuation characters
    text = text.translate(PUNCT_TABLE)
    # Replace multiple spaces/newlines with a single space
    text = re.sub(r"\s+", " ", text).strip()
    return text

def tokenize_and_remove_stopwords(text: str):
    """
    Tokenize a cleaned text string, then:
    - Keep only alphabetic tokens (no numbers, no leftover punctuation)
    - Drop stopwords like "the", "and", etc.
    Returns a list of tokens.
    """
    if not isinstance(text, str):
        return []
    tokens = word_tokenize(text)
    tokens = [t for t in tokens if t.isalpha() and t not in STOPWORDS]
    return tokens


[nltk_data] Downloading package punkt to /Users/apple/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/apple/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [6]:
# Combine title + body into a single raw_text field so we always
# have all the context for a post or comment in one place.
# (For comments, title will be empty; for posts, body is the selftext.)
df["raw_text"] = (
    df[["title", "body"]]
    .fillna("")         # replace NaN with empty strings
    .agg(" ".join, axis=1)  # join title and body with a space
    .str.strip()
)

# Apply the cleaner to get a normalized string version.
df["clean_text"] = df["raw_text"].apply(clean_text_basic)

# Tokenize into a list of words (no stopwords, no punctuation).
df["tokens"] = df["clean_text"].apply(tokenize_and_remove_stopwords)

# Quick peek at the processed columns to make sure things look reasonable.
df[["type", "subreddit", "created_utc", "raw_text", "clean_text", "tokens"]].head()


Unnamed: 0,type,subreddit,created_utc,raw_text,clean_text,tokens
0,submission,personalfinance,2022-01-31 23:53:37+00:00,Mortgage options for buying house that you won...,mortgage options for buying house that you won...,"[mortgage, options, buying, house, wont, live,..."
1,submission,personalfinance,2022-01-31 23:41:37+00:00,Invest or pay extra on student loans? Hi perso...,invest or pay extra on student loans hi person...,"[invest, pay, extra, student, loans, hi, perso..."
2,submission,personalfinance,2022-01-31 23:27:21+00:00,"New HVAC Rebates, don't want to miss anything....",new hvac rebates dont want to miss anything lo...,"[new, hvac, rebates, dont, want, miss, anythin..."
3,submission,personalfinance,2022-01-31 23:18:26+00:00,Which financial decision benefits me long term...,which financial decision benefits me long term...,"[financial, decision, benefits, long, term, he..."
4,submission,personalfinance,2022-01-31 22:40:02+00:00,File an amendment now? I just submitted my fed...,file an amendment now i just submitted my fede...,"[file, amendment, submitted, federal, return, ..."


In [10]:
# Save as Parquet (recommended for speed/size)

out_path = BASE_PATH / "scraped_data" / "reddit_inflation_2020_2025_posts_and_comments_TEST_clean.parquet"

# Create folders if they don't exist
out_path.parent.mkdir(parents=True, exist_ok=True)

df.to_parquet(out_path, engine="pyarrow", index=False)
print("Saved cleaned data to:", out_path)


Saved cleaned data to: /Users/apple/Desktop/30112_python/Scrape_Reddit/scraped_data/reddit_inflation_2020_2025_posts_and_comments_TEST_clean.parquet
