In [1]:
import pandas as pd
from pathlib import Path
from datetime import datetime
from collections import Counter
import re
import string
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer

In [2]:
# Basic paths

# If needed in a fresh environment, run once:
nltk.download("vader_lexicon")

# ---------- Paths ----------
base_path = Path("/Users/apple/Desktop/30112_python/Scrape_Reddit")
cleaned_dir = base_path / "scraped_data"

in_path = cleaned_dir / "reddit_inflation_2020_2025_posts_and_comments_TEST_clean.parquet"

print("Loading:", in_path)
df = pd.read_parquet(in_path)
print("Loaded shape:", df.shape)


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/apple/nltk_data...


Loading: /Users/apple/Desktop/30112_python/Scrape_Reddit/scraped_data/reddit_inflation_2020_2025_posts_and_comments_TEST_clean.parquet
Loaded shape: (35023, 10)


In [3]:
# Create Phase Labels
# These are temporary for now

def assign_temp_phase(dt: datetime) -> str:
    """
    Temporary phase definition based on year only.
    Replace later with CPI + policy-based phases.
    """
    if pd.isna(dt):
        return "unknown"

    year = dt.year
    if year <= 2021:
        return "early"
    elif year == 2022:
        return "peak"
    else:
        return "post"

# Ensure created_utc is datetime
if not pd.api.types.is_datetime64_any_dtype(df["created_utc"]):
    df["created_utc"] = pd.to_datetime(df["created_utc"], utc=True, errors="coerce")

df["phase_temp"] = df["created_utc"].apply(assign_temp_phase)

print("Phase counts:")
print(df["phase_temp"].value_counts(dropna=False))


Phase counts:
phase_temp
early    35023
Name: count, dtype: int64


In [4]:
# Keyword List and helper for is_inflation

KEYWORDS = [
    # Core inflation/econ
    "inflation",
    "cost of living",
    "cost-of-living",
    "high cost of living",
    "living costs",
    "price increases",
    "rising prices",
    "cpi",
    "consumer price index",
    "interest rates",
    "mortgage rates",
    "fed",
    "federal reserve",
    "rate hike",
    "rate hikes",
    "rate increase",
    "rate increases",
    # Everyday expenses
    "gas prices",
    "gas price",
    "grocery prices",
    "grocery bill",
    "food prices",
    "food bill",
    "rent increase",
    "rent hike",
    "higher rent",
    "rent is too high",
    "housing costs",
    "housing affordability",
    "property taxes",
    "electric bill",
    "electricity bill",
    "energy bill",
    "heating bill",
    "gas bill",
    "utility bills",
    "utilities",
    # Income / strain
    "wage stagnation",
    "wages not keeping up",
    "real wages",
    "paycheck to paycheck",
    "making ends meet",
    "can't afford",
    "cannot afford",
]
KEYWORDS_LOWER = [k.lower() for k in KEYWORDS]

def text_matches_keywords(text: str) -> bool:
    if not isinstance(text, str):
        return False
    t = text.lower()
    return any(k in t for k in KEYWORDS_LOWER)


In [None]:
## Build raw_text (fallback) and recompute is_inflation

# raw_text should already exist from the cleaning step (title + body).
# If not, create it here.
if "raw_text" not in df.columns:
    df["raw_text"] = (
        df[["title", "body"]]
        .fillna("")
        .agg(" ".join, axis=1)
        .str.strip()
    )

df["is_inflation"] = df["raw_text"].apply(text_matches_keywords)

print("is_inflation value counts:")
print(df["is_inflation"].value_counts(dropna=False))

# Manual quality check: print a few examples
print("\nSample inflation-related rows:")
if df["is_inflation"].any():
    sample_infl = df[df["is_inflation"]].sample(
        n=min(5, df["is_inflation"].sum()), random_state=42
    )
    for _, row in sample_infl.iterrows():
        print("----")
        print(row["created_utc"], "|", row["subreddit"], "|", row["type"])
        print(row["raw_text"][:400], "...\n")
else:
    print("No rows flagged as inflation-related in this TEST dataset.")


is_inflation value counts:
is_inflation
False    24269
True     10754
Name: count, dtype: int64

Sample inflation-related rows:
----
1970-01-01 00:00:01.641470158+00:00 | personalfinance | comment
If he pays 1680 by splitting everything, and only 550 if he doesn't pay half of taxes, it doesn't feel fair here. In another comment thread on this post it was figured that the taxes in this case are 67% of overall expenses. If I was moving someone in and they wanted me to pay almost $3k/mo while they pay around 500 and we make the same or they make a bit more, I would not move that person in. I'd ...

----
1970-01-01 00:00:01.643577614+00:00 | personalfinance | comment
I know an elderly person that got a call just like that. Like your great aunt and uncle, luckily one of their kids was visiting. The line, "Let me put my son on, he is a federal agent" got the scammer to hang up *real* quick. ...

----
1970-01-01 00:00:01.642356762+00:00 | personalfinance | comment
Separate accounts. 

I pay h

In [6]:
# Pilot analysis (on TEST data)

df_inf = df[df["is_inflation"]].copy()
print("\nInflation-related rows:", df_inf.shape[0])



Inflation-related rows: 10754


In [8]:
if not df_inf.empty:
    # Group inflation-related rows by month (period 'M') and post type,
    # count how many rows fall into each (month, type) combo.
    monthly_counts = (
        df_inf
        .groupby([df_inf["created_utc"].dt.to_period("M"), "type"])
        .size()                      # number of rows in each group
        .unstack(fill_value=0)       # turn 'type' into columns
        .rename_axis(index="month")  # label the index as 'month'
    )

    print("\nMonthly counts (first 10 rows):")
    print(monthly_counts.head(10))

    # Group by phase and type to see how many posts/comments per phase.
    phase_counts = (
        df_inf
        .groupby(["phase_temp", "type"])
        .size()
        .unstack(fill_value=0)
    )

    print("\nCounts by phase and type:")
    print(phase_counts)
else:
    print("No inflation-related rows; skipping counts.")



Monthly counts (first 10 rows):
type     comment  submission
month                       
1970-01     9535        1219

Counts by phase and type:
type        comment  submission
phase_temp                     
early          9535        1219


  .groupby([df_inf["created_utc"].dt.to_period("M"), "type"])


In [9]:
if not df_inf.empty:
    print("\nTop tokens per phase:")

    # If tokens column isn't there, we can't compute token frequencies.
    if "tokens" not in df_inf.columns:
        print("No 'tokens' column found; skipping token frequency by phase.")
    else:
        # Loop over each phase (early / peak / post).
        for phase in df_inf["phase_temp"].unique():
            # Keep only rows in this phase.
            phase_df = df_inf[df_inf["phase_temp"] == phase]

            # Flatten all token lists into one long list of tokens.
            all_tokens = [
                t
                for toks in phase_df["tokens"]
                if isinstance(toks, (list, tuple))   # skip bad entries
                for t in toks
            ]

            # Count token frequencies for this phase.
            counts = Counter(all_tokens)
            print(f"\nPhase: {phase}")
            print(counts.most_common(15))  # top 15 tokens



Top tokens per phase:
No 'tokens' column found; skipping token frequency by phase.


In [10]:
if not df_inf.empty:
    # If clean_text isn't present, create a simple cleaned version of raw_text.
    if "clean_text" not in df_inf.columns:
        def clean_text_basic(text: str) -> str:
            if not isinstance(text, str):
                return ""
            # Remove URLs.
            text = re.sub(r"http\S+|www\.\S+", " ", text)
            # Lowercase.
            text = text.lower()
            # Remove punctuation.
            text = text.translate(str.maketrans("", "", string.punctuation))
            # Collapse multiple spaces.
            text = re.sub(r"\s+", " ", text).strip()
            return text

        df_inf["clean_text"] = df_inf["raw_text"].apply(clean_text_basic)

    # Initialize VADER sentiment analyzer (gives Â± sentiment scores).
    sia = SentimentIntensityAnalyzer()

    def vader_compound(text: str) -> float:
        # Return compound score in [-1, 1]; 0 for empty/non-string.
        if not isinstance(text, str) or not text.strip():
            return 0.0
        return sia.polarity_scores(text)["compound"]

    # Compute a sentiment score for each row based on clean_text.
    df_inf["sentiment"] = df_inf["clean_text"].apply(vader_compound)

    # Average sentiment for each phase.
    sent_by_phase = df_inf.groupby("phase_temp")["sentiment"].mean().sort_index()
    print("\nAverage sentiment by phase:")
    print(sent_by_phase)

    # Average sentiment by month (using month period of created_utc).
    sent_by_month = (
        df_inf
        .groupby(df_inf["created_utc"].dt.to_period("M"))["sentiment"]
        .mean()
        .rename_axis(index="month")
    )
    print("\nAverage sentiment by month (first 10 rows):")
    print(sent_by_month.head(10))



Average sentiment by phase:
phase_temp
early    0.371401
Name: sentiment, dtype: float64

Average sentiment by month (first 10 rows):
month
1970-01    0.371401
Freq: M, Name: sentiment, dtype: float64


  .groupby(df_inf["created_utc"].dt.to_period("M"))["sentiment"]


In [None]:
# Optional: Save updated TEST file with new columns 

# Simpler name for the processed test data
out_path = cleaned_dir / "reddit_test_with_phase.parquet"

df.to_parquet(out_path, engine="pyarrow", index=False)

print("\nSaved updated TEST data with phase/is_inflation to:", out_path)


In [11]:
## Debug

print(df_inf["created_utc"].head(10))
print(df_inf["created_utc"].dtype)
print(df_inf["created_utc"].dt.year.value_counts().sort_index())

0   1970-01-01 00:00:01.643673217+00:00
1   1970-01-01 00:00:01.643672497+00:00
2   1970-01-01 00:00:01.643671641+00:00
3   1970-01-01 00:00:01.643671106+00:00
4   1970-01-01 00:00:01.643668802+00:00
5   1970-01-01 00:00:01.643668324+00:00
6   1970-01-01 00:00:01.643667109+00:00
7   1970-01-01 00:00:01.643665886+00:00
8   1970-01-01 00:00:01.643663882+00:00
9   1970-01-01 00:00:01.643662973+00:00
Name: created_utc, dtype: datetime64[ns, UTC]
datetime64[ns, UTC]
created_utc
1970    10754
Name: count, dtype: int64
