In [20]:
# Script 1: Reddit Scraper
# Purpose: Collect recent posts from target subreddits using keyword filters

import praw
import pandas as pd
from datetime import datetime, timedelta

# Initialize Reddit API client
reddit = praw.Reddit(
    client_id="WOQrgaEZpGvQZ3LHBW1ITA",
    client_secret="QJvMm5UUX7-WkLsHSNwSiy_fYryQLw",
    user_agent="ChicagoResearchBot/0.1 by u/kateleext"
)

# Parameters
subreddits = ["chicago", "AskChicago"]
timeframe = datetime.utcnow() - timedelta(days=120)
keywords = ["blackhawks"]
# Data collection
posts = []
for sub in subreddits:
    for keyword in keywords:
        for submission in reddit.subreddit(sub).search(keyword, sort="new", limit=100):
                posts.append({
                    "subreddit": sub,
                    "title": submission.title,
                    "text": submission.selftext,
                    "created_utc": datetime.utcfromtimestamp(submission.created_utc),
                    "score": submission.score,
                    "num_comments": submission.num_comments,
                    "url": submission.url
                })

# Save to CSV
df = pd.DataFrame(posts)
df.to_csv("reddit_chicago_hockey_posts.csv", index=False)

print(f"Saved {len(df)} posts to CSV.")

  timeframe = datetime.utcnow() - timedelta(days=120)
  "created_utc": datetime.utcfromtimestamp(submission.created_utc),


Saved 131 posts to CSV.


In [6]:
# Script 2: GPT-Based Situational Driver Tagging
# Purpose: Use OpenAI's structured output feature to classify Reddit posts by situational drivers and sentiment, but ONLY if the post includes a decision to attend a hockey game. Also includes engagement-level, context-type, and event-stage tagging.

import pandas as pd
import json
from openai import OpenAI

# Initialize the OpenAI client
client = OpenAI(api_key="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")


# Load the Reddit posts
df = pd.read_csv("reddit_chicago_hockey_posts.csv")

# First, let's look at the dataframe columns to confirm
print("Available columns:", df.columns.tolist())

# Define the JSON schema for the response
drivers_schema = {
    "type": "object",
    "properties": {
        "include": {"type": "boolean"},
        "decision_summary": {"type": "string"},
        "factors_summary": {"type": "string"},
        "behavioral_insight": {"type": "string"},
        "evidence_quote": {"type": "string"},
        "drivers": {
            "type": "array",
            "items": {"type": "string"}
        },
        "sentiment": {
            "type": "string",
            "enum": ["Positive", "Neutral", "Negative"]
        },
        "fan_engagement_level": {
            "type": "string",
            "enum": ["Observer", "Casual Fan", "Participant", "Deep Fan"]
        },
        "hockey_context_type": {
            "type": "string",
            "enum": ["Professional", "Junior", "Recreational", "Pickup", "Watch Party", "Other"]
        },
        "event_stage": {
            "type": "string",
            "enum": ["Considering", "Planning", "Attending", "Reflecting"]
        }
    },
    "required": ["include", "decision_summary", "factors_summary", "behavioral_insight", "evidence_quote", "drivers", "sentiment", "fan_engagement_level", "hockey_context_type", "event_stage"],
    "additionalProperties": False
}

# Define the driver options and behavioral instruction
system_message = """You are an assistant trained to analyze Reddit posts and identify situational drivers behind a user's decision to attend a hockey game.

Assume the post is related to attending a hockey game if:
- The user says they are going to a game (past or future)
- They mention a game as part of an itinerary
- They are asking about game logistics (tickets, travel, parking, etc.)
- They are reflecting on attending a game
- They describe buying tickets, watching at the stadium, or planning to go

Only set \"include\": false if there is absolutely no indication the user attended or planned to attend a hockey game — such as general fandom, watching on TV, or abstract mentions of the team.

If include = true:
- Provide a detailed `decision_summary` and `factors_summary`
- In `factors_summary`, go beyond surface-level description. Explain *why* the user is motivated in this situation. Infer any emotional, social, or contextual pressures influencing the choice (e.g. hosting visitors, rare opportunity, personal milestone, weather, etc).
- Infer a concise `behavioral_insight` describing the moment of decision-making, emotional state, or conversion trigger
- Provide a direct `evidence_quote` from the post that supports your classification

### Situational Driver Options:
1. Social Bonding – Driven by desire to connect with others, peer planning, group activity.
2. Novelty/FOMO – Seeking a new experience or avoiding missing out on something different.
3. Convenience – Chosen due to location, ease, or timing.
4. Cost/Value Sensitivity – Influenced by price, discounts, or perceived affordability.
5. Affective State – Motivated by emotions such as boredom, stress relief, or celebration.
6. Weather-Driven – Decision influenced by weather conditions.
7. Out-of-Character Behavior – Not typical for the individual, explicitly or implicitly noted.
8. Peer Influence – Decision made due to persuasion or invitation from others.
9. Spontaneity – No prior planning; made on a whim or last-minute.
10. External Stimulus – Influenced by an ad, post, event listing, or recommendation.
"""

# Function to classify a single post
def classify_post(title, text):
    try:
        full_content = f"Title: {title}\n\nContent: {text}"

        input_messages = [
            {"role": "system", "content": system_message},
            {"role": "user", "content": full_content}
        ]

        response = client.responses.create(
            model="gpt-4o-2024-08-06",
            input=input_messages,
            text={
                "format": {
                    "type": "json_schema",
                    "name": "reddit_post_analysis",
                    "schema": drivers_schema,
                    "strict": False
                }
            }
        )

        result = json.loads(response.output_text)
        return result if result.get("include") else None

    except Exception as e:
        print(f"Error processing post: {e}")
        return None

Available columns: ['subreddit', 'title', 'text', 'created_utc', 'score', 'num_comments', 'url']


In [9]:
# Process all posts and store results
all_results = []
print("Processing all posts...")
for index, row in df.iterrows():
    print(f"Processing post {index}/{len(df)}...")
    result = classify_post(row['title'], row['text'])
    if result:
        all_results.append(result)

# Create DataFrame from results and export to CSV
if all_results:
    output_df = pd.DataFrame(all_results)
    
    # Export to CSV
    output_df.to_csv('hockey_post_analysis.csv', index=False)
    print("\nAnalysis complete!")
    print(f"Processed {len(all_results)} qualifying posts")
    print("Results exported to hockey_post_analysis.csv")
else:
    print("No qualifying posts found in the dataset.")



Processing all posts...
Processing post 0/131...
Processing post 1/131...
Processing post 2/131...
Processing post 3/131...
Processing post 4/131...
Processing post 5/131...
Processing post 6/131...
Processing post 7/131...
Processing post 8/131...
Processing post 9/131...
Processing post 10/131...
Processing post 11/131...
Processing post 12/131...
Processing post 13/131...
Processing post 14/131...
Processing post 15/131...
Processing post 16/131...
Processing post 17/131...
Processing post 18/131...
Processing post 19/131...
Processing post 20/131...
Processing post 21/131...
Processing post 22/131...
Processing post 23/131...
Processing post 24/131...
Processing post 25/131...
Processing post 26/131...
Processing post 27/131...
Processing post 28/131...
Processing post 29/131...
Processing post 30/131...
Processing post 31/131...
Processing post 32/131...
Processing post 33/131...
Processing post 34/131...
Processing post 35/131...
Processing post 36/131...
Processing post 37/131..