# Get comments from Reddit

Imports

In [None]:
import os

import sys
from datetime import datetime as dt  # imports datetime to deal with dates

import pandas as pd  # imports pandas for data manipulation
import praw  # imports praw for reddit access
from dotenv import load_dotenv  # get login secrets
from openai import OpenAI
from praw import Reddit
from praw.models import MoreComments, Comment, Subreddit
from tqdm import tqdm  # show progress bars

load_dotenv()  # gets secrets

Helper function to get all comments from a post

In [None]:
def get_all_comments(comments_list: list[Comment], limit: int=None) -> list[str]:
    """
    Gets all comments from a list, even if there are MoreComments
    :param comments_list: The list of comments
    :param limit: Optional - the limit of MoreComments to read, or None for no limit
    :return: A list of all comments, with only Comments
    """
    all_comments: list[str | Comment] = []
    more_comments_count: int = 0

    for item in comments_list:
        if isinstance(item, MoreComments):
            if limit is None or more_comments_count < limit:
                try:
                    # Get comments from MoreComments object
                    new_comments: list[Comment] = item.comments()

                    # Recursively process in case there are nested MoreComments
                    processed_comments = get_all_comments(new_comments, limit)
                    all_comments.extend(processed_comments)

                    more_comments_count += 1

                except Exception as e:
                    print(f"Error expanding MoreComments: {e}")
                    continue
            else:
                break
        else:
            # Regular Comment object
            all_comments.append(item)

    return all_comments


# log into reddit api
reddit: Reddit = praw.Reddit(client_id=os.getenv("CLIENT_ID"),
                     client_secret=os.getenv("CLIENT_SECRET"),
                     user_agent='data collector for u/Delicious-Corner6100',
                     username=os.getenv("REDDIT_USERNAME"),
                     password=os.getenv("REDDIT_PASSWORD"))

# make sure that we are logged in correctly
if reddit.user.me() != os.getenv("REDDIT_USERNAME"):
    print("Failed to log in to Reddit :(")
    sys.exit(1)
print(f"Logged in to Reddit as {os.getenv("REDDIT_USERNAME")}")

# create a DataFrame to store posts
# posted date in epoch, comments as list, everything else is a string
posts: pd.DataFrame = pd.DataFrame(columns=["Posted Time",
                              "Title",
                              "Author",
                              "Link",
                              "Content",
                              "Comments"])

# list subreddits to search
# set subreddits = ["all"] to search all of reddit
# skipcq: FLK-E501
# subreddits = ['cybersecurity', 'technology', 'k12sysadmin', 'toronto', 'canada', 'askTO', 'raleigh', 'linustechtips']
subreddits: list[str] = ["all"]

# look through every subreddit
for subreddit_name in subreddits:
    # get the subreddit via api
    subreddit: Subreddit = reddit.subreddit(subreddit_name)
    # get the messages that meet a query (the same as the search bar)
    # sorts by relevance and gets only messages from the most recent year
    # change limit to the most appropriate limit
    # tqdm shows progress bar
    for post in tqdm(subreddit.search("powerschool data breach",
                                      sort="relevance",
                                      time_filter="year",
                                      limit=50),
                     desc=f"r/{subreddit} progress"):
        # check the time of the post
        # 12/28/2024, the date of the breach, in epoch time
        breach_time: int = 1735344000
        if post.created_utc <= breach_time:
            # do not process comments if they occur before the breach
            continue
        time = dt.fromtimestamp(post.created_utc)

        # gets the link so we can review the post
        # skipcq FLK-E501
        link: str = f"https://www.reddit.com/r/{post.subreddit.display_name}/comments/{post.id}/"

        # collects the data necessary
        # replace newlines with spaces because they are easier to work with
        data: list[str] = [time,
                           post.title,
                           post.author.name,
                           link,
                           post.selftext.replace("\n", " "),
                           post.comments.list()]
        # adds the data to the dataframe
        posts.loc[len(posts)] = data

# First, expand all MoreComments objects in all comment lists
expanded_comments_lists = []

for comment_list in tqdm(posts["Comments"], desc="Expanding MoreComments"):
    expanded_list: list[str] = get_all_comments(comment_list)
    expanded_comments_lists.append(expanded_list)

# Now find the maximum number of comments after expansion
num_comments: int = 0
for expanded_comment_list in expanded_comments_lists:
    num_comments: int = (len(expanded_comment_list)
                    if len(expanded_comment_list) > num_comments
                    else num_comments)

# Create columns in the dataframe based on the actual expanded comment counts
comments: pd.DataFrame = pd.DataFrame(
    columns=[f"Comment{i}" for i in range(1, num_comments + 1)]
)

# Process the already-expanded comment lists
for expanded_comment_list in tqdm(expanded_comments_lists, desc="Processing comments"):
    text: list[str | None] = []

    # Get every comment in every post (already expanded)
    for comment in expanded_comment_list:
        # Replace newlines with spaces because they are easier to work with
        text.append(comment.body.replace("\n", " "))

    # Fill any empty spaces with None
    text.extend([None] * (num_comments - len(text)))

    # Add it to the dataframe
    comments.loc[len(comments)] = text

# merge the two tables together
res: pd.DataFrame = pd.concat([posts, comments], axis=1)
# delete the list of comments because they have been converted to strings
res: pd.DataFrame = res.drop("Comments", axis=1)

# write the final dataframe to a csv
res.to_csv("posts.csv")

## Ollama AI

In [None]:
import requests

def ollama_ai(prompt: str, model: str = "gemma3:12b") -> str | None:
    """
    Makes a request to a local ollama server to run an AI query.
    :param prompt: String - the query for the AI.
    :param model: String - the model for ollama to run. Default is Gemma 3 1b.
    :return: A JSON object with response data from the API.
    """
    # url of the server
    url: str = "http://jacobs-ubuntu:11434/api/generate"

    req_data: dict[str, str | bool] = {
        "model": model,
        "prompt": prompt,
        "stream": False
    }

    try:
        # makes a request to the ollama server
        response = requests.post(url, json=req_data)
        response.raise_for_status()  # Raises an HTTPError for bad responses
        return response.json()["response"]
    except requests.exceptions.RequestException as e:
        print(f"Error making request: {e}")
        return None

## Google AI

In [None]:
import requests
from time import sleep

from google import genai  # google ai package
from google.ai.generativelanguage_v1 import GenerateContentResponse
from google.api_core import exceptions
from google.genai import Client

def google_ai(prompt: str, api_key: str, model: str = "gemma3") -> str | None:
    """
    Calls the Google/Gemini api to get their AI's answers
    :param prompt: The prompt to ask the AI
    :param model: Which AI model to ask - default is gemma 3
    :param api_key: Google api key
    :return: String response of the AI
    """
    # get google gen ai client object
    client: Client = genai.Client(api_key=api_key)

    try:
        # get response from api and return it
        response: GenerateContentResponse = client.models.generate_content(
            model=model,
            contents=prompt,
        )

        return response.text
    except exceptions.ResourceExhausted as e:
        print(f"{e}\nAPI limit reached. Taking a break for a few seconds...")
        sleep(15)
        return None

## OpenAI API

In [None]:
def openai_ai(prompt: str, api_key: str, model: str = "o4-mini", instructions: str = "") -> str | None:
    """
    Calls the OpenAI api to get their AI's response.
    :param prompt: The prompt to ask the AI.
    :param model: The model to use. Defaults to o4-mini
    :param api_key: The OpenAI API key. Default to the value set in .env
    :param instructions: The instructions that the model will receive.
    :return: String response from the AI.
    """
    try:
        client: OpenAI = OpenAI(
            api_key=api_key
        )

        response = client.responses.create(
            model=model,
            instructions=instructions,
            input=prompt
        )

        return response.output_text
    except Exception as e:
        print(f"An error occurred: {e}")
        return None

## Labeling

In [None]:
# initially label messages
posts: pd.DataFrame = pd.read_csv("roles.csv")

# get all quotes into a list
quotes: list[str] = posts["quote"].values.flatten().tolist()
responses: list[str] = []

for quote in tqdm(quotes, desc="AI Progress"):
    #ai_response: str | None = openai_ai(
    #    prompt="Below is a quote about the PowerSchool data breach. "
    #           "Label the data based on what was included. "
    #           "For example, if the author of the message complained about the delay in notification, you can label the message as 'lack of communication'. "
    #           "Additionally, if the author of the message worried about their children's data, you can label the message as 'worried about child data'. "
    #           "You can label messages with as many labels as necessary and you may generate as many labels as you need. "
    #           f"Message: {quote}",
    #    api_key=os.getenv("OPENAI_API_KEY")
    #)

    ai_response: str | None = openai_ai(
        prompt="Below is a quote about the PowerSchool data breach. "
               "Label the data based on what was included. Respond with a list of labels that fit the message. "
               "You can select as many labels as necessary, but you may not add labels that are not on this list: "
               "Lack of communication - This label means the author was not notified about the breach before they posted. "
               "Bad communication - This label means the author was notified about the breach before they posted, but they were still unsure about some aspect of the breach. "
               "Worried about data - This label means the author was worried about the impact of this data breach. For example, posters could be worried about identity theft or fraud. "
               "Not worried about data - This label means the author was not concerned about the data breach. For example, posters may assume that their data is already on the dark web, so this breach didn't affect them. "
               "Issues with remediation - This label means the author had problems when trying to protect themselves. For example, they may have encountered issues when signing up for credit monitoring. "
               "Feeling of inevitability - This label means the author felt that they had no other choice but to give data to insecure companies. "
               "Insufficient remedies - This label means the author felt that actions taken after the breach were not enough to fully protect themselves. "
               "Confused - This label means that the author did not know what to do after the data breach to protect themselves. "
               "Surprised - This label means that the author was surprised that PowerSchool was breached. "
               "Lost trust - This label means that the author no longer trusts companies with their data after the breach. "
               "Lack of funding - This label means that the author believes that schools do not have enough money to afford better systems. "
               "Outdated technology - This label means that the author believes that schools' technologies are so old they cause security vulnerabilities. "
               "High stress - This label means that the author believes that administrators and IT professionals do not have the ability to protect their systems due to high workloads. "
               "Lack of accountability - This label means that the author feels like PowerSchool was not adequately punished for the data breach. "
               f"Message: {quote}",
        api_key=os.getenv("OPENAI_API_KEY")
    )
    responses.append(ai_response)

posts["labels"] = responses
posts.to_csv("labels.csv", index=False)

## Role selection

In [None]:
# open file with comments
posts: pd.DataFrame = pd.read_csv("posts.csv")
# get the number of comments
num_comments: int = int(posts.columns[-1][7:])

column_names: list[str] = [f"Comment{i}"
                for i in range(1, num_comments + 1)] + ["Content"]

# get all quotes into a list
quotes: list[str] = posts[column_names].values.flatten().tolist()
# remove any empty values or comments that were deleted
quotes = [x for x in quotes if (type(x) is str and x != '[deleted]')]

results: list[str] = []
for quote in tqdm(quotes, desc="AI Progress"):
    # call the openai api to determine role
    result: str | None = openai_ai(
        prompt="Read the following message about the PowerSchool data breach and determine the role of the person who wrote it. "
               "The author has one of the following roles: parent, student, teacher, admin. "
               "The first word of your response should be the role of the person. "
               "Provide an explanation for why you chose the role for that message. "
               "If you are not more than 50% confident in your classification, respond with unsure. "
               "Parents generally talk about their children, using words such as 'my son', 'my daughter', 'my child(ren)', etc. "
               "They also mention receiving messages from school boards about the data breach. "
               "Students have generally graduated since we can't collect data from people under 18. "
               "They usually mention that they graduated some years ago. "
               "Teachers usually mention that they have experience teaching. "
               "Administrators usually have more technical knowledge, but that is not a guaranteed factor. "
               "They talk with PowerSchool directly and manage a school's or district's powerschool instance. "
               "Note that administrators must be from a K-12 school, so postsecondary admins, admins that worked on tech other than PowerSchool, and PowerSchool employees should be classified as general. "
               "Additionally, simply knowing technical terms does not automatically make them an administrator. "
               "They need to have experience working in a K-12 school's IT department. "
               "If the person does not fit any of the labels, respond with the best fit label that you can think of. "
               "If the message does not relate to the PowerSchool data breach, respond with not relevant. "
               "Message: "
               f"{quote}",
        api_key=os.getenv("OPENAI_API_KEY")
    )

    if result:
        # add the result so we can track it
        results.append(result)

# put the quote to the role and save as csv
out: pd.DataFrame = pd.DataFrame({
    "quote": quotes,
    "role": results
})

out.to_csv("roles.csv", index=False)

## Clean labels

In [None]:
import re

df = pd.read_csv("labels.csv")

def clean_and_capitalize(text):
    lines = text.split('\n')
    cleaned_lines = []
    for line in lines:
        # Remove everything except letters and spaces
        letters_only = re.sub(r'[^a-zA-Z\s]', '', line).replace("â€¢", "").replace("-", "")
        # Collapse multiple spaces, strip, then capitalize first letter
        cleaned_line = re.sub(r'\s+', ' ', letters_only).strip().capitalize()
        cleaned_lines.append(cleaned_line)
    return '\n'.join(cleaned_lines)


def normalize_labels(text):
    # Split by commas, clean each label
    parts = [part.strip().strip('"').strip("'").lower() for part in text.split(',')]
    return ', '.join(parts)


df["role"] = df["role"].str.split().str[0].apply(clean_and_capitalize).apply(normalize_labels)

df.to_csv("labels_cleaned.csv", index=False)

## Format labels

In [None]:
df = pd.read_csv("labels_formatted.csv")

def normalize_labels(text):
    # Split by commas, clean each label
    parts = [part.strip().strip('"').strip("'").lower() for part in text.split(',')]
    return ', '.join(parts)

df['Labels'] = df['Labels'].apply(normalize_labels)
df_grouped = df.groupby('Labels', as_index=False).sum(numeric_only=True)

df_grouped.to_csv("labels_formatted.csv", index=False)

## Also clean labels

In [None]:
import pandas as pd
import re

# Load the CSV file
df = pd.read_csv("labels.csv")  # Replace with your filename

# Choose the column to clean
column_name = "labels"  # Replace with your column name

# Function to clean each value
def clean_text(text):
    if pd.isna(text):
        return text
    return re.sub(r'[^a-zA-Z,\n ]', '', str(text))

# Apply the cleaning function to the column
df[column_name] = df[column_name].apply(clean_text)

# Optional: save to a new CSV
df.to_csv("cleaned_file.csv", index=False)

## Co-occurance Heatmap Generation

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from itertools import combinations
from collections import Counter, defaultdict

# Example input: list of strings with newline-separated labels (possibly with extra whitespace)
data = pd.read_csv("cleaned_file.csv")["labels"]

cleaned_entries = []
for entry in data:
    if pd.isna(entry):
        continue
    # Replace both newlines and commas with a common separator (e.g. newline), then split
    raw_labels = entry.replace(',', '\n').split('\n')
    labels = [label.strip() for label in raw_labels if label.strip()]
    if len(labels) > 1:
        cleaned_entries.append(labels)

# Count co-occurrences
co_counts = defaultdict(int)
all_labels = set()

for labels in cleaned_entries:
    unique_labels = sorted(set(labels))  # sort for consistent pair order
    all_labels.update(unique_labels)
    for a, b in combinations(unique_labels, 2):
        co_counts[(a, b)] += 1
        co_counts[(b, a)] += 1  # make it symmetric

# Initialize co-occurrence matrix
all_labels = sorted(all_labels)
co_matrix = pd.DataFrame(0, index=all_labels, columns=all_labels)

for (a, b), count in co_counts.items():
    co_matrix.at[a, b] = count

# Plot heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(co_matrix, annot=True, fmt='d', cmap='YlGnBu', square=True, cbar_kws={'label': 'Co-occurrence'})
plt.title("Label Co-occurrence Heatmap")
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.savefig("cooccurrence_heatmap.png", dpi=300)
plt.show()
