# Data Analysis on Reddit SuicideWatch Posts Data

In [6]:
import pandas as pd
import os
from google import genai
from dotenv import load_dotenv
load_dotenv() # load environment variables

True

In [7]:
# Load the data from the CSV file
loaded_df = pd.read_csv('reddit_suicidewatch.csv')

# Display the first few rows of the loaded dataframe
loaded_df.head()

Unnamed: 0,approved_at_utc,subreddit,selftext,author_fullname,saved,mod_reason_title,gilded,clicked,title,link_flair_richtext,...,author_patreon_flair,author_flair_text_color,permalink,stickied,url,subreddit_subscribers,created_utc,num_crossposts,media,is_video
0,,SuicideWatch,I reached a really low point today. I am strug...,t2_xs124w9j5,False,,0,False,Help?,[],...,False,,/r/SuicideWatch/comments/1jdy3rk/help/,False,https://www.reddit.com/r/SuicideWatch/comments...,519431,1742277000.0,0,,False
1,,SuicideWatch,"Everyone wants to pathologize, moralize, and r...",t2_pfkg1yd73,False,,0,False,"There's no real help for people, because peopl...",[],...,False,dark,/r/SuicideWatch/comments/1jdy0mk/theres_no_rea...,False,https://www.reddit.com/r/SuicideWatch/comments...,519431,1742276000.0,0,,False
2,,SuicideWatch,hi everyone. i’ve tried to be GOOD for as long...,t2_1cyguorgnl,False,,0,False,i want to end things because i don’t like who ...,[],...,False,,/r/SuicideWatch/comments/1jdy040/i_want_to_end...,False,https://www.reddit.com/r/SuicideWatch/comments...,519431,1742276000.0,0,,False
3,,SuicideWatch,I don't want to wake up and be me anymore. Mor...,t2_1ld9rc3s1l,False,,0,False,I don't think I can continue with who I am.,[],...,False,,/r/SuicideWatch/comments/1jdxyyu/i_dont_think_...,False,https://www.reddit.com/r/SuicideWatch/comments...,519431,1742276000.0,0,,False
4,,SuicideWatch,I've attempted suicide in the past and i found...,t2_1lfbomi4g8,False,,0,False,"I'm not afraid to die, but i'm only here becau...",[],...,False,,/r/SuicideWatch/comments/1jdxylv/im_not_afraid...,False,https://www.reddit.com/r/SuicideWatch/comments...,519431,1742276000.0,0,,False


In [8]:
print(f"Length of Dataset: {len(loaded_df)}")

Length of Dataset: 100


In [28]:
with open('prompt.txt', 'w') as file:
    file.write("I have a dataset for which contains suicidal posts on Reddit.\n") 
    file.write("Return a list in the format: \n")
    file.write("Sensitive Attributes: \n")
    file.write("1. attribute1\n")
    file.write("Insensitive Attributes: \n")
    file.write("1. attribute2\n")
    file.write("\n")
    file.write("Check whichever attribute from a dataset as listed below are sensitive attributes:\n")
    for column in loaded_df.columns:
        file.write(f"{column}\n")
print("Prompt file created")

Prompt file created


In [29]:
client = genai.Client(api_key=os.environ.get('GOOGLE_GEMINI_API_KEY'))
with open('prompt.txt', 'r') as file:
    prompt = file.read()

    response = client.models.generate_content(
        model="gemini-2.0-flash", contents=prompt
    )
    print(response.text)

Here's an analysis of the attributes and a classification into sensitive and insensitive categories:

**Sensitive Attributes:**

*   **selftext:** The actual text of the Reddit post. This is highly sensitive as it contains the user's potentially suicidal thoughts, feelings, and intentions.
*   **title:** The title of the Reddit post can also contain sensitive information, especially if it directly expresses suicidal ideation or distress.
*   **author:** The username of the person who posted the content. It can lead to identification of the user.
*   **author_fullname:** Reddit's unique identifier for the author.
*   **subreddit:** While the subreddit itself might not always be sensitive, knowing that a user posted in a suicide-related subreddit is highly sensitive information.
*   **num_comments:** High comment count may reveal the post being related to a user who has many suicide related problems.

**Insensitive Attributes:**

*   **approved_at_utc:** The UTC timestamp of when the pos

In [9]:
sensitive_attributes = [
    "selftext",
    "title",
    "author",
    "author_fullname",
    "subreddit",
    "num_comments"
]

In [10]:
df_sensitive_attribute = loaded_df.copy(deep=True)
df_sensitive_attribute = df_sensitive_attribute[sensitive_attributes]
df_non_sensitive_attribute = loaded_df.copy(deep=True).drop(columns=sensitive_attributes)

In [14]:
assert len(df_sensitive_attribute.columns) == len(sensitive_attributes)
assert len(df_non_sensitive_attribute.columns) == len(loaded_df.columns) - len(sensitive_attributes)
assert (len(df_sensitive_attribute.columns) + len(df_non_sensitive_attribute.columns)) == len(loaded_df.columns) == loaded_df.shape[1]
print("Sensitive Attributes Columns:", df_sensitive_attribute.columns.tolist())
print("Number of Sensitive Attributes Columns:", len(df_sensitive_attribute.columns))
print("Non-Sensitive Attributes Columns:", df_non_sensitive_attribute.columns.tolist())
print("Number of Non-Sensitive Attributes Columns:", len(df_non_sensitive_attribute.columns))

Sensitive Attributes Columns: ['selftext', 'title', 'author', 'author_fullname', 'subreddit', 'num_comments']
Number of Sensitive Attributes Columns: 6
Non-Sensitive Attributes Columns: ['approved_at_utc', 'saved', 'mod_reason_title', 'gilded', 'clicked', 'link_flair_richtext', 'subreddit_name_prefixed', 'hidden', 'pwls', 'link_flair_css_class', 'downs', 'top_awarded_type', 'hide_score', 'name', 'quarantine', 'link_flair_text_color', 'upvote_ratio', 'author_flair_background_color', 'subreddit_type', 'ups', 'total_awards_received', 'media_embed', 'author_flair_template_id', 'is_original_content', 'user_reports', 'secure_media', 'is_reddit_media_domain', 'is_meta', 'category', 'secure_media_embed', 'link_flair_text', 'can_mod_post', 'score', 'approved_by', 'is_created_from_ads_ui', 'author_premium', 'thumbnail', 'edited', 'author_flair_css_class', 'author_flair_richtext', 'gildings', 'content_categories', 'is_self', 'mod_note', 'created', 'link_flair_type', 'wls', 'removed_by_category', 

In [15]:
# Save to CSV
df_sensitive_attribute.to_csv('reddit_suicidewatch_sensitive_attribute.csv', index=False)
df_non_sensitive_attribute.to_csv('reddit_suicidewatch_non_sensitive_attribute.csv', index=False)
print("Saved to CSV")

Saved to CSV
