In [125]:
import pandas as pd

# read csv file
df = pd.read_csv('data/combined_dataset_with_comments_cleaned.csv')

**Step 1: Post & comments lenth and comments score**

In [126]:
# Let's assume 'df' is your cleaned DataFrame with both posts and comments
# First, separate them if you haven't already
comments_df = df[df['type'] == 'comment'].copy()
posts_df = df[df['type'] == 'post'].copy()

# Define filtering conditions for COMMENTS
min_comment_length = 50  # Minimum number of characters to avoid short/"RIP" comments
max_comment_length = 500  # Maximum number of characters to avoid overly long comments
min_comment_score = 10     # Minimum upvote score

# Filter for substantive, upvoted comments
valuable_comments_filter = (
    (comments_df['cleaned_text'].str.len().between(min_comment_length, max_comment_length)) &
    (comments_df['score'] >= min_comment_score)
)

# Apply the filter
valuable_comments_df = comments_df[valuable_comments_filter].copy()

print(f"Original comments: {len(comments_df)}")
print(f"Valuable comments: {len(valuable_comments_df)}")
print(f"Filtered out {len(comments_df) - len(valuable_comments_df)} low-quality comments.")

Original comments: 366684
Valuable comments: 23263
Filtered out 343421 low-quality comments.


**Step 2: Empathic tone sentiment analysis with Textblob**

In [127]:
# Install textblob in your environment if you haven't
!pip install textblob



In [128]:
from textblob import TextBlob

# Define a function to get sentiment polarity
def get_sentiment(text):
    try:
        return TextBlob(text).sentiment.polarity
    except:
        return 0  # Neutral if there's an error

# Calculate sentiment for each valuable comment
# This might take a minute for large datasets
print("Calculating sentiment for valuable comments...")
valuable_comments_df['sentiment'] = valuable_comments_df['cleaned_text'].apply(get_sentiment)

# Now, filter for comments with positive sentiment (empathy, support)
empathic_comments_df = valuable_comments_df[valuable_comments_df['sentiment'] > 0.3].copy()

# Sort by the most upvoted AND most positive to see the best examples
most_empathic_comments = empathic_comments_df.sort_values(['score', 'sentiment'], ascending=[False, False])

print(f"\nEmpathic and valuable comments: {len(empathic_comments_df)}")
print("\nTop 5 most upvoted empathic comments:")
for _, row in most_empathic_comments.head(5).iterrows():
    print(f"[Score: {row['score']}, Sentiment: {row['sentiment']:.2f}]: {row['cleaned_text'][:200]}...")

Calculating sentiment for valuable comments...

Empathic and valuable comments: 7004

Top 5 most upvoted empathic comments:
[Score: 1443, Sentiment: 0.30]: sounds like you the vet did all they could. they really do go downhill fast. i experienced something similar in january when i lost my year old boy. i hope you're ok i send hugs...
[Score: 1432, Sentiment: 0.45]: please tell me your dad does not have access to this cat. i honestly fear what he might be willing to do while you're away to better your life....
[Score: 1280, Sentiment: 1.00]: im not exaggerating when i say that i would die for antonio. what an absolutely perfect gentleman!...
[Score: 1273, Sentiment: 0.42]: you already have sweetheart. i've been rescuing dogs for yrs. he can feel your love. and he won't be alone at the end. that's a beautiful legacy for a dog to have a proud, loving owner until the end. ...
[Score: 1193, Sentiment: 0.49]: i think from penelopes perspective it was the best way to die. no stress from goin

In [129]:
# Get the unique 'post_id's from our filtered, empathic comments
# These are the posts that sparked high-quality discussion
valuable_post_ids = empathic_comments_df['post_id'].unique()

# Filter the original posts DataFrame to only include these posts
valuable_posts_df = posts_df[posts_df['post_id'].isin(valuable_post_ids)].copy()

print(f"Original posts: {len(posts_df)}")
print(f"Posts that received valuable comments: {len(valuable_posts_df)}")

Original posts: 2687
Posts that received valuable comments: 1300


In [130]:
# Merge the valuable posts with the empathic comments on 'post_id'
# This creates a DataFrame where each row is a (post + comment) pair
empathic_data = pd.merge(
    valuable_posts_df[['post_id', 'cleaned_text', 'title']], # Data from the post
    empathic_comments_df[['post_id', 'cleaned_text', 'score', 'sentiment']], # Data from the comment
    on='post_id',
    how='inner',
    suffixes=('_post', '_comment') # This clarifies which 'cleaned_text' is which
)

# Rename columns for absolute clarity
empathic_data.rename(columns={
    'cleaned_text_post': 'grieving_post',
    'cleaned_text_comment': 'supportive_response',
    'score': 'response_score',
    'sentiment': 'response_sentiment'
}, inplace=True)

# Display the result
print(f"Created {len(empathic_data)} high-quality (post -> response) pairs!")
empathic_data.tail(3)

Created 3627 high-quality (post -> response) pairs!


Unnamed: 0,post_id,grieving_post,title,supportive_response,response_score,response_sentiment
3624,1bkf0zz,my first post on reddit. please let me know if...,Very old feral was euthanized. Heartbreak is i...,you are wonderful person. you did what you cou...,15,0.564286
3625,1bkf0zz,my first post on reddit. please let me know if...,Very old feral was euthanized. Heartbreak is i...,you were able to give him the gift of an easy ...,12,0.45873
3626,yk704f,i am newer to reddit and wasn't sure where to ...,feeling guilty over euthanasia,thank you! that's a really good idea to change...,88,0.45


**Step 3: Self-Reflection Filter**

In [131]:
# Create a function to detect self-referential comments.

import re

def is_self_reflective(text, threshold=2):
    """
    Checks if a comment is overly focused on the commenter themselves.
    Returns True if the number of first-person pronouns exceeds the threshold.
    """
    if not isinstance(text, str):
        return False
        
    # Define patterns to look for (using regex for word boundaries)
    first_person_patterns = r'\b(I|me|my|mine|we|us|our|ours)\b'
    
    # Find all matches
    matches = re.findall(first_person_patterns, text, flags=re.IGNORECASE)
    
    # Return True if the number of matches exceeds the threshold
    return len(matches) > threshold

# Test the function
test_comment = "I'm so sorry. I had to put my dog down last year and I know exactly how you feel. It was the hardest thing I ever did."
print(is_self_reflective(test_comment, threshold=2)) # Output: True (It has 5 first-person references)

test_comment2 = "I'm so sorry you're going through this. It's the hardest thing. Please be kind to yourself right now."
print(is_self_reflective(test_comment2, threshold=2)) # Output: False (It has only 1)

True
False


In [132]:
# Apply the filter to your DataFrame of empathic comments
# We want to KEEP comments that are NOT self-reflective
focus_on_op_df = empathic_data[~empathic_data['supportive_response'].apply(is_self_reflective)]

print(f"Valuable, empathic comments: {len(empathic_data)}")
print(f"Comments focused on the OP (not the commenter): {len(focus_on_op_df)}")
print(f"Filtered out {len(empathic_data) - len(focus_on_op_df)} self-reflective comments.")

Valuable, empathic comments: 3627
Comments focused on the OP (not the commenter): 2748
Filtered out 879 self-reflective comments.


In [133]:
focus_on_op_df.tail(3)

Unnamed: 0,post_id,grieving_post,title,supportive_response,response_score,response_sentiment
3624,1bkf0zz,my first post on reddit. please let me know if...,Very old feral was euthanized. Heartbreak is i...,you are wonderful person. you did what you cou...,15,0.564286
3625,1bkf0zz,my first post on reddit. please let me know if...,Very old feral was euthanized. Heartbreak is i...,you were able to give him the gift of an easy ...,12,0.45873
3626,yk704f,i am newer to reddit and wasn't sure where to ...,feeling guilty over euthanasia,thank you! that's a really good idea to change...,88,0.45


**Step 4: Unsolicited advice filter**

In [134]:
# Define phrases that often precede unsolicited advice
advice_phrases = [
    'you should', 'you need to', 'just try to', 'have you tried',
    'what I would do is', 'the best thing is to', 'why don\'t you'
]

# Create a function to detect advice-heavy comments
def is_advice_heavy(text):
    text = text.lower()
    for phrase in advice_phrases:
        if phrase in text:
            return True
    return False

# Apply the filter (we want to KEEP comments that are NOT advice-heavy)
non_advice_df = focus_on_op_df[~focus_on_op_df['supportive_response'].apply(is_advice_heavy)]

In [135]:
non_advice_df.shape

(2720, 6)

**Step 5: Leaving comments with empathy patterns**

In [136]:
# Define positive empathy patterns
empathy_patterns = {
    'validation': [
        r'\b(normal|natural|understandable|okay|valid|makes sense)\b',
        r'\b(of course you|it\'s no wonder|no surprise that|anyone would)\b',
        r'\b(you have every right|you are (not )?alone|(completely|totally) justified)\b',
        r'\b(feel that way|go through this|react that way|expected)\b',
        r'\b(part of the process|part of grieving|part of the journey)\b'
    ],
    'affirmation': [
        r'\b(right thing|best decision|loving choice|brave|strong|courageous)\b',
        r'\b(great pet parent|wonderful owner|amazing friend|did everything you could)\b',
        r'\b(final act of love|selfless act|put them first|gift of peace)\b',
        r'\b(they know you loved|they felt your love|honored their life)\b',
        r'\b(supported them|gave them a great life|fought for them)\b'
    ],
    'shared_humanity': [
        r'\b(we all|many of us|so many of us|anyone who has|everyone feels)\b',
        r'\b(I think most|I believe many|often the case|common experience)\b',
        r'\b(you are not alone|we understand|we\'ve been there|here for you)\b',
        r'\b(this community|in this together|know the pain|share your loss)\b'
    ],
    'feeling_words': [
        r'\b(pain|heartbroken|loss|grieving|miss|love|sad|anguish|hurt)\b',
        r'\b(devastat|mourn|heartache|emptiness|lonely|ache|longing|yearning)\b',
        r'\b(guilt|guilty|regret|what if|if only|should have|could have)\b',
        r'\b(thankful|grateful|treasure|blessed|lucky|joy|happy|smile|celebrate)\b',
        r'\b(peace|peaceful|comfort|healing|hope|better|time|patience|kind)\b'
    ],
    'permission_granting': [
        r'\b(allow yourself|give yourself permission|it\'s alright to)\b',
        r'\b(you can|you deserve to|you need to|be kind to yourself)\b',
        r'\b((it\'s|that\'s) okay to|permissible|acceptable)\b'
    ],
    'present_focus': [
        r'\b(right now|in this moment|today|at this time|for now)\b',
        r'\b(one (day|step) at a time|moment by moment| breathe|just get through)\b'
    ],
    'memory_honoring': [
        r'\b(beautiful memory|wonderful times|remember the love|celebrate their life)\b',
        r'\b(they would (want|thank)|honor them|keep them in your heart)\b',
        r'\b(tell us about|share a story|what was their|what did they love)\b',
        r'\b(paw prints|rainbow bridge|waiting for you|see them again)\b'
    ],
    'support_offering': [
        r'\b(I\'m here|here for you|listening|thinking of you|sending love)\b',
        r'\b(support|lean on me|reach out|if you need to talk|any time)\b',
        r'\b(wish I could help|wish I had words|my heart (goes out|is with))\b'
    ]
}

In [137]:
def calculate_empathy_score(text):
    score = 0
    text = text.lower()
    for pattern_name, pattern_list in empathy_patterns.items():
        for regex in pattern_list:
            matches = re.findall(regex, text, flags=re.IGNORECASE)
            score += len(matches)
    return score

# Calculate score for each comment
non_advice_df['empathy_score'] = non_advice_df['supportive_response'].apply(calculate_empathy_score)

# Filter for comments with a high empathy score
empathy_pattern_df = non_advice_df[non_advice_df['empathy_score'] >= 1].copy()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  non_advice_df['empathy_score'] = non_advice_df['supportive_response'].apply(calculate_empathy_score)


In [138]:
empathy_pattern_df.shape

(1709, 7)

In [139]:
empathy_pattern_df.post_id.nunique()

876

In [140]:
empathy_pattern_df.tail(30)

Unnamed: 0,post_id,grieving_post,title,supportive_response,response_score,response_sentiment,empathy_score
3560,5uu29n,i am a wreck. my yr old hound mix fell off a c...,[RIP] My fur baby died suddenly in a hiking ac...,i'm so heartbroken for you but please find com...,53,0.5,3
3561,5uu29n,i am a wreck. my yr old hound mix fell off a c...,[RIP] My fur baby died suddenly in a hiking ac...,we all will have our time. the contents of tha...,10,0.45,3
3565,fclqin,my year old dog hadnt been doing well the past...,[help] my dog died and I didn’t get to say goo...,an excellent point. i've always thought if pos...,11,0.52,1
3566,fclqin,my year old dog hadnt been doing well the past...,[help] my dog died and I didn’t get to say goo...,yup. dying adults will often wait for their ch...,40,0.333333,1
3567,zp6py8,"last friday, out of nowhere, we had to put our...",My wife and I lost the bestest of peanuts and ...,she was beautiful. she got lucky when she foun...,73,0.454762,5
3571,12vmngr,my dog has ibd and regularly has flare ups. i ...,"My father said that when my dog dies, I’ll pro...",it's not wrong to love a soul who loves you un...,18,0.375,1
3572,12vmngr,my dog has ibd and regularly has flare ups. i ...,"My father said that when my dog dies, I’ll pro...",its not wrong to love your dog that much. your...,78,0.321131,4
3575,dsyw67,warning this is a long one. tldr at bottom. my...,[Help] How to forgive and love my dog after it...,try trading up games. you become the source of...,31,0.393878,2
3578,dsyw67,warning this is a long one. tldr at bottom. my...,[Help] How to forgive and love my dog after it...,exactly. food should never have been in the eq...,17,0.375,1
3581,hd9q8m,"reposted because i forgot the title tag, whoop...","[Fluff] One year ago today, we tried to rehome...",honestly unless theyre super old rehoming isnt...,20,0.367424,1


**Step 6: Manually assigning label to relevant replies**

In [143]:
# Prepare a file for manual review
to_review_df = empathy_pattern_df.sort_values(by=['post_id', 'response_score'], ascending=False).copy()
to_review_df['is_relevant'] = 0  # Placeholder for your manual rating
to_review_df.to_csv('for_manual_review.csv', index=True)

In [142]:
%pip install openpyxl

df = pd.read_csv("for_manual_review.csv")
df.to_excel("for_manual_review.xlsx", index=False)

Note: you may need to restart the kernel to use updated packages.


PermissionError: [Errno 13] Permission denied: 'for_manual_review.xlsx'

In [151]:
# 1. Use your in-memory DataFrame (already available as 'to_review_df')
original_df = to_review_df

# 2. Load the manually reviewed Excel file
reviewed_dataset_df = pd.read_excel('for_manual_review.xlsx')

# 3. Inspect the manual ratings (optional but good practice)
print("Manual ratings value counts:")
print(reviewed_dataset_df['is_relevant'].value_counts())
print("\nFirst few reviewed rows:")
print(reviewed_dataset_df[['Unnamed: 0', 'supportive_response', 'is_relevant']].head())

# 4. Perform the merge on the 'index' column
# We do a 'left' join to keep all rows from the original DataFrame.
# We only want to bring in the 'is_relevant' column from the manual file.
final_df = original_df.merge(
    reviewed_dataset_df[['Unnamed: 0', 'is_relevant']], # Select only the key and the rating
    left_index=True, right_on='Unnamed: 0', # Merge using the DataFrame index and Unnamed: 0
    how='left'
)

# 5. Verify the merge was successful
print(f"\nOriginal DataFrame shape: {original_df.shape}")
print(f"Merged DataFrame shape: {final_df.shape}")
print("\n'is_relevant' column in final DataFrame:")
print(final_df['is_relevant_y'].value_counts(dropna=False))

# 6. Create your final, curated Golden Dataset
# Select only the rows that were marked as 1 (Golden)
final_relevant_dataset = final_df[final_df['is_relevant_y'] == 1].copy()

# Select rows that were rejected (0) or not reviewed (NaN) for other analysis
rejected_dataset = final_df[final_df['is_relevant_y'] == 0]
not_reviewed_dataset = final_df[final_df['is_relevant_y'].isna()]

print(f"\n Final Relevant Dataset size: {len(final_relevant_dataset)}")
print(f"Rejected: {len(rejected_dataset)}")
print(f"Not Reviewed: {len(not_reviewed_dataset)}")

Manual ratings value counts:
is_relevant
1    1027
0     682
Name: count, dtype: int64

First few reviewed rows:
   Unnamed: 0                                supportive_response  is_relevant
0        1142  so much love in those eyes rdogsmirin all the ...            0
1        1143  such love and gratitude in her eyes. rest easy...            0
2        1147  thank you wombatfucker's wife for being with m...            0
3        3567  she was beautiful. she got lucky when she foun...            1
4        2041  tell cash to look for my logan and sam and wid...            0

Original DataFrame shape: (1709, 8)
Merged DataFrame shape: (1709, 10)

'is_relevant' column in final DataFrame:
is_relevant_y
1    1027
0     682
Name: count, dtype: int64

 Final Relevant Dataset size: 1027
Rejected: 682
Not Reviewed: 0


In [153]:
# Save the main DataFrame with the new manual rating column attached
final_df.to_parquet('final_with_relevance_classificator.parquet', index=False)

# Save the most important dataset: your golden examples
final_relevant_dataset.to_parquet('final_relevant_training_dataset.parquet', index=False)

# Optional: Save the rejected examples for later analysis
rejected_dataset.to_parquet('rejected_comments.parquet', index=False)

print("Datasets saved successfully.")

Datasets saved successfully.


In [154]:
final_relevant_dataset.post_id.nunique()

605

In [156]:
final_relevant_dataset.tail(20)

Unnamed: 0.1,post_id,grieving_post,title,supportive_response,response_score,response_sentiment,empathy_score,is_relevant_x,Unnamed: 0,is_relevant_y
1663,12zoe05,years wasnt nearly long enough to be wtih you ...,The light has gone out of my life. 13yr Danbi ...,i'm so sorry. she's beautiful. in time the lig...,16,0.39,1,0,809,1
1664,12zoe05,years wasnt nearly long enough to be wtih you ...,The light has gone out of my life. 13yr Danbi ...,"i am so sorry for your loss, op. she had the k...",14,0.44,1,0,810,1
1665,12zoe05,years wasnt nearly long enough to be wtih you ...,The light has gone out of my life. 13yr Danbi ...,i am sorry for loosing a best friend . but how...,13,0.426667,2,0,811,1
1666,12zoe05,years wasnt nearly long enough to be wtih you ...,The light has gone out of my life. 13yr Danbi ...,"she loved her parent so much you can tell, and...",10,0.45,1,0,812,1
1677,12rezt4,"my cat, milo, died today due to kidney failure...",My 3 yr old cat died while I was away for coll...,milo knows he was loved till the end. cats pic...,29,0.65625,1,0,3173,1
1679,12qrw6t,i'm frustrated with my senior dog's condition ...,I'm frustrated with my senior dog's condition ...,"natural death isn't always peaceful, and dying...",20,0.525,3,0,3210,1
1680,12pu2ch,im putting everything i can think of here. its...,Baby Tokyo had to go to sleep last night. I de...,shes beautiful and what wonderful whisker fire...,91,0.312143,1,0,3097,1
1681,12pu2ch,im putting everything i can think of here. its...,Baby Tokyo had to go to sleep last night. I de...,rest easy tokyo. you were loved and the world ...,33,0.566667,1,0,3098,1
1682,12pu2ch,im putting everything i can think of here. its...,Baby Tokyo had to go to sleep last night. I de...,did she matter to you? that is all that is imp...,23,0.325,3,0,3099,1
1684,12pu2ch,im putting everything i can think of here. its...,Baby Tokyo had to go to sleep last night. I de...,toki is an absolute sweetie! i bet they are pl...,11,0.5125,2,0,3104,1
