In [30]:
import pandas as pd

# Load data
from constants import posts_token, flagged_posts, participants_token, participants_table1
from functions import pull_redcap_report

print("Loading data from REDCap reports...")
posts = pull_redcap_report(posts_token, flagged_posts)
participants = pull_redcap_report(participants_token, participants_table1)

# Define known participant counts
total_participants = 45  # Known participants with Facebook data
pd_participants = 29  # Known participants with PD

# Merge participant data into posts
participants["participant_id"] = participants["record_id"]
posts = posts.merge(
    participants[["participant_id", "gender", "pd_yesno", "dx_type", "diagnosis_date"]],
    on="participant_id",
    how="inner",  # Use 'inner' to ensure only participants found in posts are retained
)



Loading data from REDCap reports...


In [43]:
# Ensure timestamp and diagnosis_date are in datetime format
posts["timestamp"] = pd.to_datetime(posts["timestamp"], errors='coerce')
posts["diagnosis_date"] = pd.to_datetime(posts["diagnosis_date"], errors='coerce')

# Detect explicit mentions of PD
posts["explicit_mention_pd"] = posts["text"].str.contains(r"parkinson|(?<!\w)pd(?!\w)", case=False, na=False)

# Ensure we only use participants who actually have posts
participants_with_posts = posts["participant_id"].unique()
participants_with_pd_posts = posts[posts["dx_type"] == "Parkinson's disease"]["participant_id"].unique()

# Filter PD-related posts
pd_posts = posts[(posts["nb_classification"] == 1) | (posts["manual_label_pd_relevant"] == 1)]
print(f"Identified {len(pd_posts)} posts classified as PD-relevant.")

# **Compute necessary metrics**

# **Total PD-related posts**
total_pd_posts = len(pd_posts)
print(f"Total PD-relevant posts: {total_pd_posts}")
print(f"Total number of participants with PD-relevant posts: {pd_posts['participant_id'].nunique()}")

# **Average number of PD-related posts per participant (including zeroes)**
pd_post_counts = pd_posts.groupby("participant_id").size()
pd_post_counts = pd_post_counts.reindex(participants_with_posts, fill_value=0)  # Only use participants with posts

avg_pd_posts_per_participant = pd_post_counts.mean()
std_pd_posts_per_participant = pd_post_counts.std()
print(f"Average number of PD-relevant posts per participant: {avg_pd_posts_per_participant:.2f} ± {std_pd_posts_per_participant:.2f}")


# **Percentage of participants with at least one PD-related post**
participants_with_pd_post = (pd_post_counts > 0).sum()
percent_with_pd_post = (participants_with_pd_post / total_participants) * 100
print(f"{percent_with_pd_post:.2f}% of participants have at least one PD-relevant post.")

# **Percentage of participants explicitly referencing PD at least once**
explicit_pd_mentions = posts.groupby("participant_id")["explicit_mention_pd"].any().sum()
percent_explicit_pd = (explicit_pd_mentions / total_participants) * 100
print(f"{percent_explicit_pd:.2f}% of participants explicitly mentioned PD in at least one post.")

# **Statistics for individuals with PD**
pd_post_counts_pd = pd_posts[pd_posts["dx_type"] == "Parkinson's disease"].groupby("participant_id").size()
pd_post_counts_pd = pd_post_counts_pd.reindex(participants_with_pd_posts, fill_value=0)

avg_pd_posts_pd = pd_post_counts_pd.mean()
std_pd_posts_pd = pd_post_counts_pd.std()
print(f"Participants with PD made an average of {avg_pd_posts_pd:.2f} ± {std_pd_posts_pd:.2f} PD-relevant posts.")

# **Compute number of posts before and after diagnosis**
posts_before_diagnosis = pd_posts[pd_posts["timestamp"] < pd_posts["diagnosis_date"]].groupby("participant_id").size()
posts_after_diagnosis = pd_posts[pd_posts["timestamp"] >= pd_posts["diagnosis_date"]].groupby("participant_id").size()

# Ensure all PD participants are represented
posts_before_diagnosis = posts_before_diagnosis.reindex(participants_with_pd_posts, fill_value=0)
posts_after_diagnosis = posts_after_diagnosis.reindex(participants_with_pd_posts, fill_value=0)

# **Compute means and standard deviations**
avg_posts_before_dx = posts_before_diagnosis.mean()
std_posts_before_dx = posts_before_diagnosis.std()

avg_posts_after_dx = posts_after_diagnosis.mean()
std_posts_after_dx = posts_after_diagnosis.std()
print(f"Before diagnosis, participants made {avg_posts_before_dx:.2f} ± {std_posts_before_dx:.2f} PD-relevant posts.")
print(f"After diagnosis, participants made {avg_posts_after_dx:.2f} ± {std_posts_after_dx:.2f} PD-relevant posts.")

# **Percentage of individuals with PD explicitly mentioning PD**
explicit_pd_mentions_pd = posts[(posts["explicit_mention_pd"]) & (posts["dx_type"] == "Parkinson's disease")]["participant_id"].nunique()
percent_explicit_pd_pd = (explicit_pd_mentions_pd / pd_participants) * 100
print(f"{percent_explicit_pd_pd:.2f}% of individuals with PD explicitly mentioned PD.")

# **Percentage of individuals with PD with at least one PD-relevant post**
participants_with_pd_relevant_post = (pd_post_counts_pd > 0).sum()
percent_with_pd_relevant_post = (participants_with_pd_relevant_post / pd_participants) * 100
print(f"{percent_with_pd_relevant_post:.2f}% of individuals with PD had at least one PD-relevant post.")

# **Store results in a dictionary**
results = {
    "Total PD-related posts": total_pd_posts,
    "Average PD posts per participant": avg_pd_posts_per_participant,
    "SD PD posts per participant": std_pd_posts_per_participant,
    "% of participants with at least one PD-relevant post": percent_with_pd_post,
    "% of participants explicitly mentioning PD": percent_explicit_pd,
    "Average PD posts per participant with PD": avg_pd_posts_pd,
    "SD PD posts per participant with PD": std_pd_posts_pd,
    "Average posts before diagnosis": avg_posts_before_dx,
    "SD posts before diagnosis": std_posts_before_dx,
    "Average posts after diagnosis": avg_posts_after_dx,
    "SD posts after diagnosis": std_posts_after_dx,
    "% of individuals with PD explicitly mentioning PD": percent_explicit_pd_pd,
    "% of individuals with PD with at least one PD-relevant post": percent_with_pd_relevant_post,
}

# **Display results as a DataFrame**
results_df = pd.DataFrame(results.items(), columns=["Metric", "Value"])

Identified 4701 posts classified as PD-relevant.
Total PD-relevant posts: 4701
Total number of participants with PD-relevant posts: 43
Average number of PD-relevant posts per participant: 104.47 ± 176.13
95.56% of participants have at least one PD-relevant post.
66.67% of participants explicitly mentioned PD in at least one post.
Participants with PD made an average of 99.21 ± 171.96 PD-relevant posts.
Before diagnosis, participants made 24.34 ± 58.36 PD-relevant posts.
After diagnosis, participants made 74.86 ± 156.02 PD-relevant posts.
68.97% of individuals with PD explicitly mentioned PD.
93.10% of individuals with PD had at least one PD-relevant post.


In [44]:
import pandas as pd

# Ensure timestamp and diagnosis_date are in datetime format
posts["timestamp"] = pd.to_datetime(posts["timestamp"], errors='coerce')
posts["diagnosis_date"] = pd.to_datetime(posts["diagnosis_date"], errors='coerce')

# Detect explicit mentions of PD
posts["explicit_mention_pd"] = posts["text"].str.contains(r"parkinson|(?<!\w)pd(?!\w)", case=False, na=False)

# Ensure we only use participants who actually have posts
participants_with_posts = posts["participant_id"].unique()
participants_with_pd_posts = posts[posts["dx_type"] == "Parkinson's disease"]["participant_id"].unique()

# **Filter out exercise-related posts**
posts = posts[~posts["keyword_categories"].str.contains("exercise", case=False, na=False)]
print(f"Filtered out exercise-related posts. Remaining posts: {len(posts)}")

# **Filter PD-related posts**
pd_posts = posts[(posts["nb_classification"] == 1) | (posts["manual_label_pd_relevant"] == 1)]
print(f"Identified {len(pd_posts)} posts classified as PD-relevant (excluding exercise).")

# **Compute necessary metrics**

# **Total PD-related posts**
total_pd_posts = len(pd_posts)
print(f"Total PD-relevant posts (excluding exercise): {total_pd_posts}")
print(f"Total number of participants with PD-relevant posts: {pd_posts['participant_id'].nunique()}")

# **Average number of PD-related posts per participant (including zeroes)**
pd_post_counts = pd_posts.groupby("participant_id").size()
pd_post_counts = pd_post_counts.reindex(participants_with_posts, fill_value=0)  # Only use participants with posts

avg_pd_posts_per_participant = pd_post_counts.mean()
std_pd_posts_per_participant = pd_post_counts.std()
print(f"Average number of PD-relevant posts per participant (excluding exercise): {avg_pd_posts_per_participant:.2f} ± {std_pd_posts_per_participant:.2f}")

# **Percentage of participants with at least one PD-related post**
participants_with_pd_post = (pd_post_counts > 0).sum()
percent_with_pd_post = (participants_with_pd_post / total_participants) * 100
print(f"{percent_with_pd_post:.2f}% of participants have at least one PD-relevant post (excluding exercise).")

# **Percentage of participants explicitly referencing PD at least once**
explicit_pd_mentions = posts.groupby("participant_id")["explicit_mention_pd"].any().sum()
percent_explicit_pd = (explicit_pd_mentions / total_participants) * 100
print(f"{percent_explicit_pd:.2f}% of participants explicitly mentioned PD in at least one post (excluding exercise).")

# **Statistics for individuals with PD**
pd_post_counts_pd = pd_posts[pd_posts["dx_type"] == "Parkinson's disease"].groupby("participant_id").size()
pd_post_counts_pd = pd_post_counts_pd.reindex(participants_with_pd_posts, fill_value=0)

avg_pd_posts_pd = pd_post_counts_pd.mean()
std_pd_posts_pd = pd_post_counts_pd.std()
print(f"Participants with PD made an average of {avg_pd_posts_pd:.2f} ± {std_pd_posts_pd:.2f} PD-relevant posts (excluding exercise).")

# **Compute number of posts before and after diagnosis**
posts_before_diagnosis = pd_posts[pd_posts["timestamp"] < pd_posts["diagnosis_date"]].groupby("participant_id").size()
posts_after_diagnosis = pd_posts[pd_posts["timestamp"] >= pd_posts["diagnosis_date"]].groupby("participant_id").size()

# Ensure all PD participants are represented
posts_before_diagnosis = posts_before_diagnosis.reindex(participants_with_pd_posts, fill_value=0)
posts_after_diagnosis = posts_after_diagnosis.reindex(participants_with_pd_posts, fill_value=0)

# **Compute means and standard deviations**
avg_posts_before_dx = posts_before_diagnosis.mean()
std_posts_before_dx = posts_before_diagnosis.std()

avg_posts_after_dx = posts_after_diagnosis.mean()
std_posts_after_dx = posts_after_diagnosis.std()
print(f"Before diagnosis, participants made {avg_posts_before_dx:.2f} ± {std_posts_before_dx:.2f} PD-relevant posts (excluding exercise).")
print(f"After diagnosis, participants made {avg_posts_after_dx:.2f} ± {std_posts_after_dx:.2f} PD-relevant posts (excluding exercise).")

# **Percentage of individuals with PD explicitly mentioning PD**
explicit_pd_mentions_pd = posts[(posts["explicit_mention_pd"]) & (posts["dx_type"] == "Parkinson's disease")]["participant_id"].nunique()
percent_explicit_pd_pd = (explicit_pd_mentions_pd / pd_participants) * 100
print(f"{percent_explicit_pd_pd:.2f}% of individuals with PD explicitly mentioned PD (excluding exercise).")

# **Percentage of individuals with PD with at least one PD-relevant post**
participants_with_pd_relevant_post = (pd_post_counts_pd > 0).sum()
percent_with_pd_relevant_post = (participants_with_pd_relevant_post / pd_participants) * 100
print(f"{percent_with_pd_relevant_post:.2f}% of individuals with PD had at least one PD-relevant post (excluding exercise).")

# **Store results in a dictionary**
results = {
    "Total PD-related posts (excluding exercise)": total_pd_posts,
    "Average PD posts per participant (excluding exercise)": avg_pd_posts_per_participant,
    "SD PD posts per participant (excluding exercise)": std_pd_posts_per_participant,
    "% of participants with at least one PD-relevant post (excluding exercise)": percent_with_pd_post,
    "% of participants explicitly mentioning PD (excluding exercise)": percent_explicit_pd,
    "Average PD posts per participant with PD (excluding exercise)": avg_pd_posts_pd,
    "SD PD posts per participant with PD (excluding exercise)": std_pd_posts_pd,
    "Average posts before diagnosis (excluding exercise)": avg_posts_before_dx,
    "SD posts before diagnosis (excluding exercise)": std_posts_before_dx,
    "Average posts after diagnosis (excluding exercise)": avg_posts_after_dx,
    "SD posts after diagnosis (excluding exercise)": std_posts_after_dx,
    "% of individuals with PD explicitly mentioning PD (excluding exercise)": percent_explicit_pd_pd,
    "% of individuals with PD with at least one PD-relevant post (excluding exercise)": percent_with_pd_relevant_post,
}

# **Display results as a DataFrame**
results_df = pd.DataFrame(results.items(), columns=["Metric", "Value"])

Filtered out exercise-related posts. Remaining posts: 15346
Identified 3047 posts classified as PD-relevant (excluding exercise).
Total PD-relevant posts (excluding exercise): 3047
Total number of participants with PD-relevant posts: 42
Average number of PD-relevant posts per participant (excluding exercise): 67.71 ± 111.19
93.33% of participants have at least one PD-relevant post (excluding exercise).
64.44% of participants explicitly mentioned PD in at least one post (excluding exercise).
Participants with PD made an average of 63.86 ± 100.85 PD-relevant posts (excluding exercise).
Before diagnosis, participants made 15.10 ± 34.41 PD-relevant posts (excluding exercise).
After diagnosis, participants made 48.76 ± 89.44 PD-relevant posts (excluding exercise).
68.97% of individuals with PD explicitly mentioned PD (excluding exercise).
93.10% of individuals with PD had at least one PD-relevant post (excluding exercise).
