In [3]:
from functions import pull_redcap_report
from constants import posts_token, flagged_posts, participants_token, participants_table1
import pandas as pd

posts = pull_redcap_report(posts_token, flagged_posts)
participants = pull_redcap_report(participants_token, participants_table1)

participants['participant_id'] = participants['record_id']

# Merge the dob column into the posts DataFrame based on the matching IDs
posts = posts.merge(
    participants[['participant_id', 'gender', 'pd_yesno', 'diagnosis_date']],  # Select only relevant columns from participants
    on='participant_id',  # Column in posts to match    # Column in participants to match
    how='left'                 # Use 'left' join to keep all rows from posts
)


# Ground Truth Dataset

In [4]:
training_posts = posts[~pd.isna(posts['manual_label_pd_relevant'])]
print(f'Manually reviewed posts: {len(training_posts)}')
num_unique_participants = training_posts['participant_id'].nunique()
print(f'Number of participants: {num_unique_participants}')

# Count unique participant IDs where pd_yesno is 1
num_unique_pd_yes = training_posts.loc[training_posts['pd_yesno'] == 1, 'participant_id'].nunique()

# Count unique participant IDs where pd_yesno is 0
num_unique_pd_no = training_posts.loc[training_posts['pd_yesno'] == 0, 'participant_id'].nunique()

print(f"Number of unique participants with pd_yesno = 1: {num_unique_pd_yes}")
print(f"Number of unique participants with pd_yesno = 0: {num_unique_pd_no}")
print(f"Number of PD-relevant posts in ground-truth dataset: {(training_posts['manual_label_pd_relevant'] == 1).sum()}")

Manually reviewed posts: 6750
Number of participants: 19
Number of unique participants with pd_yesno = 1: 14
Number of unique participants with pd_yesno = 0: 5
Number of PD-relevant posts in ground-truth dataset: 2400


# Term Dictionary

In [6]:
import json

# Load the JSON file
with open("../term_dictionary/term_dictionary.json", "r") as file:
    keyword_dict = json.load(file)

# Count the number of keys
num_keys = len(keyword_dict)

# Count the total number of entries across all keys
num_entries = sum(len(values) for values in keyword_dict.values())

print(f"Number of keys: {num_keys}")
print(f"Total number of keyword entries: {num_entries}")

Number of keys: 52
Total number of keyword entries: 1153
