In [None]:
# Dependent libraries
from constants import participants_token, facebook_files, file_data, aime_recordids_withfacebook, posts_token, all_posts
from functions import pull_redcap_report, process_json_files, clean_and_deduplicate_text, safe_decode, redcap_upload
import pandas as pd

# Import data from REDCap to identify participants with Facebook data
redcap_data = pull_redcap_report(participants_token, facebook_files)
redcap_data = redcap_data[redcap_data['fb_data'] != ''] # Filter out record_ids withouth Facebook data
redcap_data['record_id'] = redcap_data['record_id'].apply(str) # fix int/str issue
redcap_data['your_posts_combined'] = redcap_data['your_posts_combined'].apply(str) # fix int/str issue

redcap_data = redcap_data.reset_index()

# Identify record_ids used in AIME2025 submission (all extant data as of January 2025)
record_ids = aime_recordids_withfacebook

# Extract text from JSON files affiliated with Posts, Comments and Reactions, and Groups exports
# Save after each record_id because REDCap upload becomes tempermental when large
for record_id in record_ids:
    print(f'Now processing: Record ID: {record_id}')

    # Generate text dataframe
    df_text = pd.DataFrame(columns=['record_id', 'json_file', 'timestamp', 'title', 'text', 'group'])

    # Pull JSON info for participant
    record_data = redcap_data[redcap_data['record_id'] == record_id]
    record_data = record_data.fillna("")

    # Extract Text
    df_text = process_json_files(record_data, redcap_data, file_data, record_id, df_text)

    # Deduplicate Text
    df_text = clean_and_deduplicate_text(df_text)

    # Ensure text is encoded as UTF-8
    df_text['text'] = df_text['text'].apply(safe_decode)

    # Assign participant_id to match record_id
    df_text['participant_id'] = record_id

    posts = pull_redcap_report(posts_token, all_posts)


    if len(df_text) > 0 and int(df_text.loc[0, 'participant_id']) in posts['participant_id'].values:
        print(f"Participant ID '{record_id}' already exists in df_redcap. Not uploading data.")

    else: 
        if len(df_text) > 0:
            # Set additional record_ids based on pre-existing data in REDCap
            redcap_max = posts['record_id'].max()
            if redcap_max == 1: redcap_max = 0 # Dealing with an empty dataframe
            df_text['record_id'] = df_text.index + 1 + redcap_max

            print(f'number of new entries: {len(df_text)}')
            print(f'max record_id to be found in redcap after upload: {redcap_max + len(df_text)}')
            response = redcap_upload(df_text)
            print(f'Data from {record_id} successfully uploaded.')
            print(f'Target length of Posts REDCap after upload attempt: {len(posts)+len(df_text)}')
        else: print(f'Record ID {record_id} has no text to upload.')
