In [27]:
import pandas as pd
import random
import os
from pathlib import Path

In [2]:
df = pd.read_csv(r"C:\Users\ercj\Desktop\plain_language_summarization_platform\google_forms\static_1\0226_filtered.csv")

In [3]:
def normalize_column(c):
    c = c.strip()

    if c.startswith("familiar_"):
        return c.replace("familiar_", "").lower() + "_familiar"

    if c.startswith("interested_"):
        return c.replace("interested_", "").lower() + "_interest"

    if c.startswith("method_"):
        return c.replace("method_", "").lower().replace("-", "_") + "_topic"

    if c == "Which is a fruit?":
        return "attention_question"

    if c == "Would you like to participate in the next stage of this study?":
        return "study_continuation"

    return c

df = df.rename(columns=normalize_column)

In [4]:
df['chronic_lower_respiratory_diseases_interest'].value_counts()

chronic_lower_respiratory_diseases_interest
A little interested      11
Moderately interested     6
Interested                6
Not interested            5
Very interested           2
Name: count, dtype: int64

In [5]:
familiarity = {
    "Not familiar at all": 1,
    "Slightly familiar": 2,
    "Familiar": 3,
    "Moderately familiar": 4,
    "Very familiar": 5,
}

interest = {
    "Not interested": 1, 
    "A little interested": 2, 
    "Moderately interested": 3, 
    "Interested": 4, 
    "Very interested": 5
}

familiar_cols = [
    "cancer_familiar",
    "heart_disease_familiar",
    "accidents_familiar",
    "stroke_familiar",
    "chronic_lower_respiratory_diseases_familiar",
    "alzheimers_familiar",
    "nephritis_familiar",
    "diabetes_familiar",
    "chronic_liver_disease_familiar",
    "obesity_familiar",
    "covid_19_familiar",
    "depression_familiar",
]

interest_cols = [
    "cancer_interest",
    "heart_disease_interest",
    "accidents_interest",
    "stroke_interest",
    "chronic_lower_respiratory_diseases_interest",
    "alzheimers_interest",
    "nephritis_interest",
    "diabetes_interest",
    "chronic_liver_disease_interest",
    "obesity_interest",
    "covid_19_interest",
    "depression_interest",
]

In [6]:
for col in familiar_cols:
    df[col] = df[col].str.strip().map(familiarity)

for col in interest_cols:
    df[col] = df[col].str.strip().map(interest)

In [7]:
def fam_label(score):
    if score >= 3:
        return "high_fam"
    else:
        return "low_fam"

In [8]:
def interest_label(score):
    if score >= 4:
        return "high_interest"
    elif score == 3: 
        return "moderate_interest"
    else:
        return "low_interest"

In [9]:
df.to_csv("new_revised_info.csv")

In [10]:
for col in familiar_cols:
    df[col] = df[col].apply(fam_label)

for col in interest_cols:
    df[col] = df[col].apply(interest_label)


In [11]:
# read in the new csv and abstract
new_df = pd.read_csv("new_revised_info.csv")
abstract_df = pd.read_csv(
    r"C:\Users\ercj\Desktop\plain_language_summarization_platform\final_abstracts\0218_combined_abstracts.csv",
    encoding="latin1"
)
abstract_df["terms_chosen"] = (
    abstract_df["terms_chosen"]
    .fillna("")         
    .astype(str)      
    .str.replace(";", ",", regex=False)
)

In [12]:
TOPIC_MAP = {
    "Cancer": "cancer",
    "Heart disease": "heart_disease",
    "Accidents (unintentional injuries)": "accidents",
    "Stroke (cerebrovascular diseases)": "stroke",
    "Chronic lower respiratory diseases": "chronic_lower_respiratory_diseases",
    "Nephritis, nephrotic syndrome, and nephrosis": "nephritis",
    "Diabetes": "diabetes",
    "Chronic liver disease and cirrhosis": "chronic_liver_disease",
    "Obesity": "obesity",
    "COVID-19": "covid_19",
    "Depression": "depression",
    "Alzheimer's disease": "alzheimers",
}

In [13]:
def normalize_topic(s):
    if pd.isna(s):
        return s
    return (
        s.strip()
         .replace("\n", " ")
         .replace("’", "'")
    )

In [14]:
abstract_df["topic"] = abstract_df["topic"].apply(normalize_topic)
abstract_df["topic"] = abstract_df["topic"].map(TOPIC_MAP)

In [15]:
abstract_df["topic"].isna().value_counts()

topic
False    120
Name: count, dtype: int64

In [16]:
def create_piles(user_row, abstract_df):
    topics = [
        "cancer",
        "heart_disease",
        "accidents",
        "stroke",
        "chronic_lower_respiratory_diseases",
        "alzheimers",
        "nephritis",
        "diabetes",
        "chronic_liver_disease",
        "obesity",
        "covid_19",
        "depression",
    ]
    high_interest_ids = []
    moderate_interest_ids = []
    low_interest_ids = []

    for topic in topics:
        # create strings for columns 
        interest_col = f"{topic}_interest"
        method_col = f"{topic}_topic"
        # find the user's interest
        interest_value = str(user_row[interest_col]).strip().lower()
        topic_abs = abstract_df[abstract_df["topic"] == topic]
        
        # check whether the topic was high_interest
        if interest_value == "high_interest":
            raw_methods = str(user_row[method_col]).strip()
            allowed_methods = [
                m.strip().lower()
                for m in raw_methods.split(",")
                if m.strip() != ""
            ]
            if allowed_methods:
                filtered = topic_abs[
                    topic_abs["method"].str.lower().isin(allowed_methods)
                ]
            else: 
                filtered = topic_abs
            # add into the high interest pile
            high_interest_ids.extend(filtered["abstract_id"].tolist())
        # add in moderately interest
        elif interest_value == "moderate_interest":
            moderate_interest_ids.extend(topic_abs["abstract_id"].tolist())
        # go through the low interest pile 
        elif interest_value == "low_interest":
            low_interest_ids.extend(topic_abs["abstract_id"].tolist())
        else:
            pass
    return high_interest_ids, moderate_interest_ids, low_interest_ids

In [17]:
def split_interest(hi, moderate, lo, min_needed=10):
    hi = list(hi)      
    moderate = list(moderate)
    lo = list(lo)
    if len(hi) < min_needed:
        deficit = min_needed - len(hi)
        to_move = moderate[:deficit]
        hi.extend(to_move)
        moderate = moderate[deficit:]

    if len(lo) < min_needed:
        deficit = min_needed - len(lo)
        to_move = moderate[:deficit]
        lo.extend(to_move)
        moderate = moderate[deficit:]
    return hi, moderate, lo


In [18]:
hi, moderate, lo = create_piles(df.iloc[1], abstract_df)
random.shuffle(hi)
random.shuffle(moderate)
random.shuffle(lo)
hi, moderate, lo = split_interest(hi, moderate, lo)
pid = df.iloc[1]["Participant id"]
print(pid)
print(len(hi), len(moderate), len(lo))
print(hi[:10])

69839e00693d0f360c3e6821
8 0 110
[1002, 1003, 1006, 1009, 1008, 1001, 1007, 1005]


In [19]:
def assign_batches(hi_ids, mod_ids, lo_ids):
    hi = list(hi_ids)
    mod = list(mod_ids)
    lo = list(lo_ids)

    random.shuffle(hi)
    random.shuffle(mod)
    random.shuffle(lo)

    def pick(turn):
        """
        Preference-aware fallback:
        high → moderate → low
        """
        if turn == "high":
            if hi:
                return hi.pop()
            elif mod:
                return mod.pop()
            elif lo:
                return lo.pop()
            else:
                raise ValueError("No abstracts left to assign.")
        else:  # low turn
            if lo:
                return lo.pop()
            elif mod:
                return mod.pop()
            elif hi:
                return hi.pop()
            else:
                raise ValueError("No abstracts left to assign.")

    def choose_abstracts(n, start):
        batch = []
        turn = start
        for _ in range(n):
            batch.append(pick(turn))
            turn = "low" if turn == "high" else "high"
        return batch

    return {
        "batch1": choose_abstracts(1, start="high"),
        "batch2": choose_abstracts(2, start="low"),
        "batch3": choose_abstracts(1, start="high"),
        "batch4": choose_abstracts(2, start="low"),
        "batch5": choose_abstracts(1, start="low"),
        "batch6": choose_abstracts(2, start="high"),
    }

In [20]:
batches = assign_batches(hi, moderate, lo)
BATCH_TYPE_MAP = {
    "batch1": "static_1",
    "batch2": "static_2",
    "batch3": "interactive_3",
    "batch4": "interactive_4",
    "batch5": "finetuned_5",
    "batch6": "finetuned_6",
}

In [21]:
def build_user_row(user_row, batches, abstract_df):
    rows = []

    user_id = user_row["Participant id"]  
    abs_lookup = abstract_df.set_index("abstract_id")

    for batch_name, abstract_ids in batches.items():
        batch_type = BATCH_TYPE_MAP[batch_name] 

        for aid in abstract_ids:
            abs_row = abs_lookup.loc[aid]

            rows.append({
                "user_id": user_id,
                "type": batch_type,

                "abstract_title": abs_row["abstract_title"],
                "abstract": abs_row["abstract"],
                "human_written": abs_row["human_written_pls"],

                "abstract_id": aid,
                "terms": abs_row.get("terms_chosen", abs_row.get("terms", "")),

                "question_1": abs_row.get("question_1", ""),
                "question_2": abs_row.get("question_2", ""),
                "question_3": abs_row.get("question_3", ""),
                "question_4": abs_row.get("question_4", ""),
                "question_5": abs_row.get("question_5", ""),

                "question_1_answers_choices": abs_row.get("question_1_answers_choices", ""),
                "question_1_correct_answers": abs_row.get("question_1_correct_answers", ""),
                "question_2_answers_choices": abs_row.get("question_2_answers_choices", ""),
                "question_2_correct_answers": abs_row.get("question_2_correct_answers", ""),
                "question_3_answers_choices": abs_row.get("question_3_answers_choices", ""),
                "question_3_correct_answers": abs_row.get("question_3_correct_answers", ""),
                "question_4_answers_choices": abs_row.get("question_4_answers_choices", ""),
                "question_4_correct_answers": abs_row.get("question_4_correct_answers", ""),
                "question_5_answers_choices": abs_row.get("question_5_answers_choices", ""),
                "question_5_correct_answers": abs_row.get("question_5_correct_answers", ""),
            })

    return pd.DataFrame(rows)

In [22]:
all_users_rows = []
for idx, user_row in df.iterrows():
    try:
        hi, mod, lo = create_piles(user_row, abstract_df)
        hi, mod, lo = split_interest(hi, mod, lo)

        batches = assign_batches(hi, moderate, lo)
        df_user = build_user_row(user_row, batches, abstract_df)
        all_users_rows.append(df_user)
    except Exception as e:
        print(f"Error processing user at index {idx}, Participant id'={user_row['Participant id']}: {e}")

In [28]:
final_df = pd.concat(all_users_rows, ignore_index=True)
final_df.to_csv(
    r"C:\Users\ercj\Desktop\plain_language_summarization_platform\final_user_batches.csv",
    mode="a",
    header=False,
    index=False
)

In [30]:
file_path = Path(r"C:\Users\ercj\Desktop\plain_language_summarization_platform\approved_ids.csv")
unique_ids = df['Participant id'].unique()
pd.DataFrame({'prolific_id': unique_ids}).to_csv(
    file_path,
    mode="a",
    header=not file_path.exists(),
    index=False
)