In [None]:
!pip install -U datasets ipywidgets together --quiet

In [None]:
pip install fsspec==2024.9.0

In [None]:
import datasets
import os
import requests
from google.colab import userdata, output
from ipywidgets import widgets
from together import Together
from typing import Any, Dict

output.enable_custom_widget_manager()

In [None]:
# TRACK_OPTIONS is a "dictionary" that maps certain values to other values. In
# this case, our dictionary below maps the names of the Datathon tracks (e.g.,
# Medication Eduction) to the ID of the dataset in the Datathon (e.g., meded).
TRACK_OPTIONS = {
    "Medical Education": "meded",
    "Clinical Documentation": "clindoc",
    "Mental Health": "mentalhealth"
}

# TRACK is a variable that stores what track you and your team will be
# participating in.
TRACK = list(TRACK_OPTIONS.keys())[0]

# We'll create a user-friendly widget for your to be able to select your
# track that you're participating in. We create this widget using the
# Dropdown object from the `widgets` package.
track_selection = widgets.Dropdown(
    options=TRACK_OPTIONS.keys(),
    description="Track:",
    value=TRACK
)

# This is a helper function that we'll use to update the value of the
# TRACK variable depending on what you select from the dropdown widget.
# No need to look too closely at this function.
def on_change(change: Dict[str, Any]) -> None:
    global TRACK
    if change["type"] == "change" and change["name"] == "value":
        TRACK = change["new"]
    return

# Here, we're telling Dropdown widget to update the TRACK variable depending
# on the widget value.
track_selection.observe(on_change)

# Use the widget to select your Datathon track!
display(track_selection)

In [None]:
# You might get a warning about the `HF_TOKEN` not existing in your Colab
# secrets. Don't worry about this for now for the purposes of this tutorial!
ds = datasets.load_dataset("mdplus/Datathon2024", data_dir=TRACK_OPTIONS[TRACK])
df_train = ds["train"].to_pandas()
df_test = ds["test"].to_pandas()

In [None]:
df_train.head()

In [None]:
df_train.meta_info.value_counts()

In [None]:
def extract_age(text):
    match_week = re.search(r'\b(\d+)-week-old\b', text)
    if match_week:
        return round(int(match_week.group(1)) / 52, 2)
    match_month = re.search(r'\b(\d+)-month-old\b', text)
    if match_month:
        return round(int(match_month.group(1)) / 12, 2)
    match_year = re.search(r'\b(\d+)-year-old\b', text)
    if match_year:
        return int(match_year.group(1))
    return None

def extract_gender(text):
    if any(term in text.lower() for term in ['woman', 'female', 'girl']):
        return 'Female'
    elif any(term in text.lower() for term in ['man', 'male', 'boy']):
        return 'Male'
    else:
        return None

In [None]:
import re
df_train['Age'] = df_train['question'].apply(extract_age)
df_train['Gender'] = df_train['question'].apply(extract_gender)

In [None]:
os.environ["TOGETHER_API_KEY"] = userdata.get("TOGETHER_API_KEY")

In [None]:
llm_client = Together()

In [None]:
modelID = "meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo"
def get_answer(row):
    question = row['question']
    option = row['options']
    content = (
        question + "\n\n" +
        "\n".join([f"{key}: {value}" for key, value in option.items()]) +
        "\n\nselect the right answer and only give the answer index, i.e., A, B, C, D, E"
    )
    try:
        completion = llm_client.chat.completions.create(
            model=modelID,
            messages=[{
                "role": "user",
                "content": content
            }],
        )
        return completion.choices[0].message.content
    except Exception as e:
        return f"Error: {e}"
df_train['LLM_Answer'] = df_train.apply(get_answer, axis = 1)

In [None]:
from google.colab import files

uploaded = files.upload()

import io
import pandas as pd

df_4 = pd.read_csv(io.BytesIO(uploaded['data4.csv']))
df_4.head()

In [None]:
def categorize_age(num):
    if num < 3:
        return "Toddler"
    elif 3 <= num < 18:
        return "Child"
    elif 18 <= num < 60:
        return "Adult"
    else:
        return "Senior"

In [None]:
df_4['Age_Category'] = df_4['Age'].apply(categorize_age)

In [None]:
df_3['Age_Category'] = df_3['Age'].apply(categorize_age)

In [None]:
df_2['Age_Category'] = df_2['Age'].apply(categorize_age)

In [None]:
df_1['Age_Category'] = df_1['Age'].apply(categorize_age)

In [None]:
df_1['Prompt ID'] = 'Prompt 1'
df_2['Prompt ID'] = 'Prompt 2'
df_3['Prompt ID'] = 'Prompt 3'
df_4['Prompt ID'] = 'Prompt 4'
df_full = pd.concat([df_1, df_2, df_3, df_4], ignore_index=True)

In [None]:
df_full['correct'] = (df_full['LLM_Answer'] == df_full['answer_idx']).astype(int)


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
grouped_accuracy = df_full.groupby(['Prompt ID', 'Gender'])['correct'].mean().reset_index()
pivot_table = pd.pivot_table(grouped_accuracy, values='correct', index='Prompt ID', columns='Gender')
fig, ax = plt.subplots(figsize=(10, 7))
pivot_table.plot(kind='bar', ax=ax)
ax.set_title('Average Accuracy by Prompt and Gender')
ax.set_xlabel('Prompt ID')
ax.set_ylabel('Accuracy')
plt.xticks(rotation=0)
plt.legend(title='Gender', bbox_to_anchor=(1, 1))
plt.show()


In [None]:
age_order = ['Toddler', 'Child', 'Adult', 'Senior']
df_full['Age_Category'] = pd.Categorical(df_full['Age_Category'], categories=age_order, ordered=True)

grouped_accuracy = df_full.groupby(['Prompt ID', 'Age_Category'])['correct'].mean().reset_index()
pivot_table = pd.pivot_table(grouped_accuracy, values='correct', index='Prompt ID', columns='Age_Category')


fig, ax = plt.subplots(figsize=(10, 7))
pivot_table.plot(kind='bar', ax=ax)
ax.set_title('Average Accuracy by Prompt and Age')
ax.set_xlabel('Prompt ID')
ax.set_ylabel('Accuracy')
plt.xticks(rotation=0)
plt.legend(title='Age', bbox_to_anchor=(1, 1))
plt.show()

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

age_order = ['Toddler', 'Child', 'Adult', 'Senior']
df_full['Age_Category'] = pd.Categorical(df_full['Age_Category'], categories=age_order, ordered=True)

grouped = df_full.groupby(['Prompt ID', 'Age_Category'])['correct']
mean_accuracy = grouped.mean().reset_index(name='mean_correct')
sem_accuracy = grouped.sem().reset_index(name='sem_correct')

mean_pivot = pd.pivot_table(mean_accuracy, values='mean_correct', index='Prompt ID', columns='Age_Category')
sem_pivot = pd.pivot_table(sem_accuracy, values='sem_correct', index='Prompt ID', columns='Age_Category')

fig, ax = plt.subplots(figsize=(10, 7))
mean_pivot.plot(kind='bar', yerr=sem_pivot, ax=ax, capsize=4)
ax.set_title('Average Accuracy by Prompt and Age')
ax.set_xlabel('Prompt ID')
ax.set_ylabel('Accuracy')
plt.xticks(rotation=0)
plt.legend(title='Age', bbox_to_anchor=(1, 1))
plt.show()


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

grouped = df_full.groupby(['Prompt ID', 'Gender'])['correct']
mean_accuracy = grouped.mean().reset_index(name='mean_correct')
sem_accuracy = grouped.sem().reset_index(name='sem_correct')

mean_pivot = pd.pivot_table(mean_accuracy, values='mean_correct', index='Prompt ID', columns='Gender')
sem_pivot = pd.pivot_table(sem_accuracy, values='sem_correct', index='Prompt ID', columns='Gender')

fig, ax = plt.subplots(figsize=(10, 7))
mean_pivot.plot(kind='bar', yerr=sem_pivot, ax=ax, capsize=4)
ax.set_title('Average Accuracy by Prompt and Gender')
ax.set_xlabel('Prompt ID')
ax.set_ylabel('Accuracy')
plt.xticks(rotation=0)
plt.legend(title='Gender', bbox_to_anchor=(1, 1))
plt.show()


In [None]:
import seaborn as sns
sns.histplot(df_full.Age)

In [None]:
# Statistical Tests for Gender Comparison within Each Prompt
print("\n--- Statistical Tests for Gender Comparison within Each Prompt ---")
gender_results_within_prompt = {}


for prompt in df_full['Prompt ID'].unique():
    print(f"\n--- Analyzing Gender Comparison for {prompt} ---")

    prompt_data = df_full[df_full['Prompt ID'] == prompt]

    if prompt_data['Gender'].nunique() < 2:
        print(f"Not enough data for gender comparison in {prompt}. Skipping...")
        continue

    male_data = prompt_data[prompt_data['Gender'] == 'Male']['correct']
    female_data = prompt_data[prompt_data['Gender'] == 'Female']['correct']

    ttest_result = stats.ttest_ind(male_data, female_data)
    p_value = ttest_result.pvalue
    test_type = "T-test"
    gender_results_within_prompt[prompt] = p_value

    if p_value < 0.05:
        print(f"{test_type} for {prompt}: Significant difference (p-value = {p_value:.4f})")
    else:
        print(f"{test_type} for {prompt}: No significant difference (p-value = {p_value:.4f})")


print("\n--- Statistical Test for Gender Comparison Across All Prompts ---")
male_data_all_prompts = df_full[df_full['Gender'] == 'Male']['correct']
female_data_all_prompts = df_full[df_full['Gender'] == 'Female']['correct']

ttest_result_all = stats.ttest_ind(male_data_all_prompts, female_data_all_prompts)
p_value_all = ttest_result_all.pvalue
test_type_all = "T-test"

print(f"{test_type_all} test across all prompts: p-value = {p_value_all:.4f}")

if p_value_all < 0.05:
    print(f"Significant difference in accuracy between Male and Female across all prompts (p-value = {p_value_all:.4f})")
else:
    print(f"No significant difference in accuracy between Male and Female across all prompts (p-value = {p_value_all:.4f})")

print("\nSummary of p-values for Gender Comparison Within Each Prompt:")
for prompt, p_val in gender_results_within_prompt.items():
    print(f"{prompt}: p-value = {p_val:.4f}")

In [None]:
#Statistical Test for Male and Female Comparison Across Prompts
grouped_accuracy = df_full.groupby(['Prompt ID', 'Gender'])['correct'].mean().reset_index()

print("\n--- Statistical Test for Male Comparison Across Prompts ---")
male_data = df_full[df_full['Gender'] == 'Male']
male_data_by_prompt = [male_data[male_data['Prompt ID'] == prompt]['correct'] for prompt in male_data['Prompt ID'].unique()]

if all(stats.shapiro(data)[1] > 0.05 for data in male_data_by_prompt):
    anova_result_male = stats.f_oneway(*male_data_by_prompt)
    p_value_male = anova_result_male.pvalue
    test_type_male = "ANOVA"
else:
    kruskal_result_male = stats.kruskal(*male_data_by_prompt)
    p_value_male = kruskal_result_male.pvalue
    test_type_male = "Kruskal-Wallis"

print(f"{test_type_male} test for Male across prompts: p-value = {p_value_male:.4f}")

if p_value_male < 0.05:
    print(f"Significant difference in accuracy for Male across the prompts (p-value = {p_value_male:.4f})")
else:
    print(f"No significant difference in accuracy for Male across the prompts (p-value = {p_value_male:.4f})")

print("\n--- Statistical Test for Female Comparison Across Prompts ---")
female_data = df_full[df_full['Gender'] == 'Female']

female_data_by_prompt = [female_data[female_data['Prompt ID'] == prompt]['correct'] for prompt in female_data['Prompt ID'].unique()]

if all(stats.shapiro(data)[1] > 0.05 for data in female_data_by_prompt):
    # If data is normally distributed, use ANOVA
    anova_result_female = stats.f_oneway(*female_data_by_prompt)
    p_value_female = anova_result_female.pvalue
    test_type_female = "ANOVA"
else:
    # Use Kruskal-Wallis if normality is not assumed
    kruskal_result_female = stats.kruskal(*female_data_by_prompt)
    p_value_female = kruskal_result_female.pvalue
    test_type_female = "Kruskal-Wallis"

print(f"{test_type_female} test for Female across prompts: p-value = {p_value_female:.4f}")

if p_value_female < 0.05:
    print(f"Significant difference in accuracy for Female across the prompts (p-value = {p_value_female:.4f})")
else:
    print(f"No significant difference in accuracy for Female across the prompts (p-value = {p_value_female:.4f})")


In [None]:
#ANOVA test within each prompt
results = {}
for prompt in df_full['Prompt ID'].unique():
    prompt_data = df_full[df_full['Prompt ID'] == prompt]

    model = ols('correct ~ C(Age_Category)', data=prompt_data).fit()
    anova_table = sm.stats.anova_lm(model, typ=2)

    print(f"\nANOVA Results for {prompt}:")
    print(anova_table)

    p_value = anova_table['PR(>F)'][0]
    results[prompt] = p_value

    if p_value < 0.05:
        print(f"\nSignificant difference found for {prompt}. Running post-hoc test...\n")
        posthoc = stats.kruskal(*[prompt_data[prompt_data['Age_Category'] == category]['correct']
                                  for category in prompt_data['Age_Category'].unique()])
        print("Kruskal-Wallis Test Result:", posthoc)

print("\nSummary of p-values from ANOVA for each prompt:")
print(results)

In [None]:
#Analyze the different age groups between the prompts
age_groups = ['Toddler', 'Child', 'Adult', 'Senior']
results = {}

for age_group in age_groups:
    print(f"\n--- Analyzing {age_group} group ---")
    age_group_data = df_full[df_full['Age_Category'] == age_group]

    if len(age_group_data) < 2:
        print(f"Not enough data for {age_group}. Skipping...")
        continue


    prompt_accuracy_data = [age_group_data[age_group_data['Prompt ID'] == prompt]['correct']
                            for prompt in age_group_data['Prompt ID'].unique()]

    # Kruskal-Wallis Test (if data is not normally distributed) or ANOVA (if data is normally distributed)
    if stats.shapiro(age_group_data['correct'])[1] > 0.05:  # Normality Test
        # One-Way ANOVA if data is normally distributed
        anova_result = stats.f_oneway(*prompt_accuracy_data)
        p_value = anova_result.pvalue
        test_type = "ANOVA"
    else:
        # Kruskal-Wallis test if data is not normally distributed
        kruskal_result = stats.kruskal(*prompt_accuracy_data)
        p_value = kruskal_result.pvalue
        test_type = "Kruskal-Wallis"

    print(f"{test_type} Test for {age_group}: p-value = {p_value:.4f}")

    results[age_group] = p_value

    # Interpretation
    if p_value < 0.05:
        print(f"Significant difference found in accuracy for {age_group} across prompts (p < 0.05).")
    else:
        print(f"No significant difference found in accuracy for {age_group} across prompts (p >= 0.05).")

print("\nSummary of Statistical Test p-values for each age group:")
for age_group, p_val in results.items():
    print(f"{age_group}: p-value = {p_val:.4f}")