In [122]:
import pandas as pd
import os
import re
import numpy as np
import json

In [123]:
def read_extract_demo_data(movie, base_path, subfolder):
    
    data_path = os.path.join(base_path, movie, subfolder, "questiondata.csv")
    question_data = pd.read_csv(data_path, header=None, comment="#")
    question_data.columns = ["ID", "Question", "Response"]

    demo = question_data[question_data["Question"].str.startswith("demographics_", na=False)].copy()
    
    demo["Question"] = demo["Question"].str.replace("demographics_", "", regex=False)
    
    demo_wide = demo.pivot(index="ID", columns="Question", values="Response").reset_index()
    
    demo_wide = demo_wide[["ID","age","education","gender","race","hispanic"]]
    
    return demo_wide

# Description

In [125]:
movie_list = ["busstop", "cmiyc_long", "keithreynolds", "theboyfriend", "therock", "theshoe"]
base_path = "../../Raw Data/Description Raw Psiturk Data Files"
subfolder = "2023-2024 Run"

demo_data = {}

# read in id_map data
id_map = pd.read_csv("../../Cleaned Data/Description_Cleaned_Data/description_user_id_map.csv")

for movie in movie_list:
    # read in question data
    demo = read_extract_demo_data(movie, base_path, subfolder)

    id_map_movie = id_map[id_map["Movie"] == movie]

    # merge demo to subjects
    subj_demo = id_map_movie.merge(demo, left_on = "OriginalID", right_on = "ID", how = "left")
    
    demo_data[movie] = subj_demo

In [126]:
subj_demo

Unnamed: 0,ID_x,OriginalID,Movie,ID_y,age,education,gender,race,hispanic
0,Drun2M06_637,debugHL4FS:debugtOXga,theshoe,debugHL4FS:debugtOXga,40-44,4,M,White,N
1,Drun2M06_507,debugDiJ9O:debugO6PbG,theshoe,debugDiJ9O:debugO6PbG,35-39,4,M,NONE,N
2,Drun2M06_401,debugAtEIJ:debugGClXC,theshoe,debugAtEIJ:debugGClXC,30-34,1,M,Black,N
3,Drun2M06_1878,debugoua4f:debugF88fo,theshoe,debugoua4f:debugF88fo,20-24,4,M,White,N
4,Drun2M06_214,debug5pC2H:debugPWjwm,theshoe,debug5pC2H:debugPWjwm,60-64,1,F,White,N
...,...,...,...,...,...,...,...,...,...
266,Drun2M06_979,debugQwsJQ:debugpSN4U,theshoe,debugQwsJQ:debugpSN4U,35-39,4,F,Black,N
267,Drun2M06_175,debug4s1QS:debugz31Tx,theshoe,debug4s1QS:debugz31Tx,25-29,5,M,White,N
268,Drun2M06_1562,debugfve6M:debug13htu,theshoe,debugfve6M:debug13htu,55-59,1,M,White,N
269,Drun2M06_1590,debuggwr5X:debugld0Tj,theshoe,debuggwr5X:debugld0Tj,30-34,2,M,White,N


In [127]:
summary_list = []

for movie, df in demo_data.items():
    df = df.copy()

    df["gender"] = df["gender"].astype(str).str.strip().str.upper()

    n_total = len(df)
    n_male = (df["gender"] == "M").sum()
    n_female = (df["gender"] == "F").sum()

    def parse_age(age_str):
        if pd.isna(age_str):
            return np.nan
        age_str = str(age_str).strip()
        # map data like "20-24"
        m = re.match(r"(\d+)\s*-\s*(\d+)", age_str)
        if m:
            low, high = map(int, m.groups())
            return (low + high) / 2
        # map data if like 70+
        m = re.match(r"(\d+)\s*\+", age_str)
        if m:
            return int(m.group(1)) + 2.5 
        return np.nan

    df["age_mid"] = df["age"].apply(parse_age)
    mean_age = df["age_mid"].mean(skipna=True)
    sd_age = df["age_mid"].std(skipna=True)

    summary_list.append({
        "Movie": movie,
        "Total Subjects": n_total,
        "Male Count": n_male,
        "Male %": round(n_male / n_total * 100, 1) if n_total > 0 else 0,
        "Female Count": n_female,
        "Female %": round(n_female / n_total * 100, 1) if n_total > 0 else 0,
        "Mean Age (approx)": round(mean_age, 1) if not np.isnan(mean_age) else None,
        "SD Age (approx)": round(sd_age, 1) if not np.isnan(sd_age) else None
    })

summary_df = pd.DataFrame(summary_list)
print(summary_df)

           Movie  Total Subjects  Male Count  Male %  Female Count  Female %  \
0        busstop             271         146    53.9           124      45.8   
1     cmiyc_long             271         149    55.0           120      44.3   
2  keithreynolds             271         153    56.5           115      42.4   
3   theboyfriend             270         145    53.7           123      45.6   
4        therock             270         155    57.4           112      41.5   
5        theshoe             271         161    59.4           109      40.2   

   Mean Age (approx)  SD Age (approx)  
0               40.1             13.0  
1               38.8             12.2  
2               38.8             12.2  
3               39.0             12.6  
4               39.8             12.4  
5               42.4             13.4  


# Prediction

In [129]:
movie_list = ["busstop", "cmiyc_long", "keithreynolds", "theboyfriend", "therock", "theshoe"]
base_path = "../../Raw Data/Prediction Raw Psiturk Data Files"
subfolder_list = ["Old_2019", "New_2021"]

demo_data_p = {}

# read in id_map data
id_map = pd.read_csv("../../Cleaned Data/Prediction_Cleaned_Data/prediction_user_id_map.csv")

for movie in movie_list:
    # read in question data
    demo_set = []
    for subfolder in subfolder_list:
        demo_current = read_extract_demo_data(movie, base_path, subfolder)
        # read_extract_demo_data(movie, base_path, subfolder)
        demo_set.append(demo_current)
    demo = pd.concat(demo_set,ignore_index = True)
    
    id_map_movie = id_map[id_map["Movie"] == movie]

    # merge demo to subjects
    subj_demo = id_map_movie.merge(demo, left_on = "OriginalID", right_on = "ID", how = "left")
    
    demo_data_p[movie] = subj_demo

In [130]:
summary_list = []

for movie, df in demo_data_p.items():
    df = df.copy()

    df["gender"] = df["gender"].astype(str).str.strip().str.upper()

    n_total = len(df)
    n_male = (df["gender"] == "M").sum()
    n_female = (df["gender"] == "F").sum()

    def parse_age(age_str):
        if pd.isna(age_str):
            return np.nan
        age_str = str(age_str).strip()
        # map data like "20-24"
        m = re.match(r"(\d+)\s*-\s*(\d+)", age_str)
        if m:
            low, high = map(int, m.groups())
            return (low + high) / 2
        # map data if like 70+
        m = re.match(r"(\d+)\s*\+", age_str)
        if m:
            return int(m.group(1)) + 2.5 
        return np.nan

    df["age_mid"] = df["age"].apply(parse_age)
    mean_age = df["age_mid"].mean(skipna=True)
    sd_age = df["age_mid"].std(skipna=True)

    summary_list.append({
        "Movie": movie,
        "Total Subjects": n_total,
        "Male Count": n_male,
        "Male %": round(n_male / n_total * 100, 1) if n_total > 0 else 0,
        "Female Count": n_female,
        "Female %": round(n_female / n_total * 100, 1) if n_total > 0 else 0,
        "Mean Age (approx)": round(mean_age, 1) if not np.isnan(mean_age) else None,
        "SD Age (approx)": round(sd_age, 1) if not np.isnan(sd_age) else None
    })

summary_df = pd.DataFrame(summary_list)
print(summary_df)

           Movie  Total Subjects  Male Count  Male %  Female Count  Female %  \
0        busstop             198         108    54.5            89      44.9   
1     cmiyc_long             181          96    53.0            85      47.0   
2  keithreynolds             184          96    52.2            87      47.3   
3   theboyfriend             169          88    52.1            80      47.3   
4        therock             182          91    50.0            88      48.4   
5        theshoe             181          95    52.5            82      45.3   

   Mean Age (approx)  SD Age (approx)  
0               35.1              9.8  
1               36.1             11.0  
2               36.2             11.6  
3               34.9             11.4  
4               36.1             10.9  
5               36.0             11.0  
