### Import packages

In [1]:
import pandas as pd
import os
import re
import numpy as np
import json
import os

Due to video length, data for cmiyc_long is collected in 2 seperate experiments, merge the data first. And change counterbalance label in the second experiment

In [2]:
def merge_pilot_run_folder(
    base_dir: str,
    subfolders,
    filenames=("trialdata.csv", "eventdata.csv", "questiondata.csv"),
    adjust_subfolder="2",
):
    pat = re.compile(r'("counterbalance"\s*:\s*)([012])\b')
    outputs = {}

    for fname in filenames:
        parts = []
        for sub in subfolders:
            fpath = os.path.join(base_dir, sub, fname)
            if not os.path.exists(fpath):
                print(f"⚠️ Warning: {fpath} not found")
                continue

            df = pd.read_csv(fpath, header=None)
            # change counterbalance labels
            if fname == "trialdata.csv" and sub == adjust_subfolder:
                datastring_col = df.columns[-1]
                df[datastring_col] = df[datastring_col].astype(str).str.replace(
                    pat, lambda m: m.group(1) + str(int(m.group(2)) + 6), regex=True
                )

            parts.append(df)

        if parts:
            merged = pd.concat(parts, ignore_index=True)
            out_path = os.path.join(base_dir, fname)
            merged.to_csv(out_path, index=False, header=False)
            outputs[fname] = out_path
            print(f"✅ Saved merged file to {out_path}")
        else:
            print(f"❌ No data found for {fname}")

    return outputs

In [3]:
base_dir = "../Raw Data/Description Raw Psiturk Data Files/cmiyc_long/2022- Pilot Run"
subfolders = ["1", "2"]
merge_pilot_run_folder(base_dir, subfolders)
base_dir =  "../Raw Data/Description Raw Psiturk Data Files/cmiyc_long/2023-2024 Run"
subfolders = ["1", "2"]
merge_pilot_run_folder(base_dir, subfolders)

✅ Saved merged file to ../Raw Data/Description Raw Psiturk Data Files/cmiyc_long/2022- Pilot Run/trialdata.csv
✅ Saved merged file to ../Raw Data/Description Raw Psiturk Data Files/cmiyc_long/2022- Pilot Run/eventdata.csv
✅ Saved merged file to ../Raw Data/Description Raw Psiturk Data Files/cmiyc_long/2022- Pilot Run/questiondata.csv
✅ Saved merged file to ../Raw Data/Description Raw Psiturk Data Files/cmiyc_long/2023-2024 Run/trialdata.csv
✅ Saved merged file to ../Raw Data/Description Raw Psiturk Data Files/cmiyc_long/2023-2024 Run/eventdata.csv
✅ Saved merged file to ../Raw Data/Description Raw Psiturk Data Files/cmiyc_long/2023-2024 Run/questiondata.csv


{'trialdata.csv': '../Raw Data/Description Raw Psiturk Data Files/cmiyc_long/2023-2024 Run/trialdata.csv',
 'eventdata.csv': '../Raw Data/Description Raw Psiturk Data Files/cmiyc_long/2023-2024 Run/eventdata.csv',
 'questiondata.csv': '../Raw Data/Description Raw Psiturk Data Files/cmiyc_long/2023-2024 Run/questiondata.csv'}

## Create Unique ID for participants

In [4]:
import os
import pandas as pd

base_path = "../Raw Data/Description Raw Psiturk Data Files"

movie_list = ["busstop", "cmiyc_long", "keithreynolds", "theboyfriend", "therock", "theshoe"]
subfolder_list = ["2022- Pilot Run", "2023-2024 Run"]

base_ids_set = set()

for movie in movie_list:
    movie_path = os.path.join(base_path, movie)
    for subfolder in subfolder_list:
        trial_path = os.path.join(movie_path, subfolder, "trialdata.csv")
        print(f"\n📁 Reading data for: {base_path} / {movie} / {subfolder}")
        if os.path.exists(trial_path):
            df = pd.read_csv(trial_path, header=None, comment="#")
            df.columns = ["ID", "Trial", "Timestamp", "Datastring"]
            print("  Participants in trial_data:", df["ID"].nunique())
            df["BaseID"] = df["ID"].astype(str).apply(lambda x: x.split(":")[0])
            base_ids_set.update(df["BaseID"].unique())
        else:
            print(f"⚠️ Not found: {trial_path}")

sorted_ids = sorted(base_ids_set)
user_id_map = {bid: f"{i+1:03d}" for i, bid in enumerate(sorted_ids)}

user_id_map_df = pd.DataFrame(list(user_id_map.items()), columns=["OriginalBaseID", "NewUserID"])

# save id matching map
# user_id_map_df.to_csv("user_id_map.csv", index=False)

print("\n✅ Finished creating ID map, total unique BaseIDs:", len(user_id_map_df))



📁 Reading data for: ../Raw Data/Description Raw Psiturk Data Files / busstop / 2022- Pilot Run
  Participants in trial_data: 27

📁 Reading data for: ../Raw Data/Description Raw Psiturk Data Files / busstop / 2023-2024 Run
  Participants in trial_data: 372

📁 Reading data for: ../Raw Data/Description Raw Psiturk Data Files / cmiyc_long / 2022- Pilot Run
  Participants in trial_data: 35

📁 Reading data for: ../Raw Data/Description Raw Psiturk Data Files / cmiyc_long / 2023-2024 Run
  Participants in trial_data: 402

📁 Reading data for: ../Raw Data/Description Raw Psiturk Data Files / keithreynolds / 2022- Pilot Run
  Participants in trial_data: 36

📁 Reading data for: ../Raw Data/Description Raw Psiturk Data Files / keithreynolds / 2023-2024 Run
  Participants in trial_data: 364

📁 Reading data for: ../Raw Data/Description Raw Psiturk Data Files / theboyfriend / 2022- Pilot Run
  Participants in trial_data: 24

📁 Reading data for: ../Raw Data/Description Raw Psiturk Data Files / theboyf

## Read Data

In [5]:
def read_data(base_path, movie, subfolder, ts_min=1663804800000):
    """Read trial, event, and question data; keep only rows with IDs whose trials have Timestamp > ts_min."""
    data_path = os.path.join(base_path, movie, subfolder)

    trial_path = os.path.join(data_path, "trialdata.csv")
    trial_data = pd.read_csv(trial_path, header=None, comment="#", names=["ID", "Trial", "Timestamp", "Datastring"])
    trial_data["Timestamp"] = pd.to_numeric(trial_data["Timestamp"], errors="coerce")

    trial_data = trial_data[trial_data["Timestamp"] > ts_min].copy()

    keep_ids = set(trial_data["ID"].dropna().unique())

    event_path = os.path.join(data_path, "eventdata.csv")
    event_data = pd.read_csv(event_path, header=None, comment="#", names=["ID", "Event", "Duration", "Details", "Timestamp"])
    event_data = event_data[event_data["ID"].isin(keep_ids)].copy()

    question_path = os.path.join(data_path, "questiondata.csv")
    question_data = pd.read_csv(question_path, header=None, comment="#", names=["ID", "Question", "Response"])
    question_data = question_data[question_data["ID"].isin(keep_ids)].copy()

    return trial_data, event_data, question_data

## Drop Incomplete Subjects

In [6]:
def filter_completed_and_audio_catch(trial_data, event_data, question_data):
    """Keep only participants who appear in question_data and have completed Audio_Catch."""
    valid_ids = question_data["ID"].unique()

    audio_ids = question_data.loc[
        question_data["Question"] == "Audio_Catch", "ID"
    ].unique()

    keep_ids = set(valid_ids) & set(audio_ids)

    trial_data = trial_data[trial_data["ID"].isin(keep_ids)].copy()
    event_data = event_data[event_data["ID"].isin(keep_ids)].copy()
    question_data = question_data[question_data["ID"].isin(keep_ids)].copy()

    return trial_data, event_data, question_data

In [7]:
def filter_repeat(trial_data, event_data, question_data):
    """Remove participants completed the study twice"""
    drop_ids = {"debugwSh8w:debugwpzZp",
                "debugJzmMB:debugGzFsZ",
                "debugValB4:debugIp2gE",
                "debugdmrft:debugrrSdu"}
    trial_data = trial_data[~trial_data["ID"].isin(drop_ids)].copy()
    event_data = event_data[~event_data["ID"].isin(drop_ids)].copy()
    question_data = question_data[~question_data["ID"].isin(drop_ids)].copy()
    
    return trial_data, event_data, question_data

In [8]:
def filter_loops(trial_data, event_data, question_data, instr_threshold=2):
    """
       Remove participants who looped (repeated) experiments due to technical issue. 
       Filter by participants who clicked "begin" twice
    """

    def cnt(series):
        c = 0
        for ds in series.dropna():
            try:
                js = json.loads(ds)
            except (json.JSONDecodeError, TypeError):
                continue
            if str(js.get("phase")).lower() == "id_submit" and str(js.get("status")).lower() in {"begin", "submit"}:
                c += 1
        return c

    counts = trial_data.groupby("ID")["Datastring"].apply(cnt)

    drop_ids = set(counts[counts > 2].index.astype(str))

    # subject looped but not in middle of experiment phase
    EXEMPT_IDS = {
        #busstop:
        "debug85E42:debugHMIlp", "debug3xsl4:debugSL1Ao", "debug8tWP5:debugXYr1v",
        #cmyic
        "debug5fU9J:debug3WJPv", "debugqtfx5:debug3XZfc", "debugXRIw3:debugcONSF",
        "debugvtFsp:debugU9FW1", "debug0oLsp:debugscuZ6", "debugmk4Qz:debugbPaAN",
        "debug09prG:debug7vNPf", "debugeJ3re:debugvdfbP", "debugEc6if:debugHdNZR", 
        "debugt5oxx:debugO8o8R",
        #keithreynolds
        "debugxLb1V:debug8OsDX", "debugRpwLc:debugO3jhL", "debug4yg2J:debugHq0s6",
        #theboyfriend
        "debug6w2T2:debugDZHNH","debugurgNU:debugOxZCP",
        #therock
        "debugvAgQy:debugY23jk",
        #theshoe
        "debugeIXxN:debugf7YGZ", "debugJG6DG:debugEgBS1", "debugKeAJA:debugNYy79",
        "debugWevWe:debugmQnOa", "debuglfcYg:debugLVi8o"
    }
    drop_ids -= {str(x) for x in EXEMPT_IDS}

    trial_data = trial_data[~trial_data["ID"].isin(drop_ids)].copy()
    event_data = event_data[~event_data["ID"].isin(drop_ids)].copy()
    question_data = question_data[~question_data["ID"].isin(drop_ids)].copy()

    # print(drop_ids)
    
    return trial_data, event_data, question_data

In [9]:
def filter_trials(trial_data, question_data, correct_map):
    """Filter participants who passed comprehension test: all correct on 4 questions"""
    def phase(q):
        if isinstance(q, str):
            if "Comp_" in q:
                return "comprehension"
            if "Audio" in q:
                return "audio_catch"
        return None

    question_data["phase"] = question_data["Question"].apply(phase)

    comp = question_data[question_data["phase"]=="comprehension"].copy()
    comp["Response"] = pd.to_numeric(comp["Response"], errors="coerce")
    # correct_map = {"Comp_Q1":1, "Comp_Q2":3, "Comp_Q3":4, "Comp_Q4":1}
    comp["acc"] = (comp["Response"] == comp["Question"].map(correct_map)).astype(int)

    audio = question_data[question_data["phase"]=="audio_catch"].copy()
    audio["Response"] = pd.to_numeric(audio["Response"], errors="coerce")
    audio["counterbalance"] = pd.to_numeric(audio["counterbalance"], errors="coerce")
    audio["counterbalance_norm"] = audio["counterbalance"].replace({6: 0, 7: 1, 8: 2})
    audio["acc"] = (audio["Response"] == audio["counterbalance_norm"]).astype(int)

    drop_ids = set(audio.loc[audio["acc"]==0, "ID"]) | set(comp.loc[comp["acc"]==0, "ID"])

    return trial_data[~trial_data["ID"].isin(drop_ids)]

### Add New Unique ID

Anonymized participant IDs were generated using a standardized format that includes task type, year, and movie number (e.g., "P2023M12_001"). 

Base IDs were extracted from the original ID column, mapped to unique numbers, and combined with the corresponding prefix. 

The original ID values were preserved in a new column (`OriginalID`) for traceability. This process was applied consistently to all datasts.


In [10]:
def assign_new_user_ids(trial_data, event_data, question_data, user_id_map, task, year, movie):

    def map_ids(df):
        df = df.copy()
        df["BaseID"] = df["ID"].astype(str).apply(lambda x: x.split(":")[0])

        df["OriginalID"] = df["ID"] 

        df["UniqueNum"] = df["BaseID"].map(user_id_map)

        prefix = f"{task}{year}M{int(movie):02d}_"

        df["ID"] = df["UniqueNum"].apply(lambda x: f"{prefix}{str(x).zfill(3)}" if pd.notnull(x) else None)

        return df.drop(columns=["BaseID", "UniqueNum"])

    trial_data = map_ids(trial_data)
    event_data = map_ids(event_data)
    question_data = map_ids(question_data)

    return trial_data, event_data, question_data

## Extract Variables

In [11]:
def add_counterbalance(trial_df, event_df, question_df):
    label_temp = trial_df[trial_df["Datastring"].str.contains("prac_segB_desc", na=False)].copy()
    label_temp["BaseID"] = label_temp["ID"].astype(str).str.split(":").str[0]

    label_temp["counterbalance"] = label_temp["Datastring"].str.extract(r'"counterbalance"\s*:\s*([0-9])')[0]
    label_temp["counterbalance"] = label_temp["counterbalance"].where(label_temp["counterbalance"].isin([str(i) for i in range(9)]), "ERROR")

    cb_map = dict(zip(label_temp["BaseID"], label_temp["counterbalance"]))

    def apply_map(df):
        df = df.copy()
        base = df["ID"].astype(str).str.split(":").str[0]
        df["counterbalance"] = base.map(cb_map)
        return df

    return apply_map(trial_df), apply_map(event_df), apply_map(question_df)

In [12]:
def add_onset(df):
    out = df.copy()

    phase = out["Datastring"].str.extract(r'"phase"\s*:\s*"([^"]+)"')[0]
    is_view = phase.eq("viewing")
    is_desc = phase.eq("description")

    out["_vidseg"]  = pd.to_numeric(out["Datastring"].str.extract(r'"vidseg"\s*:\s*"?(\d+)"?')[0], errors="coerce")
    out["_vidpair"] = pd.to_numeric(out["Datastring"].str.extract(r'"vidpair"\s*:\s*"?(\d+)"?')[0], errors="coerce")
    out["_offset"]  = pd.to_numeric(out["Datastring"].str.extract(r'"offset"\s*:\s*"?(\d+)"?')[0], errors="coerce")


    def _fill(group):
        view_map = dict(zip(group.loc[is_view.loc[group.index], "_vidseg"],
                            group.loc[is_view.loc[group.index], "_offset"]))
        idx = is_desc.loc[group.index]
        group.loc[idx, "onset"] = (group.loc[idx, "_vidpair"] - 1).map(view_map)
        return group

    out = out.groupby("ID", group_keys=False, as_index=False, sort=False)[out.columns].apply(_fill)
    
    out.loc[~is_desc, "onset"] = np.nan
    out.drop(columns=["_vidseg", "_vidpair", "_offset"], inplace=True)

    return out

In [13]:
def filter_and_extract_tag(df, movie, subfolder):
    out = df[df["Datastring"].str.contains(r'"phase"\s*:\s*"description"', na=False, regex=True)].copy()

    out["offset"] = pd.to_numeric(
        out["Datastring"].str.extract(r'"offset"\s*:\s*"?(\d+)"?')[0], errors="coerce"
    )

    #correct offset issue in theshoe
    if movie == "theshoe":
        out["offset"] = pd.to_numeric(out["offset"], errors="coerce").clip(upper=135)

    
    out["tag"] = out["Datastring"].str.extract(r'"tag"\s*:\s*"([^"]+)"')[0]
    out["paired_vid"] = pd.to_numeric(
        out["Datastring"].str.extract(r'"vidpair"\s*:\s*"?(\d+)"?')[0], errors="coerce"
    )

    out["phase_type"] = out["tag"].apply(
        lambda x: "practice" if isinstance(x, str) and x.startswith("prac_")
        else ("test" if isinstance(x, str) and x.startswith("test_") else None)
    )

    out["data_set"] = subfolder

    return out


def extract_vidpath_description_time(df):
    """Extract vidpath, prediction number, time, and confidence from Datastring."""
    # vidpath
    df["vidpath"] = df["Datastring"].str.extract(r'"vidpath"\s*:\s*"([^"]+)"')

    # prediction number
    df["description_stop"] = df["Datastring"].str.extract(r'"descstop"\s*:\s*"?(.*?)"?(?:,|})')[0]

    # time
    df["time"] = df["Datastring"].str.extract(r'"time"\s*:\s*([0-9]+)')

    # NEW: confidence
    df["importance"] = df["Datastring"].str.extract(r'"importance"\s*:\s*"?(.*?)"?(?:,|})')[0]

    for col in ["description_stop", "time", "confidence", "offset"]:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors="coerce")

    return df

def extract_content(df):
    df = df.copy()
    
    df["description_content"] = df["Datastring"].str.extract(r'"content"\s*:\s*"((?:\\.|[^"\\])*)"')

    df["description_content"] = df["description_content"].str.replace(r'\\"', r'\\', regex=True)

    return df


In [14]:
def reorder_and_remove_columns(df):
    desired_columns = [
        'ID','OriginalID', 'Timestamp', 'Trial', 'counterbalance', 'phase_type', 'description_stop', "paired_vid",
        'onset','offset','description_content', 'importance', 'time', 'tag', 'vidpath', 'Datastring','data_set'
    ]
    
    cols_to_keep = [col for col in desired_columns if col in df.columns]
    return df[cols_to_keep]

## RUN DATA

In [15]:
def map_movie(movie):
    if movie == "busstop":
        return "01"
    elif movie == "cmiyc_long":
        return "02"
    elif movie == "keithreynolds":
        return "03"
    elif movie == "theboyfriend":
        return "04"
    elif movie == "therock":
        return "05"
    elif movie == "theshoe":
        return "06"
    else:
        return None
    
def map_year(year):
    if year == "2022- Pilot Run":
        return "2022"
    else:
        return "run2"

In [16]:
base_path = "../Raw Data/Description Raw Psiturk Data Files"
movie_list = ["busstop", "cmiyc_long", "keithreynolds", "theboyfriend", "therock", "theshoe"]
subfolder_list = ["2022- Pilot Run", "2023-2024 Run"]

datasets_dic = {}
correct_map = {"Comp_Q1":1, "Comp_Q2":3, "Comp_Q3":4, "Comp_Q4":1}

for movie in movie_list:
    print(movie)
    if movie == "theshoe":
        correct_map = {"Comp_Q1":2, "Comp_Q2":4, "Comp_Q3":1, "Comp_Q4":3}
        # THRESHOLD = 1555027140000 # April 11, 2019
    else:
        correct_map = {"Comp_Q1":1, "Comp_Q2":3, "Comp_Q3":4, "Comp_Q4":1}
        # THRESHOLD = 1555729200000 # April 19, 2019
    
    datasets = []
            
    for subfolder in subfolder_list:
        trial_data, event_data, question_data = read_data(base_path, movie, subfolder)
    
        trial_data, event_data, question_data = filter_completed_and_audio_catch(trial_data, event_data, question_data)
        trial_data, event_data, question_data = filter_repeat(trial_data, event_data, question_data)
        trial_data, event_data, question_data = filter_loops(trial_data, event_data, question_data, instr_threshold=2) 

        trial_data, event_data, question_data = add_counterbalance(trial_data, event_data, question_data)
        
        trial_data = filter_trials(trial_data, question_data, correct_map = correct_map)
        
        a = map_year(subfolder)
        b = map_movie(movie)
        trial_data, even_data, question_data = assign_new_user_ids(trial_data, event_data, question_data, user_id_map, 
                                                                   "D", a, b)

        trial_data = add_onset(trial_data)
        trial_data = filter_and_extract_tag(trial_data, movie, subfolder)
        trial_data = extract_vidpath_description_time(trial_data)
        trial_data = extract_content(trial_data)
        trial_data = reorder_and_remove_columns(trial_data)

        trial_data = trial_data[trial_data["phase_type"]=="test"]
        print(len(trial_data))

        datasets.append(trial_data)
    
    df_all = pd.concat(datasets, ignore_index=True)    
    df_all["Datastring"] = (
        df_all["Datastring"]
        .astype("string")
        .str.replace(r"\r\n?","\n", regex=True)  
        .str.strip()                           
    )

    df_all = df_all.drop_duplicates(subset="Datastring", keep="first") 
    print(f"total number of rows:"+ str(len(df_all)))
    print("total number of participants: " + str(df_all["ID"].nunique()) + "\n")
    datasets_dic[movie] = df_all

    filename = f"{movie}_description_cleaned.csv"
    output_dir = "../Cleaned Data/Description_Cleaned_Data"
    os.makedirs(output_dir, exist_ok=True)

    output_path = os.path.join(output_dir, filename)
    df_all.to_csv(output_path, index=False)

busstop
29
1264
total number of rows:1293
total number of participants: 277

cmiyc_long
88
1990
total number of rows:2078
total number of participants: 283

keithreynolds
39
1054
total number of rows:1093
total number of participants: 281

theboyfriend
43
1410
total number of rows:1453
total number of participants: 278

therock
31
990
total number of rows:1021
total number of participants: 279

theshoe
8
391
total number of rows:399
total number of participants: 277



In [17]:
datasets_dic["theshoe"]

Unnamed: 0,ID,OriginalID,Timestamp,Trial,counterbalance,phase_type,description_stop,paired_vid,onset,offset,description_content,importance,time,tag,vidpath,Datastring,data_set
0,D2022M06_2104,debugrLxU4:debugECI7s,1663855514159,12,6,test,1,2,70.0,100,a man is admiring a young woman wearing a read...,6,143454,test_seg2_desc,/static/video/G/filmfest_clip5_G_1.mp4,"{""phase"": ""description"", ""counterbalance"": 6, ...",2022- Pilot Run
1,D2022M06_059,debug1T3oX:debug5dpHY,1663855145506,12,4,test,1,2,50.0,80,An angry man wasn't happy at the man with the ...,5,96131,test_seg2_desc,/static/video/E/filmfest_clip5_E_1.mp4,"{""phase"": ""description"", ""counterbalance"": 4, ...",2022- Pilot Run
2,D2022M06_1936,debugn3LTo:debugv4knh,1663855582979,12,3,test,1,2,40.0,70,The guy is drinking his milk and leaning on a ...,4,137512,test_seg2_desc,/static/video/D/filmfest_clip5_D_1.mp4,"{""phase"": ""description"", ""counterbalance"": 3, ...",2022- Pilot Run
3,D2022M06_1936,debugn3LTo:debugv4knh,1663855697329,15,3,test,2,4,130.0,135,There was a black shoe on the ground that was ...,6,32742,test_seg4_desc,/static/video/D/filmfest_clip5_D_3.mp4,"{""phase"": ""description"", ""counterbalance"": 3, ...",2022- Pilot Run
4,D2022M06_1612,debugeUIa6:debug6BtOj,1663855644355,12,7,test,1,2,80.0,110,"as you had seen previously, the male and femal...",7,150966,test_seg2_desc,/static/video/H/filmfest_clip5_H_1.mp4,"{""phase"": ""description"", ""counterbalance"": 7, ...",2022- Pilot Run
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
394,Drun2M06_1664,debugfve6M:debug13htu,1713190449358,12,8,test,1,2,90.0,120,The man and woman continued to walk down the p...,7,87707,test_seg2_desc,/static/video/I/filmfest_clip5_I_1.mp4,"{""phase"": ""description"", ""counterbalance"": 8, ...",2023-2024 Run
395,Drun2M06_1696,debuggwr5X:debugld0Tj,1713359907781,12,3,test,1,2,40.0,70,"man drinks milk, sees a girl in a red jumper a...",4,54655,test_seg2_desc,/static/video/D/filmfest_clip5_D_1.mp4,"{""phase"": ""description"", ""counterbalance"": 3, ...",2023-2024 Run
396,Drun2M06_1696,debuggwr5X:debugld0Tj,1713360002141,15,3,test,2,4,130.0,135,A different shoe is located on the ground next...,5,17053,test_seg4_desc,/static/video/D/filmfest_clip5_D_3.mp4,"{""phase"": ""description"", ""counterbalance"": 3, ...",2023-2024 Run
397,Drun2M06_079,debug1y4xQ:debugPxTWt,1713360049561,12,3,test,1,2,40.0,70,the man took a sip of the milk and looked left...,5,60803,test_seg2_desc,/static/video/D/filmfest_clip5_D_1.mp4,"{""phase"": ""description"", ""counterbalance"": 3, ...",2023-2024 Run


In [18]:
For unknown reason, 这90行，参照的dataset里面，offset全是135，和原始数据不符合。提取出来的time也和datastring不符合。

SyntaxError: invalid character '，' (U+FF0C) (3810768502.py, line 1)

In [None]:
1159：	debug85E42:debugHMIlp
1185： 	debug3xsl4:debugSL1Ao
1280：	debug8tWP5:debugXYr1v

1044: 	debugxLb1V:debug8OsDX
1120:	debugRpwLc:debugO3jhL
1227: 	debug4yg2J:debugHq0s6
bf
1114: 	debug6w2T2:debugDZHNH
1268:	debugurgNU:debugOxZCP
rock
1247：	debugvAgQy:debugY23jk
shoe:
1065:   debugeIXxN:debugf7YGZ
1154:   debugJG6DG:debugEgBS1
1160:   debugKeAJA:debugNYy79
1179:   debugWevWe:debugmQnOa
1209：  debuglfcYg:debugLVi8o

1218: 	debugqtfx5:debug3XZfc (结果有问题，practice 被存成 test了，而且loop不是在test开始之前)

看起来完全一样，不知道为啥检测出来不一样。。。
cmiyc
1188：	debugsjpLu:debuggcMnC  
1179：  debugv3TI0:debugQrf6K 
1127：  debugOOwl7:debugLNlRv
theboyfriend
1216:   debugaphEO:debugehtVi
1297:   debuganYIb:debugHAgHr
therock
1150:   debugzqBlK:debugkBtyc


In [None]:
          debug5fU9J:debug3WJPv 1114:	debug5fU9J:debug3WJPv (loop before get to test)
		- debugf6XW7:debugSZd7Z
		- debugqtfx5:debug3XZfc 1218: 	debugqtfx5:debug3XZfc (结果有问题，practice 被存成 test了，而且loop不是在test开始之前)
		- debugUlmhP:debugnGPap
		- debug8LWOM:debug0zAxC
		- debugXPToG:debugUTp2R
		- debugUPs99:debugZn4ia
		- debugXiHAU:debugI7lJi
		- debugVv20k:debugyZbCt
		- debugXRIw3:debugcONSF 1107:	debugXRIw3:debugcONSF (loop before get to test)
		- debugvtFsp:debugU9FW1 1185：	debugvtFsp:debugU9FW1 (loop before get to test)
		- debug0oLsp:debugscuZ6 1134：	debug0oLsp:debugscuZ6 (loop before get to test)
		- debugftp27:debugFDL8k

    	- debugmk4Qz:debugbPaAN 1070: 	debugmk4Qz:debugbPaAN (loop before get to test)
		- debug09prG:debug7vNPf 1071:	debug09prG:debug7vNPf (loop before get to test)
		- debugzIGMU:debuggRqBn
		- debugeJ3re:debugvdfbP 1212: 	debugeJ3re:debugvdfbP (loop before get to test)
		- debugEc6if:debugHdNZR 1192:	debugEc6if:debugHdNZR (loop before get to test)
		- debugt5oxx:debugO8o8R 1183：	debugt5oxx:debugO8o8R (loop before get to test)