In [1]:
import pandas as pd
import numpy as np
import re


In [2]:
df_raw = pd.read_csv("cog.csv")

print(df_raw.shape)
print(df_raw.columns)
df_raw.head()


(32, 19)
Index(['Timestamp',
       'This form collects anonymous study behavior data for an academic project on study patterns and mental workload. No personal identifiers are collected. You may skip any question. ',
       'Q1. Your academic level ', 'Q2. Todayâ€™s date ',
       'Q3. Did you study today? ',
       'Q4. How many separate study sessions did you have today? ',
       'Q5a. Subject studied ', 'Q5b. Approximate start time (HH:MM)',
       '  Q5c. Approximate end time (HH:MM)  ',
       '  Q5d. Was this session planned in advance?  ',
       '  Q5e. How long did you plan to study? (minutes)  ',
       '  Q6. Did you take breaks during this session?  ',
       '  Q7. Did you switch tasks or subjects during the session?  ',
       '  Q8. Do you have an academic deadline within:  ',
       '  Q9. How mentally demanding did todayâ€™s study feel?  ',
       'Q10. How focused did you feel overall? ',
       'Q11. Right now, how mentally tired do you feel? ',
       '  Q12. Did 

Unnamed: 0,Timestamp,This form collects anonymous study behavior data for an academic project on study patterns and mental workload. No personal identifiers are collected. You may skip any question.,Q1. Your academic level,Q2. Todayâ€™s date,Q3. Did you study today?,Q4. How many separate study sessions did you have today?,Q5a. Subject studied,Q5b. Approximate start time (HH:MM),Q5c. Approximate end time (HH:MM),Q5d. Was this session planned in advance?,Q5e. How long did you plan to study? (minutes),Q6. Did you take breaks during this session?,Q7. Did you switch tasks or subjects during the session?,Q8. Do you have an academic deadline within:,Q9. How mentally demanding did todayâ€™s study feel?,Q10. How focused did you feel overall?,"Q11. Right now, how mentally tired do you feel?",Q12. Did todayâ€™s study happen mostly:,Q13. Any short comment about todayâ€™s study?
0,12/19/2025 20:04:55,I consent to anonymously share my responses fo...,PG,12/19/2025,Yes,2.0,Coding,12:00,14:30,No,240,No breaks,No,2-3 days,5,3,4,Afternoon,Wasted!!!!!
1,12/19/2025 20:05:01,I consent to anonymously share my responses fo...,PG,12/19/2025,Yes,1.0,Coding,8:00 AM,9:10 AM,No,approx 1 hour,Yes but very few,No,No upcoming deadline,1,4,3,Morning,nice
2,12/19/2025 20:22:31,I consent to anonymously share my responses fo...,UG,12/19/2025,No (Scroll down click next),,Coding,,,,,,,,2,2,4,Afternoon,
3,12/19/2025 20:26:06,I consent to anonymously share my responses fo...,PG,12/19/2025,Yes,2.0,Other,8 30 am,10:30 qm,No,There was not decided,No breaks,No,This week,4,3,2,Morning,
4,12/19/2025 20:27:04,I consent to anonymously share my responses fo...,PG,12/19/2025,Yes,2.0,Coding,2,3.3,Yes,240,Yes regularly,1-2 times,No upcoming deadline,3,3,3,Late night,Ghai Mt kar ðŸ˜…


In [3]:
df = df_raw.rename(columns={
    df_raw.columns[0]: "timestamp",
    df_raw.columns[1]: "consent",
    df_raw.columns[2]: "academic_level",
    df_raw.columns[3]: "date",
    df_raw.columns[4]: "studied_today_raw",
    df_raw.columns[5]: "num_sessions",
    df_raw.columns[6]: "subject",
    df_raw.columns[7]: "start_time_raw",
    df_raw.columns[8]: "end_time_raw",
    df_raw.columns[9]: "planned_duration_raw",
    df_raw.columns[10]: "breaks",
    df_raw.columns[11]: "task_switching",
    df_raw.columns[12]: "deadline",
    df_raw.columns[13]: "effort",
    df_raw.columns[14]: "focus",
    df_raw.columns[15]: "fatigue",
    df_raw.columns[16]: "time_of_day",
    df_raw.columns[17]: "comment",
    df_raw.columns[18]: "extra_col"   # garbage column, weâ€™ll drop it
})


In [6]:
df = df.drop(columns=["consent", "extra_col"], errors="ignore")


In [7]:
df["studied_today"] = np.where(
    df["subject"].notna() | df["start_time_raw"].notna(),
    1,
    0
)


In [8]:
df["date"] = pd.to_datetime(df["date"], errors="coerce")


In [9]:
def parse_time(t):
    if pd.isna(t):
        return np.nan

    t = str(t).lower().strip()
    t = t.replace(" ", "").replace(".", ":")

    # common garbage
    if t in ["", "nan", "qm", "mins"]:
        return np.nan

    # match hour, optional minute, optional am/pm
    m = re.match(r"^(\d{1,2})(:?(\d{1,2}))?(am|pm)?$", t)
    if not m:
        return np.nan

    hour = int(m.group(1))
    minute = int(m.group(3)) if m.group(3) else 0
    meridian = m.group(4)

    if meridian == "pm" and hour != 12:
        hour += 12
    if meridian == "am" and hour == 12:
        hour = 0

    if hour > 23 or minute > 59:
        return np.nan

    return f"{hour:02d}:{minute:02d}"


In [10]:
df["start_time"] = df["start_time_raw"].apply(parse_time)
df["end_time"] = df["end_time_raw"].apply(parse_time)


In [12]:
df["start_dt"] = pd.to_datetime(
    df["date"].astype(str) + " " + df["start_time"],
    errors="coerce"
)

df["end_dt"] = pd.to_datetime(
    df["date"].astype(str) + " " + df["end_time"],
    errors="coerce"
)

df["session_duration_min"] = (
    (df["end_dt"] - df["start_dt"]).dt.total_seconds() / 60
)


In [13]:
df.loc[
    (df["session_duration_min"] < 0) |
    (df["session_duration_min"] > 600),
    "session_duration_min"
] = np.nan


In [14]:
break_map = {
    "No breaks": 0,
    "Yes but very few": 1,
    "Yes regularly": 2
}
df["break_score"] = df["breaks"].map(break_map)


In [15]:
switch_map = {
    "No": 0,
    "1-2 times": 1,
    "Frequently": 2
}
df["switch_score"] = df["task_switching"].map(switch_map)


In [16]:
deadline_map = {
    "Next 24 hours": 3,
    "2-3 days": 2,
    "This week": 1.5,
    "More than a week away": 1,
    "No upcoming deadline": 0
}
df["deadline_weight"] = df["deadline"].map(deadline_map)


In [17]:
for col in ["effort", "focus", "fatigue"]:
    df[col] = pd.to_numeric(df[col], errors="coerce")


In [18]:
df["late_night"] = np.where(
    df["start_dt"].dt.hour >= 22,
    1,
    0
)


In [19]:
df[[
    "subject",
    "session_duration_min",
    "effort",
    "focus",
    "fatigue",
    "break_score",
    "switch_score",
    "deadline_weight",
    "late_night"
]].head(10)


Unnamed: 0,subject,session_duration_min,effort,focus,fatigue,break_score,switch_score,deadline_weight,late_night
0,Coding,150.0,,5,3,,,,0
1,Coding,70.0,,1,4,,,,0
2,Coding,,,2,2,,,,0
3,Other,,,4,3,,,,0
4,Coding,63.0,,3,3,,,,0
5,Coding,260.0,,4,4,,,,0
6,Coding,60.0,,1,3,,,,0
7,Reading,150.0,,1,3,,,,0
8,Other,60.0,,1,5,,,,0
9,Math,60.0,,4,4,,,,0


In [20]:
df.to_csv("cognitive_load_cleaned.csv", index=False)


In [31]:
cols_to_keep = [
    "academic_level",
    "date",
    "num_sessions",
    "subject",
    "session_duration_min",
    "break_score",
    "switch_score",
    "deadline_weight",
    "late_night",
    "effort",
    "focus",
    "fatigue",
    "studied_today"
]

df_model = df[cols_to_keep].copy()


In [32]:
df_model.shape
df_model.head()
df_model.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32 entries, 0 to 31
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   academic_level        32 non-null     object        
 1   date                  32 non-null     datetime64[ns]
 2   num_sessions          28 non-null     object        
 3   subject               30 non-null     object        
 4   session_duration_min  22 non-null     float64       
 5   break_score           0 non-null      float64       
 6   switch_score          0 non-null      float64       
 7   deadline_weight       0 non-null      float64       
 8   late_night            32 non-null     int64         
 9   effort                0 non-null      float64       
 10  focus                 32 non-null     int64         
 11  fatigue               32 non-null     int64         
 12  studied_today         32 non-null     int64         
dtypes: datetime64[ns](1), 

In [33]:
numeric_cols = [
    "num_sessions",
    "session_duration_min",
    "break_score",
    "switch_score",
    "deadline_weight",
    "late_night",
    "effort",
    "focus",
    "fatigue",
    "studied_today"
]

df_model[numeric_cols] = df_model[numeric_cols].apply(
    pd.to_numeric, errors="coerce"
)


In [34]:
df_model.to_csv("cognitive_load_model_ready.csv", index=False)


In [38]:
# Rule-based imputation (behavioral absence)
df_model["break_score"] = df_model["break_score"].fillna(0)
df_model["switch_score"] = df_model["switch_score"].fillna(0)
df_model["deadline_weight"] = df_model["deadline_weight"].fillna(0)

# Optional: mark missing duration explicitly
df_model["duration_missing"] = df_model["session_duration_min"].isna().astype(int)

# Final check
df_model.isna().sum()


academic_level           0
date                     0
num_sessions             5
subject                  2
session_duration_min    10
break_score              0
switch_score             0
deadline_weight          0
late_night               0
effort                  32
focus                    0
fatigue                  0
studied_today            0
duration_missing         0
dtype: int64