In [None]:

"""
Cell 1: 
working to import symptom data from an Excel file to csv
"""
#!/usr/bin/env python3
import os
import pandas as pd

# 1) Source Excel on Cortex
file_path = '/vol/cortex/cd3/pesaranlab/OCD_Mapping_Foundation/' \
            'sEEG_Pt_5_Symptom_provocation_wlabel.xlsx'
print("Exists?", os.path.exists(file_path))  # True

# 2) Columns we care about
time_cols   = ['Matlab Time', 'Timestamp']
rating_cols = ['Obsessions','Compulsions','Anxiety','Energy','Depression','Distress']
wanted      = time_cols + rating_cols

# 3) Only these three sheets (days 3–5)
sheets = ['Day3_02072025','Day4_02082025','Day5_02092025']

for sheet in sheets:
    # read with header on row 1, skip the blank row 2
    df = pd.read_excel(
        file_path,
        sheet_name=sheet,
        header=1,
        skiprows=[2],
    )
    df.columns = df.columns.str.strip()

    # pick only the columns that exist
    available = [c for c in wanted if c in df.columns]
    if not available:
        print(f"⚠️  {sheet}: no matching columns found; skipping")
        continue

    df2 = df[available].copy()

    # 4) Convert literal "missing" → NaN in the rating columns
    #    then drop rows where **all** ratings are NaN
    ratings_present = [c for c in rating_cols if c in df2.columns]
    df2[ratings_present] = df2[ratings_present].replace('missing', pd.NA)
    df2 = df2.dropna(subset=ratings_present, how='all')

    # 5) Write out a CSV for this day
    out_path = (
        '/vol/cortex/cd3/pesaranlab/OCD_Mapping_Foundation/'
        f'{sheet}_filtered_symptom_data.csv'
    )
    df2.to_csv(out_path, index=False)
    print(f"Wrote {len(df2)} rows to {out_path}")


In [None]:
"""
Cell 2: Extra scripts for filling in missing MATLAB time
working to compute the correlation between MATLAB time and wall-clock time
"""
import pandas as pd
import re
import matplotlib.pyplot as plt
import numpy as np

fn = "Day3_02072025_filtered_symptom_data.csv"

# 1) Read everything as data (no header), skipping the first two lines 
#    if they contain sheet-name or bogus header info.
df = pd.read_csv(fn, header=None, skiprows=2, usecols=[0,1], names=["mat_raw","ts_raw"])

# 2) Extract the numeric MATLAB time
df["mat_time"] = df["mat_raw"].astype(str).str.extract(r"^(\d+)")[0].astype(float)

# 3) Parse the timestamp into seconds since midnight
ts = pd.to_datetime(df["ts_raw"], format="%H:%M:%S", errors="coerce")
df["sec_of_day"] = ts.dt.hour*3600 + ts.dt.minute*60 + ts.dt.second

# 4) Drop rows where either is NaN and compute correlation
clean = df.dropna(subset=["mat_time","sec_of_day"])
print("Pearson r:", clean["mat_time"].corr(clean["sec_of_day"]))


plt.figure(figsize=(8,6))
plt.scatter(clean["sec_of_day"], clean["mat_time"], alpha=0.6)
plt.xlabel("Seconds since midnight")
plt.ylabel("MATLAB time")
plt.title("MATLAB Time vs. Wall-clock Time" + fn)
plt.grid(True)
plt.show()

plt.savefig("mat_vs_time.png", dpi=300, bbox_inches="tight")
print("Saved plot to mat_vs_time.png")



"""
predicting the MATLAB time for missing rows

"""
# --- 1) identify the .part2 block and fit a line mat_time = m*sec + b ---
# flag all rows whose raw label contains “part2”
mask_part2 = df["mat_raw"].astype(str).str.contains("part2", na=False)

# x = wall-clock seconds, y = MATLAB time
x = df.loc[mask_part2, "sec_of_day"]
y = df.loc[mask_part2, "mat_time"]

# fit a first-degree polynomial → slope m and intercept b
m, b = np.polyfit(x, y, 1)
print(f"part2 slope = {m:.4f}  intercept = {b:.1f}")

# --- 2) pick out the missing rows that belong to part2 (between first & last part2 secs) ---
tmin, tmax = x.min(), x.max()
mask_missing = (
    (df["mat_raw"] == "missing") &
    df["sec_of_day"].between(tmin, tmax)
)

r_part2 = x.corr(y)
print(f"Pearson r for .part2 = {r_part2:.3f}")

missing = df.loc[mask_missing, ["ts_raw", "sec_of_day"]].copy()
# predict MATLAB time
missing["pred_mat_time"] = m * missing["sec_of_day"] + b

# --- 3) print your three estimates ---
print("\nPredicted MATLAB times for the missing .part2 rows:")
for ts, sec, pred in missing.itertuples(index=False):
    print(f"  {ts}  →  {pred:.0f}")





In [None]:

"""
Cell 3
code to convert from the csv files to the pkl format, with a different file for every different 'part' of each trial, so that there are 3/4 files per patient/day/task for patient 5
"""


import os
import pickle
import pandas as pd
import numpy as np
from temporaldata import Interval

# === Configuration ===
# Sampling rate (samples per second) for converting sample indices to seconds
FSAMPLE = 1200.0  # adjust to your data's sampling rate

# === Paths ===
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
CSV_DIR = BASE_DIR  # CSV files live in the root of the project
LABEL_DIR = os.path.join(BASE_DIR, "processed_data", "labels")
os.makedirs(LABEL_DIR, exist_ok=True)


def process_csv(csv_path):
    """
    Process a filtered symptom CSV and write Interval pickles per segment,
    then print a brief preview of each Interval object.

    Expects CSV with columns:
      Matlab Time, Timestamp, Obsessions, Compulsions, Anxiety, Energy, Depression, Distress
    """
    base = os.path.basename(csv_path).replace("_filtered_symptom_data.csv", "")
    print(f"Processing file: {csv_path}")

    # Load CSV (header is first row)
    df = pd.read_csv(csv_path)
    # Normalize column names: remove spaces, trim
    df.columns = df.columns.str.strip().str.replace(' ', '')
    total = len(df)
    print(f"  Total rows: {total}")

    # Ensure required columns exist
    required = ["MatlabTime", "Timestamp", "Obsessions", "Compulsions",
                "Anxiety", "Energy", "Depression", "Distress"]
    missing = [c for c in required if c not in df.columns]
    if missing:
        raise KeyError(f"Missing columns in {csv_path}: {missing}")

    # Extract numeric sample index from 'MatlabTime'
    df['sample_idx'] = df['MatlabTime'].astype(str).str.extract(r"^(\d+)")[0].astype(float)

    # Determine segment label by splitting after the first hyphen
    df['segment'] = df['MatlabTime'].astype(str).str.split('-', 1).str[1].str.strip()

    # Compute interval start/end in seconds (60-second window)
    ends = df['sample_idx'] / FSAMPLE
    starts = (df['sample_idx'] - 60 * FSAMPLE) / FSAMPLE

    # Stack rating columns into array (N x 6)
    rating_cols = ['Obsessions', 'Compulsions', 'Anxiety', 'Energy', 'Depression', 'Distress']
    ratings = df[rating_cols].astype(float).to_numpy()

    # Default stim flag = 0 for all intervals
    stim_arr = np.zeros((total,), dtype=int)
    # Combine ratings + stim into label array (N x 7)
    label_arr = np.hstack([ratings, stim_arr.reshape(-1, 1)])

    # Write one Interval pickle per segment and preview
    for segment, grp in df.groupby('segment'):
        idx = grp.index
        iv = Interval(
            start=starts.iloc[idx].to_numpy(),
            end=ends.iloc[idx].to_numpy(),
            label=label_arr[idx],
            timekeys=['start', 'end']
        )
        out_name = f"{base}_{segment}_intervals.pkl"
        out_path = os.path.join(LABEL_DIR, out_name)
        with open(out_path, 'wb') as pf:
            pickle.dump(iv, pf)
        print(f"  → Segment '{segment}': {len(idx)} intervals saved to {out_path}")
        # Preview first three intervals
        print(f"    starts[:3]: {iv.start[:3]}")
        print(f"    ends[:3]:   {iv.end[:3]}")
        print(f"    labels[:3]: {iv.label[:3]}")


# if __name__ == '__main__':
#     # Only process Patient 4 CSV
#     patient4_csv = os.path.join(CSV_DIR, 'Day4_02082025_filtered_symptom_data.csv')
#     if os.path.exists(patient4_csv):
#         process_csv(patient4_csv)
#     else:
#         raise FileNotFoundError(f"Patient 4 CSV not found at {patient4_csv}")



if __name__ == '__main__':
    # Process all *_filtered_symptom_data.csv files in the root directory
    for fname in os.listdir(CSV_DIR):
        if fname.endswith('_filtered_symptom_data.csv'):
            process_csv(os.path.join(CSV_DIR, fname))