In [2]:
import pandas as pd
pd.set_option('display.width', None)         # show full rows
pd.set_option('display.max_columns', None)   # show all columns

FILE        = "data/2025_ListofStudiesforPlanning.xlsx"
SHEET       = "selection"
HEADER_ROW  = 3                               # row 4 in Excel

# 1 ─ read the data
df = pd.read_excel(FILE, sheet_name=SHEET, header=HEADER_ROW)

# 2 ─ date columns → datetime
for col in ["FPFV original date", "LPLV original date",
            "FPFV actual date",  "LPLV actual date"]:
    df[col] = pd.to_datetime(df[col], errors="coerce")

# 3 ─ durations
df["manual_days"] = (df["LPLV original date"] - df["FPFV original date"]).dt.days
df["actual_days"] = (df["LPLV actual date"]  - df["FPFV actual date"]).dt.days
df["delta_days"]  = df["actual_days"] - df["manual_days"]

# 4 ─ **absolute** error in years  (always positive)
df["abs_error_years"] = df["delta_days"].abs() / 365

# keep useful cols and rows with both durations present
cols     = ["Study no","NCT CT.gov","manual_days","actual_days","delta_days","abs_error_years"]
complete = df.dropna(subset=["manual_days","actual_days"])[cols]

# first 15 studies
first15 = complete.head(15)
print(first15.to_string(index=False))

# mean absolute error for those 15
mae_years = first15["abs_error_years"].mean()
mae_days  = mae_years * 365
print(f"\nMean absolute error (first 15 studies): "
      f"{mae_years:.2f} years  ≈ {mae_days:.0f} days")


Study no  NCT CT.gov  manual_days  actual_days  delta_days  abs_error_years
  PS0041 NCT06011733        565.0          463      -102.0         0.279452
  UP0119 NCT05292131        314.0          298       -16.0         0.043836
  AS0010 NCT03928704       1069.0         1453       384.0         1.052055
  AS0011 NCT03928743       1001.0         1201       200.0         0.547945
  HS0003 NCT04242446       1099.0         1096        -3.0         0.008219
  HS0004 NCT04242498       1096.0          940      -156.0         0.427397
  PS0015 NCT03536884        811.0         1883      1072.0         2.936986
  PS0032 NCT05020249        441.0          344       -97.0         0.265753
  EP0231 NCT06312566        186.0           71      -115.0         0.315068
  RA0043 NCT01550003       1447.0         4414      2967.0         8.128767
  RA0138 NCT04740814        504.0          501        -3.0         0.008219
  UP0085 NCT04163016        738.0         1068       330.0         0.904110
  SL0043 NCT