# Transform FMSA Article Submissions

Load the raw CSV, normalize the date column, add a lagged submission feature, and write an updated file next to the original.

In [None]:
from pathlib import Path

import pandas as pd

possible_roots = [Path.cwd(), Path.cwd().parent]
source_csv = None
for root in possible_roots:
    candidate = root / "data" / "FMSA_Articles.csv"
    if candidate.exists():
        repo_root = root
        source_csv = candidate
        break

if source_csv is None:
    raise FileNotFoundError(
        "Could not locate data/FMSA_Articles.csv from the current or parent directory."
    )

output_csv = source_csv.with_name(f"{source_csv.stem}_with_lag.csv")
source_csv, output_csv

In [None]:
raw_df = pd.read_csv(source_csv, encoding="utf-8-sig")

df = raw_df.rename(columns={"index": "date"})
df["date"] = pd.to_datetime(df["date"], format="%m/%d/%y")

article_series = pd.to_numeric(df["Article submissions"], errors="coerce")
if article_series.isna().any():
    raise ValueError("Unable to parse 'Article submissions' into numeric values.")
df["Article submissions"] = article_series.astype("Int64")

df = df.sort_values("date").reset_index(drop=True)
df["Article submissions lagged"] = df["Article submissions"].shift(1)
df["Article submissions lagged"] = df["Article submissions lagged"].astype("Int64")

df_out = df.assign(date=df["date"].dt.strftime("%Y-%m-%d"))[
    ["date", "Article submissions", "Article submissions lagged"]
].dropna()

df_out.to_csv(output_csv, index=False)

print(f"Wrote transformed data to {output_csv}")
df_out.head()