In [3]:
import pandas as pd
import os

# === 1️⃣ Paths ===
base_dir = "/h/kupfersk/cfpr_2026/data_limited_2026/"
file_path = os.path.join(base_dir, "utils/Geopolitical.csv")
output_path = os.path.join(base_dir, "Geopolitical_clean.csv")

# === 2️⃣ Load the file ===
df = pd.read_csv(file_path)

# --- Ensure column names are standardized ---
df.columns = [c.strip() for c in df.columns]

# === 3️⃣ Clean date column ===
if "Date" in df.columns:
    df["Date"] = pd.to_datetime(df["Date"], errors="coerce")
else:
    raise ValueError("❌ Expected a 'Date' column but did not find one.")

# === 4️⃣ Clean WUI column (remove commas, quotes, and make numeric) ===
if "WUI" in df.columns:
    df["WUI"] = (
        df["WUI"]
        .astype(str)
        .str.replace(",", "", regex=False)
        .str.replace('"', "", regex=False)
    )
    df["WUI"] = pd.to_numeric(df["WUI"], errors="coerce")
else:
    raise ValueError("❌ Expected a 'WUI' column but did not find one.")

# === 5️⃣ Drop rows without dates or values ===
df = df.dropna(subset=["Date", "WUI"])

# === 6️⃣ Sort and reset ===
df = df.sort_values("Date").reset_index(drop=True)

# === 7️⃣ Save cleaned version ===
df.to_csv(output_path, index=False)

print(f"✅ Cleaned Geopolitical data saved to: {output_path}")
print(f"📅 Date range: {df['Date'].min().date()} → {df['Date'].max().date()}")
print(f"🧮 Rows: {len(df)}")
print(df.head())


✅ Cleaned Geopolitical data saved to: /h/kupfersk/cfpr_2026/data_limited_2026/Geopolitical_clean.csv
📅 Date range: 2008-01-01 → 2025-08-01
🧮 Rows: 212
        Date      WUI
0 2008-01-01  17245.1
1 2008-02-01  16059.4
2 2008-03-01  11945.0
3 2008-04-01  14277.7
4 2008-05-01  14969.4
