# Task 1 — Structured Events Dataset (Brent Oil)

This notebook demonstrates how the project loads and validates the **structured events dataset** (see `data/events.csv`) and prepares it for analysis alongside Brent oil prices (`data/BrentOilPrices.csv`).

**Outputs produced in this notebook:**
- Basic validation (schema + minimum event count)
- Parsed events (date + category)
- Simple alignment of event dates to the closest available price observation

In [5]:
from __future__ import annotations

import csv
from dataclasses import dataclass
from datetime import datetime, date
from pathlib import Path

def find_project_root(start: Path) -> Path:
    start = start.resolve()
    for candidate in (start, *start.parents):
        if (candidate / 'data').is_dir() and (candidate / 'backend').is_dir():
            return candidate
        if (candidate / 'data').is_dir() and (candidate / 'frontend').is_dir():
            return candidate
    return start

ROOT = find_project_root(Path.cwd())
EVENTS_CSV = ROOT / 'data' / 'events.csv'
PRICES_CSV = ROOT / 'data' / 'BrentOilPrices.csv'

print('Root:', ROOT)
print('Events CSV exists:', EVENTS_CSV.exists(), '|', EVENTS_CSV)
print('Prices CSV exists:', PRICES_CSV.exists(), '|', PRICES_CSV)

Root: C:\Users\weldi\Desktop\week_11
Events CSV exists: True | C:\Users\weldi\Desktop\week_11\data\events.csv
Prices CSV exists: True | C:\Users\weldi\Desktop\week_11\data\BrentOilPrices.csv


In [6]:
@dataclass(frozen=True)
class Event:
    date: date
    event: str
    category: str


def parse_iso_date(value: str) -> date:
    # Expected format for events.csv is YYYY-MM-DD
    return datetime.strptime(value.strip(), '%Y-%m-%d').date()


def load_events(path: Path) -> list[Event]:
    if not path.exists():
        raise FileNotFoundError(f'Missing events dataset: {path}')

    with path.open('r', encoding='utf-8-sig', newline='') as f:
        reader = csv.DictReader(f)
        required = {'date', 'event', 'category'}
        if not reader.fieldnames or not required.issubset(set(reader.fieldnames)):
            raise ValueError(f'events.csv must contain columns {sorted(required)}; got {reader.fieldnames}')

        events: list[Event] = []
        for row in reader:
            d = parse_iso_date(row['date'])
            e = (row.get('event') or '').strip()
            c = (row.get('category') or '').strip()
            if not e or not c:
                continue
            events.append(Event(date=d, event=e, category=c))

    events.sort(key=lambda x: x.date)
    return events


events = load_events(EVENTS_CSV)
len(events), events[0], events[-1]

(15,
 Event(date=datetime.date(1990, 8, 2), event='Iraq invades Kuwait (Gulf War)', category='Geopolitical Conflict'),
 Event(date=datetime.date(2022, 10, 5), event='OPEC+ announces significant production cuts', category='Policy/OPEC'))

In [7]:
# Basic validation required by feedback: ensure we have at least 10–15 structured events
if len(events) < 10:
    raise ValueError(f'Expected at least 10 events; found {len(events)}')

# Show a small sample
for ev in events[:5]:
    print(ev.date.isoformat(), '|', ev.category, '|', ev.event)

1990-08-02 | Geopolitical Conflict | Iraq invades Kuwait (Gulf War)
2001-09-11 | Geopolitical Conflict | September 11 attacks
2003-03-20 | Geopolitical Conflict | U.S.-led invasion of Iraq
2008-09-15 | Economic Shock | Global financial crisis intensifies (Lehman collapse)
2011-02-15 | Geopolitical Conflict | Libyan civil war escalates


In [None]:
def parse_price_date(value: str) -> date:
    value = value.strip()
    # Brent datasets can contain multiple date formats depending on source/cleaning
    for fmt in (
        '%d-%b-%y',        # 20-May-87
        '%d-%b-%Y',        # 20-May-1987
        '%Y-%m-%d',        # 1987-05-20
        '%b %d, %Y',       # Apr 22, 2020
        '%b %d,%Y',        # Apr 22,2020 (no space)
    ):
        try:
            return datetime.strptime(value, fmt).date()
        except ValueError:
            pass
    raise ValueError(f'Unrecognized price date format: {value!r}')


def load_prices(path: Path) -> list[tuple[date, float]]:
    if not path.exists():
        raise FileNotFoundError(f'Missing prices dataset: {path}')

    with path.open('r', encoding='utf-8-sig', newline='') as f:
        reader = csv.DictReader(f)
        # Expected columns: Date, Price
        required = {'Date', 'Price'}
        if not reader.fieldnames or not required.issubset(set(reader.fieldnames)):
            raise ValueError(f'BrentOilPrices.csv must contain columns {sorted(required)}; got {reader.fieldnames}')

        out: list[tuple[date, float]] = []
        for row in reader:
            d = parse_price_date(row['Date'])
            p = float(str(row['Price']).replace(',', '').strip())
            out.append((d, p))

    out.sort(key=lambda x: x[0])
    return out


prices = load_prices(PRICES_CSV)
len(prices), prices[0], prices[-1]

ValueError: Unrecognized price date format: 'Apr 22, 2020'

In [None]:
# Align each event to the closest available price observation date
price_dates = [d for d, _ in prices]

def closest_price_date(d: date) -> date:
    # O(n) is fine here for a small demo; can be optimized with bisect if needed
    return min(price_dates, key=lambda x: abs((x - d).days))

aligned = []
for ev in events:
    cd = closest_price_date(ev.date)
    aligned.append((ev.date, cd, ev.category, ev.event))

for row in aligned[:10]:
    print(row[0].isoformat(), '→', row[1].isoformat(), '|', row[2])