In [1]:
from fastapi import FastAPI, Request
from pydantic import BaseModel
from collections import OrderedDict
from dateutil import parser
from typing import Dict
from datetime import datetime
import re
import json

app = FastAPI()

class TextRequest(BaseModel):
    text: str

def extract_valid_dates(text: str) -> Dict[str, str]:
    date_patterns = [
        r'\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b',
        r'\b\d{1,2}[/-]\d{4}\b',
        r'\b(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:t(?:ember)?)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)\s+\d{4}\b',
        r'\b\d{4}\b'
    ]

    phone_pattern = r'(\+?\d[\d\s\-().]{7,})'
    latlong_pattern = r'(?i)\b-?\d{1,2}\.\d{4,10}\s*(?:,|\s+and\s+)?\s*-?\d{1,3}\.\d{4,10}(?=\b|\s)(?:\s*(?:latitude|longitude))?'
    dms_pattern = r'\b\d{1,3}°\s*\d{1,2}(?:\'|’)?\s*\d{1,2}(?:\.\d+)?("?|’’)?\s*[NSEWnsew]\b'

    # New pattern to match Rs./₹ followed by numbers (currency format)
    currency_pattern = r'(Rs\.?\s?[\d,]+|₹\s?[\d,]+)'

    entries = []

    for pattern in date_patterns:
        for match in re.finditer(pattern, text):
            raw_date = match.group().strip()
            start = match.start()

            # Skip if raw_date is part of a currency match
            currency_context = text[max(0, start - 10):start + 15]
            if re.search(currency_pattern, currency_context):
                continue

            if re.fullmatch(latlong_pattern, raw_date):
                continue

            try:
                parsed_date = parser.parse(raw_date, dayfirst=True, fuzzy=False)
                year = parsed_date.year

                if re.fullmatch(r'\d{4}', raw_date):
                # Skip if this year is already part of a longer parsed date (e.g., "April 2020")
                if any(f"{parsed_date.month:02}-{year}" in existing_date for _, existing_date, _ in entries):
                    continue
                display_date = f"{year}"
                elif re.search(r'\b(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Sept|Oct|Nov|Dec)', raw_date, re.IGNORECASE):
                    display_date = f"{parsed_date.month:02}-{year}"
                elif re.fullmatch(r'\d{1,2}[/-]\d{4}', raw_date):
                    display_date = f"{parsed_date.month:02}-{year}"
                else:
                    display_date = f"{parsed_date.day:02}-{parsed_date.month:02}-{year}"
            except:
                continue

            snippet = text[start:start + 500].strip().replace("\n", " ")
            sentences = re.split(r'(?<=[.?!])\s+', snippet)
            snippet = " ".join(sentences[:4])

            snippet_context = text[max(0, start - 50):start + 100]
            if (
                re.search(phone_pattern, snippet_context) or 
                re.search(latlong_pattern, snippet_context) or 
                re.search(dms_pattern, snippet_context)
            ):
                continue

            entries.append((parsed_date, display_date, snippet))

    entries.sort(key=lambda x: x[0])
    sorted_dict = OrderedDict()
    for _, display_date, snippet in entries:
        if display_date not in sorted_dict:
            sorted_dict[display_date] = snippet

    return sorted_dict

@app.post("/extract-dates")
def extract_dates(request: TextRequest):
    result = extract_valid_dates(request.text)
    return result

