In [2]:
import re

# Mapping month names to numbers
MONTHS = {
    'january': '01', 'jan': '01',
    'february': '02', 'feb': '02',
    'march': '03', 'mar': '03',
    'april': '04', 'apr': '04',
    'may': '05',
    'june': '06', 'jun': '06',
    'july': '07', 'jul': '07',
    'august': '08', 'aug': '08',
    'september': '09', 'sep': '09', 'sept': '09',
    'october': '10', 'oct': '10',
    'november': '11', 'nov': '11',
    'december': '12', 'dec': '12'
}

def normalize_year(y):
    y = y.strip()
    if len(y) == 2:
        # Convert 2-digit year to 4-digit year with heuristic
        y2 = int(y)
        if y2 <= 49:
            return '20{:02d}'.format(y2)
        else:
            return '19{:02d}'.format(y2)
    elif len(y) == 4:
        return y
    else:
        # fallback, unrecognized year format
        return y

def normalize_day(d):
    # Remove ordinal suffixes: st, nd, rd, th
    d = d.lower()
    d = re.sub(r'(st|nd|rd|th)', '', d)
    d = d.zfill(2)
    return d

def parse_date(text):
    text = text.lower()

    # Patterns to extract dates:
    patterns = [
        # 1) Month Day, Year or Day Month Year (with optional ordinal suffix)
        r'(?P<month>\b(?:jan(?:uary)?|feb(?:ruary)?|mar(?:ch)?|apr(?:il)?|may|jun(?:e)?|jul(?:y)?|aug(?:ust)?|sep(?:t(?:ember)?)?|oct(?:ober)?|nov(?:ember)?|dec(?:ember)?)\b)[\s,.-]*(?P<day>\d{1,2}(?:st|nd|rd|th)?)\D{0,3}(?P<year>\d{2,4})',
        r'(?P<day>\d{1,2}(?:st|nd|rd|th)?)\s*(?:of\s*)?(?P<month>\b(?:jan(?:uary)?|feb(?:ruary)?|mar(?:ch)?|apr(?:il)?|may|jun(?:e)?|jul(?:y)?|aug(?:ust)?|sep(?:t(?:ember)?)?|oct(?:ober)?|nov(?:ember)?|dec(?:ember)?)\b)[\s,.-]*(?P<year>\d{2,4})?',
        # 2) ISO style YYYY-MM-DD or YYYY/MM/DD or YYYY.MM.DD
        r'(?P<year>\d{4})[./-](?P<month>\d{1,2})[./-](?P<day>\d{1,2})',
        # 3) Numeric: DD/MM/YYYY or DD-MM-YYYY or DD.MM.YYYY (assuming DD/MM/YYYY, default if ambiguous)
        r'(?P<day>\d{1,2})[./-](?P<month>\d{1,2})[./-](?P<year>\d{2,4})',
        # 4) Numeric with 2-digit year DD/MM/YY
        r'(?P<day>\d{1,2})[./-](?P<month>\d{1,2})[./-](?P<year>\d{2})',
        # Month and year only is ignored due to missing day
    ]

    for pattern in patterns:
        match = re.search(pattern, text)
        if match:
            gd = match.groupdict()
            day = gd.get('day') or ''
            month = gd.get('month') or ''
            year = gd.get('year') or ''

            # Handle month as name or number
            if month.isdigit():
                mm = month.zfill(2)
            else:
                mm = MONTHS.get(month.strip(), None)
                if not mm:
                    continue  # skip if month not recognized

            # Normalize day and year
            if day:
                dd = normalize_day(day)
            else:
                continue  # No day found, skip

            if year:
                yy = normalize_year(year)
            else:
                continue  # Year missing, skip

            # Convert to integers for validation
            try:
                di = int(dd)
                mi = int(mm)
                yi = int(yy)
            except:
                continue  # invalid integers, skip

            # Validate ranges
            if not (1 <= di <= 31 and 1 <= mi <= 12 and 1000 <= yi <= 2999):
                continue

            # Handle ambiguous MM/DD/YYYY like 07/08/1990 or 12/25/2019 in US format
            # Detect if the pattern is ambiguous numeric withslashes/dots/hyphens:
            # We'll try a heuristic: if day>12 and month<=12 -> interpret as DD/MM/YYYY
            # if month>12 and day<=12 -> invalid month, skip
            # if both <=12, default to DD/MM/YYYY per user request

            # Confirm if input uses MM/DD/YYYY by some clues (no strict way here)
            # But user examples prefer DD/MM/YYYY always so return DD/MM/YYYY format

            return f"{dd}/{mm}/{yy}"

    return None


if __name__ == "__main__":
    tests = [
        "The event will take place on March 5, 2023.",
        "Her birthday is on 07/08/1990.",
        "The deadline is 2022-12-31.",
        "We met on 1st of January 2000.",
        "The deadline is 2020-02-28.",
        "Submit your report by 08/31/2021.",
        "They got married on 12/12/12.",
        "Vacation starts on 07/15/2021.",
        "The workshop is on February 29, 2024.",
        "The seminar is on 03/14/2022.",
    ]

    for test in tests:
        print(f"Input: {test}")
        print(f"Parsed: {parse_date(test)}\n")


Input: The event will take place on March 5, 2023.
Parsed: 05/03/2023

Input: Her birthday is on 07/08/1990.
Parsed: 07/08/1990

Input: The deadline is 2022-12-31.
Parsed: 31/12/2022

Input: We met on 1st of January 2000.
Parsed: 20/01/2000

Input: The deadline is 2020-02-28.
Parsed: 28/02/2020

Input: Submit your report by 08/31/2021.
Parsed: None

Input: They got married on 12/12/12.
Parsed: 12/12/2012

Input: Vacation starts on 07/15/2021.
Parsed: None

Input: The workshop is on February 29, 2024.
Parsed: 29/02/2024

Input: The seminar is on 03/14/2022.
Parsed: None



In [9]:
import re
import csv

def parse_date(text):
    """
    Extracts date information from a given text string and formats it as DD/MM/YYYY.

    This function uses regular expressions to identify various common date formats
    and converts month names to numerical representations. It handles 2-digit years
    by applying a heuristic (00-49 -> 20xx, 50-99 -> 19xx).

    Args:
        text (str): The input text containing a date.

    Returns:
        str: The extracted date in DD/MM/YYYY format, or "N/A" if no date is found.
    """
    # Convert text to lowercase for case-insensitive matching
    text_lower = text.lower()

    # Dictionary to map month names (full and abbreviated) to their two-digit numbers
    months = {
        'january': '01', 'jan': '01',
        'february': '02', 'feb': '02',
        'march': '03', 'mar': '03',
        'april': '04', 'apr': '04',
        'may': '05',
        'june': '06', 'jun': '06',
        'july': '07', 'jul': '07',
        'august': '08', 'aug': '08',
        'september': '09', 'sep': '09', 'sept': '09',
        'october': '10', 'oct': '10',
        'november': '11', 'nov': '11',
        'december': '12', 'dec': '12'
    }

    # Compile a regex pattern to match all month names for efficiency
    # re.escape is used to handle month names that might contain special regex characters
    month_names_pattern = '|'.join(re.escape(m) for m in months.keys())

    # --- Date Parsing Patterns (ordered by specificity/commonality) ---

    # Pattern 1: MM/DD/YYYY, MM-DD-YYYY, MM.DD.YYYY (prioritize if day > 12)
    # This pattern specifically targets US-style dates where the month comes first.
    # We apply a heuristic: if the second numerical part (which would be the day in MM/DD)
    # is greater than 12, we assume it's MM/DD/YYYY.
    match = re.search(r'\b(\d{1,2})[./-](\d{1,2})[./-](\d{4})\b', text_lower)
    if match:
        potential_month, potential_day, year = match.groups()
        if int(potential_day) > 12: # If the second number is > 12, it's likely a day in MM/DD/YYYY
            return f"{int(potential_day):02d}/{int(potential_month):02d}/{year}"

    # Pattern 2: DD/MM/YYYY, DD-MM-YYYY, DD.MM.YYYY
    # Examples: "07/08/1990", "31-12-2022", "02.04.2022", and ambiguous cases like "01/02/2022"
    # This pattern now acts as the primary for DD/MM/YYYY and the fallback for ambiguous X/Y/Z dates.
    match = re.search(r'\b(\d{1,2})[./-](\d{1,2})[./-](\d{4})\b', text_lower)
    if match:
        day, month, year = match.groups()
        # Format day and month with leading zeros if necessary
        return f"{int(day):02d}/{int(month):02d}/{year}"

    # Pattern 3: YYYY-MM-DD, YYYY/MM/DD, YYYY.MM.DD
    # Examples: "2022-12-31", "1987/11/23", "2020.12.15"
    # Matches four digits for year, two digits for month, two digits for day,
    # separated by '/', '-', or '.'
    match = re.search(r'\b(\d{4})[./-](\d{1,2})[./-](\d{1,2})\b', text_lower)
    if match:
        year, month, day = match.groups()
        # Format day and month with leading zeros if necessary
        return f"{int(day):02d}/{int(month):02d}/{year}"

    # Pattern 4: D/M/YY (e.g., "5/6/19")
    # Matches one or two digits for day, one or two digits for month, two digits for year,
    # separated by '/' or '-'
    match = re.search(r'\b(\d{1,2})[/-](\d{1,2})[/-](\d{2})\b', text_lower)
    if match:
        day, month, year_short = match.groups()
        year_int = int(year_short)
        # Heuristic for 2-digit years:
        # If year is 00-49, assume 20xx (e.g., 19 -> 2019)
        # If year is 50-99, assume 19xx (e.g., 87 -> 1987)
        full_year = 2000 + year_int if year_int <= 49 else 1900 + year_int
        return f"{int(day):02d}/{int(month):02d}/{full_year}"

    # Pattern 5: Month DD, YYYY (e.g., "March 5, 2023", "April 03, 2020", "February 29, 2024")
    # Matches a month name, followed by one or two digits for day (optionally with 'st', 'nd', 'rd', 'th' suffix),
    # then allows for optional non-digit characters and spaces before the four-digit year.
    match = re.search(rf'\b({month_names_pattern})\s+(\d{{1,2}})(?:st|nd|rd|th)?\s*(?:[^0-9]*?)\s*(\d{{4}})\b', text_lower)
    if match:
        month_name, day, year = match.groups()
        month = months[month_name] # Convert month name to number
        return f"{int(day):02d}/{month}/{year}"

    # Pattern 6: DDth/st/nd/rd Month YYYY (e.g., "21st June, 2024", "1st of January 2000", "15th September, 2021")
    # Matches one or two digits for day (optionally with 'st', 'nd', 'rd', 'th' suffix),
    # an optional "of ", a month name, then allows for optional non-digit characters and spaces before the four-digit year.
    match = re.search(rf'\b(\d{{1,2}})(?:st|nd|rd|th)?\s+(?:of\s+)?({month_names_pattern})\s*(?:[^0-9]*?)\s*(\d{{4}})\b', text_lower)
    if match:
        day, month_name, year = match.groups()
        month = months[month_name] # Convert month name to number
        return f"{int(day):02d}/{month}/{year}"

    # If no date pattern is matched, return "N/A"
    return "N/A"

# --- Main execution part to read CSV and test the parser ---
def main():
    """
    Reads test cases from 'date_parser_testcases.csv', applies the parse_date function,
    and prints the results, comparing actual output with expected output.
    Also provides a summary of test results.
    """
    test_cases = []
    try:
        # Open the CSV file and read test cases
        with open('date_parser_testcases.csv', 'r', encoding='utf-8') as f:
            reader = csv.reader(f)
            next(reader)  # Skip the primary header row
            for row in reader:
                # Ensure the row has exactly two columns (Input, Expected Output)
                if len(row) == 2:
                    # Explicitly skip the header row if it appears as a data row
                    if row[0].strip().lower() == 'input' and row[1].strip().lower() == 'expected output':
                        continue
                    test_cases.append({'input': row[0], 'expected_output': row[1]})
                else:
                    print(f"Skipping malformed row in CSV: {row}")

    except FileNotFoundError:
        print("Error: 'date_parser_testcases.csv' not found. Please ensure the file is in the same directory.")
        return
    except Exception as e:
        print(f"An error occurred while reading the CSV file: {e}")
        return

    print("--- Running Date Parser Tests ---")
    print("-" * 50)

    total_tests = len(test_cases)
    passed_tests = 0
    failed_tests = 0

    # Iterate through each test case and evaluate the parser
    for i, case in enumerate(test_cases):
        input_text = case['input']
        expected_output = case['expected_output']
        actual_output = parse_date(input_text)

        status = "PASSED" if actual_output == expected_output else "FAILED"

        print(f"Test {i+1}:")
        print(f"  Input:    '{input_text}'")
        print(f"  Expected: '{expected_output}'")
        print(f"  Actual:   '{actual_output}'")
        print(f"  Status:   {status}")
        if status == "FAILED":
            print(f"  Mismatch: Expected '{expected_output}', Got '{actual_output}'")
            failed_tests += 1
        else:
            passed_tests += 1
        print("-" * 50)

    print("--- Tests Complete ---")


    print("")

    print("-" * 50)
    print("\n--- Test Summary ---")
    print(f"Total Tests: {total_tests}")
    print(f"Passed:      {passed_tests}")
    print(f"Failed:      {failed_tests}")
    if total_tests > 0:
        pass_rate = (passed_tests / total_tests) * 100
        print(f"Pass Rate:   {pass_rate:.2f}%")
    else:
        print("No tests were run.")
    print("-" * 50)


if __name__ == '__main__':
    main()


--- Running Date Parser Tests ---
--------------------------------------------------
Test 1:
  Input:    'The event will take place on March 5, 2023.'
  Expected: '05/03/2023'
  Actual:   '05/03/2023'
  Status:   PASSED
--------------------------------------------------
Test 2:
  Input:    'Her birthday is on 07/08/1990.'
  Expected: '07/08/1990'
  Actual:   '07/08/1990'
  Status:   PASSED
--------------------------------------------------
Test 3:
  Input:    'The deadline is 2022-12-31.'
  Expected: '31/12/2022'
  Actual:   '31/12/2022'
  Status:   PASSED
--------------------------------------------------
Test 4:
  Input:    'We met on 1st of January 2000.'
  Expected: '01/01/2000'
  Actual:   '01/01/2000'
  Status:   PASSED
--------------------------------------------------
Test 5:
  Input:    'The concert is scheduled for 15th September, 2021.'
  Expected: '15/09/2021'
  Actual:   '15/09/2021'
  Status:   PASSED
--------------------------------------------------
Test 6:
  Input:    

## Regular Expression Breakdown for `parse_date` Function

Here's a breakdown of each regex pattern used in the `parse_date` function:

---

### `r'\b(\d{1,2})[./-](\d{1,2})[./-](\d{4})\b'`

Matches formats like:  
**`DD/MM/YYYY`, `MM/DD/YYYY`, `DD-MM-YYYY`, `MM-DD-YYYY`, `DD.MM.YYYY`, `MM.DD.YYYY`**

**Explanation:**
- `\b` – Word boundary to ensure full date match.
- `(\d{1,2})` – Captures 1–2 digit day or month.
- `[./-]` – Accepts dot, slash, or hyphen as separator.
- `(\d{1,2})` – Captures the second part (day/month).
- `[./-]` – Another separator.
- `(\d{4})` – Captures a 4-digit year.
- `\b` – Ending word boundary.

**Usage:**  
This appears twice in the code:
1. **Pattern 1**: If the second value > 12 → `MM/DD/YYYY` → reorder fields.
2. **Pattern 2**: Used as fallback, assuming `DD/MM/YYYY`.

---

### `r'\b(\d{4})[./-](\d{1,2})[./-](\d{1,2})\b'`

Matches formats like:  
**`YYYY-MM-DD`, `YYYY/MM/DD`, `YYYY.MM.DD`**

**Explanation:**
- `(\d{4})` – Year (4 digits).
- `[./-]` – Separator.
- `(\d{1,2})` – Month.
- `[./-]` – Separator.
- `(\d{1,2})` – Day.
- `\b` – Word boundary.

---

### `r'\b(\d{1,2})[/-](\d{1,2})[/-](\d{2})\b'`

Matches formats like:  
**`D/M/YY`, `DD-MM-YY`**

**Explanation:**
- `(\d{1,2})` – Day.
- `[/-]` – Separator.
- `(\d{1,2})` – Month.
- `[/-]` – Separator.
- `(\d{2})` – Year (2 digits).
- `\b` – Word boundary.

---

### `rf'\b({month_names_pattern})\s+(\d{{1,2}})(?:st|nd|rd|th)?\s*(?:[^0-9]*?)\s*(\d{{4}})\b'`

Matches formats like:  
**`Month DD, YYYY`**, **`Month DD YYYY`**  
(e.g., `"January 1st, 2023"` or `"Jan 12 2022"`)

**Explanation:**
- `({month_names_pattern})` – Month names (e.g., Jan, February, etc.).
- `\s+` – Space between month and date.
- `(\d{1,2})` – Day of the month.
- `(?:st|nd|rd|th)?` – Optional ordinal suffix.
- `(?:[^0-9]*?)` – Non-greedy non-digit characters (e.g., comma, “of”).
- `(\d{4})` – Year.

---

### `rf'\b(\d{{1,2}})(?:st|nd|rd|th)?\s+(?:of\s+)?({month_names_pattern})\s*(?:[^0-9]*?)\s*(\d{{4}})\b'`

Matches formats like:  
**`DDth Month YYYY`**, **`DDth of Month YYYY`**  
(e.g., `"5th of July 1999"`, `"21st March 2020"`)

**Explanation:**
- `(\d{1,2})` – Day.
- `(?:st|nd|rd|th)?` – Optional suffix.
- `(?:of\s+)?` – Optional "of" keyword.
- `({month_names_pattern})` – Month names.
- `(?:[^0-9]*?)` – Flexible for commas or other non-digits.
- `(\d{4})` – Year.

---

These patterns are written to comprehensively cover a wide variety of date formats and are processed in order to prioritize the most specific and common formats first.
