In [8]:
import re
import csv

# Dictionary to convert month names to numbers
MONTHS = {
    "january": "01", "february": "02", "march": "03", "april": "04",
    "may": "05", "june": "06", "july": "07", "august": "08",
    "september": "09", "october": "10", "november": "11", "december": "12",
    "jan": "01", "feb": "02", "mar": "03", "apr": "04",
    "jun": "06", "jul": "07", "aug": "08",
    "sep": "09", "oct": "10", "nov": "11", "dec": "12"
}

# Define regex patterns for matching dates
DATE_PATTERNS = [
    r"(\d{1,2})(st|nd|rd|th)?[ ]?(January|February|March|April|May|June|July|August|September|October|November|December|Jan|Feb|Mar|Apr|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[, ]+(\d{4})",  # 21st June, 2024
    r"(\d{1,2})[ ]?(January|February|March|April|May|June|July|August|September|October|November|December|Jan|Feb|Mar|Apr|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[, ]+(\d{4})",  # 21 June 2024
    r"(\d{1,2})/(\d{1,2})/(\d{4})",  # 21/06/2024
    r"(\d{1,2})-(\d{1,2})-(\d{4})",  # 21-06-2024
    r"(January|February|March|April|May|June|July|August|September|October|November|December|Jan|Feb|Mar|Apr|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[ ]?(\d{1,2})(st|nd|rd|th)?,[ ]?(\d{4})",  # June 21, 2024
    r"(January|February|March|April|May|June|July|August|September|October|November|December|Jan|Feb|Mar|Apr|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[ ]?(\d{1,2})[, ]?(\d{4})",  # June 21 2024
    r"(\d{4})/(\d{1,2})/(\d{1,2})",  # 2024/06/21
    r"(\d{4})-(\d{1,2})-(\d{1,2})",  # 2024-06-21
    r"(\d{4})[.](\d{1,2})[.](\d{1,2})",  # 2024.06.21
    r"(\d{1,2})[.](\d{1,2})[.](\d{4})"  # 21.06.2024
]

def parse_date(text):
    for pattern in DATE_PATTERNS:
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            groups = match.groups()
            day, month, year = None, None, None

            if len(groups) == 4:  # For formats like 21st June, 2024 or June 21, 2024
                if groups[0].isdigit():  # 21st June, 2024
                    day, month, year = groups[0], groups[2], groups[3]
                else:  # June 21, 2024
                    day, month, year = groups[1], groups[0], groups[3]
            elif len(groups) == 3:  # For formats like 21 June 2024 or 21/06/2024
                if groups[0].isdigit() and groups[1].isdigit():  # 21/06/2024
                    day, month, year = groups[0], groups[1], groups[2]
                elif groups[1].isalpha():  # 21 June 2024
                    day, month, year = groups[0], groups[1], groups[2]
                else:  # 2024/06/21 or 2024-06-21
                    year, month, day = groups[0], groups[1], groups[2]
            elif len(groups) == 5:  # For formats like June 21st, 2024
                month, day, year = groups[0], groups[1], groups[3]
            elif len(groups) == 6:  # For formats like June 21st, 2024 with more groups
                month, day, year = groups[0], groups[1], groups[3]

            if month:
                month = MONTHS.get(month.lower(), month)
            
            if day and month and year:
                return f"{day.zfill(2)}/{month.zfill(2)}/{year}"
    return "Date not found"

# Read and process the dataset
def process_dataset(file_path):
    with open(file_path, mode='r') as file:
        csv_reader = csv.reader(file)
        next(csv_reader)  # Skip header if there is one
        for row in csv_reader:
            text = row[0]
            parsed_date = parse_date(text)
            print(f"Original text: {text}")
            print(f"Parsed date: {parsed_date}\n")

# Example usage
process_dataset('date_parser_testcases.csv')


Original text: The event will take place on March 5, 2023.
Parsed date: 05/03/2023

Original text: Her birthday is on 07/08/1990.
Parsed date: 07/08/1990

Original text: The deadline is 2022-12-31.
Parsed date: 2022/12/31

Original text: We met on 1st of January 2000.
Parsed date: Date not found

Original text: The concert is scheduled for 15th September, 2021.
Parsed date: 15/09/2021

Original text: Let's catch up on 02.04.2022.
Parsed date: 02/04/2022

Original text: The project started on 5/6/19.
Parsed date: Date not found

Original text: He was born on 1987/11/23.
Parsed date: 1987/11/23

Original text: Christmas is on 25th Dec 2024.
Parsed date: 25/12/2024

Original text: The meeting is set for April 03, 2020.
Parsed date: 03/04/2020

Original text: Her birthdate, noted as 1997-05-20, is in the records.
Parsed date: 1997/05/20

Original text: Her appointment is on the 2nd of March, 2021.
Parsed date: Date not found

Original text: The exam date is 2021.11.10.
Parsed date: 2021/11