In [None]:
import sys
import os
import mmap
import datetime
import time
import threading
from concurrent.futures import ThreadPoolExecutor

In [None]:
def validate_date(date_str):
    """Validates if the given date is in the correct format (YYYY-MM-DD)."""
    try:
        datetime.datetime.strptime(date_str, "%Y-%m-%d")
        return True
    except ValueError:
        return False

In [None]:
def process_chunk(file_path, search_date, start, end, output_file, lock):
    """Processes a chunk of the log file and extracts relevant logs efficiently."""
    try:
        with open(file_path, "r", encoding="utf-8") as file, mmap.mmap(file.fileno(), 0, access=mmap.ACCESS_READ) as mm:
            chunk = mm[start:end].decode("utf-8", errors="ignore")
            logs = [line for line in chunk.split("\n") if line.startswith(search_date)]

        if logs:
            with lock:
                with open(output_file, "a", encoding="utf-8") as out:
                    out.writelines("\n".join(logs) + "\n")

    except Exception as e:
        print(f"⚠️ Error processing chunk: {e}")


In [None]:
def get_file_size(file_path):
    """Returns the size of the file in bytes."""
    try:
        return os.path.getsize(file_path)
    except FileNotFoundError:
        print(f"❌ Error: File '{file_path}' not found.")
        sys.exit(1)

In [None]:
def extract_logs_parallel(log_file, search_date, output_dir="output"):
    """Extract logs for a specific date from a large log file using threading."""
    
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    output_file = os.path.join(output_dir, f"output_{search_date}.txt")
    
    file_size = get_file_size(log_file)
    num_workers = os.cpu_count()  # Use all available CPU cores
    chunk_size = file_size // num_workers

    lock = threading.Lock()
    with ThreadPoolExecutor(max_workers=num_workers) as executor:
        futures = []
        for i in range(num_workers):
            start = i * chunk_size
            end = file_size if i == num_workers - 1 else (i + 1) * chunk_size
            futures.append(executor.submit(process_chunk, log_file, search_date, start, end, output_file, lock))

        for future in futures:
            future.result()  # Ensure all threads complete execution

    print(f"✅ Logs for {search_date} saved in {output_file}")


In [None]:
def run_small_test():
    """Runs a test on a small dataset before processing the full file."""
    test_data = """\
2024-12-01 14:23:45 INFO User logged in
2024-12-01 14:24:10 ERROR Failed to connect to the database
2024-12-02 09:15:30 WARN Disk space running low
2024-12-01 16:45:00 INFO File uploaded successfully
"""
    test_file = "test_logs.txt"
    with open(test_file, "w", encoding="utf-8") as f:
        f.write(test_data)

    print("\n🟢 Running test on small dataset...")
    extract_logs_parallel(test_file, "2024-12-01")
    print("🟢 Test completed. Check 'output/output_2024-12-01.txt'.\n")


In [None]:
if __name__ == "__main__":
    if len(sys.argv) != 2:
        print("❌ Usage: python extract_logs.py YYYY-MM-DD")
        sys.exit(1)

    search_date = sys.argv[1]

    if not validate_date(search_date):
        print("❌ Error: Invalid date format. Use YYYY-MM-DD.")
        sys.exit(1)

    # Run a test on a small dataset before full execution
    run_small_test()

    log_file = "logs_2024.log"  # Replace with actual log file path
    print("\n🚀 Running on the full log file...\n")

    start_time = time.time()  # Start time tracking
    extract_logs_parallel(log_file, search_date)
    end_time = time.time()  # End time tracking

    print(f"⏳ Execution time: {end_time - start_time:.2f} seconds")
