In [3]:
def normalize_lines(text):
    """
    Convert to lowercase and remove all whitespace and punctuation.
    """
    text = text.lower()

    valid_characters = "abcdefghijklmnopqrstuvwxyz0123456789"
    normalized = ''

    for character in text:
        if character in valid_characters:
            normalized += character
    return normalized

def find_near_duplicate_lines(filename):
    """
    Reads a text file and groups lines that match after normalization
    (lowercase + remove all whitespace and punctuation). Prints the number
    of near-duplicate sets and displays the first two sets with line numbers.
    """
    normalize_to_line = {}

    with open(filename, 'r', encoding='utf-8') as file:
        for line_number, line in enumerate(file, start=1):
            stripped_line = line.strip()
            normalized_text = normalize_lines(stripped_line)

            # Skip lines that become empty after normalization
            if normalized_text == "":
                continue

            if normalized_text not in normalize_to_line:
                normalize_to_line[normalized_text] = []
            normalize_to_line[normalized_text].append((line_number, stripped_line))

    # Collect only groups that actually have duplicates
    near_duplicate_line_set = []
    for groups in normalize_to_line.values():
        if len(groups) > 1:
            near_duplicate_line_set.append(groups)

    print(f"\nNumber of near-duplicate sets found: {len(near_duplicate_line_set)}\n")

    for set_index in range(min(2, len(near_duplicate_line_set))):
        print(f"Near-duplicate set {set_index + 1}:")
        for line_number, original_text in near_duplicate_line_set[set_index]:
            print(f"  Line {line_number}: {original_text}")
        print("-" * 60)

find_near_duplicate_lines("sample-file.txt")


Number of near-duplicate sets found: 3

Near-duplicate set 1:
  Line 1: Data science is an exciting field that combines statistics, computing, and real-world problem solving.
  Line 34: Data science is an exciting field that combines statistics computing and real world problem solving
------------------------------------------------------------
Near-duplicate set 2:
  Line 7: Machine learning is a major part of data science, allowing computers to learn patterns from data.
  Line 35: Machine learning is a major part of data science allowing computers to learn patterns from data
------------------------------------------------------------
