## Check for Duplicate Words Between Captions and Tags
----

This script traverses through a directory, searches for text files, processes each file to extract tags and captions, and highlights occurrences of tags within captions using random colors, displaying the results in a visually rich format in the terminal. This script assumes that you separate your tags with `,` and your captions with `.,`.

In [3]:
import os
import random
import re
from rich import print
from rich.console import Console
from rich.style import Style
from rich.color import Color

def find_files(path, extension):
    for root, dirs, files in os.walk(path):
        for file in files:
            if file.endswith(extension):
                yield os.path.join(root, file)

def process_file(file_path):
    console = Console()
    file_path_printed = False
    with open(file_path, 'r') as file:
        content = file.read()
        elements = content.split(',')
        captions = [element.strip() for element in elements if '.' in element]
        tags = [element.strip() for element in elements if '.' not in element and element.strip() != '']
        for tag in tags:
            pattern = r'\b{}\b'.format(re.escape(tag))
            for caption in captions:
                if re.search(pattern, caption):
                    if not file_path_printed:
                        console.print(f'\n\n[bold]{file_path}[/bold]\n')
                        file_path_printed = True
                    color = Color.from_rgb(random.randint(0, 255), random.randint(0, 255), random.randint(0, 255))
                    style = Style(color=color, bold=True)
                    highlighted_caption = caption.replace(tag, f"[{style}]{tag}[/{style}]")
                    console.print(f'Tag "[{style}]{tag}[/{style}]" found in caption "{highlighted_caption}"')

def main():
    path = 'C:\\Users\\kade\\Desktop\\training_dir_staging'
    for file_path in find_files(path, '.txt'):
        process_file(file_path)

if __name__ == "__main__":
    main()