In [3]:
from langchain.document_loaders import PyMuPDFLoader, DirectoryLoader

import os
from collections import defaultdict
import ollama

def collect_files(directory):
    """
    Traverse the given directory and collect all file paths.
    """
    file_dict = defaultdict(list)
    for root, dirs, files in os.walk(directory):
        for file in files:
            file_path = os.path.join(root, file)
            file_name = file
            parent_path = os.path.basename(os.path.dirname(file_path))
            file_dict[file].append([file_path, parent_path, "0"])
    
    
    print("Number of files: ", len(file_dict))
    return file_dict

def find_duplicates(file_dict):
    """
    Identify duplicate files based on their names.
    """
    duplicates = {file: paths for file, paths in file_dict.items() if len(paths) > 1}
    return duplicates

def extract_final_answer(response):
    # Split the response by <think> and </think> tags
    parts = response.split("</think>")
    if len(parts) > 1:
        # The final answer is after the last </think> tag
        final_answer = parts[-1].strip()
        return final_answer
    return response.strip()  # Fallback if no tags are found



def main():
    directory = "E:\__ilker\BİLGİ\Kitaplar"
    if not os.path.isdir(directory):
        print("Invalid directory path.")
        return

    file_dict = collect_files(directory)
    duplicates = find_duplicates(file_dict)

    if duplicates:
        print("Duplicate files found:")
        print("Number of duplicates: ",len(duplicates))
        counter = 1
        for file, paths in duplicates.items():
            print(counter, "-------------------")

            foldername1 = paths[0][1]
            foldername2 = paths[1][1]

            #print("file: ", file)
            #print("foldername1: ", foldername1 )
            #print("foldername2: ", foldername2)


            system_message = """You are a binary classifier. Respond only with '1' for first choice or '2' with second choice.
                                Do not include any reasoning or explanations in your answer.
                             """

            user_prompt = f"""
                        Get the following file name:
                        {file}
                        
                        Is that filename related mostly with {foldername1} or {foldername2}?
                        """

            # Combine the system message and user prompt
            prompt = f"{system_message}\n\n{user_prompt}"


            response = ollama.generate(model = "deepseek-r1:14b", prompt=prompt)
            final_res = response.get("response", "")

            res = final_res[-1]  #This is to be sure to get the final binary response after the reasoning explanation of the llm model


            if res == "1":
                paths[0][2] = "1"
                paths[1][2] = "-1"
                remove_file_path = paths[1][0]
            else:
                if res == "2":
                    paths[0][2] = "-1"
                    paths[1][2] = "1"
                    remove_file_path = paths[0][0]

            #print(f"\nFile Name: {file}")
            for data in paths:
                print(f" {file}- {data[:][1:]}")


            print("Remove: ", remove_file_path)
            os.remove(remove_file_path)  #Dublicated files from irrelevant folder is removed

            counter += 1
    else:
        print("No duplicate files found.")



In [4]:
if __name__ == "__main__":
    main()

Number of files:  2037
No duplicate files found.
