In [1]:
import os
from dotenv import load_dotenv
from py_files.update_compare import compare_fd_then_create_updatedfd2, count_files_in_directories
from py_files.check_compare import just_compare, count_files_simple
from byaldi import RAGMultiModalModel


class FileComparisonManager:
    def __init__(self, exclusion_folder, exclusive_path, updated_fd, set1, set2):
        self.exclusion_folder = exclusion_folder
        self.exclusive_path = exclusive_path
        self.updated_fd = updated_fd
        self.set1 = set1
        self.set2 = set2
        self.result_path_dict = {}

    def compare_and_update_files(self):
        # self.result_path_dict = compare_fd_then_create_updatedfd2(
        self.result_path_dict = just_compare(
            self.set1, self.set2, self.updated_fd, self.exclusion_folder, self.exclusive_path
        )
        return self.result_path_dict

    def count_files_in_updated_directories(self):
        if self.result_path_dict:
            count_files_in_directories(self.result_path_dict)
        else:
            print("No files to count; please run compare_and_update_files first.")

class RAGModelManager:
    def __init__(self, hf_token, anthropic_api_key):
        os.environ["HF_TOKEN"] = hf_token
        os.environ["ANTHROPIC_API_KEY"] = anthropic_api_key

    @staticmethod
    def create_rag_model(index_path, added_filepath, add_pdf = False):
        model = RAGMultiModalModel.from_index(index_path, verbose=0)
        print(f"Len of model (before): {len(model.get_doc_ids_to_file_names())}")
        if add_pdf:
            model.add_to_index(input_item=added_filepath, store_collection_with_index=False)
        return model

class ProcessCronjob:
    def __init__(self):
        load_dotenv()
        self.hf_token = os.getenv("HF_TOKEN")
        self.anthropic_api_key = os.getenv("ANTHROPIC_API_KEY")
        self.file_manager = FileComparisonManager(
            exclusion_folder="./file_can_pass_or_loi",
            exclusive_path="./dont_care",
            updated_fd="./updated_fd_all_all_test1",
 
            set1 = [
                r"D:\RAG_models_files_backup\pdf_fd\Sale_3807_188",
                r"D:\RAG_models_files_backup\pdf_fd\CT_11184_2435_1854",
                r"D:\RAG_models_files_backup\pdf_fd\Prop_20344_1885_294",
                    ],            

            set2 = [
                r"D:\Vector Cloud\Sales\3. SI Sales\1. SI", 
                r"D:\Vector Cloud\Procurement\Procurement",
                r"D:\Vector Cloud\Proposal\1. Project", 
            ]
            
            # set1=[
            #     "./examples/example_fd_to_compare/test_Sale",
            #     "./examples/example_fd_to_compare/test_CT",
            #     "./examples/example_fd_to_compare/test_Prop",
            # ],
            # set2=[
            #     "./examples/example_fd_to_compare/Sale_server",
            #     "./examples/example_fd_to_compare/CT_server",
            #     "./examples/example_fd_to_compare/Prop_server"
            # ]
        )
        self.rag_manager = RAGModelManager(self.hf_token, self.anthropic_api_key)

    def run(self):
        print("Running scheduled task...")
        result_path_dict = self.file_manager.compare_and_update_files()
        # print("Updated directories:", result_path_dict, end = '\n\n\n')
        self.file_manager.count_files_in_updated_directories()

        # Initialize and update RAG models
        models_info = {
            "Sale": {
                "attn_path": r"D:\RAG_models_files_test_compare\attn_fd\attn_CT_2435_1854",
                "added_path": ""
            },
            # "Proposal": {
            #     "attn_path": r"D:\RAG_models_files_test_compare\attn_fd\attn_Prop_1885_294_cp",
            #     "added_path": ""
            # },
            "CT": {
                "attn_path": r"D:\RAG_models_files_test_compare\attn_fd\attn_Sale_188_cp",
                "added_path": ""
            }
        }        
        for key, path in result_path_dict .items():
            if key.startswith('Sale_'):
                models_info['Sale']['added_path'] = path
            # elif key.startswith('Prop_'):
            #     result_path_dict['test_Prop_vs_Prop_server'] = path
            elif key.startswith('CT_'):
                models_info['CT']['added_path'] = path
                
        # models_info = {
        #     "Sale": {
        #         "attn_path": "./examples/example_attn_fd/attn_Sale_sample",
        #         "added_path": result_path_dict.get('test_Sale_vs_Sale_server')
        #     },
        #     "Proposal": {
        #         "attn_path": "./examples/example_attn_fd/attn_Prop_sample",
        #         "added_path": result_path_dict.get('test_Prop_vs_Prop_server')
        #     },
        #     "Procur": {
        #         "attn_path": "./examples/example_attn_fd/attn_CT_sample",
        #         "added_path": result_path_dict.get('test_CT_vs_CT_server')
        #     }
        # }

        
        for model_name, paths in models_info.items():
            if paths["added_path"]:
                index_path = os.path.abspath(paths["attn_path"])
                added_filepath = paths["added_path"]
                model = self.rag_manager.create_rag_model(index_path, added_filepath,)
                                                        #   add_pdf = True)
                print(f"Len of added_pdf: {count_files_simple(added_filepath)}")
                print(f"Len of model (after): {len(model.get_doc_ids_to_file_names())}")
                print(f"{model_name} model updated with file: {added_filepath}")                
                print('=' * 50)
                print('=' * 50)
                
process = ProcessCronjob()
process.run()

  from .autonotebook import tqdm as notebook_tqdm


Running scheduled task...

Merging PDFs for pair 1 (Sale_3807_188 vs 1. SI):
  - Total PDFs merged: 11
  - Merge PDF location: f:\local_RAG_pdf\second_docu\part12_cronjob\notebook_fd\updated_fd_all_all_test1\Sale_3807_188_vs_1. SI\merge_pdf

Results for pair 1 (Sale_3807_188 vs 1. SI):
  - Processed 0 unique DOC files
  - Processed 11 unique PDF files

Merging PDFs for pair 2 (CT_11184_2435_1854 vs Procurement):
  - Total PDFs merged: 18
  - Merge PDF location: f:\local_RAG_pdf\second_docu\part12_cronjob\notebook_fd\updated_fd_all_all_test1\CT_11184_2435_1854_vs_Procurement\merge_pdf

Results for pair 2 (CT_11184_2435_1854 vs Procurement):
  - Processed 1 unique DOC files
  - Processed 18 unique PDF files

Merging PDFs for pair 3 (Prop_20344_1885_294 vs 1. Project):
  - Total PDFs merged: 5
  - Merge PDF location: f:\local_RAG_pdf\second_docu\part12_cronjob\notebook_fd\updated_fd_all_all_test1\Prop_20344_1885_294_vs_1. Project\merge_pdf

Results for pair 3 (Prop_20344_1885_294 vs 1. Pr

`Qwen2VLRotaryEmbedding` can now be fully parameterized by passing the model config through the `config` argument. All other arguments will be removed in v4.46
Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.20s/it]
  self.indexed_embeddings.extend(torch.load(file))


Len of model (before): 15471
Len of added_pdf: 11
Len of model (after): 15471
Sale model updated with file: f:\local_RAG_pdf\second_docu\part12_cronjob\notebook_fd\updated_fd_all_all_test1\Sale_3807_188_vs_1. SI\merge_pdf


Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.33s/it]


Len of model (before): 3994
Len of added_pdf: 18
Len of model (after): 3994
CT model updated with file: f:\local_RAG_pdf\second_docu\part12_cronjob\notebook_fd\updated_fd_all_all_test1\CT_11184_2435_1854_vs_Procurement\merge_pdf


In [2]:
import os
import re
from collections import defaultdict

def analyze_and_delete_files(directory='.'):
    """
    Analyze PDF files in a directory, count redundant files, and delete them.
    
    Args:
        directory (str): Directory path to scan for files. Defaults to current directory.
    
    Returns:
        tuple: (total_files, groups_count, deleted_count, grouped_files)
    """
    # Get all PDF files in the directory
    all_files = [f for f in os.listdir(directory) if f.lower().endswith('.pdf')]
    
    # Dictionary to group related files
    groups = defaultdict(list)
    
    # Regular expression to match PO numbers
    po_pattern = re.compile(r'(PO-SI\d{7}-\d{2})')
    
    # Group files by PO number
    for filename in all_files:
        match = po_pattern.search(filename)
        if match:
            po_number = match.group(1)
            groups[po_number].append(filename)
    
    # Process each group and delete redundant files
    deleted_count = 0
    grouped_files = defaultdict(dict)
    
    for po_number, group_files in groups.items():
        if len(group_files) > 0:
            # Keep the longest filename
            to_keep = max(group_files, key=len)
            
            # Count and delete other files
            files_to_remove = [f for f in group_files if f != to_keep]
            deleted_count += len(files_to_remove)
            
            # Store group info
            grouped_files[po_number] = {
                'total': len(group_files),
                'deleted': len(files_to_remove),
                'kept': to_keep
            }
            
            # Delete files
            for file in files_to_remove:
                try:
                    os.remove(os.path.join(directory, file))
                    print(f"Deleted: {file}")
                except Exception as e:
                    print(f"Error deleting {file}: {e}")
    
    return len(all_files), len(groups), deleted_count, grouped_files

def main():
    """
    Main function to analyze files, display results, and delete redundant files.
    """
    try:
        main_dir = r'F:\orig\CT_new_merge_7234_1296_1712 - Copy'

        total_files, groups_count, deleted_count, grouped_files = analyze_and_delete_files(main_dir)
        
        print(f"\nOperation Summary:")
        print(f"Total PDF files found: {total_files}")
        print(f"Number of unique PO groups: {groups_count}")
        print(f"Files deleted: {deleted_count}")
        
        # Show detailed breakdown if files were deleted
        if deleted_count > 0:
            print("\nBreakdown by PO number:")
            for po_number, info in grouped_files.items():
                if info['deleted'] > 0:  # Only show groups where files were deleted
                    print(f"\n{po_number}:")
                    print(f"  Total files: {info['total']}")
                    print(f"  Files deleted: {info['deleted']}")
                    print(f"  Kept: {info['kept']}")
        
    except Exception as e:
        print(f"An error occurred: {str(e)}")

if __name__ == "__main__":
    main()

Deleted: PO-SI1603009-02-Fan via Fandis.pdf
Deleted: PO-SI1608018-10024-Meinberg - Hirschmann for Omnitech, Dong nam.pdf
Deleted: PO-SI1703005-01 - FiberTek for Yoko.pdf
Deleted: PO-SI1706012-01 to VTS REV 1.pdf
Deleted: PO-SI1706012-01 to VTS.pdf
Deleted: PO-SI1706015-02 - Antenna for Vietsov - Alpha Marine.pdf
Deleted: PO-SI1710024-03 - Belden for MP - Anixter.pdf
Deleted: PO-SI1710024-03 - cable for MP - shipment 2.pdf
Deleted: PO-SI1712037-03 - cable gland.pdf
Deleted: PO-SI1712037-03-rev1 - cable gland.pdf
Deleted: PO-SI1712037-04 - cable.pdf
Deleted: PO-SI1712037-05 - cable.pdf
Deleted: PO-SI1712037-06 - cable spare.pdf
Deleted: PO-SI1712037-06.pdf
Deleted: PO-SI1712037-07 - Pelco - VISG rev 1.pdf
Deleted: PO-SI1712037-07 - Pelco - VISG rev 2.pdf
Deleted: PO-SI1712037-07 - Pelco - VISG rev 3.pdf
Deleted: PO-SI1712037-07 - Pelco - VISG.pdf
Deleted: PO-SI1808035-01 - via VTS _.pdf
Deleted: PO-SI1808035-01 - via VTS.pdf

Operation Summary:
Total PDF files found: 9671
Number of uniqu

In [2]:
import os
import re
from collections import defaultdict

def analyze_files(directory='.'):
    """
    Analyze PDF files in a directory and count how many redundant files exist.
    
    Args:
        directory (str): Directory path to scan for files. Defaults to current directory.
    
    Returns:
        tuple: (total_files, groups_count, files_to_remove_count, grouped_files)
    """
    # Get all PDF files in the directory
    all_files = [f for f in os.listdir(directory) if f.lower().endswith('.pdf')]
    
    # Dictionary to group related files
    groups = defaultdict(list)
    
    # Regular expression to match PO numbers
    po_pattern = re.compile(r'(PO\d{7}-\d{2})')
    
    # Group files by PO number
    for filename in all_files:
        match = po_pattern.search(filename)
        if match:
            po_number = match.group(1)
            groups[po_number].append(filename)
    
    # Count files that would be removed (all files except longest in each group)
    files_to_remove_count = 0
    grouped_files = defaultdict(dict)
    
    for po_number, group_files in groups.items():
        if len(group_files) > 0:
            files_to_remove_count += len(group_files) - 1  # All files except one to keep
            grouped_files[po_number] = {
                'total': len(group_files),
                'to_remove': len(group_files) - 1,
                'example': max(group_files, key=len)  # longest filename as example
            }
    
    return len(all_files), len(groups), files_to_remove_count, grouped_files

def main():
    """
    Main function to analyze files and display results.
    """
    try:
        main_dir = r'F:\orig\CT_new_merge_7234_1296_1712 - Copy'

        total_files, groups_count, files_to_remove_count, grouped_files = analyze_files(main_dir)
        
        print(f"\nAnalysis Summary:")
        print(f"Total PDF files found: {total_files}")
        print(f"Number of unique PO groups: {groups_count}")
        print(f"Files that could be removed: {files_to_remove_count}")
        
        # Show detailed breakdown if there are groups with multiple files
        if files_to_remove_count > 0:
            print("\nBreakdown by PO number:")
            for po_number, info in grouped_files.items():
                if info['to_remove'] > 0:  # Only show groups with removable files
                    print(f"\n{po_number}:")
                    print(f"  Total files: {info['total']}")
                    print(f"  Could remove: {info['to_remove']}")
                    print(f"  Example: {info['example']}")
        
    except Exception as e:
        print(f"An error occurred: {str(e)}")

if __name__ == "__main__":
    main()


Analysis Summary:
Total PDF files found: 10238
Number of unique PO groups: 456
Files that could be removed: 567

Breakdown by PO number:

PO2201001-01:
  Total files: 2
  Could remove: 1
  Example: 1. PO2201001-01 - Autronica for HSTech.pdf

PO2201002-01:
  Total files: 4
  Could remove: 3
  Example: 1. PO2201002-01 rev 1 - patch cord for Yoko.pdf

PO2201003-01:
  Total files: 2
  Could remove: 1
  Example: 1. PO2201003-01- Fibertek for Fuji.pdf

PO2201004-01:
  Total files: 3
  Could remove: 2
  Example: PO2201004-01- Fibertek for CAD (revised).pdf

PO2201005-01:
  Total files: 3
  Could remove: 2
  Example: VECTOR VIETNAM PO2201005-01 dated 15.02.2022.pdf

PO2202006-01:
  Total files: 2
  Could remove: 1
  Example: 1. PO2202006-01 - Det-tronics for TNTE.pdf

PO2202007-01:
  Total files: 4
  Could remove: 3
  Example: PO2202007-01- Hirschmann for NTT rev2 - gui khach.pdf

PO2202008-01:
  Total files: 4
  Could remove: 3
  Example: 1. PO2202008-01 - Det-tronics for Bach Viet.pdf

PO22

### Count not remove

In [4]:
import os

def count_files(directory_path, keywords):
    # Initialize counters
    keyword_counts = {keyword: 0 for keyword in keywords}
    total_matched_files = 0
    numeric_files = []
    total_files = 0
    
    try:
        # Walk through directory
        for root, dirs, files in os.walk(directory_path):
            for file in files:
                total_files += 1
                file_matched = False
                filename, extension = os.path.splitext(file)
                
                # Check if filename contains only numbers
                if filename.isdigit():
                    full_path = os.path.join(root, file)
                    numeric_files.append({
                        'filename': file,
                        'path': full_path
                    })
                
                # Check each keyword
                for keyword in keywords:
                    if keyword.lower() in file.lower():  # Case-insensitive search
                        keyword_counts[keyword] += 1
                        if not file_matched:
                            total_matched_files += 1
                            file_matched = True
        
        # Print keyword match results
        print("\nKeyword Match Results:")
        print("-" * 50)
        for keyword, count in keyword_counts.items():
            print(f"* Number of files has keywords '{keyword}': {count}")
            
        print(f"\nTotal unique files matching keywords: {total_matched_files}")
        
        # Print numeric filename results
        print("\nNumeric Filename Results:")
        print("-" * 50)
        print(f"Total files scanned: {total_files}")
        print(f"Files with only numbers in filename: {len(numeric_files)}")

        
    except Exception as e:
        print(f"Error: {e}")
    
    return keyword_counts, numeric_files

# Example usage
excluded_substrings = ['F32', 'F60', 'phieu bao hanh', 'livery note', 'DNTT', 'ĐNTT',
        'Shipping mark', 'Shipping Mark', 'shipping mark', 'SHIPPING MARK',
        'De nghi thanh toan', 'thong bao giao hang', 'Thông báo giao hàng',
        'phieu kiem hang', 'kiểm hàng', 'cu di cong tac', 'bao hanh',
        'thong bao hang den', 'Thông báo hàng đến', 'Taxi fee', 'Timesheet',
        'Yeu cau boi thuong', 'Yeu cau mua bao hiem', 'BBGH', 'Bien ban ban giao', 'Biên bản giao hàng',
        'phieu kiem hang', 'TBC', 'Certificate', 'CO_CQ', 'CO, CQ', 'CO-CQ', 'phieu bao hanh',
        'job completion', 'Scan_', 'TBHD', 'Vendor daily report', 'WaybillDoc', 'TVL_', 'BBBG',
        'Hóa đơn', 'Hoa don', 'Invoice', 'MA VACH', 'De nghi thanh toan',
        'Payment request', '01GTKT0_', 'Bang doi chieu cong no ', 'ToKhai',
        'Packing List', 'PO Receipt Confirm', 'Order Acknowledgement for',
        'FedEX AWB and', 'Cước ', 'AWB#', 'AWB #', 'FedEX AWB and', 'Letter of confirmation',
        'STFFD-P1-', 'PL_PO', 'Đề nghị thanh toán', 'CQ by', 'THONG BAO CUOC',
        'CO Chamber ', 'CO by Chamber ', 'Test Report ', 'TransportLabel_',
        'Vector InfoTech (VN) MC ', 'Phiếu bảo hành', 'Supplier confirmation for the use',
        'COO-VIVN-', 'Raw material Test Report_', 'Repair agreement No', 
        'Thong tin hang', 'F30 Supplier Monitoring & Review Report', 'thông tin hàng',
        'VECTOR INFOTECH-F1-', 'VECTOR INFOTECH-F2-', 'VECTOR- VOL', 'VIVN Drafter Claim ',
        'Vu Dinh Toan', 'Electronic ticket receipt', 'VECTOR-VOL', 'daily report Vector',
        'COO Chamber', 'COO by', 'COO 23', 'COO & ', 'COO-VIVN_MO', 'Raw material Test Report',
        'TNT AWB and CI&PL', 'To khai', 'Vector Infotech Vietnam 22A7021v-A',
        'DN VECTOR 21', 'DN VECTOR 22', 'DN VECTOR 23', 'DN VECTOR 24', 'daily report Vector',
        'Nguyen Van Hiep', 'Nguyen Tuan Anh', 'Nguyen Thuong Thuong', 'Nguyen Thanh Tung',
        'Nguyen Thai Duy', 'Ngo Nhan Tam', 'Ma Bao Nguyen', 'Le Xuan Hien',
        'Chamberized CO', 'Daily report_MAINTENANCE SERVICE', 'Tran Hao Hiep',
        'bang ke chi tiet', 'STFFD-Change Order-Variation Order Request-Rev1', 'SGNIR0023',
        'SGNIR0022', 'SGNIR0021', 'SGNIR0020', 'SGNIR0019', 'SGNIR0018', 'SGNIR0017', 'SGNIR0016',
        'SGNIR0015', 'SGNIR0013', 'Product test report_', 'order confirmation 20', 'COO_CQ', 'COO_COC',
        'Confirmation Letter', 'CO issued by ', 'Biên bản nghiệm thu', 'Biên bản Kiểm tra Hàng hóa',
        'AWB 77', '08-195353-06-PO-VXM-VECTOR-HQC- signed PO_Page'  
        ]

directory = r'F:\orig\CT_new_merge_7234_1296_1712 - Copy'

keyword_counts, numeric_files = count_files(directory, excluded_substrings)


Keyword Match Results:
--------------------------------------------------
* Number of files has keywords 'F32': 0
* Number of files has keywords 'F60': 0
* Number of files has keywords 'phieu bao hanh': 0
* Number of files has keywords 'livery note': 0
* Number of files has keywords 'DNTT': 0
* Number of files has keywords 'ĐNTT': 0
* Number of files has keywords 'Shipping mark': 0
* Number of files has keywords 'Shipping Mark': 0
* Number of files has keywords 'shipping mark': 0
* Number of files has keywords 'SHIPPING MARK': 0
* Number of files has keywords 'De nghi thanh toan': 0
* Number of files has keywords 'thong bao giao hang': 0
* Number of files has keywords 'Thông báo giao hàng': 0
* Number of files has keywords 'phieu kiem hang': 0
* Number of files has keywords 'kiểm hàng': 0
* Number of files has keywords 'cu di cong tac': 0
* Number of files has keywords 'bao hanh': 0
* Number of files has keywords 'thong bao hang den': 0
* Number of files has keywords 'Thông báo hàng đ

### Remove files

In [None]:
import os

def count_and_remove_files(directory_path, keywords):
    # Initialize counters
    keyword_counts = {keyword: 0 for keyword in keywords}
    total_matched_files = 0
    numeric_files = []
    total_files = 0
    files_removed = 0
    
    try:
        # Walk through directory
        for root, dirs, files in os.walk(directory_path):
            for file in files:
                total_files += 1
                file_matched = False
                filename, extension = os.path.splitext(file)
                file_path = os.path.join(root, file)
                should_remove = False
                
                # Check if filename contains only numbers
                if filename.isdigit():
                    numeric_files.append({
                        'filename': file,
                        'path': file_path
                    })
                    should_remove = True
                
                # Check each keyword
                for keyword in keywords:
                    if keyword.lower() in file.lower():  # Case-insensitive search
                        keyword_counts[keyword] += 1
                        if not file_matched:
                            total_matched_files += 1
                            file_matched = True
                        should_remove = True
                
                # Remove file if it matches either condition
                if should_remove:
                    try:
                        os.remove(file_path)
                        files_removed += 1
                    except Exception as e:
                        print(f"Error removing {file_path}: {e}")
        
        # Print keyword match results
        print("\nKeyword Match Results:")
        print("-" * 50)
        for keyword, count in keyword_counts.items():
            print(f"* Number of files has keywords '{keyword}': {count}")
            
        print(f"\nTotal unique files matching keywords: {total_matched_files}")
        
        # Print numeric filename results
        print("\nNumeric Filename Results:")
        print("-" * 50)
        print(f"Total files scanned: {total_files}")
        print(f"Files with only numbers in filename: {len(numeric_files)}")
        
        # Print removal summary
        print("\nRemoval Summary:")
        print("-" * 50)
        print(f"Total files removed: {files_removed}")

    except Exception as e:
        print(f"Error: {e}")
    
    return keyword_counts, numeric_files

# Example usage
excluded_substrings = ['F32', 'F60', 'phieu bao hanh', 'livery note', 'DNTT', 'ĐNTT',
        'Shipping mark', 'Shipping Mark', 'shipping mark', 'SHIPPING MARK',
        'De nghi thanh toan', 'thong bao giao hang', 'Thông báo giao hàng',
        'phieu kiem hang', 'kiểm hàng', 'cu di cong tac', 'bao hanh',
        'thong bao hang den', 'Thông báo hàng đến', 'Taxi fee', 'Timesheet',
        'Yeu cau boi thuong', 'Yeu cau mua bao hiem', 'BBGH', 'Bien ban ban giao', 'Biên bản giao hàng',
        'phieu kiem hang', 'TBC', 'Certificate', 'CO_CQ', 'CO, CQ', 'CO-CQ', 'phieu bao hanh',
        'job completion', 'Scan_', 'TBHD', 'Vendor daily report', 'WaybillDoc', 'TVL_', 'BBBG',
        'Hóa đơn', 'Hoa don', 'Invoice', 'MA VACH', 'De nghi thanh toan',
        'Payment request', '01GTKT0_', 'Bang doi chieu cong no ', 'ToKhai',
        'Packing List', 'PO Receipt Confirm', 'Order Acknowledgement for',
        'FedEX AWB and', 'Cước ', 'AWB#', 'AWB #', 'FedEX AWB and', 'Letter of confirmation',
        'STFFD-P1-', 'PL_PO', 'Đề nghị thanh toán', 'CQ by', 'THONG BAO CUOC',
        'CO Chamber ', 'CO by Chamber ', 'Test Report ', 'TransportLabel_',
        'Vector InfoTech (VN) MC ', 'Phiếu bảo hành', 'Supplier confirmation for the use',
        'COO-VIVN-', 'Raw material Test Report_', 'Repair agreement No', 
        'Thong tin hang', 'F30 Supplier Monitoring & Review Report', 'thông tin hàng',
        'VECTOR INFOTECH-F1-', 'VECTOR INFOTECH-F2-', 'VECTOR- VOL', 'VIVN Drafter Claim ',
        'Vu Dinh Toan', 'Electronic ticket receipt', 'VECTOR-VOL', 'daily report Vector',
        'COO Chamber', 'COO by', 'COO 23', 'COO & ', 'COO-VIVN_MO', 'Raw material Test Report',
        'TNT AWB and CI&PL', 'To khai', 'Vector Infotech Vietnam 22A7021v-A',
        'DN VECTOR 21', 'DN VECTOR 22', 'DN VECTOR 23', 'DN VECTOR 24', 'daily report Vector',
        'Nguyen Van Hiep', 'Nguyen Tuan Anh', 'Nguyen Thuong Thuong', 'Nguyen Thanh Tung',
        'Nguyen Thai Duy', 'Ngo Nhan Tam', 'Ma Bao Nguyen', 'Le Xuan Hien',
        'Chamberized CO', 'Daily report_MAINTENANCE SERVICE', 'Tran Hao Hiep',
        'bang ke chi tiet', 'STFFD-Change Order-Variation Order Request-Rev1', 'SGNIR0023',
        'SGNIR0022', 'SGNIR0021', 'SGNIR0020', 'SGNIR0019', 'SGNIR0018', 'SGNIR0017', 'SGNIR0016',
        'SGNIR0015', 'SGNIR0013', 'Product test report_', 'order confirmation 20', 'COO_CQ', 'COO_COC',
        'Confirmation Letter', 'CO issued by ', 'Biên bản nghiệm thu', 'Biên bản Kiểm tra Hàng hóa',
        'AWB 77', '08-195353-06-PO-VXM-VECTOR-HQC- signed PO_Page', 'Work complet', ' HHNK 00',
        'HHNK + TBTP', ' VCND ', 'ihoadon.vn_031', 'Huynh Phuc Tho', 'EIR - CT190', ' VECTOR VIETNAM dated ',
        'DN190', 'DN - CT19', 'DHL AWB and CI', 'Delivery Ticket', 'Delivery Ninh Binh FertilizerToxic Gas  Detector',
        'CO by', 'CI for PO 44 ', 'Chung nhan xuat xu cap boi', 'Vo Tram Anh-', 'Vector Infotech Vietnam 22A7021v - CI Rev.3'
        ]

# directory = r'F:\orig\CT_in_Procur_merge_11184_orig - Copy'
directory = r'F:\orig\CT_new_merge_7234_1296_1712 - Copy'

keyword_counts, numeric_files = count_and_remove_files(directory, excluded_substrings)


Keyword Match Results:
--------------------------------------------------
* Number of files has keywords 'F32': 0
* Number of files has keywords 'F60': 0
* Number of files has keywords 'phieu bao hanh': 0
* Number of files has keywords 'livery note': 0
* Number of files has keywords 'DNTT': 0
* Number of files has keywords 'ĐNTT': 0
* Number of files has keywords 'Shipping mark': 0
* Number of files has keywords 'Shipping Mark': 0
* Number of files has keywords 'shipping mark': 0
* Number of files has keywords 'SHIPPING MARK': 0
* Number of files has keywords 'De nghi thanh toan': 0
* Number of files has keywords 'thong bao giao hang': 0
* Number of files has keywords 'Thông báo giao hàng': 0
* Number of files has keywords 'phieu kiem hang': 0
* Number of files has keywords 'kiểm hàng': 0
* Number of files has keywords 'cu di cong tac': 0
* Number of files has keywords 'bao hanh': 0
* Number of files has keywords 'thong bao hang den': 0
* Number of files has keywords 'Thông báo hàng đ