In [None]:
import pandas as pd
import glob
from pathlib import Path
import xml.etree.ElementTree as ET

# Set the desired width for displaying the XML_File column
desired_width = 1000  # Adjust this value as needed
pd.set_option('display.max_colwidth', desired_width)

# Define the directory path containing the XML files
directory_path = Path(r'C:\Users\41015078\Desktop\New folder\19008365_01032025_025549.XML')

# Use glob.glob() to get a list of file paths that match the pattern
xml_files = glob.glob(str(directory_path / '*.xml'))

# List to hold the extracted data
data_list = []

# List of tags to check for
tags_to_check = ['D1', 'D2', 'D3', 'D4', 'D5', 'D6', 'D7', 'D8', 'D9', 'D10', 'D11', 'D1251', 'D1300']

# Iterate through each XML file
for xml_file in xml_files:
    # Parse XML file
    tree = ET.parse(xml_file)
    root = tree.getroot()
    
    # Extract the value of METER for each G1 element and check for each tag
    for G1 in root.iter('G1'):
        # Dictionary to hold the extracted data for the current G1 element
        temp_dict = {'METER': G1.text}
        
        # Check for each tag in the XML
        for tag in tags_to_check:
            if root.find(f".//{tag}") is not None:
                temp_dict[tag] = 'yes'
            else:
                temp_dict[tag] = 'no'
        
        # Append the temporary dictionary to the list
        data_list.append(temp_dict)

# Create a DataFrame from the list of dictionaries
data_frame = pd.DataFrame(data_list)

# Display the DataFrame
display(data_frame)
# data_frame.to_csv(r'C:\Users\Dell\Desktop\h\tags.csv')


### New Update Code Below for Bulk XML files-Final

In [None]:
import pandas as pd
import glob
from pathlib import Path
import xml.etree.ElementTree as ET
from tqdm import tqdm

# Set the desired width for displaying the XML_File column
desired_width = 1000  # Adjust this value as needed
pd.set_option('display.max_colwidth', desired_width)

# Define the directory path containing the XML files
directory_path = Path(r'C:\Users\41015078\Desktop\New folder')

# Use glob.glob() to get a list of file paths that match the pattern
xml_files = glob.glob(str(directory_path / '*.xml'))

# List to hold the extracted data
data_list = []
faulty_xml = []

# List of tags to check for
tags_to_check = ['D1', 'D2', 'D3', 'D4', 'D5', 'D6', 'D7', 'D8', 'D9', 'D10', 'D11', 'D1251', 'D1300']


# Iterate through each XML file
for xml_file in tqdm(xml_files):
        
    try:

        # Parse XML file
        tree = ET.parse(xml_file)
        root = tree.getroot()
        
        # Extract the value of METER for each G1 element and check for each tag
        for G1 in root.iter('G1'):
            # Dictionary to hold the extracted data for the current G1 element
            temp_dict = {'METER': G1.text}
            
            # Check for each tag in the XML
            for tag in tags_to_check:
                if root.find(f".//{tag}") is not None:
                    temp_dict[tag] = 'yes'
                else:
                    temp_dict[tag] = 'no'
            
            # Append the temporary dictionary to the list
            data_list.append(temp_dict)
    except:
        faulty_xml.append(xml_file)

# After the loop, save the faulty XML files to a CSV
faulty_xml_df = pd.DataFrame(faulty_xml, columns=['Faulty_XML_Files'])

# Save the DataFrame to a CSV file
faulty_xml_df.to_csv(r'C:\Users\41015078\Desktop\New folder\faulty_xml_files.csv', index=False)

print(f"Faulty XML files have been saved to 'faulty_xml_files.csv'")


print("No. of Faulty xmls")
print(len(faulty_xml))

# Create a DataFrame from the list of dictionaries
data_frame = pd.DataFrame(data_list)

# Display the DataFrame
display(data_frame)


Final XML Tags Parameter Status Script

In [1]:
import pandas as pd
import glob
from pathlib import Path
import xml.etree.ElementTree as ET
from tqdm import tqdm  # Changed back to regular tqdm
import concurrent.futures
import threading
from queue import Queue
import time
import os
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 


# Set the desired width for displaying the XML_File column
desired_width = 1000  # Adjust this value as needed
pd.set_option('display.max_colwidth', desired_width)


# Define the directory path containing the XML files
directory_path = Path(r'Y:\FY2024_25\FEB25\EMD XML FILES\1\Completed')


# Use glob.glob() to get a list of file paths that match the pattern
xml_files = glob.glob(str(directory_path / '*.xml'))


# List to hold the extracted data
data_list = []
faulty_xml = []
normal_xml = []


# List of tags to check for
tags_to_check = ['D1', 'D2', 'D3', 'D4', 'D5', 'D6', 'D7', 'D8', 'D9', 'D10', 'D11', 'D1251', 'D1300']


# Thread-safe queues for data collection
data_queue = Queue()
faulty_queue = Queue()
normal_queue = Queue()


def process_xml_file(xml_file):
    try:
        # Parse XML file
        tree = ET.parse(xml_file)
        root = tree.getroot()
        
        # Extract the value of METER for each G1 element and check for each tag
        for G1 in root.iter('G1'):
            # Dictionary to hold the extracted data for the current G1 element
            temp_dict = {'METER': G1.text}
            
            # Check for each tag in the XML
            for tag in tags_to_check:
                if root.find(f".//{tag}") is not None:
                    temp_dict[tag] = 'yes'
                else:
                    temp_dict[tag] = 'no'
            
            # Put the data in the queue
            data_queue.put(temp_dict)
        
        # If we get here, the file was processed successfully
        normal_queue.put(xml_file)
        
    except Exception as e:
        # If there's an error, add to faulty queue
        faulty_queue.put(xml_file)
        print(f"Error processing {xml_file}: {str(e)}")


def process_files_with_threading():
    # Create a thread pool
    with concurrent.futures.ThreadPoolExecutor(max_workers=8) as executor:
        # Submit all files to the thread pool
        futures = [executor.submit(process_xml_file, xml_file) for xml_file in xml_files]
        
        # Show progress bar
        with tqdm(total=len(xml_files), desc="Processing XML files") as pbar:
            for future in concurrent.futures.as_completed(futures):
                pbar.update(1)


def collect_results():
    # Collect data from queues
    while not data_queue.empty():
        data_list.append(data_queue.get())
    
    while not faulty_queue.empty():
        faulty_xml.append(faulty_queue.get())
    
    while not normal_queue.empty():
        normal_xml.append(normal_queue.get())


# Start processing
print("Starting XML processing with threading...")
start_time = time.time()


process_files_with_threading()


print("Collecting results...")
collect_results()


# Save the faulty XML files to a CSV
faulty_xml_df = pd.DataFrame(faulty_xml, columns=['Faulty_XML_Files'])
faulty_xml_df.to_csv(r'C:\Users\41015078\Desktop\3Phase XML Tag Status FrontEnd\faulty_xml_files.csv', index=False)


# Save the normal XML files to a CSV
normal_xml_df = pd.DataFrame(normal_xml, columns=['Normal_XML_Files'])
normal_xml_df.to_csv(r'C:\Users\41015078\Desktop\3Phase XML Tag Status FrontEnd\normal_xml_files.csv', index=False)


# Create a DataFrame from the list of dictionaries
data_frame = pd.DataFrame(data_list)


# Save the processed data to a CSV
data_frame.to_csv(r'C:\Users\41015078\Desktop\3Phase XML Tag Status FrontEnd\processed_xml_data.csv', index=False)


end_time = time.time()
processing_time = end_time - start_time


print(f"\nProcessing completed in {processing_time:.2f} seconds")
print(f"Total files processed: {len(xml_files)}")
print(f"Normal XML files: {len(normal_xml)}")
print(f"Faulty XML files: {len(faulty_xml)}")
print(f"Total records extracted: {len(data_list)}")


print("\nFiles have been saved:")
print("1. processed_xml_data.csv - Contains all extracted data")
print("2. normal_xml_files.csv - List of successfully processed XML files")
print("3. faulty_xml_files.csv - List of XML files that failed to process")


# Display the first few rows of the processed data
print("\nFinal XML files processed data:")
print(data_frame.head())


Starting XML processing with threading...


Processing XML files: 100%|██████████████████████████████████████████████████████████| 651/651 [01:07<00:00,  9.68it/s]

Collecting results...

Processing completed in 67.31 seconds
Total files processed: 651
Normal XML files: 651
Faulty XML files: 0
Total records extracted: 651

Files have been saved:
1. processed_xml_data.csv - Contains all extracted data
2. normal_xml_files.csv - List of successfully processed XML files
3. faulty_xml_files.csv - List of XML files that failed to process

Final XML files processed data:
      METER   D1   D2   D3   D4   D5  D6  D7   D8   D9  D10 D11 D1251 D1300
0  17008336  yes  yes  yes  yes  yes  no  no  yes  yes  yes  no    no    no
1  17012660  yes  yes  yes  yes  yes  no  no  yes  yes  yes  no    no    no
2  17013571  yes  yes  yes  yes  yes  no  no  yes  yes  yes  no    no    no
3  17010614  yes  yes  yes  yes  yes  no  no  yes  yes  yes  no    no    no
4  17013184  yes  yes  yes  yes  yes  no  no  yes  yes  yes  no    no    no



