In [None]:
import pandas as pd
import glob
import os

# Get the parent directory path
current_dir = os.getcwd()
parent_dir = os.path.dirname(current_dir)

# Find all files in the parent directory matching the webscrape pattern
csv_pattern = os.path.join(parent_dir, "webscrape_*.csv")
csv_files = glob.glob(csv_pattern)

# Store column headers for each file
all_headers = {}
consistent = True
first_file = None

print(f"Found {len(csv_files)} webscrape CSV files")

# Loop through each file and get headers
for file in csv_files:
    filename = os.path.basename(file)
    
    # Just read the headers without loading entire file
    headers = pd.read_csv(file, nrows=0).columns.tolist()
    all_headers[filename] = headers
    
    # Store first file's headers as reference
    if first_file is None:
        first_file = filename
        reference_headers = headers
        print(f"Reference headers from {first_file}: {reference_headers}")
    # Compare current file with reference
    elif headers != reference_headers:
        consistent = False
        print(f"\nMISMATCH in {filename}:")
        
        # Find and show differences
        missing = set(reference_headers) - set(headers)
        extra = set(headers) - set(reference_headers)
        
        if missing:
            print(f"  Missing columns: {missing}")
        if extra:
            print(f"  Extra columns: {extra}")

# Print final result
if consistent:
    print("\nAll files have identical column headers")
else:
    print("\nWarning: Column headers differ between files")

Found 66 webscrape CSV files
Reference headers from webscrape_2023-08-15_09-20-02.csv: ['title', 'company', 'location', 'salary', 'description']

All files have identical column headers!
