In [1]:
import pandas as pd
from pathlib import Path
from collections import defaultdict

# Define path to CSV files
csv_dir = Path('../source/csv_results')

# Dictionary to store column names for each file
column_dict = {}

# Read column names from each CSV
for csv_file in sorted(csv_dir.glob('*.csv')):
    df = pd.read_csv(csv_file, nrows=0)  # Read only headers
    column_dict[csv_file.name] = list(df.columns)
    print(f"{csv_file.name}: {len(df.columns)} columns")

print(f"\n{'='*60}")
print(f"Total CSV files analyzed: {len(column_dict)}")
print(f"{'='*60}")

# Check if all column sets are identical
all_columns = list(column_dict.values())
first_columns = all_columns[0]
all_identical = all(cols == first_columns for cols in all_columns)

if all_identical:
    print("\n✅ All CSV files have IDENTICAL column names!")
    print(f"\nColumn names ({len(first_columns)} columns):")
    for i, col in enumerate(first_columns, 1):
        print(f"  {i}. {col}")
else:
    print("\n⚠️  CSV files have DIFFERENT column names!")
    
    # Find unique column sets
    unique_column_sets = {}
    for filename, cols in column_dict.items():
        cols_tuple = tuple(cols)
        if cols_tuple not in unique_column_sets:
            unique_column_sets[cols_tuple] = []
        unique_column_sets[cols_tuple].append(filename)
    
    print(f"\nFound {len(unique_column_sets)} different column structures:\n")
    
    for i, (cols, files) in enumerate(unique_column_sets.items(), 1):
        print(f"\n--- Structure {i} ({len(cols)} columns) ---")
        print(f"Files with this structure: {', '.join(files)}")
        print(f"\nColumns:")
        for j, col in enumerate(cols, 1):
            print(f"  {j}. {col}")
    
    # Show differences between structures
    if len(unique_column_sets) == 2:
        print(f"\n{'='*60}")
        print("DIFF ANALYSIS")
        print(f"{'='*60}")
        
        cols_list = list(unique_column_sets.keys())
        set1, set2 = set(cols_list[0]), set(cols_list[1])
        
        only_in_first = set1 - set2
        only_in_second = set2 - set1
        common = set1 & set2
        
        print(f"\nCommon columns: {len(common)}")
        print(f"Only in structure 1: {len(only_in_first)}")
        if only_in_first:
            for col in only_in_first:
                print(f"  - {col}")
        
        print(f"\nOnly in structure 2: {len(only_in_second)}")
        if only_in_second:
            for col in only_in_second:
                print(f"  - {col}")


Athabasca_Results_Nov6_2024.csv: 56 columns
Calgary_Results_Oct23.csv: 56 columns
Concordia_Results_Oct24.csv: 56 columns
Laval_Results_Oct20.csv: 56 columns
Manitoba_Results_Oct23.csv: 56 columns
McGill-Queens_Results_Oct20.csv: 56 columns
Memorial_Final_Title_List.csv: 56 columns
Nunavut_Results_Oct24.csv: 60 columns
PIMS_Results_Oct23.csv: 56 columns
Regina_Results_Oct24.csv: 56 columns
Toronto_Results_Oct19.csv: 56 columns
UAlberta_Final_List_April1_2025.csv: 56 columns
UBC_Results_Oct23.csv: 56 columns
UMontreal_Results_Oct24.csv: 56 columns
UQuebec_Results_Oct24.csv: 56 columns
Wilfrid_Laurier_Results_Oct23_Revised.csv: 57 columns

Total CSV files analyzed: 16

⚠️  CSV files have DIFFERENT column names!

Found 3 different column structures:


--- Structure 1 (56 columns) ---
Files with this structure: Athabasca_Results_Nov6_2024.csv, Calgary_Results_Oct23.csv, Concordia_Results_Oct24.csv, Laval_Results_Oct20.csv, Manitoba_Results_Oct23.csv, McGill-Queens_Results_Oct20.csv, Memori

In [1]:
import pandas as pd
from pathlib import Path
import warnings

# Define columns to extract
columns_to_extract = [
    'ID',
    'TITLE_AUTHOR_DATE_COMBINED_NORMALIZED',
    'TITLE',
    'TITLE_REMAINDER',
    'AUTHOR',
    'AUTHOR_QUALIFIER',
    'AUTHOR_DATE',
    'EDITION',
    'BEGIN_PUBLICATION_DATE',
    'PUBDATE_260',
    'PUBDATE_264',
    'PUBPLACE_260',
    'PUBPLACE_264',
    'PUBLISHER_260',
    'PUBLISHER_264',
    'EXTENT',
    'ASSOCIATED_ISBNS',
    'ASSOCIATED_OCLC_NUMBERS',
    'LCCN',
    'TITLE_MATCH_COUNT',
    'RETENTIONS_APPLIED'
]

# Define path to CSV files
csv_dir = Path('../source/csv_results')

# List to store dataframes
dfs_to_combine = []

print(f"Processing {len(list(csv_dir.glob('*.csv')))} CSV files...\n")

# Process each CSV file
for csv_file in sorted(csv_dir.glob('*.csv')):
    print(f"Processing: {csv_file.name}")
    
    # Read the CSV
    df = pd.read_csv(csv_file)
    
    # Get available columns
    available_columns = df.columns.tolist()
    
    # Check which columns exist and which don't
    missing_columns = []
    present_columns = []
    
    for col in columns_to_extract:
        if col in available_columns:
            present_columns.append(col)
        else:
            missing_columns.append(col)
            warnings.warn(f"Column '{col}' not found in {csv_file.name}")
    
    # Extract only the columns that exist
    df_subset = df[present_columns].copy()
    
    # Add missing columns as NaN
    for col in missing_columns:
        df_subset[col] = None
    
    # Reorder columns to match the original list
    df_subset = df_subset[columns_to_extract]
    
    # Add source column (extract meaningful name from filename)
    source_name = csv_file.stem  # Remove .csv extension
    df_subset.insert(0, 'SOURCE_DATASET', source_name)
    
    print(f"  ✓ Extracted {len(df_subset)} rows, {len(present_columns)}/{len(columns_to_extract)} columns present")
    if missing_columns:
        print(f"  ⚠️  Missing columns: {', '.join(missing_columns)}")
    
    dfs_to_combine.append(df_subset)

print(f"\n{'='*60}")
print("Combining all datasets...")

# Combine all dataframes vertically
combined_df = pd.concat(dfs_to_combine, axis=0, ignore_index=True)

print(f"✓ Combined dataset shape: {combined_df.shape[0]:,} rows × {combined_df.shape[1]} columns")

# Save to CSV
output_path = Path('../source/combined_results.csv')
combined_df.to_csv(output_path, index=False)

print(f"✓ Saved to: {output_path}")
print(f"\n{'='*60}")
print("Summary by source:")
print(combined_df['SOURCE_DATASET'].value_counts().sort_index())
print(f"{'='*60}")


Processing 16 CSV files...

Processing: Athabasca_Results_Nov6_2024.csv
  ✓ Extracted 225 rows, 20/21 columns present
  ⚠️  Missing columns: RETENTIONS APPLIED
Processing: Calgary_Results_Oct23.csv
  ✓ Extracted 595 rows, 20/21 columns present
  ⚠️  Missing columns: RETENTIONS APPLIED
Processing: Concordia_Results_Oct24.csv
  ✓ Extracted 187 rows, 20/21 columns present
  ⚠️  Missing columns: RETENTIONS APPLIED
Processing: Laval_Results_Oct20.csv
  ✓ Extracted 5787 rows, 20/21 columns present
  ⚠️  Missing columns: RETENTIONS APPLIED
Processing: Manitoba_Results_Oct23.csv
  ✓ Extracted 386 rows, 20/21 columns present
  ⚠️  Missing columns: RETENTIONS APPLIED
Processing: McGill-Queens_Results_Oct20.csv
  ✓ Extracted 7982 rows, 20/21 columns present
  ⚠️  Missing columns: RETENTIONS APPLIED
Processing: Memorial_Final_Title_List.csv
  ✓ Extracted 253 rows, 20/21 columns present
  ⚠️  Missing columns: RETENTIONS APPLIED
Processing: Nunavut_Results_Oct24.csv
  ✓ Extracted 100 rows, 20/21 col



  ✓ Extracted 9451 rows, 20/21 columns present
  ⚠️  Missing columns: RETENTIONS APPLIED
Processing: UAlberta_Final_List_April1_2025.csv
  ✓ Extracted 1167 rows, 20/21 columns present
  ⚠️  Missing columns: RETENTIONS APPLIED
Processing: UBC_Results_Oct23.csv
  ✓ Extracted 2406 rows, 20/21 columns present
  ⚠️  Missing columns: RETENTIONS APPLIED
Processing: UMontreal_Results_Oct24.csv
  ✓ Extracted 10907 rows, 20/21 columns present
  ⚠️  Missing columns: RETENTIONS APPLIED
Processing: UQuebec_Results_Oct24.csv
  ✓ Extracted 4783 rows, 20/21 columns present
  ⚠️  Missing columns: RETENTIONS APPLIED
Processing: Wilfrid_Laurier_Results_Oct23_Revised.csv
  ✓ Extracted 881 rows, 20/21 columns present
  ⚠️  Missing columns: RETENTIONS APPLIED

Combining all datasets...
✓ Combined dataset shape: 47,161 rows × 22 columns
✓ Saved to: ../source/combined_results.csv

Summary by source:
SOURCE_DATASET
Athabasca_Results_Nov6_2024                225
Calgary_Results_Oct23                      595
Co