# Rename irregularly named files into a standard format

Starting point: 
Folders named by year with a variety of filenames within, each indicating either month or season. 

Ending point: 
A regular folder structure and regular filenames indicating month and year of publication.

In [5]:
import os
import re
import shutil
from pathlib import Path

# Dictionary to map month names, abbreviations, and seasons to month numbers
MONTH_MAP = {
    "january": "01", "jan": "01",
    "february": "02", "feb": "02",
    "march": "03", "mar": "03",
    "april": "04", "apr": "04",
    "may": "05",
    "june": "06", "jun": "06",
    "july": "07", "jul": "07",
    "august": "08", "aug": "08",
    "september": "09", "sep": "09", "sept": "09",
    "october": "10", "oct": "10",
    "november": "11", "nov": "11",
    "december": "12", "dec": "12",
    "winter": "01", "spring": "03", "summer": "06", "fall": "09", "autumn": "09",
}

# Regular expression to capture numeric months (e.g., 01, 1, etc.)
MONTH_NUM_REGEX = re.compile(r'(\d{1,2})')

# Function to detect month from filename
def detect_month(filename):
    filename_lower = filename.lower()
    
    # Check for month names or seasons
    for month_name, month_num in MONTH_MAP.items():
        if month_name in filename_lower:
            return month_num
    
    # Check for numeric month (e.g., 01, 1, etc.)
    month_match = MONTH_NUM_REGEX.search(filename_lower)
    if month_match:
        month = month_match.group(1).zfill(2)  # Pad with zero if needed
        if 1 <= int(month) <= 12:  # Ensure it's a valid month
            return month
    
    return None

# Function to copy files to a new folder with subfolders by year, keeping only the largest file per year/month
def copy_largest_files_to_year_folders(base_folder, destination_folder):
    base_path = Path(base_folder)
    dest_path = Path(destination_folder)
    
    # Ensure the destination folder exists
    dest_path.mkdir(parents=True, exist_ok=True)
    
    # Dictionary to track the largest file for each year/month
    largest_files = {}
    
    for year_folder in base_path.iterdir():
        if year_folder.is_dir() and year_folder.name.isdigit():  # Year subfolders
            year = year_folder.name
            
            for file_path in year_folder.glob("*.pdf"):  # Look for PDF files
                month = detect_month(file_path.name)
                
                if month:
                    year_month = f"{year}_{month}"
                    
                    # Compare file size to determine if this file should be kept
                    if (year_month not in largest_files) or (file_path.stat().st_size > largest_files[year_month].stat().st_size):
                        largest_files[year_month] = file_path
    
    # Copy the largest files to the destination folder
    for year_month, file_path in largest_files.items():
        year, month = year_month.split("_")
        year_dest_folder = dest_path / year
        year_dest_folder.mkdir(exist_ok=True)
        
        new_filename = f"SHHA-GRIT-{year}_{month}.pdf"
        new_file_path = year_dest_folder / new_filename
        
        shutil.copy(file_path, new_file_path)




In [6]:
# Example usage
base_folder = "/Users/heidi/Documents/SHHA/GRIT/GRIT Archive **"
destination_folder = "/Users/heidi/Documents/SHHA/GRIT/GRIT_archive"
copy_largest_files_to_year_folders(base_folder, destination_folder)

In [7]:
from pathlib import Path

def count_pdf_files(folder):
    """
    Count the number of PDF files in a folder, including all subfolders.
    
    :param folder: The path to the folder to count PDF files in.
    :return: The total number of PDF files found.
    """
    folder_path = Path(folder)
    return len(list(folder_path.rglob("*.pdf")))

In [8]:
count_pdf_files(destination_folder)

341

In [9]:
count_pdf_files(base_folder)

356

In [12]:
# Write file structure to a text file

import os

folder_path = 'GRIT_archive'
output_file = 'folder_structure.txt'

with open(output_file, 'w') as f:
    for root, dirs, files in os.walk(folder_path):
        level = root.replace(folder_path, '').count(os.sep)
        indent = ' ' * 4 * level
        f.write(f'{indent}{os.path.basename(root)}/\n')
        sub_indent = ' ' * 4 * (level + 1)
        for file in files:
            f.write(f'{sub_indent}{file}\n')