Task 3: Organizing PDF Files for Companies

The company folders should be in the same directory where the script is run from

In [17]:
import os
import re
import shutil
from PyPDF2 import PdfReader
from datetime import datetime

In [18]:
pdf_pattern = r'[\w\d\s]+\.pdf'

cwd = os.getcwd()

apple_docs_folder = f'{cwd}{os.sep}Apple docs'
tesla_docs_folder = f'{cwd}{os.sep}Tesla docs'

apple_listed_dir = ','.join(os.listdir(apple_docs_folder))
tesla_listed_dir = ','.join(os.listdir(tesla_docs_folder))

apple_new_pdfs = re.findall(pdf_pattern, apple_listed_dir)
tesla_new_pdfs = re.findall(pdf_pattern, tesla_listed_dir)

Extract the date from each of the new document’s titles

In [19]:
def read_pdf_first_page(pdf_file_name: str) -> str:
    """
    Read first pag of the PDF file
    :param pdf_file_name: absolute path to the PDF file
    :return: First Page data read from PDF file
    """
    
    with open(pdf_file_name, 'rb') as opened_file:
        reader = PdfReader(opened_file)
        first_page = reader.pages[0]
        first_page_data = first_page.extract_text()

    return first_page_data


def get_pdf_file_dates(pdf_file: str, company_folder: str) -> str:
    """
    Get the date from the title in the PDF file
    :param pdf_file: absolute path to the PDF file
    :param company_folder: absolute path to the company folder
    :return: Date extracted from the PDF file's title
    """
    
    date_pattern = r'[A-Z][a-z]+[ ]+\d+,[ ]+\d{4}'
    file_path = f'{company_folder}{os.sep}{pdf_file}'
    
    first_page_data = read_pdf_first_page(file_path)    
    dates = re.findall(date_pattern, first_page_data)
    date = dates.pop(0) if dates else None

    return date

In [20]:
pdf_extracted_dates = {
    'apple docs': {pdf: get_pdf_file_dates(pdf, apple_docs_folder) for pdf in apple_new_pdfs}, 
    'tesla docs': {pdf: get_pdf_file_dates(pdf, tesla_docs_folder) for pdf in tesla_new_pdfs}
}

Move the PDFs to the appropriate folders corresponding to the correct quarters
Bonus: Categorize the new documents

In [21]:
quarter_dates_ranges = {
    'Q1': [(1, 1), (3, 31)],
    'Q2': [(4, 1), (6, 30)],
    'Q3': [(7, 1), (9, 30)],
    'Q4': [(10, 1), (12, 31)],
}

In [22]:
def categorize_pdf_files(date: tuple) -> str:
    """
    Categorize the file by its date provided
    :param date: tuple (month, day) of the date to categorize
    :return: Category of the PDF file in the quarter
    """
    
    for dates in quarter_dates_ranges.values():
        starting_date = dates[0]
        ending_date = dates[1]
        mid_of_quarter = tuple(map(sum, zip(starting_date, (1, 14))))
        
        if starting_date <= date <= mid_of_quarter:
            return 'Cat1'
        elif mid_of_quarter < date <= ending_date:
            return 'Cat2'

In [23]:
def format_extracted_date(date: str) -> tuple:
    """
    Format extracted date into a tuple of month, day
    :param date: Extracted date from the PDF file title in format '{Month} {day}, {year}'
    :return: Tuple of month, day
    """
    
    day = int(date.replace(',', '').split()[1])
    month = datetime.strptime(date.split()[0], "%B").month
    return month, day
    

def get_quarter(date: tuple) -> str:
    """
    Get the quarter of the document
    :param date: date in format (month, day)
    :return: Returns the quarter of the document in format 'Q{number}'
    """
    
    for quarter, dates_list in quarter_dates_ranges.items():
        starting_date = dates_list[0]
        ending_date = dates_list[1]
        
        if starting_date <= date <= ending_date:
            return quarter
        

def check_quarter_directory_exists(quarter: str, company_folder: str) -> None:
    """
    Check if the quarter directory exists, if not creates it
    :param quarter: the quarter of the document
    :param company_folder: absolute path of the company folder
    :return: None
    """
    
    quarter_path = os.path.join(company_folder, quarter)
    if not os.path.exists(quarter_path):
        os.makedirs(quarter_path)    
    

def move_pdf_to_quarter_folder(pdf_file: str, date_extracted: str, company_folder: str) -> None:
    """
    Move the PDF file to the appropriate quarter folder
    :param pdf_file: absolute path of the PDF file
    :param date_extracted: date in the format it is extracted from the file's title
    :param company_folder: absolute path of the company folder
    :return: None
    """
    
    date = format_extracted_date(date_extracted)
    quarter = get_quarter(date)
    category = categorize_pdf_files(date)
    
    destination_path = f'{company_folder}{os.sep}{quarter}{os.sep}{category}.pdf'
    pdf_abs_path = f'{company_folder}{os.sep}{pdf_file}'
    
    check_quarter_directory_exists(quarter, company_folder)

    shutil.move(pdf_abs_path, destination_path)


def organize_new_company_pdfs(company_name: str) -> None:
    """
    Organize the company new PDF files into their respective folders
    :param company_name: absolute path of the company folder
    :return: None
    """
    
    company_folder = apple_docs_folder if company_name in apple_docs_folder else tesla_docs_folder
    company_docs = f'{company_name.lower()} docs'
    pdfs = pdf_extracted_dates.get(company_docs)

    for pdf_file, extracted_date in pdfs.items():
        if extracted_date:
            move_pdf_to_quarter_folder(pdf_file, extracted_date, company_folder)
                  

In [24]:
organize_new_company_pdfs('Apple')
organize_new_company_pdfs('Tesla')