In [58]:
from pathlib import Path
import pandas as pd
from collections import defaultdict
import PyPDF2
pdf_dir = "pdfs/"
id_numders_csv = 'Lead_Proposals_PDFs.csv'

In [28]:
numbers = pd.read_csv(id_numders_csv)

There are a few ways to group the numbers. We'll use a default dict. This is a dicionary where each key has some data structure, even when it hasn't been created. For example, if we create number_groups = defaultdict(list) this will be a dictionary where all values are a list. We can then append to those lists. So each key will be a lead number, and the values of the dictionary will be a list of proposal numbers.

In [44]:
numbers

Unnamed: 0,Lead,Proposal
0,2023106,2023106
1,2023288,2023288
2,2023288,2023092
3,2022392,2022392
4,2022392,2022424
5,2022392,2022389


In [45]:
number_groups = defaultdict(list)

In [46]:
for lead, proposal in zip(numbers['Lead'], numbers['Proposal']):
    number_groups[lead].append(proposal)

In [47]:
number_groups

defaultdict(list,
            {2023106: [2023106],
             2023288: [2023288, 2023092],
             2022392: [2022392, 2022424, 2022389]})

So now we have groups of numbers, and we can use this to select files. This is a good user for pathlib. We create a path object where all the PDFs. We can then use regular expressions to get files based on number.

In [48]:
pdfs = Path(pdf_dir)

In [50]:
list(pdfs.glob('2023106*'))

[PosixPath('pdfs/2023106.pdf'), PosixPath('pdfs/2023106_COA.pdf')]

Note that this returns more path objects. For simplicity, you probably want just the file path as a string.

In [51]:
[i.as_posix() for i in pdfs.glob('2023106*')]

['pdfs/2023106.pdf', 'pdfs/2023106_COA.pdf']

Now let's write a function that takes a key from the dictionary and returns all files associated with it.

In [52]:
def pdf_file_getter(a_dict_key):
    files = []
    for proposal in number_groups[a_dict_key]:
        files.extend([i.as_posix() for i in pdfs.glob('{}*'.format(proposal))])
    return files

In [53]:
pdf_file_getter(2023106)

['pdfs/2023106.pdf', 'pdfs/2023106_COA.pdf']

In [54]:
pdf_file_getter(2023288)

['pdfs/2023288.pdf',
 'pdfs/2023288_COA.pdf',
 'pdfs/2023092_COA.pdf',
 'pdfs/2023092.pdf']

In [55]:
pdf_file_getter(2022392)

['pdfs/2022392_COA.pdf', 'pdfs/2022424_COA.pdf', 'pdfs/2022389_COA.pdf']

Now, a function that takes a list of files and combines them into one pdf

In [65]:
def pdf_combiner(pdf_list):
    pdfs = [PyPDF2.PdfFileReader(i) for i in pdf_list]
    pdfWriter = PyPDF2.PdfFileWriter()
    for pdf in pdfs:
        for pagenum in range(pdf.numPages):
            page = pdf.getPage(pagenum)
            pdfWriter.addPage(page)
    return pdfWriter

In [66]:
pdf_combiner(pdf_file_getter(2022392))

<PyPDF2.pdf.PdfFileWriter at 0x1242d8d90>

And finally, a function to write to disk.

In [67]:
def pdf_writer(pdf_object, file_name):
    with open(file_name, 'wb') as outfile:
        pdf_object.write(outfile)

In [68]:
num = 2022392
file_list = pdf_file_getter(num)
pdf_object = pdf_combiner(file_list)
pdf_writer(pdf_object, "{}_combined.pdf".format(num))

Create all files at once

In [70]:
for num in number_groups.keys():
    file_list = pdf_file_getter(num)
    pdf_object = pdf_combiner(file_list)
    pdf_writer(pdf_object, "{}_combined.pdf".format(num))

For shits and giggles, here's a single function to do everything.

In [71]:
def pdf_writer(number_csv, pdf_directory, output_suffix="_combined"):
    numbers = pd.read_csv(number_csv)
    pdf_dir = Path(pdf_directory)
    number_groups = defaultdict(list)
    for lead, proposal in zip(numbers['Lead'], numbers['Proposal']):
        number_groups[lead].append(proposal)
    def pdf_file_getter(a_dict_key):
        files = []
        for proposal in number_groups[a_dict_key]:
            files.extend([i.as_posix() for i in pdf_dir.glob('{}*'.format(proposal))])
        return files
    def pdf_combiner(pdf_list):
        pdfs = [PyPDF2.PdfFileReader(i) for i in pdf_list]
        pdfWriter = PyPDF2.PdfFileWriter()
        for pdf in pdfs:
            for pagenum in range(pdf.numPages):
                page = pdf.getPage(pagenum)
                pdfWriter.addPage(page)
        return pdfWriter
    def pdf_writer(pdf_object, file_name):
        with open(file_name, 'wb') as outfile:
            pdf_object.write(outfile)
    for num in number_groups.keys():
        file_list = pdf_file_getter(num)
        pdf_object = pdf_combiner(file_list)
        pdf_writer(pdf_object, "{0}{1}.pdf".format(num, output_suffix))

In [72]:
pdf_writer(id_numders_csv, pdf_dir)