In [1]:
from pathlib import Path
import pandas as pd
from collections import defaultdict
import PyPDF2
pdf_dir = "pdfs/"
id_numders_csv = 'Lead_Proposals_PDFs.csv'

In [2]:
numbers = pd.read_csv(id_numders_csv)

There are a few ways to group the numbers. We'll use a default dict. This is a dicionary where each key has some data structure, even when it hasn't been created. For example, if we create number_groups = defaultdict(list) this will be a dictionary where all values are a list. We can then append to those lists. So each key will be a lead number, and the values of the dictionary will be a list of proposal numbers.

In [3]:
numbers

Unnamed: 0,Lead,Proposal,PI NAME
0,2023106,2023106,"Lewis, Allison"
1,2023288,2023288,"Snyder, Ben"
2,2023288,2023092,"Some, Loser"
3,2022392,2022392,"Sanders, Bernie"
4,2022392,2022424,"Doe, Jane"
5,2022392,2022389,"Random, Person"


In [4]:
number_groups = defaultdict(list)

In [5]:
for lead, proposal in zip(numbers['Lead'], numbers['Proposal']):
    number_groups[lead].append(proposal)

To sort the list so that the lead proposal is always first, we can just use the built in Python `sorted` function, but with a tweak. Normally, `sorted` will sort alphabetically or numerically, but can be modified by using a lbamda function as a key. Let's use the PI names column as a list.

In [6]:
a_list = numbers['PI NAME'].values
a_list

array(['Lewis, Allison', 'Snyder, Ben', 'Some, Loser', 'Sanders, Bernie',
       'Doe, Jane', 'Random, Person'], dtype=object)

In [7]:
# sorted alphabetically by last name
sorted(a_list)

['Doe, Jane',
 'Lewis, Allison',
 'Random, Person',
 'Sanders, Bernie',
 'Snyder, Ben',
 'Some, Loser']

In [8]:
# using a key to force S names to appear first
# considers anything starting with S to be key 1 and anything else 2
# sorts according to key
sorted(a_list, key=lambda x: 1 if x.startswith('S') else 2)

['Snyder, Ben',
 'Some, Loser',
 'Sanders, Bernie',
 'Lewis, Allison',
 'Doe, Jane',
 'Random, Person']

In [9]:
# forcing a specific name to appear first
sorted(a_list, key=lambda x: 1 if x=='Lewis, Allison' else 2)

['Lewis, Allison',
 'Snyder, Ben',
 'Some, Loser',
 'Sanders, Bernie',
 'Doe, Jane',
 'Random, Person']

In [10]:
# creates a new dictionary with lists sorted to that the lead proposal is always first
number_groups = {i:sorted(j, key=lambda x: 1 if x==i else 2) for i,j in number_groups.items()}

For names, we would like a dictionary of all the lead proposal numbers, and the corresponding last name. Get this by iterating over the lead proposal numbers, the keys in the number_groups.

In [11]:
number_groups.keys()

dict_keys([2023106, 2023288, 2022392])

In [12]:
# use each key to subset the data
for i in number_groups.keys():
    print(numbers.loc[(numbers['Lead']==i)])

      Lead  Proposal         PI NAME
0  2023106   2023106  Lewis, Allison
      Lead  Proposal      PI NAME
1  2023288   2023288  Snyder, Ben
2  2023288   2023092  Some, Loser
      Lead  Proposal          PI NAME
3  2022392   2022392  Sanders, Bernie
4  2022392   2022424        Doe, Jane
5  2022392   2022389   Random, Person


In [13]:
# need to make sure we get the right row, so also subset where lead==proposal
for i in number_groups.keys():
    print(numbers.loc[(numbers['Lead']==i) & (numbers['Lead']==numbers['Proposal'])])

      Lead  Proposal         PI NAME
0  2023106   2023106  Lewis, Allison
      Lead  Proposal      PI NAME
1  2023288   2023288  Snyder, Ben
      Lead  Proposal          PI NAME
3  2022392   2022392  Sanders, Bernie


In [14]:
# get just the pi name
for i in number_groups.keys():
    print(numbers.loc[(numbers['Lead']==i) & (numbers['Lead']==numbers['Proposal'])]['PI NAME'])

0    Lewis, Allison
Name: PI NAME, dtype: object
1    Snyder, Ben
Name: PI NAME, dtype: object
3    Sanders, Bernie
Name: PI NAME, dtype: object


In [15]:
# as a string
for i in number_groups.keys():
    print(numbers.loc[(numbers['Lead']==i) & (numbers['Lead']==numbers['Proposal'])]['PI NAME'].values[0])

Lewis, Allison
Snyder, Ben
Sanders, Bernie


In [16]:
# split string to get last name
for i in number_groups.keys():
    print(numbers.loc[(numbers['Lead']==i) & \
                      (numbers['Lead']==numbers['Proposal'])] \
                      ['PI NAME'].values[0].split(',')[0])

Lewis
Snyder
Sanders


In [17]:
# make into a dictionary
# get PI last names that correspond to each lead proposal
names = {i: numbers.loc[(numbers['Lead']==i) & \
           (numbers['Lead']==numbers['Proposal'])]['PI NAME'].values[0].split(',')[0] \
             for i in number_groups.keys()}

So now we have groups of numbers, and we can use this to select files. This is a good user for pathlib. We create a path object where all the PDFs. We can then use regular expressions to get files based on number.

In [18]:
pdfs = Path(pdf_dir)

In [19]:
list(pdfs.glob('2023106*'))

[PosixPath('pdfs/2023106.pdf'), PosixPath('pdfs/2023106_COA.pdf')]

Note that this returns more path objects. For simplicity, you probably want just the file path as a string.

In [20]:
[i.as_posix() for i in pdfs.glob('2023106*')]

['pdfs/2023106.pdf', 'pdfs/2023106_COA.pdf']

Now let's write a function that takes a key from the dictionary and returns all files associated with it. We also want to make sure the file with just the id number appears first. We can do that by sorting by length.

In [21]:
def pdf_file_getter(a_dict_key):
    files = []
    for proposal in number_groups[a_dict_key]:
        files_for_id = [i.as_posix() for i in pdfs.glob('{}*.pdf'.format(proposal))]
        files_for_id = sorted(files_for_id, key=len)
        files.extend(files_for_id)
    return files

In [22]:
pdf_file_getter(2023106)

['pdfs/2023106.pdf', 'pdfs/2023106_COA.pdf']

In [23]:
pdf_file_getter(2023288)

['pdfs/2023288.pdf',
 'pdfs/2023288_COA.pdf',
 'pdfs/2023092.pdf',
 'pdfs/2023092_COA.pdf']

In [24]:
pdf_file_getter(2022392)

['pdfs/2022392_COA.pdf', 'pdfs/2022424_COA.pdf', 'pdfs/2022389_COA.pdf']

Now, a function that takes a list of files and combines them into one pdf

In [25]:
def pdf_combiner(pdf_list):
    pdfs = [PyPDF2.PdfFileReader(i) for i in pdf_list]
    pdfWriter = PyPDF2.PdfFileWriter()
    for pdf in pdfs:
        for pagenum in range(pdf.numPages):
            page = pdf.getPage(pagenum)
            pdfWriter.addPage(page)
    return pdfWriter

In [26]:
pdf_combiner(pdf_file_getter(2022392))

<PyPDF2.pdf.PdfFileWriter at 0x116bc1fd0>

And finally, a function to write to disk.

In [27]:
def pdf_writer(pdf_object, file_name):
    with open(file_name, 'wb') as outfile:
        pdf_object.write(outfile)

In [28]:
num = 2022392
file_list = pdf_file_getter(num)
pdf_object = pdf_combiner(file_list)
pdf_writer(pdf_object, "{}_{}_combined.pdf".format(names[num], num))

Create all files at once

In [29]:
for num in number_groups.keys():
    file_list = pdf_file_getter(num)
    pdf_object = pdf_combiner(file_list)
    pdf_writer(pdf_object, "{}_{}_combined.pdf".format(names[num], num))

For shits and giggles, here's a single function to do everything.

In [30]:
def pdf_combiner(number_csv, pdf_directory, output_suffix="_combined"):
    numbers = pd.read_csv(number_csv)
    pdf_dir = Path(pdf_directory)
    number_groups = defaultdict(list)
    for lead, proposal in zip(numbers['Lead'], numbers['Proposal']):
        number_groups[lead].append(proposal)
    number_groups = {i:sorted(j, key=lambda x: 1 if x==i else 2) \
                     for i,j in number_groups.items()}
    names = {i: numbers.loc[(numbers['Lead']==i) & \
           (numbers['Lead']==numbers['Proposal'])]['PI NAME'].values[0].split(',')[0] \
             for i in number_groups.keys()}
    def pdf_file_getter(a_dict_key):
        files = []
        for proposal in number_groups[a_dict_key]:
            files_for_id = [i.as_posix() for i in pdfs.glob('{}*.pdf'.format(proposal))]
            files_for_id = sorted(files_for_id, key=len)
            files.extend(files_for_id)
        return files
    def pdf_combiner(pdf_list):
        pdfs = [PyPDF2.PdfFileReader(i) for i in pdf_list]
        pdfWriter = PyPDF2.PdfFileWriter()
        for pdf in pdfs:
            for pagenum in range(pdf.numPages):
                page = pdf.getPage(pagenum)
                pdfWriter.addPage(page)
        return pdfWriter
    def pdf_writer(pdf_object, file_name):
        with open(file_name, 'wb') as outfile:
            pdf_object.write(outfile)
    for num in number_groups.keys():
        file_list = pdf_file_getter(num)
        pdf_object = pdf_combiner(file_list)
        pdf_writer(pdf_object, "{0}_{1}{2}.pdf".format(names[num], num, output_suffix))

In [31]:
pdf_combiner(id_numders_csv, pdf_dir)