In [1]:
# Import Packages
import pandas as pd, numpy as np
import os, sys, glob, re
from pathlib import Path
import fitz # need to do "pip install PyMuPDF" for this to work

In [2]:
inputfolder = Path(r"C:\Users\jasonjia\Dropbox\Projects\conference_call\output\01_download_cc\01.1_pdf_20210101_20220617")
outputfilepath = Path(r"C:\Users\jasonjia\Dropbox\Projects\conference_call\output\01_download_cc\01.1_pdf_20210101_20220617\20210101_20220617_pdf_titles.xlsx")

In [3]:
# Get df_doc containing a list of titles (firm names) for 1 particular .pdf
def get_df_doc(file):
    df_doc = pd.DataFrame()
    if file.suffix == ".pdf":
        print("Reading:", file.name)
        doc = fitz.open(file)
        toc = doc.get_toc()
        
        # pdf has > 1 conference call and thus a table of contents
        if toc[0][1] == 'Table of Contents':
            for lvl, title, page in toc:
                # All titles are level-1 bookmarks. 
                # The only incorrect level-1 bookmark is 'Table of Contents', which can be filtered out with page > 1.
                if lvl == 1 and page > 1:
                    # Create df for a title, and append it to the df for that 1 particular pdf
                    df_cc = pd.DataFrame({'Title':[title], 'filestem':[file.stem]})
                    df_doc = pd.concat([df_doc, df_cc], ignore_index=True)
                    
        # pdf only has 1 conference call and thus no table of contents
        else:
            # The title is the first keyword in the metadata.
            title = doc.metadata['keywords'].split(',')[0]
            df_doc = pd.DataFrame({'Title':[title], 'filestem':[file.stem]})
            
    return df_doc

In [4]:
# Aim: get a list of titles (firm names) for each .pdf, to compare against titles in .xls files
df = pd.DataFrame()
for file in inputfolder.iterdir():
    df_doc = get_df_doc(file)
    df = pd.concat([df, df_doc])

Reading: 20201229-20210101_1.pdf
Reading: 20210102-20210105_1.pdf
Reading: 20210106-20210109_1.pdf
Reading: 20210106-20210109_2.pdf
Reading: 20210106-20210109_3.pdf
Reading: 20210110-20210113_1.pdf
Reading: 20210110-20210113_10.pdf
Reading: 20210110-20210113_2.pdf
Reading: 20210110-20210113_3.pdf
Reading: 20210110-20210113_4.pdf
Reading: 20210110-20210113_5.pdf
Reading: 20210110-20210113_6.pdf
Reading: 20210110-20210113_7.pdf
Reading: 20210110-20210113_8.pdf
Reading: 20210110-20210113_9.pdf
Reading: 20210114-20210117_1.pdf
Reading: 20210114-20210117_2.pdf
Reading: 20210114-20210117_3.pdf
Reading: 20210114-20210117_4.pdf
Reading: 20210118-20210121_1.pdf
Reading: 20210118-20210121_2.pdf
Reading: 20210118-20210121_3.pdf
Reading: 20210118-20210121_4.pdf
Reading: 20210118-20210121_5.pdf
Reading: 20210122-20210125_1.pdf
Reading: 20210122-20210125_2.pdf
Reading: 20210122-20210125_3.pdf
Reading: 20210126-20210129_1.pdf
Reading: 20210126-20210129_10.pdf
Reading: 20210126-20210129_11.pdf
Reading

Reading: 20210420-20210423_8.pdf
Reading: 20210420-20210423_9.pdf
Reading: 20210424-20210427_1.pdf
Reading: 20210424-20210427_2.pdf
Reading: 20210424-20210427_3.pdf
Reading: 20210424-20210427_4.pdf
Reading: 20210424-20210427_5.pdf
Reading: 20210424-20210427_6.pdf
Reading: 20210424-20210427_7.pdf
Reading: 20210424-20210427_8.pdf
Reading: 20210424-20210427_9.pdf
Reading: 20210428-20210501_1.pdf
Reading: 20210428-20210501_10.pdf
Reading: 20210428-20210501_11.pdf
Reading: 20210428-20210501_12.pdf
Reading: 20210428-20210501_13.pdf
Reading: 20210428-20210501_14.pdf
Reading: 20210428-20210501_15.pdf
Reading: 20210428-20210501_16.pdf
Reading: 20210428-20210501_17.pdf
Reading: 20210428-20210501_18.pdf
Reading: 20210428-20210501_19.pdf
Reading: 20210428-20210501_2.pdf
Reading: 20210428-20210501_20.pdf
Reading: 20210428-20210501_21.pdf
Reading: 20210428-20210501_22.pdf
Reading: 20210428-20210501_23.pdf
Reading: 20210428-20210501_24.pdf
Reading: 20210428-20210501_25.pdf
Reading: 20210428-20210501_

Reading: 20210721-20210724_9.pdf
Reading: 20210725-20210728_1.pdf
Reading: 20210725-20210728_10.pdf
Reading: 20210725-20210728_11.pdf
Reading: 20210725-20210728_12.pdf
Reading: 20210725-20210728_13.pdf
Reading: 20210725-20210728_14.pdf
Reading: 20210725-20210728_15.pdf
Reading: 20210725-20210728_16.pdf
Reading: 20210725-20210728_17.pdf
Reading: 20210725-20210728_2.pdf
Reading: 20210725-20210728_3.pdf
Reading: 20210725-20210728_4.pdf
Reading: 20210725-20210728_5.pdf
Reading: 20210725-20210728_6.pdf
Reading: 20210725-20210728_7.pdf
Reading: 20210725-20210728_8.pdf
Reading: 20210725-20210728_9.pdf
Reading: 20210729-20210801_1.pdf
Reading: 20210729-20210801_10.pdf
Reading: 20210729-20210801_11.pdf
Reading: 20210729-20210801_12.pdf
Reading: 20210729-20210801_13.pdf
Reading: 20210729-20210801_14.pdf
Reading: 20210729-20210801_15.pdf
Reading: 20210729-20210801_16.pdf
Reading: 20210729-20210801_17.pdf
Reading: 20210729-20210801_18.pdf
Reading: 20210729-20210801_19.pdf
Reading: 20210729-2021080

Reading: 20211029-20211101_1.pdf
Reading: 20211029-20211101_2.pdf
Reading: 20211029-20211101_3.pdf
Reading: 20211029-20211101_4.pdf
Reading: 20211029-20211101_5.pdf
Reading: 20211029-20211101_6.pdf
Reading: 20211029-20211101_7.pdf
Reading: 20211029-20211101_8.pdf
Reading: 20211102-20211105_1.pdf
Reading: 20211102-20211105_10.pdf
Reading: 20211102-20211105_11.pdf
Reading: 20211102-20211105_12.pdf
Reading: 20211102-20211105_13.pdf
Reading: 20211102-20211105_14.pdf
Reading: 20211102-20211105_15.pdf
Reading: 20211102-20211105_16.pdf
Reading: 20211102-20211105_17.pdf
Reading: 20211102-20211105_18.pdf
Reading: 20211102-20211105_19.pdf
Reading: 20211102-20211105_2.pdf
Reading: 20211102-20211105_20.pdf
Reading: 20211102-20211105_21.pdf
Reading: 20211102-20211105_22.pdf
Reading: 20211102-20211105_23.pdf
Reading: 20211102-20211105_24.pdf
Reading: 20211102-20211105_25.pdf
Reading: 20211102-20211105_26.pdf
Reading: 20211102-20211105_27.pdf
Reading: 20211102-20211105_28.pdf
Reading: 20211102-202111

Reading: 20220306-20220309_15.pdf
Reading: 20220306-20220309_16.pdf
Reading: 20220306-20220309_17.pdf
Reading: 20220306-20220309_2.pdf
Reading: 20220306-20220309_3.pdf
Reading: 20220306-20220309_4.pdf
Reading: 20220306-20220309_5.pdf
Reading: 20220306-20220309_6.pdf
Reading: 20220306-20220309_7.pdf
Reading: 20220306-20220309_8.pdf
Reading: 20220306-20220309_9.pdf
Reading: 20220310-20220313_1.pdf
Reading: 20220310-20220313_2.pdf
Reading: 20220310-20220313_3.pdf
Reading: 20220310-20220313_4.pdf
Reading: 20220310-20220313_5.pdf
Reading: 20220310-20220313_6.pdf
Reading: 20220310-20220313_7.pdf
Reading: 20220314-20220317_1.pdf
Reading: 20220314-20220317_10.pdf
Reading: 20220314-20220317_11.pdf
Reading: 20220314-20220317_12.pdf
Reading: 20220314-20220317_13.pdf
Reading: 20220314-20220317_2.pdf
Reading: 20220314-20220317_3.pdf
Reading: 20220314-20220317_4.pdf
Reading: 20220314-20220317_5.pdf
Reading: 20220314-20220317_6.pdf
Reading: 20220314-20220317_7.pdf
Reading: 20220314-20220317_8.pdf
Rea

In [5]:
df

Unnamed: 0,Title,filestem
0,MAHINDRA & MAHINDRA,20201229-20210101_1
1,SOCIAL REALITY INC,20201229-20210101_1
2,MIRAGEN THERAPEUTICS INC,20201229-20210101_1
3,DPW HOLDINGS INC,20201229-20210101_1
4,MCCORMICK & CO.,20201229-20210101_1
...,...,...
21,RIMINI STREET INC,20220602-20220605_4
22,EMCORE CORP.,20220602-20220605_4
23,AMBARELLA INC,20220602-20220605_4
24,RINGCENTRAL INC,20220602-20220605_4


In [6]:
print("Saving pdf to:", outputfilepath)
writer = pd.ExcelWriter(outputfilepath)
df.to_excel(writer)
writer.save()
print("Saved!")

Saving pdf to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\01_download_cc\01.1_pdf_20210101_20220617\20210101_20220617_pdf_titles.xlsx
Saved!


# Misc: Test a single .pdf file, with get_df_doc

In [30]:
inputfilepath = Path(r"C:\Users\jasonjia\Dropbox\Projects\conference_call\output\01_download_cc\01.1_pdf_20210101_20220617\20210106-20210109_2.pdf")
df_doc = get_df_doc(inputfilepath)
# df_doc

Reading: 20210106-20210109_2.pdf


# Misc: Test .pdf with only 1 conference call and thus no table of contents

In [22]:
inputfilepath = Path(r"C:\Users\jasonjia\Dropbox\Projects\conference_call\output\01_download_cc\01.1_pdf_20210101_20220617\20210106-20210109_3.pdf")
doc = fitz.open(inputfilepath)
title = doc.metadata['keywords'].split(',')[0]
title

'Meredith Corp'

In [23]:
inputfilepath = Path(r"C:\Users\jasonjia\Dropbox\Projects\conference_call\output\01_download_cc\01.1_pdf_20210101_20220617\20210623-20210626_5.pdf")
doc = fitz.open(inputfilepath)
title = doc.metadata['keywords'].split(',')[0]
title

'Johnson & Johnson'

In [24]:
inputfilepath = Path(r"C:\Users\jasonjia\Dropbox\Projects\conference_call\output\01_download_cc\01.1_pdf_20210101_20220617\20210701-20210704_2.pdf")
doc = fitz.open(inputfilepath)
title = doc.metadata['keywords'].split(',')[0]
title

'C&C Group PLC'

# Misc: Get number of pages

In [8]:
# Get number of pages
inputfilepath = Path(r"C:\Users\jasonjia\Dropbox\Projects\conference_call\output\01_download_cc\01.1_pdf_20210101_20220617\20220505-20220508_22.pdf")
doc = fitz.open(inputfilepath)
num_pages = doc.page_count
print(num_pages)

376
