In [1]:
# Import Packages
import pandas as pd, numpy as np
import os, sys, glob, re
from pathlib import Path
from itertools import compress

# Set the max number of columns and rows to display in Jupyter Notebooks
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

In [2]:
pdf_titles_filepath = Path(r"C:\Users\jasonjia\Dropbox\Projects\conference_call\output\01_download_cc\01.1_pdf_20210101_20220617\20210101_20220617_pdf_titles.xlsx")
xls_combined_filepath = Path(r"C:\Users\jasonjia\Dropbox\Projects\conference_call\output\01_download_cc\01.1_xls_20210101_20220617\20210101_20220617_xls_combined.xlsx")

In [3]:
# Functions
def print_test_result(number_of_failed_rows):
    print("Number of rows:", number_of_failed_rows)
    if number_of_failed_rows == 0:
        print("Test passed!")
    else:
        print("Test failed!")

# Import files

In [4]:
# Import pdf_titles
pdf_titles = pd.read_excel(pdf_titles_filepath)
pdf_titles = pdf_titles.rename(columns = {'Unnamed: 0': 'index', 'Title': 'title_pdf'})
pdf_titles = pdf_titles.reindex(columns=['filestem', 'index', 'title_pdf'])
pdf_titles['title_pdf'] = pdf_titles['title_pdf'].str.strip()
pdf_titles['title_pdf'] = pdf_titles['title_pdf'].str.upper()
pdf_titles['title_pdf'] = pdf_titles['title_pdf'].str.replace(' +', ' ', regex = True)
pdf_titles.head(3)

Unnamed: 0,filestem,index,title_pdf
0,20201229-20210101_1,0,MAHINDRA & MAHINDRA
1,20201229-20210101_1,1,SOCIAL REALITY INC
2,20201229-20210101_1,2,MIRAGEN THERAPEUTICS INC


In [5]:
# Import xls_combined and get xls_titles
xls_combined = pd.read_excel(xls_combined_filepath, usecols = [0, 3, 14])
xls_titles = xls_combined.rename(columns = {'Unnamed: 0': 'index', 'Title': 'title_xls'})
xls_titles = xls_titles.reindex(columns=['filestem', 'index', 'title_xls'])
xls_titles['title_xls'] = xls_titles['title_xls'].str.strip()
xls_titles['title_xls'] = xls_titles['title_xls'].str.upper()
xls_titles['title_xls'] = xls_titles['title_xls'].str.replace(' +', ' ', regex = True)
xls_titles.head(3)

Unnamed: 0,filestem,index,title_xls
0,20201229-20210101_1,0,MAHINDRA & MAHINDRA
1,20201229-20210101_1,1,SOCIAL REALITY INC
2,20201229-20210101_1,2,MIRAGEN THERAPEUTICS INC


# Manual replacements

In [6]:
# Manual replacements
# Some .pdf files only have 1 conference call, and the title is extracted using the metadata.
# However, these may not be the same as the title in the corresponding .xls file, even though it refers to the same firm.
# Hence, we manually replace entries in pdf_titles.xlsx once we have checked that the firm names are correct:
pdf_titles = pdf_titles.replace({'title_pdf': {'MEREDITH CORP': 'MEREDITH CORPORATION', 
                                 'BLACKBAUD INC': 'BLACKBAUD, INC.'}})

# Tests

In [7]:
# Test 1
left_merge = pdf_titles.merge(xls_titles, how='left', on=['filestem', 'index'], indicator=True)
print("The following are .pdf rows without a corresponding .xls row with the same (filestem, index):")
test_1 = left_merge[left_merge['_merge'] != 'both']
print_test_result(test_1.shape[0])
test_1

The following are .pdf rows without a corresponding .xls row with the same (filestem, index):
Number of rows: 0
Test passed!


Unnamed: 0,filestem,index,title_pdf,title_xls,_merge


In [8]:
# Test 2
right_merge = pdf_titles.merge(xls_titles, how='right', on=['filestem', 'index'], indicator=True)
print("The following are .xls rows without a corresponding .pdf row with the same (filestem, index):")
test_2 = right_merge[right_merge['_merge'] != 'both']
print_test_result(test_2.shape[0])
test_2

The following are .xls rows without a corresponding .pdf row with the same (filestem, index):
Number of rows: 0
Test passed!


Unnamed: 0,filestem,index,title_pdf,title_xls,_merge


In [9]:
# Test 3
print("The following are .pdf rows with a corresponding .xls row with the same (filestem, index),", 
      "\nbut having different titles (firm names):")
inner_merge = pdf_titles.merge(xls_titles, how='inner', on=['filestem', 'index'], indicator=True)
test_3 = inner_merge[inner_merge['title_pdf'] != inner_merge['title_xls']]
print_test_result(test_3.shape[0])
test_3

The following are .pdf rows with a corresponding .xls row with the same (filestem, index), 
but having different titles (firm names):
Number of rows: 0
Test passed!


Unnamed: 0,filestem,index,title_pdf,title_xls,_merge


In [11]:
# View all problematic filestems
print("Problematic filestems to check:")
print(test_1['filestem'].unique())
print(test_2['filestem'].unique())
print(test_3['filestem'].unique())

Problematic filestems to check:
[]
[]
[]


# Misc Checks

In [29]:
# Check 1
print("The following are .pdf rows with the same filestem and title (firm name), which may be duplicates, but are most likely not:")
check_1 = pdf_titles[pdf_titles.duplicated(['filestem', 'title_pdf'], keep=False)]
print("Number of rows:", check_1.shape[0])

The following are .pdf rows with the same filestem and title (firm name), which may be duplicates, but are most likely not:
Number of rows: 4295
