In [41]:
# Import Packages
import pandas as pd, numpy as np
import os, sys, glob, re
from pathlib import Path
from itertools import compress

# Set the max number of columns and rows to display in Jupyter Notebooks
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

In [42]:
pdf_titles_filepath = Path(r"C:\Users\jasonjia\Dropbox\Projects\conference_call\output\01_download_cc\01.1_pdf_20210101_20220617\20210101_20220617_pdf_titles.xlsx")
xls_combined_filepath = Path(r"C:\Users\jasonjia\Dropbox\Projects\conference_call\output\01_download_cc\01.1_xls_20210101_20220617\20210101_20220617_xls_combined.xlsx")

In [43]:
# Functions
def print_test_result(number_of_failed_rows):
    print("Number of rows:", number_of_failed_rows)
    if number_of_failed_rows == 0:
        print("Test passed!")
    else:
        print("Test failed!")

In [44]:
# Import pdf_titles
pdf_titles = pd.read_excel(pdf_titles_filepath)
pdf_titles = pdf_titles.rename(columns = {'Unnamed: 0': 'index', 'Title': 'title_pdf'})
pdf_titles = pdf_titles.reindex(columns=['filestem', 'index', 'title_pdf'])
pdf_titles['title_pdf'] = pdf_titles['title_pdf'].str.strip()
pdf_titles['title_pdf'] = pdf_titles['title_pdf'].str.replace(' +', ' ', regex = True)
pdf_titles.head(3)

Unnamed: 0,filestem,index,title_pdf
0,20201229-20210101_1,0,MAHINDRA & MAHINDRA
1,20201229-20210101_1,1,SOCIAL REALITY INC
2,20201229-20210101_1,2,MIRAGEN THERAPEUTICS INC


In [45]:
# Import xls_combined and get xls_titles
xls_combined = pd.read_excel(xls_combined_filepath, usecols = [0, 3, 14])
xls_titles = xls_combined.rename(columns = {'Unnamed: 0': 'index', 'Title': 'title_xls'})
xls_titles = xls_titles.reindex(columns=['filestem', 'index', 'title_xls'])
xls_titles['title_xls'] = xls_titles['title_xls'].str.strip()
xls_titles['title_xls'] = xls_titles['title_xls'].str.replace(' +', ' ', regex = True)
xls_titles.head(3)

Unnamed: 0,filestem,index,title_xls
0,20201229-20210101_1,0,MAHINDRA & MAHINDRA
1,20201229-20210101_1,1,SOCIAL REALITY INC
2,20201229-20210101_1,2,MIRAGEN THERAPEUTICS INC


In [46]:
# Test 1
left_merge = pdf_titles.merge(xls_titles, how='left', on=['filestem', 'index'], indicator=True)
print("The following are .pdf rows without a corresponding .xls row with the same (filestem, index):")
test_1 = left_merge[left_merge['_merge'] != 'both']
print_test_result(test_1.shape[0])
test_1

The following are .pdf rows without a corresponding .xls row with the same (filestem, index):
Number of rows: 47
Test failed!


Unnamed: 0,filestem,index,title_pdf,title_xls,_merge
166,20210106-20210109_3,1,Page 3,,left_only
167,20210106-20210109_3,2,Page 4,,left_only
168,20210106-20210109_3,3,Page 5,,left_only
169,20210106-20210109_3,4,Page 6,,left_only
170,20210106-20210109_3,5,Page 7,,left_only
171,20210106-20210109_3,6,Page 8,,left_only
172,20210106-20210109_3,7,Page 9,,left_only
173,20210106-20210109_3,8,Page 10,,left_only
174,20210106-20210109_3,9,Page 11,,left_only
175,20210106-20210109_3,10,Page 12,,left_only


In [47]:
# Test 2
right_merge = pdf_titles.merge(xls_titles, how='right', on=['filestem', 'index'], indicator=True)
print("The following are .xls rows without a corresponding .pdf row with the same (filestem, index):")
test_2 = right_merge[right_merge['_merge'] != 'both']
print_test_result(test_2.shape[0])
test_2

The following are .xls rows without a corresponding .pdf row with the same (filestem, index):
Number of rows: 0
Test passed!


Unnamed: 0,filestem,index,title_pdf,title_xls,_merge


In [48]:
# Test 3
print("The following are .pdf rows with a corresponding .xls row with the same (filestem, index),", 
      "\nbut having different titles (firm names):")
inner_merge = pdf_titles.merge(xls_titles, how='inner', on=['filestem', 'index'], indicator=True)
test_3 = inner_merge[inner_merge['title_pdf'] != inner_merge['title_xls']]
print_test_result(test_3.shape[0])
test_3

The following are .pdf rows with a corresponding .xls row with the same (filestem, index), 
but having different titles (firm names):
Number of rows: 96
Test failed!


Unnamed: 0,filestem,index,title_pdf,title_xls,_merge
165,20210106-20210109_3,0,Page 2,MEREDITH CORPORATION,both
22251,20210623-20210626_5,0,Page 2,JOHNSON & JOHNSON,both
22486,20210701-20210704_2,0,Page 2,C&C GROUP PLC,both
30232,20210826-20210829_4,0,HELLENIC PETROLEUM,RAPID MICRO BIOSYSTEMS INC,both
30233,20210826-20210829_4,1,FJORDKRAFT HOLDING ASA,F45 TRAINING HOLDINGS INC,both
30234,20210826-20210829_4,2,TESSENDERLO GROUP NV,AINSWORTH GAME TECHNOLOGY LTD.,both
30235,20210826-20210829_4,3,ENAD GLOBAL 7 AB (PUBL),PETROCHINA CO. LTD.,both
30236,20210826-20210829_4,4,- EVENT TRANSCRIPT OF MILLER HOMES GROUP HOLDI...,CHALICE BRANDS LTD.,both
30237,20210826-20210829_4,5,YATSEN HOLDING LTD,BWX LTD,both
30238,20210826-20210829_4,6,SELECT SANDS CORP,PIVOTAL SYSTEMS CORP,both


In [53]:
print(test_1['filestem'].unique())
print(test_2['filestem'].unique())
print(test_3['filestem'].unique())

['20210106-20210109_3' '20210623-20210626_5' '20210701-20210704_2'
 '20210826-20210829_4' '20220101-20220104_1']
[]
['20210106-20210109_3' '20210623-20210626_5' '20210701-20210704_2'
 '20210826-20210829_4' '20220101-20220104_1' '20220509-20220512_3']


In [61]:
# Check 1
print("The following are .pdf rows with the same filestem and title (firm name), which may be duplicates, but are most likely not:")
check_1 = pdf_titles[pdf_titles.duplicated(['filestem', 'title_pdf'], keep=False)]
print("Number of rows:", check_1.shape[0])
check_1

The following are .pdf rows with the same filestem and title (firm name), which may be duplicates, but are most likely not:
Number of rows: 4295


Unnamed: 0,filestem,index,title_pdf
4,20201229-20210101_1,4,MCCORMICK & CO.
5,20201229-20210101_1,5,MCCORMICK & CO.
41,20210102-20210105_1,23,UGI CORPORATION
45,20210102-20210105_1,27,UGI CORPORATION
52,20210102-20210105_1,34,TELEDYNE TECHNOLOGIES INC.
53,20210102-20210105_1,35,DENTSPLY SIRONA INC
54,20210102-20210105_1,36,DENTSPLY SIRONA INC
55,20210102-20210105_1,37,TELEDYNE TECHNOLOGIES INC.
63,20210102-20210105_1,45,SVB FINANCIAL GROUP
64,20210102-20210105_1,46,SVB FINANCIAL GROUP
