In [1]:
import openpyxl
import xlsxwriter
import os, sys, glob, re
from pathlib import Path
import pandas as pd

In [2]:
entryfiles_combined_filepath = Path(r"C:\Users\jasonjia\Dropbox\Projects\conference_call\output\04_match_firm_names_to_gvkeys\04.5_firm_names_matched\20210101-20220617\entryfilescombined_with_gvkeycountry_20210101-20220617.xlsx")
indv_entryfile_outputfolder = Path(r"C:\Users\jasonjia\Dropbox\Projects\conference_call\output\05_breakup_entryfiles_combined_into_individual_entryfiles\20210101-20220617")
paragraph_record_output_filepath = Path(r"C:\Users\jasonjia\Dropbox\Projects\conference_call\output\05_breakup_entryfiles_combined_into_individual_entryfiles\20210101-20220617\entryfiles_paragraph_record.xlsx")
template_filepath = Path(r"C:\Users\jasonjia\Dropbox\Projects\conference_call\code\05_breakup_entryfiles_combined_into_individual_entryfiles\reference_files\entryfile_template.xlsx")
keywords_filepath = Path(r"C:\Users\jasonjia\Dropbox\Projects\conference_call\code\03_identify_paragraphs_containing_keywords\reference_files\keywords.txt")

# Import files

In [3]:
# Choose first sheet of the workbook (there's only one sheet but we want a sheet object)
# Import entryfiles_combined
entryfiles_combined = openpyxl.load_workbook(entryfiles_combined_filepath)
entryfiles_combined = entryfiles_combined[entryfiles_combined.sheetnames[0]] 

In [4]:
# Import template (for the headers)
template = openpyxl.load_workbook(template_filepath)
template = template[template.sheetnames[0]]

In [5]:
# Import keywords
keywords = pd.read_csv(keywords_filepath, sep = "\t", header = None)
keyword_list = list(keywords[0].str.lower())

# Create paragraph record

In [6]:
def create_paragraph_record(number_of_paragraphs):
    div, mod = divmod(number_of_paragraphs, 500)
    df = pd.DataFrame(columns = ['File Name', 'Starting Row', 'Ending Row', 'Name of RA'])

    # Col 1: create list of 1 to div or div-1 (number of individual entry files)
    # Col 2: create list of 1, 501, ... (starting paragraph number for each entry file)
    # Col 3: create list of 500, 1000, ... (ending paragraph number for each entry file)
    # Col 4: empty col
    
    # Number of paragraphs = e.g. 1, 59, 1424, ...
    if mod > 0:
        list1 = [x + 1 for x in range(div+1)]
        list2 = [500 * x + 1 for x in range(div+1)]
        list3 = [500 * (x + 1) for x in range(div+1)]
        
        # Adjust last entry for col 3 (ending paragraph number for last entry file)
        if number_of_paragraphs > 500:
            list3[-1] = list3[-2] + mod
        else:
            list3[-1] = mod
    
    # Number of paragraphs = e.g. 0, 500, 1000, ...
    else:
        list1 = [x + 1 for x in range(div)]
        list2 = [500 * x + 1 for x in range(div)]
        list3 = [500 * (x + 1) for x in range(div)]
    
    df['File Name'] = list1
    df['Starting Row'] = list2
    df['Ending Row'] = list3
    df['Name of RA'] = ""

    return df

In [7]:
# Create paragraph record
number_of_paragraphs = entryfiles_combined.max_row - 1
paragraph_record = create_paragraph_record(number_of_paragraphs)

# Save paragraph record
paragraph_record.to_excel(paragraph_record_output_filepath, index=False)
print("Saved paragraph record to:", paragraph_record_output_filepath)

Saved paragraph record to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\05_breakup_entryfiles_combined_into_individual_entryfiles\20210101-20220617\entryfiles_paragraph_record.xlsx


# Hard-coded variables

In [8]:
# Dictionary maps column in entryfiles_combined (key) to corresponding col in entryfile_template (value).
# Columns in entryfiles_combined: 
# 0 = Keyword, 1 = Paragraph, 2 = Date, 3 = Title, 4 = Subtitle, 5 = Report, 6 = gvkey, 7 = hqcountry
combined_to_template_col_dict = {0:2, 1:5, 2:6, 3:8, 4:9, 5:44, 6:7, 7:45} 

# Row heights for each row, based on entryfiles_template
row_height = 15

# Column widths for each column, based on entryfiles_template
col_width = [13.29, 14.43, 30.43,  8.29,  8.29, 84.29, 13.57, 13.71, 61.57,    35,
             12.29, 11.43, 11.43, 13.43, 13.29, 10.71, 11.86, 10.86, 12.86, 18.29, 
                34, 15.71, 19.71, 14.29, 22.14, 17.29, 29.14, 10.14,  8.29,  8.29,
              9.71,  8.29,  8.86, 16.71, 15.86, 21.86, 17.29, 21.57,    20,    17, 
             24.86,    21,  9.29,    10, 14.57,    19] 

# Functions

In [9]:
def add_formats(indv_entryfile):
    # Add the bold format
    bold_format = indv_entryfile.add_format({"bold": True, "align": "center", "valign": "center"})

    # Add the default cell format
    cell_format = indv_entryfile.add_format()
    cell_format.set_align('center') # horizontal center
    cell_format.set_align('vcenter') # vertical center
    cell_format.set_text_wrap()
    
    return bold_format, cell_format

In [10]:
def set_formats_for_rows_and_columns(worksheet, row_height, col_width, cell_format):
    # Set row height and format for rows
    for i in range(2, 502):
        worksheet.set_row(i, row_height, cell_format)
    # Set column width and cell format for columns
    for i in range(46):
        worksheet.set_column(i, i, col_width[i], cell_format)

In [11]:
def modify_paragraph_by_bolding_one_keyword(para_with_keywords_bolded, keyword, bold_format):
    para_with_keywords_bolded_new = []
    
    for chunk in para_with_keywords_bolded:
        # Normal string
        try:
            len(chunk)
            # Split chunk by keyword
            # The output will be a list of strings, where keywords are either removed, 
            # or converted to "" (if they start or end the paragraph). 
            # E.g. "The hurdle rate is 7%." -> ['The ', ' is 7%.']
            # E.g. "Hurdle rate is 7%." -> ['', ' is 7%.']
            # E.g. "The hurdle rate" -> ['The ', '']
            # E.g. "Hurdle rate" -> ['', '']
            chunk_splitted = re.split(keyword, chunk, flags=re.I)
            
            for index, string in enumerate(chunk_splitted):
                # Keyword appears at start or end of para_splitted
                if string == '':
                    para_with_keywords_bolded_new.append(bold_format)
                    para_with_keywords_bolded_new.append(keyword)
                # Keyword appears between 2 strings
                else:
                    para_with_keywords_bolded_new.append(string)
                    # Keyword can only appear before the last string,
                    # and keyword can't appear after the 2nd last string if the last string is ''.
                    if index < len(chunk_splitted) - 2 or (index == len(chunk_splitted) - 2 and chunk_splitted[-1] != ''):
                        para_with_keywords_bolded_new.append(bold_format)
                        para_with_keywords_bolded_new.append(keyword)                     
        # Bold format
        except:
            para_with_keywords_bolded_new.append(chunk)   

    return para_with_keywords_bolded_new

In [12]:
def get_paragraph_with_keywords_bolded(para, keyword_list, bold_format):
    para_with_keywords_bolded = [para]
    for keyword in keyword_list:
        para_with_keywords_bolded = modify_paragraph_by_bolding_one_keyword(para_with_keywords_bolded, keyword, bold_format)
    return para_with_keywords_bolded

# Main loop

In [13]:
# Format: worksheet.write(row, col, value)
# For all paragraphs in entryfiles_combined (rows 2 to entryfiles_combined.max_row)
# So row goes from 1 to entryfiles_combined.max_row-1.
for row in range(1, entryfiles_combined.max_row):
    # For the 1st row of every individual entry file (1, 501, ...) 
    if row % 500 == 1:
        # Create new individual entry file - workbook and worksheet
        indv_entryfile_filename = str(row // 500 + 1) + ".xlsx"
        print("Entry file:", indv_entryfile_filename)
        indv_entryfile_filepath = Path(indv_entryfile_outputfolder / indv_entryfile_filename)
        indv_entryfile = xlsxwriter.Workbook(indv_entryfile_filepath)
        worksheet = indv_entryfile.add_worksheet()
        
        # Set formats for rows and columns
        bold_format, cell_format = add_formats(indv_entryfile)
        set_formats_for_rows_and_columns(worksheet, row_height, col_width, cell_format)
         
        # Write out the header rows (the first 2 rows)
        for row_header in range(2):
            for col_header in range(template.max_column):
                worksheet.write(row_header, col_header, template.cell(row_header+1, col_header+1).value, bold_format) 

    # Get keyword and paragraph
    keyword = entryfiles_combined.cell(row+1, 0+1).value
    para = entryfiles_combined.cell(row+1, 1+1).value
    
    # Get paragraph with keywords bolded
    para_with_keywords_bolded = get_paragraph_with_keywords_bolded(para, keyword_list, bold_format)
    
    # Write out the row corresponding to the paragraph
    # Write out all relevant columns of entryfile, except paragraph and paragraph number
    row_indv_entryfile = (row-1) % 500 + 2
    for col in combined_to_template_col_dict.keys():
        worksheet.write(row_indv_entryfile, combined_to_template_col_dict[col], entryfiles_combined.cell(row+1, col+1).value)
    # Write out paragraph with keyword bolded
    worksheet.write_rich_string(row_indv_entryfile, 5, *para_with_keywords_bolded)
    # Write out paragraph number
    worksheet.write(row_indv_entryfile, 1, row)
    
    # Close the entry file (1-500, 501-1000, ..., <..>-max_row)
    if row % 500 == 0 or row == entryfiles_combined.max_row - 1:
        indv_entryfile.close()

print("---")
print("Done!")

Entry file: 1.xlsx
Entry file: 2.xlsx
Entry file: 3.xlsx
Entry file: 4.xlsx
Entry file: 5.xlsx
Entry file: 6.xlsx
Entry file: 7.xlsx
Entry file: 8.xlsx
Entry file: 9.xlsx
Entry file: 10.xlsx
Entry file: 11.xlsx
Entry file: 12.xlsx
Entry file: 13.xlsx
Entry file: 14.xlsx
---
Done!
