In [1]:
import pandas as pd
import numpy as np
import os
from cleantext import clean
from collections import defaultdict
import re
import sys
import argparse
from pathlib import Path

# Example command:
# python identify_paragraphs_containing_keywords.py C:\Users\jasonjia\Dropbox\Projects\conference_call\output\02_process_cc\02.2_csv_20210101_20220617 C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_paragraphs_containing_keywords C:\Users\jasonjia\Dropbox\Projects\conference_call\code\03_identify_paragraphs_containing_keywords\reference_files\keywords.txt

In [62]:
# Set the max number of columns and rows to display in Jupyter Notebooks
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

In [None]:
# Parse arguments
if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='Identify paragraphs containing keywords')
    parser.add_argument('inputfolder', help="input folder containing the .csv files", type=str)
    parser.add_argument('outputfolder', help="output folder containing the paragraphs", type=str)
    parser.add_argument('keywords_filepath', help="filepath of the keywords.txt file", type=str)
    args = parser.parse_args()
    inputfolder = Path(args.inputfolder)
    outputfolder = Path(args.outputfolder)
    keywords_filepath = Path(args.keywords_filepath)

print("Input folder:", inputfolder)
print("Output folder:", outputfolder)
print("Keywords filepath:", keywords_filepath)

In [9]:
inputfolder = Path(r"C:\Users\jasonjia\Dropbox\Projects\conference_call\output\02_process_cc\02.2_csv_20210101_20220617")
outputfolder = Path(r"C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_paragraphs_containing_keywords")
keywords_filepath = Path(r"C:\Users\jasonjia\Dropbox\Projects\conference_call\code\03_identify_paragraphs_containing_keywords\reference_files\keywords.txt")

# Functions

In [3]:
def clean_str(s):
    t = s.replace("\\n", " ")
    t = t.replace("\\", "")
    if len(t) == 0:
        return ""
    if t[0] == '"':
        t = t[1:-1]
    return clean(t, fix_unicode = True, no_line_breaks = True, lower = False)

In [5]:
def has_numbers(inputString):
    return bool(re.search(r'\d', inputString))

In [6]:
def extractNumbers(text):
    digits = re.findall(r"[(\d.)]+", text)
    digits = [i for i in digits if i != "."]
    return digits

In [7]:
def check_n_prev_and_next(p1, p2, k, n = 5):
    inverted_n = n * -1 ### Good for indexing purposes
    start = p1.index(k)
    end = start + len(k)
    p1_start_chunk, p1_end_chunk = p1[:start], p1[end:]
    p2_start_chunk, p2_end_chunk = p2[:start], p2[end:]
    p1_preceding_n_words, p2_preceding_n_words = " ".join(p1_start_chunk.split()[inverted_n:]), " ".join(p2_start_chunk.split()[inverted_n:])
    p1_succeding_n_words, p2_succeding_n_words = " ".join(p1_end_chunk.split()[:n]), " ".join(p2_end_chunk.split()[:n])
    if (p1_preceding_n_words == p2_preceding_n_words) and (p1_succeding_n_words == p2_succeding_n_words):
        return True
    else:
        return False

In [46]:
def split_conf_call_into_paras(call):
    paras_list = [t.replace("\n"," ") for t in re.split(r"\n\s*\n+", call) if len(t)>0 and not re.match(r"^\s*[0-9]+$",t)]
    return paras_list

In [88]:
def get_all_paras_containing_keywords_from_a_conf_call(row, keywords, clean = True, lower = True):
    # Split conference call into paragraphs.
    # This is an imperfect split - sometimes it breaks in the middle of a sentence
    call = str(row["Call"])
    paras_list = split_conf_call_into_paras(call)
    found_keywords, found_in_paras = [], []

    report_id = row["Report"]
    for para in paras_list:
        for keyword in keywords:
            # Lower case
            if lower == True:
                para = para.lower()
                keyword = keyword.lower()
            # Clean paragraph
            if clean == True:
                para = clean_str(para)
            # Search for keyword
            if keyword in para:
                found_keywords.append(keyword)
                found_in_paras.append(para)
    
    # Create df 
    # Note that found_keywords and found_in_paras are lists. 
    # report_id is an int, but it will be broadcasted to a list.
    df = pd.DataFrame({"keyword": found_keywords, "paragraph": found_in_paras, "report_id": report_id})
    return df 

In [82]:
def get_all_paras_containing_keywords_from_a_csv_file(csv_file):
    df_csv_file = pd.read_csv(csv_file)
    df_combined = pd.DataFrame()
    for _, row_conf_call in df_csv_file.iterrows():
        df = get_all_paras_containing_keywords_from_a_conf_call(row_conf_call, keywords, clean = False, lower = True)
        df_combined = pd.concat([df_combined, df])
    return df_combined

In [67]:
inputfilepath = Path(r"C:\Users\jasonjia\Dropbox\Projects\conference_call\output\02_process_cc\02.2_csv_20210101_20220617\20201229-20210101_1.csv")

In [89]:
df_csv_file = pd.read_csv(csv_file)
df_combined = []
get_all_paras_containing_keywords_from_a_conf_call(df_csv_file.iloc[0], keywords, clean = False, lower = True)
get_all_paras_containing_keywords_from_a_csv_file(csv_file)

Unnamed: 0,keyword,paragraph,report_id
0,occ,pawan kumar goenka - mahindra & mahindra limi...,71149754
0,occ,"lawrence e. kurzius - mccormick & company, inc...",71118206
0,occ,"lawrence e. kurzius - mccormick & company, inc...",71118386
0,interest rate,"after this, we completed an $8.3 million overs...",71118254
0,occ,"following the presentation of these proposals,...",71118259
0,interest rate,"in addition, in recent years, we're also goin...",71118480
1,cost of capital,improve and achieve a return on capital which...,71118480
2,cost of capital,unidentified company representative [interpret...,71118480
0,interest rate,so apart from getting the resolution done with...,71117411
0,cost of capital,"at its core, this is a favorable recapitalizat...",71114841


In [72]:
csv_file

WindowsPath('C:/Users/jasonjia/Dropbox/Projects/conference_call/output/02_process_cc/02.2_csv_20210101_20220617/20201229-20210101_1.csv')

In [24]:
# Main loop
keywords = pd.read_csv(keywords_filepath, sep = "\t", header = None)[0]

In [26]:
for csv_file in inputfolder.iterdir():    
    df_combined = get_all_paragraphs_containing_keywords_from_a_csv_file(csv_file)
    unclean_data = pd.concat(df_combined).reset_index(drop = True)
    unclean_data = unclean_data.merge(df, on = "Report", how = "left") 
    unclean_data["File"] = file
    unclean_data["HasNumber"] = unclean_data["Para"].apply(has_numbers)
    outputfilepath = Path(outputfolder / file)
    unclean_data.to_parquet(outputfilepath, compression = "gzip")

KeyError: 'Para'