In [1]:
import pandas as pd
import numpy as np
import os
from cleantext import clean
from collections import defaultdict
import re
import sys
import argparse
from pathlib import Path

# Example command:
# python identify_paragraphs_containing_keywords.py C:\Users\jasonjia\Dropbox\Projects\conference_call\output\02_process_cc\02.2_csv_20210101_20220617 C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_paragraphs_containing_keywords C:\Users\jasonjia\Dropbox\Projects\conference_call\code\03_identify_paragraphs_containing_keywords\reference_files\keywords.txt

In [2]:
# Set the max number of columns and rows to display in Jupyter Notebooks
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

In [3]:
inputfolder = Path(r"C:\Users\jasonjia\Dropbox\Projects\conference_call\output\02_process_cc\02.2_csv_20210101_20220617")
outputfolder = Path(r"C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_paragraphs_containing_keywords")
keywords_filepath = Path(r"C:\Users\jasonjia\Dropbox\Projects\conference_call\code\03_identify_paragraphs_containing_keywords\reference_files\keywords.txt")

In [4]:
# Import keywords
keywords = pd.read_csv(keywords_filepath, sep = "\t", header = None)[0]

# Functions

In [5]:
def clean_str(s):
    t = s.replace("\\n", " ")
    t = t.replace("\\", "")
    if len(t) == 0:
        return ""
    if t[0] == '"':
        t = t[1:-1]
    return clean(t, fix_unicode = True, no_line_breaks = True, lower = False)

In [6]:
def has_numbers(inputString):
    return bool(re.search(r'\d', inputString))

In [7]:
def extractNumbers(text):
    digits = re.findall(r"[(\d.)]+", text)
    digits = [i for i in digits if i != "."]
    return digits

In [8]:
def split_conf_call_into_paras(call):
    paras_list = [t.replace("\n"," ") for t in re.split(r"\n\s*\n+", call) if len(t)>0 and not re.match(r"^\s*[0-9]+$",t)]
    return paras_list

In [9]:
def get_full_keyword(keyword, start, end, para):
    # Keyword is the first word in the paragraph
    if start == 0:
        full_keyword = keyword + " "
    # Keyword is the last word in the paragraph
    elif end == len(para) - 1:
        full_keyword = " " + keyword
    # Keyword is in the middle of a paragraph
    else:
        full_keyword = " " + keyword + " "
    return full_keyword

In [10]:
def keyword_in_paragraph(keyword, para):
    # Note that keyword must stand on its own as a word/phrase.
    # E.g. "The OCC ..." has a keyword, but "The occasion" doesn't. 
    # Idea: Check if the "full keyword" is in para, by adding appropriate spaces to the front and/or back of the keyword.
    
    # Check index of first and last char of the keyword, for every occurence of the keyword in the paragraph.
    result = re.finditer(keyword, para)
    for match_obj in result:
        start, end = match_obj.span() 
        full_keyword = get_full_keyword(keyword, start, end, para)
        # Check if full keyword is in paragraph
        if full_keyword in para:
            return True
    return False

In [11]:
def paragraph_contains_percent(para):
    para = para.lower()
    percent_list = ["%", "per cent", "percent", "percentage"]
    for i in percent_list:
        if i in para:
            return True
    return False

In [12]:
def get_all_paras_containing_keywords_from_a_conf_call(row, keywords, clean = True, lower = True):
    # Split conference call into paragraphs.
    # This is an imperfect split - sometimes it breaks in the middle of a sentence
    call = str(row["Call"])
    paras_list = split_conf_call_into_paras(call)
    found_keywords, found_in_paras = [], []

    report_id = row["Report"]
    for para in paras_list:
        # Paragraph must contain some kind of "%"
        if paragraph_contains_percent(para):
            for keyword in keywords:
                # Lower case
                if lower == True:
                    para = para.lower()
                    keyword = keyword.lower()
                # Clean paragraph
                if clean == True:
                    para = clean_str(para)
                # Check if keyword is in paragraph
                if keyword_in_paragraph(keyword, para):
                    found_keywords.append(keyword)
                    found_in_paras.append(para)
    
    # Create df 
    # Note that found_keywords and found_in_paras are lists. report_id is an int, but it will be broadcasted to a list.
    df = pd.DataFrame({"Keyword": found_keywords, "Paragraph": found_in_paras, "Report": report_id})
    return df 

In [13]:
def get_all_paras_containing_keywords_from_a_csv_file(df_csv_file):
    df_combined = pd.DataFrame()
    for _, row_conf_call in df_csv_file.iterrows():
        df = get_all_paras_containing_keywords_from_a_conf_call(row_conf_call, keywords, clean = False, lower = True)
        df_combined = pd.concat([df_combined, df])
    return df_combined

# Main loop

In [14]:
for csv_filepath in inputfolder.iterdir():    
    csv_filename = csv_filepath.name
    print("Processing:", csv_filename)
    # Get all paragraphs containing keywords from 1 .csv file, as df_combined
    df_csv_file = pd.read_csv(csv_filepath)
    df_combined = get_all_paras_containing_keywords_from_a_csv_file(df_csv_file)
    
    # Merge paragraphs with .xls info
    df_final = df_combined.merge(df_csv_file, on = 'Report', how = "left")
    df_final["file"] = csv_filename
    df_final["hasnumber"] = df_final["Paragraph"].apply(has_numbers)
    print("Number of paras containing keywords:", df_final.shape[0])
    
    # Save as parquet file (to save memory)
    outputfilename = csv_filepath.stem + '.gzip'
    outputfilepath = Path(outputfolder / outputfilename)
    df_final.to_parquet(outputfilepath, compression = "gzip")
    print("Saved df to:", outputfilepath)
    print("---")

Processing: 20201229-20210101_1.csv
Number of paras containing keywords: 1
Saved df to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_paragraphs_containing_keywords\20201229-20210101_1.gzip
---
Processing: 20210102-20210105_1.csv
Number of paras containing keywords: 2
Saved df to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_paragraphs_containing_keywords\20210102-20210105_1.gzip
---
Processing: 20210106-20210109_1.csv
Number of paras containing keywords: 7
Saved df to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_paragraphs_containing_keywords\20210106-20210109_1.gzip
---
Processing: 20210106-20210109_2.csv
Number of paras containing keywords: 4
Saved df to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_paragraphs_containing_keywords\20210106-20210109_2.gzip
---
Processing: 20210106-20210109_3.csv
Number of paras containing keywords: 0
Saved df to: C:\Users\jasonjia\Dropbox\Projects\confe

Number of paras containing keywords: 3
Saved df to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_paragraphs_containing_keywords\20210126-20210129_5.gzip
---
Processing: 20210126-20210129_6.csv
Number of paras containing keywords: 12
Saved df to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_paragraphs_containing_keywords\20210126-20210129_6.gzip
---
Processing: 20210126-20210129_7.csv
Number of paras containing keywords: 7
Saved df to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_paragraphs_containing_keywords\20210126-20210129_7.gzip
---
Processing: 20210126-20210129_8.csv
Number of paras containing keywords: 12
Saved df to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_paragraphs_containing_keywords\20210126-20210129_8.gzip
---
Processing: 20210126-20210129_9.csv
Number of paras containing keywords: 7
Saved df to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_para

Number of paras containing keywords: 10
Saved df to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_paragraphs_containing_keywords\20210207-20210210_9.gzip
---
Processing: 20210211-20210214_1.csv
Number of paras containing keywords: 14
Saved df to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_paragraphs_containing_keywords\20210211-20210214_1.gzip
---
Processing: 20210211-20210214_10.csv
Number of paras containing keywords: 16
Saved df to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_paragraphs_containing_keywords\20210211-20210214_10.gzip
---
Processing: 20210211-20210214_11.csv
Number of paras containing keywords: 12
Saved df to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_paragraphs_containing_keywords\20210211-20210214_11.gzip
---
Processing: 20210211-20210214_12.csv
Number of paras containing keywords: 10
Saved df to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_ident

Number of paras containing keywords: 8
Saved df to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_paragraphs_containing_keywords\20210223-20210226_11.gzip
---
Processing: 20210223-20210226_12.csv
Number of paras containing keywords: 3
Saved df to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_paragraphs_containing_keywords\20210223-20210226_12.gzip
---
Processing: 20210223-20210226_13.csv
Number of paras containing keywords: 27
Saved df to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_paragraphs_containing_keywords\20210223-20210226_13.gzip
---
Processing: 20210223-20210226_14.csv
Number of paras containing keywords: 2
Saved df to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_paragraphs_containing_keywords\20210223-20210226_14.gzip
---
Processing: 20210223-20210226_15.csv
Number of paras containing keywords: 4
Saved df to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identi

Number of paras containing keywords: 3
Saved df to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_paragraphs_containing_keywords\20210227-20210302_5.gzip
---
Processing: 20210227-20210302_6.csv
Number of paras containing keywords: 11
Saved df to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_paragraphs_containing_keywords\20210227-20210302_6.gzip
---
Processing: 20210227-20210302_7.csv
Number of paras containing keywords: 4
Saved df to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_paragraphs_containing_keywords\20210227-20210302_7.gzip
---
Processing: 20210227-20210302_8.csv
Number of paras containing keywords: 12
Saved df to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_paragraphs_containing_keywords\20210227-20210302_8.gzip
---
Processing: 20210227-20210302_9.csv
Number of paras containing keywords: 4
Saved df to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_para

Number of paras containing keywords: 21
Saved df to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_paragraphs_containing_keywords\20210311-20210314_7.gzip
---
Processing: 20210311-20210314_8.csv
Number of paras containing keywords: 2
Saved df to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_paragraphs_containing_keywords\20210311-20210314_8.gzip
---
Processing: 20210315-20210318_1.csv
Number of paras containing keywords: 10
Saved df to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_paragraphs_containing_keywords\20210315-20210318_1.gzip
---
Processing: 20210315-20210318_10.csv
Number of paras containing keywords: 9
Saved df to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_paragraphs_containing_keywords\20210315-20210318_10.gzip
---
Processing: 20210315-20210318_11.csv
Number of paras containing keywords: 1
Saved df to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_p

Number of paras containing keywords: 1
Saved df to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_paragraphs_containing_keywords\20210412-20210415_3.gzip
---
Processing: 20210412-20210415_4.csv
Number of paras containing keywords: 7
Saved df to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_paragraphs_containing_keywords\20210412-20210415_4.gzip
---
Processing: 20210412-20210415_5.csv
Number of paras containing keywords: 4
Saved df to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_paragraphs_containing_keywords\20210412-20210415_5.gzip
---
Processing: 20210412-20210415_6.csv
Number of paras containing keywords: 0
Saved df to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_paragraphs_containing_keywords\20210412-20210415_6.gzip
---
Processing: 20210416-20210419_1.csv
Number of paras containing keywords: 6
Saved df to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_paragr

Number of paras containing keywords: 12
Saved df to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_paragraphs_containing_keywords\20210428-20210501_17.gzip
---
Processing: 20210428-20210501_18.csv
Number of paras containing keywords: 2
Saved df to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_paragraphs_containing_keywords\20210428-20210501_18.gzip
---
Processing: 20210428-20210501_19.csv
Number of paras containing keywords: 6
Saved df to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_paragraphs_containing_keywords\20210428-20210501_19.gzip
---
Processing: 20210428-20210501_2.csv
Number of paras containing keywords: 8
Saved df to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_paragraphs_containing_keywords\20210428-20210501_2.gzip
---
Processing: 20210428-20210501_20.csv
Number of paras containing keywords: 1
Saved df to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify

Number of paras containing keywords: 6
Saved df to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_paragraphs_containing_keywords\20210502-20210505_4.gzip
---
Processing: 20210502-20210505_5.csv
Number of paras containing keywords: 1
Saved df to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_paragraphs_containing_keywords\20210502-20210505_5.gzip
---
Processing: 20210502-20210505_6.csv
Number of paras containing keywords: 5
Saved df to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_paragraphs_containing_keywords\20210502-20210505_6.gzip
---
Processing: 20210502-20210505_7.csv
Number of paras containing keywords: 2
Saved df to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_paragraphs_containing_keywords\20210502-20210505_7.gzip
---
Processing: 20210502-20210505_8.csv
Number of paras containing keywords: 4
Saved df to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_paragr

Number of paras containing keywords: 3
Saved df to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_paragraphs_containing_keywords\20210510-20210513_17.gzip
---
Processing: 20210510-20210513_18.csv
Number of paras containing keywords: 13
Saved df to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_paragraphs_containing_keywords\20210510-20210513_18.gzip
---
Processing: 20210510-20210513_19.csv
Number of paras containing keywords: 3
Saved df to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_paragraphs_containing_keywords\20210510-20210513_19.gzip
---
Processing: 20210510-20210513_2.csv
Number of paras containing keywords: 3
Saved df to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_paragraphs_containing_keywords\20210510-20210513_2.gzip
---
Processing: 20210510-20210513_20.csv
Number of paras containing keywords: 4
Saved df to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify

Number of paras containing keywords: 1
Saved df to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_paragraphs_containing_keywords\20210518-20210521_14.gzip
---
Processing: 20210518-20210521_15.csv
Number of paras containing keywords: 4
Saved df to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_paragraphs_containing_keywords\20210518-20210521_15.gzip
---
Processing: 20210518-20210521_16.csv
Number of paras containing keywords: 9
Saved df to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_paragraphs_containing_keywords\20210518-20210521_16.gzip
---
Processing: 20210518-20210521_17.csv
Number of paras containing keywords: 1
Saved df to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_paragraphs_containing_keywords\20210518-20210521_17.gzip
---
Processing: 20210518-20210521_2.csv
Number of paras containing keywords: 0
Saved df to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify

Number of paras containing keywords: 1
Saved df to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_paragraphs_containing_keywords\20210530-20210602_5.gzip
---
Processing: 20210530-20210602_6.csv
Number of paras containing keywords: 1
Saved df to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_paragraphs_containing_keywords\20210530-20210602_6.gzip
---
Processing: 20210530-20210602_7.csv
Number of paras containing keywords: 0
Saved df to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_paragraphs_containing_keywords\20210530-20210602_7.gzip
---
Processing: 20210530-20210602_8.csv
Number of paras containing keywords: 2
Saved df to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_paragraphs_containing_keywords\20210530-20210602_8.gzip
---
Processing: 20210530-20210602_9.csv
Number of paras containing keywords: 4
Saved df to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_paragr

Number of paras containing keywords: 2
Saved df to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_paragraphs_containing_keywords\20210615-20210618_7.gzip
---
Processing: 20210619-20210622_1.csv
Number of paras containing keywords: 7
Saved df to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_paragraphs_containing_keywords\20210619-20210622_1.gzip
---
Processing: 20210619-20210622_2.csv
Number of paras containing keywords: 1
Saved df to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_paragraphs_containing_keywords\20210619-20210622_2.gzip
---
Processing: 20210619-20210622_3.csv
Number of paras containing keywords: 0
Saved df to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_paragraphs_containing_keywords\20210619-20210622_3.gzip
---
Processing: 20210623-20210626_1.csv
Number of paras containing keywords: 5
Saved df to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_paragr

Number of paras containing keywords: 4
Saved df to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_paragraphs_containing_keywords\20210721-20210724_9.gzip
---
Processing: 20210725-20210728_1.csv
Number of paras containing keywords: 6
Saved df to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_paragraphs_containing_keywords\20210725-20210728_1.gzip
---
Processing: 20210725-20210728_10.csv
Number of paras containing keywords: 9
Saved df to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_paragraphs_containing_keywords\20210725-20210728_10.gzip
---
Processing: 20210725-20210728_11.csv
Number of paras containing keywords: 3
Saved df to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_paragraphs_containing_keywords\20210725-20210728_11.gzip
---
Processing: 20210725-20210728_12.csv
Number of paras containing keywords: 6
Saved df to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_p

Number of paras containing keywords: 5
Saved df to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_paragraphs_containing_keywords\20210729-20210801_9.gzip
---
Processing: 20210802-20210805_1.csv
Number of paras containing keywords: 5
Saved df to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_paragraphs_containing_keywords\20210802-20210805_1.gzip
---
Processing: 20210802-20210805_10.csv
Number of paras containing keywords: 3
Saved df to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_paragraphs_containing_keywords\20210802-20210805_10.gzip
---
Processing: 20210802-20210805_11.csv
Number of paras containing keywords: 4
Saved df to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_paragraphs_containing_keywords\20210802-20210805_11.gzip
---
Processing: 20210802-20210805_12.csv
Number of paras containing keywords: 8
Saved df to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_p

Number of paras containing keywords: 4
Saved df to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_paragraphs_containing_keywords\20210802-20210805_9.gzip
---
Processing: 20210806-20210809_1.csv
Number of paras containing keywords: 3
Saved df to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_paragraphs_containing_keywords\20210806-20210809_1.gzip
---
Processing: 20210806-20210809_10.csv
Number of paras containing keywords: 7
Saved df to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_paragraphs_containing_keywords\20210806-20210809_10.gzip
---
Processing: 20210806-20210809_11.csv
Number of paras containing keywords: 3
Saved df to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_paragraphs_containing_keywords\20210806-20210809_11.gzip
---
Processing: 20210806-20210809_2.csv
Number of paras containing keywords: 1
Saved df to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_pa

Number of paras containing keywords: 15
Saved df to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_paragraphs_containing_keywords\20210814-20210817_1.gzip
---
Processing: 20210814-20210817_2.csv
Number of paras containing keywords: 3
Saved df to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_paragraphs_containing_keywords\20210814-20210817_2.gzip
---
Processing: 20210814-20210817_3.csv
Number of paras containing keywords: 16
Saved df to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_paragraphs_containing_keywords\20210814-20210817_3.gzip
---
Processing: 20210814-20210817_4.csv
Number of paras containing keywords: 1
Saved df to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_paragraphs_containing_keywords\20210814-20210817_4.gzip
---
Processing: 20210814-20210817_5.csv
Number of paras containing keywords: 8
Saved df to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_para

Number of paras containing keywords: 1
Saved df to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_paragraphs_containing_keywords\20210907-20210910_3.gzip
---
Processing: 20210907-20210910_4.csv
Number of paras containing keywords: 11
Saved df to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_paragraphs_containing_keywords\20210907-20210910_4.gzip
---
Processing: 20210907-20210910_5.csv
Number of paras containing keywords: 1
Saved df to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_paragraphs_containing_keywords\20210907-20210910_5.gzip
---
Processing: 20210907-20210910_6.csv
Number of paras containing keywords: 0
Saved df to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_paragraphs_containing_keywords\20210907-20210910_6.gzip
---
Processing: 20210907-20210910_7.csv
Number of paras containing keywords: 3
Saved df to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_parag

Number of paras containing keywords: 3
Saved df to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_paragraphs_containing_keywords\20211001-20211004_1.gzip
---
Processing: 20211005-20211008_1.csv
Number of paras containing keywords: 2
Saved df to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_paragraphs_containing_keywords\20211005-20211008_1.gzip
---
Processing: 20211005-20211008_2.csv
Number of paras containing keywords: 8
Saved df to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_paragraphs_containing_keywords\20211005-20211008_2.gzip
---
Processing: 20211005-20211008_3.csv
Number of paras containing keywords: 18
Saved df to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_paragraphs_containing_keywords\20211005-20211008_3.gzip
---
Processing: 20211005-20211008_4.csv
Number of paras containing keywords: 1
Saved df to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_parag

Number of paras containing keywords: 7
Saved df to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_paragraphs_containing_keywords\20211025-20211028_20.gzip
---
Processing: 20211025-20211028_21.csv
Number of paras containing keywords: 3
Saved df to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_paragraphs_containing_keywords\20211025-20211028_21.gzip
---
Processing: 20211025-20211028_22.csv
Number of paras containing keywords: 6
Saved df to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_paragraphs_containing_keywords\20211025-20211028_22.gzip
---
Processing: 20211025-20211028_23.csv
Number of paras containing keywords: 8
Saved df to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_paragraphs_containing_keywords\20211025-20211028_23.gzip
---
Processing: 20211025-20211028_24.csv
Number of paras containing keywords: 9
Saved df to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identif

Number of paras containing keywords: 1
Saved df to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_paragraphs_containing_keywords\20211102-20211105_20.gzip
---
Processing: 20211102-20211105_21.csv
Number of paras containing keywords: 20
Saved df to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_paragraphs_containing_keywords\20211102-20211105_21.gzip
---
Processing: 20211102-20211105_22.csv
Number of paras containing keywords: 12
Saved df to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_paragraphs_containing_keywords\20211102-20211105_22.gzip
---
Processing: 20211102-20211105_23.csv
Number of paras containing keywords: 2
Saved df to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_paragraphs_containing_keywords\20211102-20211105_23.gzip
---
Processing: 20211102-20211105_24.csv
Number of paras containing keywords: 8
Saved df to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_ident

Saved df to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_paragraphs_containing_keywords\20211106-20211109_5.gzip
---
Processing: 20211106-20211109_6.csv
Number of paras containing keywords: 6
Saved df to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_paragraphs_containing_keywords\20211106-20211109_6.gzip
---
Processing: 20211106-20211109_7.csv
Number of paras containing keywords: 5
Saved df to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_paragraphs_containing_keywords\20211106-20211109_7.gzip
---
Processing: 20211106-20211109_8.csv
Number of paras containing keywords: 10
Saved df to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_paragraphs_containing_keywords\20211106-20211109_8.gzip
---
Processing: 20211106-20211109_9.csv
Number of paras containing keywords: 8
Saved df to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_paragraphs_containing_keywords\20211106-2021

Number of paras containing keywords: 2
Saved df to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_paragraphs_containing_keywords\20211114-20211117_6.gzip
---
Processing: 20211114-20211117_7.csv
Number of paras containing keywords: 2
Saved df to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_paragraphs_containing_keywords\20211114-20211117_7.gzip
---
Processing: 20211114-20211117_8.csv
Number of paras containing keywords: 17
Saved df to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_paragraphs_containing_keywords\20211114-20211117_8.gzip
---
Processing: 20211114-20211117_9.csv
Number of paras containing keywords: 11
Saved df to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_paragraphs_containing_keywords\20211114-20211117_9.gzip
---
Processing: 20211118-20211121_1.csv
Number of paras containing keywords: 2
Saved df to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_para

Saved df to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_paragraphs_containing_keywords\20211204-20211207_6.gzip
---
Processing: 20211208-20211211_1.csv
Number of paras containing keywords: 23
Saved df to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_paragraphs_containing_keywords\20211208-20211211_1.gzip
---
Processing: 20211208-20211211_2.csv
Number of paras containing keywords: 6
Saved df to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_paragraphs_containing_keywords\20211208-20211211_2.gzip
---
Processing: 20211208-20211211_3.csv
Number of paras containing keywords: 2
Saved df to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_paragraphs_containing_keywords\20211208-20211211_3.gzip
---
Processing: 20211208-20211211_4.csv
Number of paras containing keywords: 0
Saved df to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_paragraphs_containing_keywords\20211208-2021

Number of paras containing keywords: 9
Saved df to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_paragraphs_containing_keywords\20220210-20220213_5.gzip
---
Processing: 20220210-20220213_6.csv
Number of paras containing keywords: 6
Saved df to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_paragraphs_containing_keywords\20220210-20220213_6.gzip
---
Processing: 20220210-20220213_7.csv
Number of paras containing keywords: 7
Saved df to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_paragraphs_containing_keywords\20220210-20220213_7.gzip
---
Processing: 20220210-20220213_8.csv
Number of paras containing keywords: 0
Saved df to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_paragraphs_containing_keywords\20220210-20220213_8.gzip
---
Processing: 20220214-20220217_1.csv
Number of paras containing keywords: 7
Saved df to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_paragr

Number of paras containing keywords: 4
Saved df to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_paragraphs_containing_keywords\20220222-20220225_27.gzip
---
Processing: 20220222-20220225_28.csv
Number of paras containing keywords: 7
Saved df to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_paragraphs_containing_keywords\20220222-20220225_28.gzip
---
Processing: 20220222-20220225_29.csv
Number of paras containing keywords: 8
Saved df to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_paragraphs_containing_keywords\20220222-20220225_29.gzip
---
Processing: 20220222-20220225_3.csv
Number of paras containing keywords: 13
Saved df to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_paragraphs_containing_keywords\20220222-20220225_3.gzip
---
Processing: 20220222-20220225_30.csv
Number of paras containing keywords: 7
Saved df to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify

Number of paras containing keywords: 2
Saved df to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_paragraphs_containing_keywords\20220306-20220309_14.gzip
---
Processing: 20220306-20220309_15.csv
Number of paras containing keywords: 3
Saved df to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_paragraphs_containing_keywords\20220306-20220309_15.gzip
---
Processing: 20220306-20220309_16.csv
Number of paras containing keywords: 7
Saved df to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_paragraphs_containing_keywords\20220306-20220309_16.gzip
---
Processing: 20220306-20220309_17.csv
Number of paras containing keywords: 2
Saved df to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_paragraphs_containing_keywords\20220306-20220309_17.gzip
---
Processing: 20220306-20220309_2.csv
Number of paras containing keywords: 9
Saved df to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify

Number of paras containing keywords: 8
Saved df to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_paragraphs_containing_keywords\20220322-20220325_5.gzip
---
Processing: 20220322-20220325_6.csv
Number of paras containing keywords: 8
Saved df to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_paragraphs_containing_keywords\20220322-20220325_6.gzip
---
Processing: 20220322-20220325_7.csv
Number of paras containing keywords: 7
Saved df to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_paragraphs_containing_keywords\20220322-20220325_7.gzip
---
Processing: 20220322-20220325_8.csv
Number of paras containing keywords: 6
Saved df to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_paragraphs_containing_keywords\20220322-20220325_8.gzip
---
Processing: 20220326-20220329_1.csv
Number of paras containing keywords: 4
Saved df to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_paragr

Number of paras containing keywords: 10
Saved df to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_paragraphs_containing_keywords\20220423-20220426_5.gzip
---
Processing: 20220423-20220426_6.csv
Number of paras containing keywords: 5
Saved df to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_paragraphs_containing_keywords\20220423-20220426_6.gzip
---
Processing: 20220423-20220426_7.csv
Number of paras containing keywords: 12
Saved df to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_paragraphs_containing_keywords\20220423-20220426_7.gzip
---
Processing: 20220427-20220430_1.csv
Number of paras containing keywords: 8
Saved df to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_paragraphs_containing_keywords\20220427-20220430_1.gzip
---
Processing: 20220427-20220430_10.csv
Number of paras containing keywords: 10
Saved df to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_pa

Number of paras containing keywords: 12
Saved df to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_paragraphs_containing_keywords\20220501-20220504_17.gzip
---
Processing: 20220501-20220504_18.csv
Number of paras containing keywords: 14
Saved df to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_paragraphs_containing_keywords\20220501-20220504_18.gzip
---
Processing: 20220501-20220504_19.csv
Number of paras containing keywords: 3
Saved df to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_paragraphs_containing_keywords\20220501-20220504_19.gzip
---
Processing: 20220501-20220504_2.csv
Number of paras containing keywords: 15
Saved df to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_paragraphs_containing_keywords\20220501-20220504_2.gzip
---
Processing: 20220501-20220504_20.csv
Number of paras containing keywords: 3
Saved df to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identi

Number of paras containing keywords: 11
Saved df to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_paragraphs_containing_keywords\20220509-20220512_10.gzip
---
Processing: 20220509-20220512_11.csv
Number of paras containing keywords: 2
Saved df to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_paragraphs_containing_keywords\20220509-20220512_11.gzip
---
Processing: 20220509-20220512_12.csv
Number of paras containing keywords: 3
Saved df to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_paragraphs_containing_keywords\20220509-20220512_12.gzip
---
Processing: 20220509-20220512_13.csv
Number of paras containing keywords: 10
Saved df to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_paragraphs_containing_keywords\20220509-20220512_13.gzip
---
Processing: 20220509-20220512_14.csv
Number of paras containing keywords: 6
Saved df to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_ident

Saved df to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_paragraphs_containing_keywords\20220513-20220516_2.gzip
---
Processing: 20220513-20220516_3.csv
Number of paras containing keywords: 1
Saved df to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_paragraphs_containing_keywords\20220513-20220516_3.gzip
---
Processing: 20220513-20220516_4.csv
Number of paras containing keywords: 5
Saved df to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_paragraphs_containing_keywords\20220513-20220516_4.gzip
---
Processing: 20220513-20220516_5.csv
Number of paras containing keywords: 10
Saved df to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_paragraphs_containing_keywords\20220513-20220516_5.gzip
---
Processing: 20220513-20220516_6.csv
Number of paras containing keywords: 5
Saved df to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_paragraphs_containing_keywords\20220513-2022

Number of paras containing keywords: 4
Saved df to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_paragraphs_containing_keywords\20220529-20220601_4.gzip
---
Processing: 20220529-20220601_5.csv
Number of paras containing keywords: 6
Saved df to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_paragraphs_containing_keywords\20220529-20220601_5.gzip
---
Processing: 20220529-20220601_6.csv
Number of paras containing keywords: 4
Saved df to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_paragraphs_containing_keywords\20220529-20220601_6.gzip
---
Processing: 20220602-20220605_1.csv
Number of paras containing keywords: 4
Saved df to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_paragraphs_containing_keywords\20220602-20220605_1.gzip
---
Processing: 20220602-20220605_2.csv
Number of paras containing keywords: 3
Saved df to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_paragr

# Tests for a single .csv file

In [23]:
csv_filepath = Path(r"C:\Users\jasonjia\Dropbox\Projects\conference_call\output\02_process_cc\02.2_csv_20210101_20220617\20201229-20210101_1.csv")

In [24]:
csv_filename = csv_filepath.name
df_csv_file = pd.read_csv(csv_filepath)
df_combined = get_all_paras_containing_keywords_from_a_csv_file(df_csv_file)
df_combined

Unnamed: 0,Keyword,Paragraph,Report
0,occ,pawan kumar goenka - mahindra & mahindra limi...,71149754
0,occ,"lawrence e. kurzius - mccormick & company, inc...",71118206
0,occ,"lawrence e. kurzius - mccormick & company, inc...",71118386
0,interest rate,"after this, we completed an $8.3 million overs...",71118254
0,occ,"following the presentation of these proposals,...",71118259
0,interest rate,"in addition, in recent years, we're also goin...",71118480
1,cost of capital,improve and achieve a return on capital which...,71118480
2,cost of capital,unidentified company representative [interpret...,71118480
0,interest rate,so apart from getting the resolution done with...,71117411
0,cost of capital,"at its core, this is a favorable recapitalizat...",71114841


In [25]:
# Merge with .xls info
unclean_data = df_combined.merge(df_csv_file, on = 'Report', how = "left")
unclean_data["file"] = csv_filename
unclean_data["hasnumber"] = unclean_data["Paragraph"].apply(has_numbers)
unclean_data

Unnamed: 0,Keyword,Paragraph,Report,PPV,TOC,Title,Subtitle,Date,Pages,Price,Contributor,Analyst,Language,Collection,Call,file,hasnumber
0,occ,pawan kumar goenka - mahindra & mahindra limi...,71149754,N,Y,MAHINDRA & MAHINDRA,MAHM.NS - Event Transcript of Mahindra and Mah...,2021-01-01,24,Subscription,THOMSON REUTERS STREETEVENTS,ANON,English,INV,\n CORPORATE PARTICIPANTS\nAnish Shah Mahindr...,20201229-20210101_1.csv,True
1,occ,"lawrence e. kurzius - mccormick & company, inc...",71118206,N,Y,MCCORMICK & CO.,MKC.N - Event Transcript of McCormick & Compan...,2020-12-30,12,Subscription,THOMSON REUTERS STREETEVENTS,ANON,English,INV,\n CORPORATE PARTICIPANTS\nKasey A. Jenkins M...,20201229-20210101_1.csv,True
2,occ,"lawrence e. kurzius - mccormick & company, inc...",71118386,N,Y,MCCORMICK & CO.,MKC.N - Event Brief of McCormick & Company Inc...,2020-12-30,15,Subscription,THOMSON REUTERS STREETEVENTS,ANON,English,INV,\n CORPORATE PARTICIPANTS\nKasey A. Jenkins M...,20201229-20210101_1.csv,True
3,interest rate,"after this, we completed an $8.3 million overs...",71118254,N,Y,HUT 8 MINING CORP,HUT.TO - Event Transcript of Hut 8 Mining Corp...,2020-12-30,9,Subscription,THOMSON REUTERS STREETEVENTS,ANON,English,INV,\n CORPORATE PARTICIPANTS\nDemetrios Vaiopoul...,20201229-20210101_1.csv,True
4,occ,"following the presentation of these proposals,...",71118259,N,Y,WPX ENERGY INC,WPX.N - Event Transcript of WPX Energy Inc con...,2020-12-30,4,Subscription,THOMSON REUTERS STREETEVENTS,ANON,English,INV,\n CORPORATE PARTICIPANTS\nRichard E. Muncrie...,20201229-20210101_1.csv,False
5,interest rate,"in addition, in recent years, we're also goin...",71118480,N,Y,UNICAJA,UNI.MC - Event Transcript of Unicaja Banco SA ...,2020-12-30,11,Subscription,THOMSON REUTERS STREETEVENTS,ANON,English,INV,\n CORPORATE PARTICIPANTS\nJaime Hernández Ma...,20201229-20210101_1.csv,False
6,cost of capital,improve and achieve a return on capital which...,71118480,N,Y,UNICAJA,UNI.MC - Event Transcript of Unicaja Banco SA ...,2020-12-30,11,Subscription,THOMSON REUTERS STREETEVENTS,ANON,English,INV,\n CORPORATE PARTICIPANTS\nJaime Hernández Ma...,20201229-20210101_1.csv,True
7,cost of capital,unidentified company representative [interpret...,71118480,N,Y,UNICAJA,UNI.MC - Event Transcript of Unicaja Banco SA ...,2020-12-30,11,Subscription,THOMSON REUTERS STREETEVENTS,ANON,English,INV,\n CORPORATE PARTICIPANTS\nJaime Hernández Ma...,20201229-20210101_1.csv,True
8,interest rate,so apart from getting the resolution done with...,71117411,N,Y,JAIN IRRIGATION SYSTEMS,JAIR.NS - Event Transcript of Jain Irrigation ...,2020-12-30,8,Subscription,THOMSON REUTERS STREETEVENTS,ANON,English,INV,\n CORPORATE PARTICIPANTS\nAnil Bhavarlal Jai...,20201229-20210101_1.csv,False
9,cost of capital,"at its core, this is a favorable recapitalizat...",71114841,N,Y,GEG.OQ - EVENT TRANSCRIPT OF FOREST INVESTMENT...,GEG.OQ - Event Transcript of Forest Investment...,2020-12-29,7,Subscription,THOMSON REUTERS STREETEVENTS,"RESEARCH DEPARTMENT, ET AL",English,INV,\n CORPORATE PARTICIPANTS\nPeter Andrew Reed ...,20201229-20210101_1.csv,False


In [30]:
# Save as parquet file
parquet_filename = csv_filepath.stem + '.gzip'
outputfilepath = Path(outputfolder / parquet_filename)
print(outputfilepath)
unclean_data.to_parquet(outputfilepath, compression = "gzip")

C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_paragraphs_containing_keywords\20201229-20210101_1.gzip


# Test for keyword_in_paragraph

In [49]:
keyword = "occ"
target_string = "asd"
result = re.finditer(keyword, target_string)
for match_obj in result:
    start, end = match_obj.span() 
    print(start, end)