This notebook is to batch process any folder of text with covariance resolution. 

In [1]:
import os
import spacy
from spacy.tokens import Doc

from tqdm import tqdm

In [2]:
######################
# Helper Functions
######################

def resolve_references(doc: Doc) -> str:
    """Function for resolving references with the coref ouput
    doc (Doc): The Doc object processed by the coref pipeline
    RETURNS (str): The Doc string with resolved references
    """
    # token.idx : token.text
    token_mention_mapper = {}
    output_string = ""
    clusters = [
        val for key, val in doc.spans.items() if key.startswith("coref_cluster")
    ]

    # Iterate through every found cluster
    for cluster in clusters:
        first_mention = cluster[0]
        # Iterate through every other span in the cluster
        for mention_span in list(cluster)[1:]:
            # Set first_mention as value for the first token in mention_span in the token_mention_mapper
            token_mention_mapper[mention_span[0].idx] = first_mention.text + mention_span[0].whitespace_
            
            for token in mention_span[1:]:
                # Set empty string for all the other tokens in mention_span
                token_mention_mapper[token.idx] = ""

    # Iterate through every token in the Doc
    for token in doc:
        # Check if token exists in token_mention_mapper
        if token.idx in token_mention_mapper:
            output_string += token_mention_mapper[token.idx]
        # Else add original token text
        else:
            output_string += token.text + token.whitespace_

    return output_string

In [3]:
######################
# Process
######################

nlp = spacy.load("en_coreference_web_trf")

# folders
input_folder = "./data/BBC/News Articles/business"
output_folder = "./data/BBC/News Articles_w_CovarianceRes/business"

# create output folder if it doesn't exist
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

# loop through input files
for file_name in tqdm(os.listdir(input_folder)):
    # ignore any non-txt files
    if not file_name.endswith(".txt"):
        continue
    
    # create input and output file paths
    input_path = os.path.join(input_folder, file_name)
    output_file_name = os.path.splitext(file_name)[0] + ".txt"
    output_path = os.path.join(output_folder, output_file_name)
    
    # open input file and read text
    with open(input_path, "r", encoding="utf-8") as f:
        text = f.read()
    
    # process text using spacy
    doc = nlp(text)
    doc = nlp(resolve_references(doc))
    
    # write tokenized processed text to output file
    with open(output_path, "w", encoding="utf-8") as f:
        f.write(doc.text)
    print(f"done: {output_file_name}")


  0%|▏                                                                               | 1/510 [00:07<1:02:21,  7.35s/it]

done: 001.txt


  0%|▎                                                                                 | 2/510 [00:13<58:34,  6.92s/it]

done: 002.txt


  1%|▍                                                                                 | 3/510 [00:17<46:50,  5.54s/it]

done: 003.txt


  1%|▋                                                                                 | 4/510 [00:25<54:38,  6.48s/it]

done: 004.txt


  1%|▊                                                                                 | 5/510 [00:30<47:43,  5.67s/it]

done: 005.txt


  1%|▉                                                                                 | 6/510 [00:32<38:45,  4.61s/it]

done: 006.txt


  1%|█▏                                                                                | 7/510 [00:36<36:33,  4.36s/it]

done: 007.txt


  2%|█▎                                                                                | 8/510 [00:42<40:31,  4.84s/it]

done: 008.txt


  2%|█▍                                                                                | 9/510 [00:45<35:53,  4.30s/it]

done: 009.txt


  2%|█▌                                                                               | 10/510 [00:49<34:05,  4.09s/it]

done: 010.txt


  2%|█▋                                                                               | 11/510 [00:52<31:13,  3.75s/it]

done: 011.txt


  2%|█▉                                                                               | 12/510 [00:56<33:10,  4.00s/it]

done: 012.txt


Token indices sequence length is longer than the specified maximum sequence length for this model (572 > 512). Running this sequence through the model will result in indexing errors
  2%|█▉                                                                               | 12/510 [00:59<40:54,  4.93s/it]


KeyboardInterrupt: 