In [7]:
import random
import re
from typing import List
from pathlib import Path
from tqdm import tqdm

In [8]:
from datasets import load_dataset
from transformers import GPT2Tokenizer, DataCollatorForLanguageModeling

coca_dir = "../data/coca/text/text_spoken_kde/"

# dataset = load_dataset('text', data_dir=coca_dir)
dataset = load_dataset('text', data_files=coca_dir+'w_spok_201*.txt')
train_dataset = dataset['train']

In [9]:
example_line = random.choice(train_dataset)
print(example_line['text'][:100])

##4103113 @!ROBERT-JEFFRESS-@1# We want a candidate who is a good , moral person , or do we want a c


## Preprocessing (text cleaning)

Goal: From COCA's spoken genre, make a .txt file of new-line separated sentences. Clean formatting incl. speaker codes and weird tags.

In [10]:
example_string_id = random.randint(0, len(train_dataset) - 1)
example_string = train_dataset[example_string_id]['text']
print(f'{example_string_id=}')
print(f'{len(example_string)=}')
print(example_string[:100])

example_string_id=1809
len(example_string)=11366
##4103331 @!ROBIN-ROBERTS-@1-A# @(Off-camera) Now to the latest on Gary Giordano. @!ROBIN-ROBERTS-@1


In [12]:
def separate_chunks(text: str) -> List[str]:
    """
    COCA is composed of scrambled chunks split by "@" * 10 (possibly 
    cut off at end of file). 
    Returns a list of separated chunks.
    """
    return text.split(' @ @ @ @ @ @ @ @ @ @ ')


example_chunks = separate_chunks(example_string)
print(len(example_chunks), [len(chunk) for chunk in example_chunks])
for chunk in example_chunks:
    print(chunk[:50])

11 [1178, 1204, 1152, 1016, 1015, 1052, 946, 893, 1115, 1145, 440]
##4103331 @!ROBIN-ROBERTS-@1-A# @(Off-camera) Now 
@GRAPHICS @!DAN-HARRIS-@1-ABC-# @(Voiceover) And r
, behind me . And she wasn't. @!DAN-HARRIS-@1-ABC-
before somebody is satisfied . I do miss her . And
exactly what happened . Okay . @!DAN-ABRAMS-@1-ABC
well , that , that he 's thinking about the attorn
those are some of the questions you asked . You kn
he 's , he 's talked about it so often that that c
you have gone through interrogation , and you get 
taken . @!ROBIN-ROBERTS-@1-A# @(Off-camera) ... yo
not offering specific details that I would think a


In [14]:
def remove_speaker_and_other_tags(chunk: str, remove_nonspeaker_tags=True) -> str:
    """
    DEPRECATED: it's better to split text by these tags instead of removing them
    Remove from one chunk speaker tags (ex: @!BOB:) and optionally
    other tags (ex: @(End-of-clip)).
    """
    pattern = r"\s+@\S+" if remove_nonspeaker_tags else r"\s+@!\S+"
    return re.sub(pattern, " . ", chunk)

def split_by_speaker_and_other_tags(
        chunk: str, 
        remove_nonspeaker_tags=True,
        ) -> List[str]:
    """
    Splits one chunk by speaker tags (ex: @!BOB) and optionally
        other tags (ex: @(End-of-clip)).

    remove_nonspeaker_tags: also removes things like @(End-of-clip). 
        speaker tag: @!BOB  non-speaker tag @BOB (no "!")
        Does not remove long portions inside of @(Clip-from-previous blocks
    
    Notes:
        - Pattern makes first word in turn start with a space.
        To remove it, add an \s at the end of the pattern, but be aware
        that this will break pattern matching of consecutive tags.
        - Speaker tags are inconsistently either marked as 
            "@!BOB", "@!BOB:", "@!BOB :", "@!BOB ( voiceover ) :", 
            and more. ( voiceover ) is currently not captured.

    """
    # pattern = r"\s+@\S+" if remove_nonspeaker_tags else r"\s+@!\S+"
    pattern = r"@\S+(?:\s:|)\s" if remove_nonspeaker_tags else r"@!\S+(?:\s:|)\s"
    out = re.split(pattern, chunk)
    out = [segment for segment in out if segment.strip()]
    return out


example_chunk_id = random.randint(0, len(example_chunks)-1)
print(f'{example_chunk_id=}')
example_chunk = example_chunks[example_chunk_id]
print(example_chunk)
print(remove_speaker_and_other_tags(example_chunk))
print('----')
example_turns = split_by_speaker_and_other_tags(example_chunk)
for turn_number, turn in enumerate(example_turns):
    print(turn_number, turn)

example_chunk_id=4
exactly what happened . Okay . @!DAN-ABRAMS-@1-ABC-# @(Voiceover) But to go on and then be so belligerent , and to antagonize often the people who are doing the interviews does n't help him at all . And so I 'm not certain that in the end this was a net win for Giordano. @!ROBIN-ROBERTS-@1-A# @(Off-camera) But you think , Michael , that there is a reason , because many people watching both interviews were saying there 's no emotion from him when it comes to Robyn . And just when he kept saying - I 've answered this 50 , 60 times , I do n't wan na do it again . You say there 's a reason for that . @!DOCTOR-MICHAEL-WEL# @(Off-camera) I feel , I feel a little bit more careful about interpreting what I 'm seeing in an interview when you have an attorney right next to someone . It 's rehearsed . Your interview was rehearsed . Another interview was rehearsed . And to whatever degree an attorney can control his client , he 'll even allow that interview to go forward . @!DOC

In [18]:
def split_turn_into_sentences(
        turn: str, 
        # exclude_sentences_with_ellipses=False
        ) -> str:
    """
    Splits one tag-free turn (as separated by split_by_speaker_and_other_tags) 
        into sentences.
    Since COCA has space-separated punctuation, splits are done by:
        [' . ', ' ? ', ' ! ']
    """
    delimiters = [' . ', ' ? ', ' ! ']
    pattern = "|".join(map(re.escape, delimiters))
    pattern = '(' + pattern + ')' # retain delimiters
    splits = re.split(pattern, turn)
    if len(splits) == 1:
        return splits
    
    # For multi-sentence utterances, we must manually re-combine punctuation
    out = []
    for idx, split in enumerate(splits):
        if not split:
            continue
        if not (idx % 2): # is sentence
            out.append(split)
        else: # is delimiter
            out[-1] += split[:-1] # don't include space after punctuation
    return out
    
turn = example_turns[random.randint(0, len(example_turns)-1)]
split_turn_into_sentences(turn)

["But you think , Michael , that there is a reason , because many people watching both interviews were saying there 's no emotion from him when it comes to Robyn .",
 "And just when he kept saying - I 've answered this 50 , 60 times , I do n't wan na do it again .",
 "You say there 's a reason for that ."]

In [19]:
def split_chunk_into_sentences(
        chunk: str,
        exclude_first_and_last_sentences=True,
        remove_nonspeaker_tags=True,
        ) -> List[str]:
    """
    Combines `split_by_speaker_and_other_tags` and 
        `split_turn_into_sentences` to split a COCA chunk
        into a list of sentences.

    exclude_first_and_last_sentences: because the first and 
        last sentences are likely fragments split by the chunk border
    """
    turns = split_by_speaker_and_other_tags(chunk, 
                                            remove_nonspeaker_tags)
    sentences = []
    for turn in turns:
        sentences.extend(split_turn_into_sentences(turn))
    return sentences[1:-1] if exclude_first_and_last_sentences else sentences

chunk_number = random.randint(0, len(example_chunks)-1)
example_chunk = example_chunks[chunk_number]
example_sentences = split_chunk_into_sentences(example_chunk, 
                                               exclude_first_and_last_sentences=True)
print(f'{chunk_number=}')
example_sentences

chunk_number=5


["I do n't think people in America recognize what it 's like to be sitting here , talking and recognizing how many people are listening to you and then have your freedom jeopardized at the same time , recognizing that you have so much hanging on every word .",
 'So I think the , the pressure ... ',
 'Mm-hmm .',
 "... operating within it does affect someone 's expression .",
 "But that 's , but that 's the attorney 's job .",
 "I mean , the attorney 's job ... ",
 "That 's right .",
 "That 's right .",
 'I agree .',
 'Sure .',
 "... should be to make sure , and , and he was totally unrehearsed , I mean , in my view , meaning , I think Giordano should have spent more time thinking about his answers and how he 's gon na respond ."]

In [20]:
print(example_chunks[chunk_number])

well , that , that he 's thinking about the attorney next to him . I do n't think people in America recognize what it 's like to be sitting here , talking and recognizing how many people are listening to you and then have your freedom jeopardized at the same time , recognizing that you have so much hanging on every word . @!DOCTOR-MICHAEL-WEL# @(Off-camera) So I think the , the pressure ... @!ROBIN-ROBERTS-@1-A# @(Off-camera) Mm-hmm . @!DOCTOR-MICHAEL-WEL# @(Off-camera) ... operating within it does affect someone 's expression . @!DAN-ABRAMS-@1-ABC-# @(Off-camera) But that 's , but that 's the attorney 's job . I mean , the attorney 's job ... @!DOCTOR-MICHAEL-WEL# @(Off-camera) That 's right . That 's right . I agree . Sure . @!DAN-ABRAMS-@1-ABC-# @(Off-camera) ... should be to make sure , and , and he was totally unrehearsed , I mean , in my view , meaning , I think Giordano should have spent more time thinking about his answers and how he 's gon na respond . And it seem to me either

### Putting it all together

In [21]:
def clean_coca_file(
        input_file_path: Path,
        output_dir_path: Path,
        split_by='chunk', # 'sentence'
        overwrite=True,
        exclude_first_and_last_sentences=True,
        remove_nonspeaker_tags=True,
        ) -> None:
    assert input_file_path.exists(), f'File "{input_file_path}" not found'
    assert split_by in ['chunk', 'sentence'], f'Invalid split method: choose from ["chunk", "sentence"]'
    dataset_dict = load_dataset('text', data_files=str(input_file_path))
    dataset = dataset_dict['train']

    output_dir_path.mkdir(parents=True, exist_ok=overwrite)
    output_file_path = output_dir_path / (input_file_path.stem + '_cleaned.txt')

    f = open(output_file_path, 'w')
    for line in tqdm(dataset):
        text = line['text']
        chunks = separate_chunks(text)
        if split_by == 'chunk':
            f.write('\n'.join(chunks) + '\n')
        elif split_by == 'sentence':
            for chunk in chunks:
                sentences = split_chunk_into_sentences(chunk,
                                                    exclude_first_and_last_sentences,
                                                    remove_nonspeaker_tags)
                f.write('\n'.join(sentences) + '\n')

    f.close()
    return None
        


In [23]:
clean_coca_file(
    input_file_path=Path("../data/coca/text/text_spoken_kde/w_spok_2000.txt"),
    output_dir_path=Path("../data/coca_spoken/text_chunk_cleaned/"),
    split_by='chunk'
)

100%|██████████| 3025/3025 [00:00<00:00, 24878.86it/s]


In [24]:
for file in Path("../data/coca/text/text_spoken_kde/").iterdir():
    clean_coca_file(
        input_file_path=file, 
        output_dir_path=Path("../data/coca_spoken/text_chunk_cleaned/"),
        split_by='chunk'
    )

100%|██████████| 2399/2399 [00:00<00:00, 19427.00it/s]
100%|██████████| 3025/3025 [00:00<00:00, 28911.07it/s]
100%|██████████| 2771/2771 [00:00<00:00, 25081.07it/s]
100%|██████████| 3079/3079 [00:00<00:00, 25170.03it/s]
100%|██████████| 1913/1913 [00:00<00:00, 18782.34it/s]
100%|██████████| 563/563 [00:00<00:00, 10866.87it/s]
100%|██████████| 2324/2324 [00:00<00:00, 21597.32it/s]
100%|██████████| 972/972 [00:00<00:00, 10217.81it/s]
100%|██████████| 2647/2647 [00:00<00:00, 23244.61it/s]
100%|██████████| 2132/2132 [00:00<00:00, 20658.49it/s]
100%|██████████| 1006/1006 [00:00<00:00, 10582.01it/s]
100%|██████████| 1783/1783 [00:00<00:00, 5893.63it/s]
100%|██████████| 1670/1670 [00:00<00:00, 16039.33it/s]
100%|██████████| 760/760 [00:00<00:00, 7800.72it/s]
100%|██████████| 720/720 [00:00<00:00, 7330.01it/s]
100%|██████████| 1472/1472 [00:00<00:00, 15450.53it/s]
100%|██████████| 1426/1426 [00:00<00:00, 11973.65it/s]
100%|██████████| 1296/1296 [00:00<00:00, 9772.13it/s]
100%|██████████| 1452/