In [541]:
import random
import re
from typing import List
from pathlib import Path
from tqdm import tqdm

In [443]:
from datasets import load_dataset
from transformers import GPT2Tokenizer, DataCollatorForLanguageModeling

coca_dir = "../data/coca/text/text_spoken_kde/"

# dataset = load_dataset('text', data_dir=coca_dir)
dataset = load_dataset('text', data_files=coca_dir+'w_spok_201*.txt')
train_dataset = dataset['train']

In [466]:
example_line = random.choice(train_dataset)
print(example_line['text'][:100])

##4072661 @!NANCY-GRACE-HOST : A 5-year-old Florida girl tucked into bed , five hours later , shes g


## Preprocessing (text cleaning)

Goal: From COCA's spoken genre, make a .txt file of new-line separated sentences. Clean formatting incl. speaker codes and weird tags.

In [497]:
example_string_id = random.randint(0, len(train_dataset) - 1)
example_string = train_dataset[example_string_id]['text']
print(f'{example_string_id=}')
print(f'{len(example_string)=}')
print(example_string[:100])

example_string_id=81
len(example_string)=15290
##4072612 @!ROBIN-ROBERTS-@1-A# @(Off-camera) Oh , come on upstairs here , Sam . Now , the dos and d


In [498]:
def separate_chunks(text: str) -> List[str]:
    """
    COCA is composed of scrambled chunks split by "@" * 10 (possibly 
    cut off at end of file). 
    Returns a list of separated chunks.
    """
    return text.split(' @ @ @ @ @ @ @ @ @ @ ')


example_chunks = separate_chunks(example_string)
print(len(example_chunks), [len(chunk) for chunk in example_chunks])
for chunk in example_chunks:
    print(chunk[:50])

15 [929, 972, 1084, 1047, 1018, 982, 1020, 929, 1053, 1054, 943, 1122, 1023, 1053, 767]
##4072612 @!ROBIN-ROBERTS-@1-A# @(Off-camera) Oh ,
. @!STEVE-HARVEY-@1-AB# @(Off-camera) Yeah . @!ROB
couple . We have , let 's see , Robert is a 51-yea
lets Christine know they 're having a wonderful da
right there . Just meeting somebody . @GRAPHICS @G
whole evening right there . @!ROBIN-ROBERTS-@1-A# 
audience are here all shaking , you , you like tha
to know before you waste a lot of time , emotion ,
that lady keep talking about anyway . @!ROBIN-ROBE
@!DEE-DEE-@130'S-PUB# There are better products no
right . @!ROBIN-ROBERTS-@1-A# @(Off-camera) It was
noticed he was a little ... @!ROBIN-ROBERTS-@1-A# 
, you know , I do n't like your hair . What are yo
you know , they are going out on a second date . O
skating . @!GEORGE-STEPHANOPOU# @(Off-camera) Also


In [617]:
def remove_speaker_and_other_tags(chunk: str, remove_nonspeaker_tags=True) -> str:
    """
    DEPRECATED: it's better to split text by these tags instead of removing them
    Remove from one chunk speaker tags (ex: @!BOB:) and optionally
    other tags (ex: @(End-of-clip)).
    """
    pattern = r"\s+@\S+" if remove_nonspeaker_tags else r"\s+@!\S+"
    return re.sub(pattern, " . ", chunk)

def split_by_speaker_and_other_tags(
        chunk: str, 
        remove_nonspeaker_tags=True,
        ) -> List[str]:
    """
    Splits one chunk by speaker tags (ex: @!BOB) and optionally
        other tags (ex: @(End-of-clip)).

    remove_nonspeaker_tags: also removes things like @(End-of-clip). 
        speaker tag: @!BOB  non-speaker tag @BOB (no "!")
        Does not remove long portions inside of @(Clip-from-previous blocks
    
    Notes:
        - Pattern makes first word in turn start with a space.
        To remove it, add an \s at the end of the pattern, but be aware
        that this will break pattern matching of consecutive tags.
        - Speaker tags are inconsistently either marked as 
            "@!BOB", "@!BOB:", "@!BOB :", "@!BOB ( voiceover ) :", 
            and more. ( voiceover ) is currently not captured.

    """
    # pattern = r"\s+@\S+" if remove_nonspeaker_tags else r"\s+@!\S+"
    pattern = r"@\S+(?:\s:|)\s" if remove_nonspeaker_tags else r"@!\S+(?:\s:|)\s"
    out = re.split(pattern, chunk)
    out = [segment for segment in out if segment.strip()]
    return out


example_chunk_id = random.randint(0, len(example_chunks)-1)
print(f'{example_chunk_id=}')
example_chunk = example_chunks[example_chunk_id]
print(example_chunk)
print(remove_speaker_and_other_tags(example_chunk))
print('----')
example_turns = split_by_speaker_and_other_tags(example_chunk)
for turn_number, turn in enumerate(example_turns):
    print(turn_number, turn)

example_chunk_id=0
##4072612 @!ROBIN-ROBERTS-@1-A# @(Off-camera) Oh , come on upstairs here , Sam . Now , the dos and don'ts of a first date . What can you do on a first date to make sure there is a second one ? A third one ? We found some brave souls who let us follow them on a blind date to find out what they 're doing right , and what they could be doing better . Joining us now with advice for our singles is " GMA 's " relationship guru , and the author of the best-selling relationship book , " Act Like A Lady , Think Like A Man , " Mr. Steve Harvey . And you always come with your own audience too here . @GRAPHICS @GRAPHICS @!STEVE-HARVEY-@1-AB# @(Off-camera) I bring a crowd with me . I always do better with people watching me . I do n't know what , a little showoff thing , maybe , I do n't know . @!ROBIN-ROBERTS-@1-A# @(Off-camera) There 's something , get on with that , but I 'm , I 'm going to let that go . @!STEVE-HARVEY-@1-AB#
##4072612 .  .  Oh , come on upstairs here , Sam . 

In [679]:
def split_turn_into_sentences(
        turn: str, 
        # exclude_sentences_with_ellipses=False
        ) -> str:
    """
    Splits one tag-free turn (as separated by split_by_speaker_and_other_tags) 
        into sentences.
    Since COCA has space-separated punctuation, splits are done by:
        [' . ', ' ? ', ' ! ']
    """
    delimiters = [' . ', ' ? ', ' ! ']
    pattern = "|".join(map(re.escape, delimiters))
    pattern = '(' + pattern + ')' # retain delimiters
    splits = re.split(pattern, turn)
    if len(splits) == 1:
        return splits
    
    # For multi-sentence utterances, we must manually re-combine punctuation
    out = []
    for idx, split in enumerate(splits):
        if not split:
            continue
        if not (idx % 2): # is sentence
            out.append(split)
        else: # is delimiter
            out[-1] += split[:-1] # don't include space after punctuation
    return out
    
turn = example_turns[random.randint(0, len(example_turns)-1)]
split_turn_into_sentences(turn)

['Oh , come on upstairs here , Sam .',
 "Now , the dos and don'ts of a first date .",
 'What can you do on a first date to make sure there is a second one ?',
 'A third one ?',
 "We found some brave souls who let us follow them on a blind date to find out what they 're doing right , and what they could be doing better .",
 'Joining us now with advice for our singles is " GMA \'s " relationship guru , and the author of the best-selling relationship book , " Act Like A Lady , Think Like A Man , " Mr. Steve Harvey .',
 'And you always come with your own audience too here .']

In [691]:
def split_chunk_into_sentences(
        chunk: str,
        exclude_first_and_last_sentences=True,
        remove_nonspeaker_tags=True,
        ) -> List[str]:
    """
    Combines `split_by_speaker_and_other_tags` and 
        `split_turn_into_sentences` to split a COCA chunk
        into a list of sentences.

    exclude_first_and_last_sentences: because the first and 
        last sentences are likely fragments split by the chunk border
    """
    turns = split_by_speaker_and_other_tags(chunk, 
                                            remove_nonspeaker_tags)
    sentences = []
    for turn in turns:
        sentences.extend(split_turn_into_sentences(turn))
    return sentences[1:-1] if exclude_first_and_last_sentences else sentences

chunk_number = random.randint(0, len(example_chunks)-1)
example_chunk = example_chunks[chunk_number]
example_sentences = split_chunk_into_sentences(example_chunk, 
                                               exclude_first_and_last_sentences=True)
print(f'{chunk_number=}')
example_sentences

chunk_number=3


["I 'm glad that we got together today to have this wonderful date together .",
 'Yes .',
 "It 's really a pleasure to , to have this nice date with you .",
 'I enjoy talking to you .',
 "I mean , I think , I think , you 're , you 're actually , you could be a friend of mine .",
 'Really .',
 'I really like you .',
 "You 're very sweet .",
 'Very nice person .',
 'I want to say , I like Robert .',
 'I did too , I like him .',
 "I like how he 's all like this , and he 's really relaxed .",
 'See , he had that little swagger. ',
 'I know , he did .',
 'He slumped in his seat .',
 'He was handling it .',
 'Little leery when he kept his legs crossed the entire time .',
 "I do n't know what was going on .",
 'But he started with a hug .']

In [692]:
print(example_chunks[chunk_number])

lets Christine know they 're having a wonderful date . @!ROBERT-@151-YEAR-O# I 'm glad that we got together today to have this wonderful date together . @!CHRISTINE-@146-YEA# Yes . @!ROBERT-@151-YEAR-O# It 's really a pleasure to , to have this nice date with you . I enjoy talking to you . I mean , I think , I think , you 're , you 're actually , you could be a friend of mine . Really . I really like you . You 're very sweet . Very nice person . @!ROBIN-ROBERTS-@1-A# @(Off-camera) I want to say , I like Robert . @!STEVE-HARVEY-@1-AB# @(Off-camera) I did too , I like him . @!ROBIN-ROBERTS-@1-A# @(Off-camera) I like how he 's all like this , and he 's really relaxed . @!STEVE-HARVEY-@1-AB# @(Off-camera) See , he had that little swagger. @!ROBIN-ROBERTS-@1-A# @(Off-camera) I know , he did . @!STEVE-HARVEY-@1-AB# @(Off-camera) He slumped in his seat . He was handling it . Little leery when he kept his legs crossed the entire time . I do n't know what was going on . @!ROBIN-ROBERTS-@1-A# @(

### Putting it all together

In [689]:
def clean_coca_file(
        input_file_path: Path,
        output_dir_path: Path,
        overwrite=True,
        exclude_first_and_last_sentences=True,
        remove_nonspeaker_tags=True,
        ) -> None:
    assert input_file_path.exists(), f'File "{input_file_path}" not found'
    dataset_dict = load_dataset('text', data_files=str(input_file_path))
    dataset = dataset_dict['train']

    output_dir_path.mkdir(parents=True, exist_ok=overwrite)
    output_file_path = output_dir_path / (input_file_path.stem + '_cleaned.txt')

    f = open(output_file_path, 'w')
    for line in tqdm(dataset):
        text = line['text']
        chunks = separate_chunks(text)
        for chunk in chunks:
            sentences = split_chunk_into_sentences(chunk,
                                                   exclude_first_and_last_sentences,
                                                   remove_nonspeaker_tags)
            f.write('\n'.join(sentences) + '\n')

    f.close()
    return None
        
clean_coca_file(
    input_file_path=Path("../data/coca/text/text_spoken_kde/w_spok_2000.txt"),
    output_dir_path=Path("../data/coca_spoken/text_cleaned/"),
)

100%|██████████| 3025/3025 [00:00<00:00, 5782.66it/s]


In [694]:
for file in Path("../data/coca/text/text_spoken_kde/").iterdir():
    clean_coca_file(file, Path("../data/coca_spoken/text_cleaned/"))

Generating train split: 2399 examples [00:00, 7528.44 examples/s]
100%|██████████| 2399/2399 [00:00<00:00, 6725.94it/s]
100%|██████████| 3025/3025 [00:00<00:00, 6972.86it/s]
Generating train split: 2771 examples [00:00, 18821.67 examples/s]
100%|██████████| 2771/2771 [00:00<00:00, 6742.26it/s]
Generating train split: 3079 examples [00:00, 22134.69 examples/s]
100%|██████████| 3079/3079 [00:00<00:00, 7683.01it/s]
Generating train split: 1913 examples [00:00, 15499.17 examples/s]
100%|██████████| 1913/1913 [00:00<00:00, 4472.69it/s]
Generating train split: 563 examples [00:00, 7163.92 examples/s]
100%|██████████| 563/563 [00:00<00:00, 2252.76it/s]
Generating train split: 2324 examples [00:00, 21070.03 examples/s]
100%|██████████| 2324/2324 [00:00<00:00, 5892.23it/s]
Generating train split: 972 examples [00:00, 8249.32 examples/s]
100%|██████████| 972/972 [00:00<00:00, 2186.51it/s]
Generating train split: 2647 examples [00:00, 21737.80 examples/s]
100%|██████████| 2647/2647 [00:00<00:00, 