In [442]:
import random
import re
from typing import List

In [443]:
from datasets import load_dataset
from transformers import GPT2Tokenizer, DataCollatorForLanguageModeling

coca_dir = "../data/coca/text/text_spoken_kde/"

# dataset = load_dataset('text', data_dir=coca_dir)
dataset = load_dataset('text', data_files=coca_dir+'w_spok_201*.txt')
train_dataset = dataset['train']

In [466]:
example_line = random.choice(train_dataset)
print(example_line['text'][:100])

##4072661 @!NANCY-GRACE-HOST : A 5-year-old Florida girl tucked into bed , five hours later , shes g


## Preprocessing (text cleaning)

Goal: From COCA's spoken genre, make a .txt file of new-line separated sentences. Clean formatting incl. speaker codes and weird tags.

In [497]:
example_string_id = random.randint(0, len(train_dataset) - 1)
example_string = train_dataset[example_string_id]['text']
print(f'{example_string_id=}')
print(f'{len(example_string)=}')
print(example_string[:100])

example_string_id=81
len(example_string)=15290
##4072612 @!ROBIN-ROBERTS-@1-A# @(Off-camera) Oh , come on upstairs here , Sam . Now , the dos and d


In [498]:
def separate_chunks(text: str) -> List[str]:
    """
    COCA is composed of scrambled chunks split by "@" * 10 (possibly 
    cut off at end of file). 
    Returns a list of separated chunks.
    """
    return text.split(' @ @ @ @ @ @ @ @ @ @ ')


example_chunks = separate_chunks(example_string)
print(len(example_chunks), [len(chunk) for chunk in example_chunks])
for chunk in example_chunks:
    print(chunk[:50])

15 [929, 972, 1084, 1047, 1018, 982, 1020, 929, 1053, 1054, 943, 1122, 1023, 1053, 767]
##4072612 @!ROBIN-ROBERTS-@1-A# @(Off-camera) Oh ,
. @!STEVE-HARVEY-@1-AB# @(Off-camera) Yeah . @!ROB
couple . We have , let 's see , Robert is a 51-yea
lets Christine know they 're having a wonderful da
right there . Just meeting somebody . @GRAPHICS @G
whole evening right there . @!ROBIN-ROBERTS-@1-A# 
audience are here all shaking , you , you like tha
to know before you waste a lot of time , emotion ,
that lady keep talking about anyway . @!ROBIN-ROBE
@!DEE-DEE-@130'S-PUB# There are better products no
right . @!ROBIN-ROBERTS-@1-A# @(Off-camera) It was
noticed he was a little ... @!ROBIN-ROBERTS-@1-A# 
, you know , I do n't like your hair . What are yo
you know , they are going out on a second date . O
skating . @!GEORGE-STEPHANOPOU# @(Off-camera) Also


In [522]:
def remove_speaker_and_other_tags(chunk: str, remove_other_tags=True) -> str:
    """
    Remove from one chunk speaker tags (ex: @!BOB:) and optionally
    other tags (ex: @(End-of-clip)).
    """
    pattern = r"\s+@\S+" if remove_other_tags else r"\s+@!\S+"
    return re.sub(pattern, " . ", chunk)

def split_by_speaker_and_other_tags(
        chunk: str, 
        remove_other_tags=True,
        ) -> List[str]:
    """
    Splits one chunk by speaker tags (ex: @!BOB) and optionally
        other tags (ex: @(End-of-clip)).

    remove_other_tags: also removes things like 
        @(End-of-clip).
        Does not remove long portions inside of @(Clip-from-previous blocks
    
    Notes:
        - Pattern makes first word in turn start with a space.
        To remove it, add an \s at the end of the pattern, but be aware
        that this will break pattern matching of consecutive tags.
        - Speaker tags are inconsistently either marked as 
            "@!BOB", "@!BOB:", "@!BOB :", "@!BOB ( voiceover ) :", 
            and more. ( voiceover ) is currently not captured.

    """
    pattern = r"\s+@\S+" if remove_other_tags else r"\s+@!\S+"
    # pattern = r"\s+@!\S+(?:\s*\(.*\))?(?::|\s:)"
    # pattern = r'(@\S+(?:\s*\(\s*voiceover\s*\)\s*)?[: ]?)'
    # pattern = r"\s+@!\S+(?: :|\s:)" if remove_other_tags else r"\s+@!\S+(:|\s)"
    out = re.split(pattern, chunk)
    out = [segment for segment in out if segment.strip()]
    return out

example_chunk_id = random.randint(0, len(example_chunks)-1)
print(f'{example_chunk_id=}')
example_chunk = example_chunks[example_chunk_id]
print(example_chunk)
print(remove_speaker_and_other_tags(example_chunk))
print('----')
example_turns = split_by_speaker_and_other_tags(example_chunk)
for turn_number, turn in enumerate(example_turns):
    print(turn_number, turn)

example_chunk_id=8
that lady keep talking about anyway . @!ROBIN-ROBERTS-@1-A# @(Off-camera) Denise , is that you , Denise ? Get out of , get out of Steve 's ear . All right . Our next couple , our next couple , Dane Potter . Dane is a 35-year-old actor . Dee Dee is in her 30 's and works in public relations . This is their first date . @GRAPHICS @!ROBIN-ROBERTS-@1-A# @(Voiceover) Dane starts things off with flowers for Dee Dee . @!DANE-@135-YEAR-OLD# I 'm Dane . @!DEE-DEE-@130'S-PUB# I 'm Dee Dee . @!DANE-@135-YEAR-OLD# Nice to meet you . @!DEE-DEE-@130'S-PUB# You too . @!DANE-@135-YEAR-OLD# These are for you . @!DEE-DEE-@130'S-PUB# Thank you , these are beautiful . @GRAPHICS @!ROBIN-ROBERTS-@1-A# @(Voiceover) At the table , Dane does not pull out Dee Dee 's chair or take her coat . When Dane asks about Dee Dee 's look , how much is too much ? @!DANE-@135-YEAR-OLD# You got the earthy look going . @!DEE-DEE-@130'S-PUB# It did n't have anything to do with like being earthy . It really w

In [530]:
def split_turn_into_sentences(
        turn: str, 
        # exclude_sentences_with_ellipses=False
        ) -> str:
    """
    Splits one tag-free turn (as separated by split_by_speaker_and_other_tags) 
        into sentences.
    Since COCA has space-separated punctuation, splits are done by:
        [' . ', ' ? ', ' ! ']
    """
    delimiters = [' . ', ' ? ', ' ! ']
    pattern = "|".join(map(re.escape, delimiters))
    pattern = '(' + pattern + ')'
    splits = re.split(pattern, turn)
    if len(splits) == 1:
        return splits
    
    # Else, manually re-insert punctuation
    out = []
    for idx, split in enumerate(splits):
        if not (idx % 2): # is sentence
            if idx == 0:
                out.append(split)
            else: # add prefix space for consistency
                out.append(' ' + split)
        else: # is delimiter
            out[-1] += split[:-1]
    return out
    
turn = example_turns[random.randint(0, len(example_turns)-1)]
split_turn_into_sentences(turn)

[' Denise , is that you , Denise ?',
 " Get out of , get out of Steve 's ear .",
 ' All right .',
 ' Our next couple , our next couple , Dane Potter .',
 ' Dane is a 35-year-old actor .',
 " Dee Dee is in her 30 's and works in public relations .",
 ' This is their first date .']

In [531]:
def split_chunk_into_sentences(
        chunk: str,
        exclude_first_and_last_sentences=True,
        ) -> List[str]:
    """
    Combines `split_by_speaker_and_other_tags` and 
        `split_turn_into_sentences` to split a COCA chunk
        into a list of sentences.

    exclude_first_and_last_sentences: because the first and 
        last sentences are likely fragments split by the chunk border
    """
    turns = split_by_speaker_and_other_tags(chunk)
    sentences = []
    for turn in turns:
        sentences.extend(split_turn_into_sentences(turn))
    return sentences[1:-1] if exclude_first_and_last_sentences else sentences

chunk_number = random.randint(0, len(example_chunks)-1)
example_chunk = example_chunks[chunk_number]
example_sentences = split_chunk_into_sentences(example_chunk, 
                                               exclude_first_and_last_sentences=True)
print(f'{chunk_number=}')
example_sentences

chunk_number=14


[" Also , we 're going to take a look inside Oprah 's closet .",
 ' She is giving everyone a chance to have a piece of her wardrobe .',
 " She 's going to auction it off for charity .",
 ' And you can , you can bid on it .',
 " Also , we 're going to get some tips on how to declutter your own closet .",
 " You can now walk in Oprah 's shoes .",
 " I do n't want to walk in Oprah 's shoes , but I 'm sure a lot of people out there do .",
 " And we 're going to get to Kathryn Bigelow today .",
 ' We promise we will do that today .',
 ' We are going to have the story on Kathryn Bigelow .',
 ' Last half hour .']

In [532]:
print(example_chunks[chunk_number])

skating . @!GEORGE-STEPHANOPOU# @(Off-camera) Also , we 're going to take a look inside Oprah 's closet . She is giving everyone a chance to have a piece of her wardrobe . She 's going to auction it off for charity . And you can , you can bid on it . @!GEORGE-STEPHANOPOU# @(Voiceover) Also , we 're going to get some tips on how to declutter your own closet . You can now walk in Oprah 's shoes . I do n't want to walk in Oprah 's shoes , but I 'm sure a lot of people out there do . @!ROBIN-ROBERTS-@1-A# @(Off-camera) And we 're going to get to Kathryn Bigelow today . @!GEORGE-STEPHANOPOU# @(Off-camera) We promise we will do that today . @!ROBIN-ROBERTS-@1-A# @(Off-camera) We are going to have the story on Kathryn Bigelow . Last half hour . COMMERCIAL BREAK ' 
