In [44]:
from typing import List

In [2]:
from datasets import load_dataset
from transformers import GPT2Tokenizer, DataCollatorForLanguageModeling

coca_dir = "../data/coca/text/text_spoken_kde/"

# Load the dataset
# dataset = load_dataset('text', data_dir=coca_dir)
dataset = load_dataset('text', data_files=coca_dir+'w_spok_201*.txt')


# Initialize the tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

# Define padding token: (is this actually necessary?)
tokenizer.pad_token = tokenizer.eos_token

# Define a tokenization function
def tokenize_function(examples):
    return tokenizer(examples['text'], truncation=True, padding='max_length')

# Apply tokenization
tokenized_dataset = dataset.map(tokenize_function, batched=True)

# Initialize the data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False,
)

  from .autonotebook import tqdm as notebook_tqdm


In [16]:
print(tokenized_dataset['train'][1]['text'])

##4072514 @!JOHN-EDWARDS-@1FOR# The story 's false , it 's completely untrue . I 've been in love with the same woman for 30-plus years , as anybody who has been around us knows . I do n't respond to these lies . I would welcome participating in a paternity test . Be happy to participate in one . I know that it 's not possible that this child could be mine . @!ELIZABETH-EDWARDS-# I have seen a picture of the baby . I have no idea . It does n't look like my children . @!ANDREW-YOUNG-@1FOR# To fake the paternity . To get a doctor to fake the DNA results . And he asked , he asked me and Sherri to steal a diaper from the baby so that he could secretly do a DNA test to find out if it was indeed his child . @!TERRY-MORAN-@1-ABC# @(Off-camera) Well that is just painful to watch . That 's the John Edwards saga . That broke again this week . We 'll talk about that and the remarkable week in politics . Our " Roundtable " - George Will , @ @ @ @ @ @ @ @ @ @ . Welcome to all of you . Let 's begin 

In [5]:
[print(tokenized_dataset['train'][i]['text'][:100]) for i in range(len(tokenized_dataset['train']))]
print()


##4072514 @!JOHN-EDWARDS-@1FOR# The story 's false , it 's completely untrue . I 've been in love wi
##4072515 @!TERRY-MORAN-@1-ABC# @(Off-camera) And to talk about the political road ahead , I 'm join
##4072516 @!TERRY-MORAN-@1-ABC# @(Off-camera) Good morning . It 's a brand new ballgame here in Wash
##4072517 @!CHRIS-CUOMO-@1-ABC# @(Off-camera) Millions of us were joined in a common cause tonight w
##4072518 @!TERRY-MORAN-@1-ABC# @(Off-camera) Good evening , I 'm Terry Moran , and we 're going to 
##4072519 @!CYNTHIA-MCFADDEN-@# @(Off-camera) Good evening , I 'm Cynthia McFadden . We begin tonigh
##4072520 @!GEORGE-STEPHANOPOU# @(Off-camera) Okay , Robin . It is the political upset of the centur
##4072521 @!ROBIN-ROBERTS-@1-A# @(Off-camera) But first in this half hour , an update on the four-ye
##4072522 @!JAY-LENO-@1HOST-# What 's the best prank you ever pulled ? @!JIMMY-KIMMEL-@1HOS# I told 
##4072523 @!JAKE-TAPPER-@1-ABC# @(Off-camera) Now , to the two men in charge of the relief

In [15]:
print(len(tokenized_dataset['train'][3]['input_ids']))
print(len(tokenized_dataset['train'][1]['text']))

1024
26459


## Preprocessing (text cleaning)

Goal: From COCA's spoken genre, make a .txt file of new-line separated sentences. Clean formatting incl. speaker codes and weird tags.

In [30]:
import random
import re

In [27]:
# example_string = tokenized_dataset['train'][1]['text']
example_string = tokenized_dataset['train'][random.randint(0, len(tokenized_dataset['train']) - 1)]['text']
example_string

'##4103538 @!GIFFORD , co-host : So listen , for everybody that had to win , somebody had to lose , so we send our love to all the people in Baltimore and ... @!HODA-KOTB-co-host: @!Right-Well-even-i# @!GIFFORD: ... elsewhere . @!KOTB: OK , let \'s start with this . If you \'re not a football fan at all , let \'s pretend , but you do like to watch because it \'s the big hoopla , there are two things I know everyone watched and it was the National Anthems that were ... @!GIFFORD: Oh , yes . Oh , yes . @!KOTB: ... sung before each game . Now one person -- it was Steven Tyler and Kristin Chenoweth . Steven Tyler sparked a lot of controversy . @!GIFFORD: Are there two people more different ? More disalike than -- is that a -- is that a word ? @!KOTB: That , too . @!GIFFORD: Yeah . Yeah . @!KOTB: All right . So let \'s watch Steven Tyler as he sang the National Anthem , please . @!GIFFORD: Study in contrasts . @(Clip-of-Steven-Tyl @!KOTB: Now he got knocked @ @ @ @ @ @ @ @ @ @ He -- that \'

In [348]:
def separate_chunks(text: str) -> List[str]:
    """
    COCA is composed of scrambled chunks split by "@" * 10 (possibly 
    cut off at end of file). 
    Returns a list of separated chunks.
    """
    return text.split(' @ @ @ @ @ @ @ @ @ @ ')


example_chunks = separate_chunks(example_string)
print(len(example_chunks), [len(chunk) for chunk in example_chunks])
example_chunks

22 [960, 923, 841, 846, 914, 933, 900, 880, 891, 905, 880, 953, 939, 869, 848, 875, 821, 875, 902, 918, 922, 185]


["##4103538 @!GIFFORD , co-host : So listen , for everybody that had to win , somebody had to lose , so we send our love to all the people in Baltimore and ... @!HODA-KOTB-co-host: @!Right-Well-even-i# @!GIFFORD: ... elsewhere . @!KOTB: OK , let 's start with this . If you 're not a football fan at all , let 's pretend , but you do like to watch because it 's the big hoopla , there are two things I know everyone watched and it was the National Anthems that were ... @!GIFFORD: Oh , yes . Oh , yes . @!KOTB: ... sung before each game . Now one person -- it was Steven Tyler and Kristin Chenoweth . Steven Tyler sparked a lot of controversy . @!GIFFORD: Are there two people more different ? More disalike than -- is that a -- is that a word ? @!KOTB: That , too . @!GIFFORD: Yeah . Yeah . @!KOTB: All right . So let 's watch Steven Tyler as he sang the National Anthem , please . @!GIFFORD: Study in contrasts . @(Clip-of-Steven-Tyl @!KOTB: Now he got knocked",
 "He -- that 's how he sings . @!KO

In [349]:
def remove_speaker_and_other_tags(chunk: str, remove_other_tags=True) -> str:
    """
    Remove from one chunk speaker tags (ex: @!BOB:) and optionally
    other tags (ex: @(End-of-clip)).
    """
    pattern = r"\s+@\S+" if remove_other_tags else r"\s+@!\S+"
    return re.sub(pattern, " . ", chunk)

def split_by_speaker_and_other_tags(
        chunk: str, 
        remove_other_tags=True,
        exclude_first_and_last_sentences=True,
        ) -> List[str]:
    """
    Splits one chunk by speaker tags (ex: @!BOB:) and optionally
        other tags (ex: @(End-of-clip)).
    remove_other_tags: removes things like 
        @(End-of-clip).
        Does not remove long portions inside of @(Clip-from-previous blocks
    exclude_first_and_last_sentences: because the first and last sentences
        are likely fragments split by the chunk border
    
    Notes:
        - Pattern makes first word in turn start with a space.
        To remove it, add an \s at the end of the pattern, but be aware
        that this will block pattern matching of consecutive tags.
    """
    pattern = r"\s+@\S+" if remove_other_tags else r"\s+@!\S+"
    out = re.split(pattern, chunk)
    return out[1:-1] if exclude_first_and_last_sentences else out

example_chunk = example_chunks[random.randint(0, len(example_chunks)-1)]
print(example_chunk)
print(remove_speaker_and_other_tags(example_chunk))
print('----')
example_turns = split_by_speaker_and_other_tags(example_chunk)
for turn in example_turns:
    print(turn)

doll . @!KOTB: Sweet . @!GIFFORD: Sweetheart . @!KOTB: Yeah . @!GIFFORD: Sweetheart . She was in her adorable little yoga clothes and she did get a short shrift because we got lost in the building , and so last my husband was seen was right over by the kitchen talking to Trudie Styler . And ... @!KOTB: That 's Sting 's wife , by the way . @!GIFFORD: Sting 's wife ... @!KOTB: Mm-hmm . @!GIFFORD: ... by the way , OK ? Sting . And so I took my mom downstairs to go have a little piece of her birthday cake . @!KOTB: Yeah . And ? @!GIFFORD: Ticktock , where 's Frank ? Not for 15 more minutes . @!KOTB: Where was he ? @!GIFFORD: Finally , I was walking down the steps to go -- to go home and there 's Trudie with all her girlfriends and I go -- and she goes , ' Oh , my God , your husband is so gorgeous ! I ca n't believe -- he told me how old he is . My God , he 's gorgeous ! I
doll . .  Sweet . .  Sweetheart . .  Yeah . .  Sweetheart . She was in her adorable little yoga clothes and she did get

In [350]:
def split_turn_into_sentences(
        turn: str, 
        # exclude_sentences_with_ellipses=False
        ) -> str:
    """
    Splits one tag-free turn (as separated by split_by_speaker_and_other_tags) 
        into sentences.
    Since COCA has space-separated punctuation, splits are done by:
        [' . ', ' ? ', ' ! ']
    """
    delimiters = [' . ', ' ? ', ' ! ']
    pattern = "|".join(map(re.escape, delimiters))
    pattern = '(' + pattern + ')'
    splits = re.split(pattern, turn)
    if len(splits) == 1:
        return splits
    
    # Else, manually re-insert punctuation
    out = []
    for idx, split in enumerate(splits):
        if not (idx % 2): # is sentence
            if idx == 0:
                out.append(split)
            else: # add prefix space for consistency
                out.append(' ' + split)
        else: # is delimiter
            out[-1] += split[:-1]
    return out
    
turn = example_turns[random.randint(0, len(example_turns)-1)]
split_turn_into_sentences(turn)

[' Mm-hmm .']

In [380]:
def split_chunk_into_sentences(chunk: str) -> List[str]:
    """
    Combines `split_by_speaker_and_other_tags` and 
    `split_turn_into_sentences` to split a COCA chunk
    into a list of sentences.
    """
    turns = split_by_speaker_and_other_tags(chunk)
    sentences = []
    for turn in turns:
        sentences.extend(split_turn_into_sentences(turn))
    return sentences

chunk_number = random.randint(0, len(example_chunks)-1)
example_chunk = example_chunks[chunk_number]
example_sentences = split_chunk_into_sentences(example_chunk)
print(f'{chunk_number=}')
example_sentences

chunk_number=9


[" It 's a new baby , though .",
 " It 's a new baby and a new commercial about a lottery winner .",
 ' Take a look .',
 '',
 ' Come on .',
 " Oh , that 's not the one I thought we were going to show .",
 ' Oh .',
 " There 's a brand-new one with the new kid .",
 " That 's the old kid .",
 " Is n't that the cutest ?",
 " Who 's probably now 18 , I do n't know .",
 ' But yeah .',
 ' I love that kid .',
 ' All right .',
 ' By the way , there was a guy who did get really lucky with the lottery .',
 ' This is incredible .',
 ' This is crazy .',
 ' He went in to play the Powerball , he thought , OK.',
 ' He does it every day .',
 ' Every day .',
 ' Goes ...',
 ' Every day .',
 ' Cup of coffee , plays the Powerball.',
 ' Right .',
 ' For some reason , this time he bought the Mega Millions accidentally , OK ?',
 ' Yeah .']

In [352]:
print(example_chunks[chunk_number])

kid . And he had -- he was 16 for 16 up till that point . @!KOTB: So if you watched the game ... @!GIFFORD: Oh , it just -- nobody could believe it . @!KOTB: ... there was 10 -- 15 seconds left and all they had to do was hit that so that they could go into overtime and ... @!GIFFORD: And he was n't very far ... @!KOTB: ... and it did n't happen . @!GIFFORD: I know . I ... @!KOTB: You know what ... @!GIFFORD: And it was n't a bad snap , it was n't ... @!KOTB: Oh , oh , oh . @!GIFFORD: ... it was -- it was just ... @!KOTB: Oh . @!GIFFORD: You know , I -- and they had to , of course , fly back to Baltimore . @!KOTB: Yeah . @!GIFFORD: And it 's a long way from San Francisco to Baltimore , you know ? @!KOTB: Yeah . @!GIFFORD: And they get -- usually get on a plane right after the game , and I said to Frank , I said , ' What 's it like
