In [1]:
from transformers import AutoTokenizer, AutoModel, BertTokenizer, BertModel
from datasets import load_dataset, DatasetDict, Dataset, Features, Value

import pandas as pd

### Data loading

In [11]:
swda = load_dataset("swda")

  0%|          | 0/3 [00:00<?, ?it/s]

### Data cleaning

#### Filter text

In [12]:
import re

def clean_swda_utterance(text):
    """
    This function takes a string of SWDA utterance text and returns a cleaned version of the text.
    """
    # Remove square brackets and anything inside them
    text = re.sub(r'\[.*?\]', '', text)
    
    # Remove leading and trailing whitespaces
    text = text.strip()
    
    # Remove any remaining parentheses and their contents
    text = re.sub(r'\(.*?\)', '', text)
    
    # Remove any remaining non-alphanumeric characters except for spaces
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    
    # Convert all letters to lowercase
    text = text.lower()
    
    # Remove any residual spaces
    text = ' '.join(text.split())
    
    return {"text": text}


swda = swda.map(lambda x : clean_swda_utterance(x['text']))


  0%|          | 0/213543 [00:00<?, ?ex/s]

  0%|          | 0/56729 [00:00<?, ?ex/s]

  0%|          | 0/4514 [00:00<?, ?ex/s]

#### Format dataset

In [13]:
def preprocess_data(utterance):
    
    convo_id = utterance['conversation_no']
    text = utterance['text']
    speaker = utterance['caller']
    label = utterance['damsl_act_tag']

    return {'convo_id': convo_id, 'text': text, 'speaker': speaker, 'label': label}

swda = swda.map(preprocess_data)

  0%|          | 0/213543 [00:00<?, ?ex/s]

  0%|          | 0/56729 [00:00<?, ?ex/s]

  0%|          | 0/4514 [00:00<?, ?ex/s]

#### Map speakers

In [14]:
def map_speaker_to_int(data):
    """
    This function maps the speaker information in an example to 0 or 1.
    """
    # Get the caller information from the example
    caller = data["speaker"]
    
    # Map the caller to 0 or 1
    if caller == "A":
        data["speaker"] = 0
    elif caller == "B":
        data["speaker"] = 1
    
    return data

swda = swda.map(map_speaker_to_int)

  0%|          | 0/213543 [00:00<?, ?ex/s]

  0%|          | 0/56729 [00:00<?, ?ex/s]

  0%|          | 0/4514 [00:00<?, ?ex/s]

#### Create dataframes

In [22]:
train_dataset = pd.DataFrame(swda['train'])
val_dataset = pd.DataFrame(swda['validation'])
test_dataset = pd.DataFrame(swda['test'])

In [23]:
train_dataset = train_dataset[['convo_id','speaker','text','label']]
val_dataset = val_dataset[['convo_id','speaker','text','label']]

In [24]:
train_dataset.head(8)

Unnamed: 0,convo_id,speaker,text,label
0,4325,0,okay,26
1,4325,0,d so,15
2,4325,1,i guess,36
3,4325,0,what kind of experience have then with child care,20
4,4325,1,i think f uh i wonder if that worked,20
5,4325,0,does it say something,2
6,4325,1,i think it usually does,4
7,4325,1,you might try f uh,0


#### Save cleaned data

In [25]:
train_dataset.to_csv('train.csv', index=False)
val_dataset.to_csv('validation.csv', index=False)
test_dataset.to_csv('test.csv', index=False)