### Code to Convert the Switchboard dataset into Convokit format

In [1]:
import os
os.chdir("../../") # import convokit
from convokit import Corpus, Speaker, Utterance
os.chdir("datasets/switchboard-corpus") # then come back for swda
from swda import Transcript
import glob

#### Create Speakers

Each caller is considered a user, and there are total of 440 different callers in this dataset. Each user is marked with a numerical id, and the metadata for each user includes the following information:

- Gender (str): MALE or FEMALE
- Education (int): 0, 1, 2, 3, 9
- Birth Year (int): YYYY
- Dialect Area (str): MIXED, NEW ENGLAND, NORTH MIDLAND, NORTHERN, NYC, SOUTH MIDLAND, SOUTHERN, UNK, WESTERN

In [2]:
files = glob.glob("./swda/*/sw_*.utt.csv") # Switchboard utterance files
user_meta = {}

for file in files:
    trans = Transcript(file, './swda/swda-metadata.csv')
    user_meta[str(trans.from_caller)] = {"sex": trans.from_caller_sex,
                                    "education": trans.from_caller_education,
                                    "birth_year": trans.from_caller_birth_year,
                                    "dialect_area": trans.from_caller_dialect_area}
    user_meta[str(trans.to_caller)] = {"sex": trans.to_caller_sex,
                                  "education": trans.to_caller_education,
                                  "birth_year": trans.to_caller_birth_year,
                                  "dialect_area": trans.to_caller_dialect_area}

Create a Speaker object for each unique user in the dataset

In [3]:
corpus_speakers = {k: Speaker(id = k, meta = v) for k,v in user_meta.items()}

Check number of users in the dataset

In [4]:
print("Number of users in the data = {}".format(len(corpus_speakers)))

Number of users in the data = 440


In [5]:
# Example metadata from user 1632
corpus_speakers['1632'].meta

{'sex': 'FEMALE',
 'education': 2,
 'birth_year': 1962,
 'dialect_area': 'WESTERN'}

#### Create Utterances

Utterances are found in the "text" field of each Transcript object. There are 221,616 utterances in total.

Each Utterance object has the following fields:

- id (str): the unique id of the utterance
- user (Speaker): the Speaker giving the utterance
- root (str): id of the root utterance of the conversation
- reply_to (str): id of the utterance this replies to
- timestamp: timestamp of the utterance (not applicable in Switchboard)
- text (str): text of the utterance
- metadata
    - tag (str): the DAMSL act-tag of the utterance
    - pos (str): the part-of-speech tagged portion of the utterance
    - trees (nltk Tree): parsed tree of the utterance

In [6]:
utterance_corpus = {}

# Iterate thru each transcript
for file in files:
    
    trans = Transcript(file, './swda/swda-metadata.csv')
    utts = trans.utterances
    root = str(trans.conversation_no) + "-0" # Get id of root utterance
    
    recent_A = None
    recent_B = None
    
    # Iterate thru each utterance in transcript
    last_speaker = ''
    cur_speaker = ''
    all_text = ''
    text_pos = ''
    text_tag_list = []
    counter = 0
    first_utt = True
    
    for i, utt in enumerate(utts):
        
        idx = str(utt.conversation_no) + "-" + str(counter)
        text = utt.text
        
        # Check which user is talking
        if 'A' in utt.caller:
            recent_A = idx;
            user = str(trans.from_caller)
            cur_speaker = user
        else:
            recent_B = idx;
            user = str(trans.to_caller)
            cur_speaker = user
        
        # Only add as an utterance if the user has finished talking
        if cur_speaker != last_speaker and i > 0:
                
            # Put act-tag and POS information into metadata
            meta = {'tag': text_tag_list,
                   }

            # For reply_to, find the most recent utterance from the other caller
            if first_utt:
                reply_to = None
                first_utt = False
            elif 'A' in utt.caller:
                reply_to = recent_B
            else:
                reply_to = recent_A
            
                
            utterance_corpus[idx] = Utterance(idx, corpus_speakers[user], root,
                                              reply_to, None, all_text, meta)

            # Update with the current utterance information
            # This is the first utterance of the next statement
            all_text = utt.text
            text_pos = utt.pos
            text_tag_list = [(utt.text, utt.act_tag)]
            
            counter += 1
            
        else:
            # Otherwise, combine all the text from the user
            all_text += utt.text
            text_pos += utt.pos
            text_tag_list.append((utt.text, utt.act_tag))
            
        last_speaker = cur_speaker
        last_speaker_idx = idx

In [7]:
utterance_list = [utterance for k,utterance in utterance_corpus.items()]

Check number of utterances in the dataset

In [8]:
print("Number of utterances in the data = {}".format(len(utterance_corpus)))

Number of utterances in the data = 122646


In [9]:
# Example utterance object
utterance_corpus['4325-2']

Utterance({'id': '4325-2', 'speaker': Speaker([('id', '1519')]), 'conversation_id': '4325-0', 'reply_to': '4325-1', 'timestamp': None, 'text': 'What kind of experience [ do you, + do you ] have, then with child care? /', 'meta': {'tag': [('What kind of experience [ do you, + do you ] have, then with child care? /', '+')]}})

#### Create corpus from list of utterances

In [10]:
switchboard_corpus = Corpus(utterances=utterance_list, version=1)
print("number of conversations in the dataset = {}".format(len(switchboard_corpus.get_conversation_ids())))

number of conversations in the dataset = 1155


### Create Conversations

In [11]:
# Set conversation Metadata
for i, c in enumerate(switchboard_corpus.conversations):
    trans = Transcript(files[i], './swda/swda-metadata.csv')
    idx = str(trans.conversation_no)
    
    convo = switchboard_corpus.conversations[c]
    
    convo.meta['filename'] = files[i]
    date = trans.talk_day
    convo_date = "%d-%d-%d" % (date.year, date.month, date.day)
    convo.meta['talk_day'] = convo_date
    convo.meta['topic_description'] = trans.topic_description
    convo.meta['length'] = trans.length
    convo.meta['prompt'] = trans.prompt
    convo.meta['from_caller'] = str(trans.from_caller)
    convo.meta['to_caller'] = str(trans.to_caller)

In [12]:
print(switchboard_corpus.conversations['4384-0'].meta)

{'filename': './swda/sw13utt/sw_1325_4384.utt.csv', 'talk_day': '1992-3-25', 'topic_description': 'CHILD CARE', 'length': 5, 'prompt': 'FIND OUT WHAT CRITERIA THE OTHER CALLER WOULD USE IN SELECTING CHILD CARE SERVICES FOR A PRESCHOOLER.  IS IT EASY OR DIFFICULT TO FIND SUCH CARE?', 'from_caller': '1653', 'to_caller': '1646'}


#### Update corpus level metadata

In [13]:
switchboard_meta = {}
for file in files:
    trans = Transcript(file, './swda/swda-metadata.csv')
    idx = str(trans.conversation_no)
    switchboard_meta[idx] = {}
    
switchboard_corpus.meta['metadata'] = switchboard_meta
switchboard_corpus.meta['name'] = "The Switchboard Dialog Act Corpus"

In [14]:
switchboard_corpus.meta['metadata']['4325']

{}

#### Save created corpus

In [15]:
switchboard_corpus.dump("corpus", base_path = "./")

Check if available info from dataset can be viewed directly

In [16]:
from convokit import meta_index
meta_index(filename = "./corpus")

{'utterances-index': {'tag': "<class 'list'>"},
 'users-index': {'sex': "<class 'str'>",
  'education': "<class 'int'>",
  'birth_year': "<class 'int'>",
  'dialect_area': "<class 'str'>"},
 'conversations-index': {'filename': "<class 'str'>",
  'talk_day': "<class 'str'>",
  'topic_description': "<class 'str'>",
  'length': "<class 'int'>",
  'prompt': "<class 'str'>",
  'from_caller': "<class 'str'>",
  'to_caller': "<class 'str'>"},
 'overall-index': {'metadata': "<class 'dict'>", 'name': "<class 'str'>"},
 'version': 1}

In [17]:
switchboard_corpus = Corpus(filename = "./corpus")
switchboard_corpus.print_summary_stats()

Number of Speakers: 440
Number of Utterances: 122646
Number of Conversations: 1155
