In [1]:
# !pip install -U datasets -q
import pandas as pd
import re
import json
import numpy as np
import datasets

In [2]:
def get_turn_length(x):
    lengths_turns = []
    for turn in x:
        lengths_turns.append(len(turn))
        
    return sum(lengths_turns) / len(lengths_turns)

# MSC Dataset

In [3]:
def create_df_from_txt(session_nr):
    data = []
    
    with open(f"msc/msc_dialogue/session_{session_nr}/train.txt", 'r') as file: # train.txt is text file consisting of dicts
        for line in file:
            data.append(json.loads(line)) # make list with those dicts
            
    print(f"keys in dictionary: {data[0].keys()}")
    
    df_rows = []

    for i in range(len(data)): # iterate through dictionaries from text file

        dialogue_session = []
        speakers = []
        previous_session = []
        time_intervals = ["Start"]
        

        for j, turn in enumerate(data[i]["dialog"]): # put all the dialogue turns from this session in a list
            dialogue_session.append(turn["text"])
            speakers.append(turn["id"])  # same with speaker ids

        if session_nr == 2:  # if file is second dialogue session, also create list for first dialogue session (this session is from a different dataset, "PersonaChat")
            for turn in data[i]["previous_dialogs"][0]["dialog"]:
                previous_session.append(turn["text"])
        
        dataID = data[i]["dialog"][0]["convai2_id"]  # get unique ID for conversation
        timeInterval = data[i]["previous_dialogs"][0]["time_back"][:-3].strip()
        
        
        if session_nr == 2:
            df_rows.append({"dataID": dataID, "dialogue_session_1": previous_session, 
                            f"dialogue_session_{session_nr}": dialogue_session, 
                            f"speakers_{session_nr}": speakers,
                            "timeInterval_1": "Start",
                            f"timeInterval_{session_nr}": timeInterval}) # "persona1": persona1, "persona2": persona2,
            
        else:
            df_rows.append({"dataID": dataID, f"dialogue_session_{session_nr}": dialogue_session, 
                       f"speakers_{session_nr}": speakers,
                        f"timeInterval_{session_nr}": timeInterval}) # "persona1": persona1, "persona2": persona2,
            
       
    
    df = pd.DataFrame(df_rows) 
    
    return df

In [4]:
df_2 = create_df_from_txt(2)

keys in dictionary: dict_keys(['personas', 'dialog', 'metadata', 'previous_dialogs', 'init_personas'])


In [5]:
df_3 = create_df_from_txt(3)

keys in dictionary: dict_keys(['personas', 'dialog', 'metadata', 'previous_dialogs', 'init_personas'])


In [6]:
df_4 = create_df_from_txt(4)

keys in dictionary: dict_keys(['personas', 'dialog', 'metadata', 'previous_dialogs', 'init_personas'])


In [7]:
df_merged = pd.merge(df_2, df_3, on=['dataID'], how='inner')
df_merged_final = pd.merge(df_merged, df_4, on=['dataID'], how='inner')

In [8]:
df_merged_final.head()

Unnamed: 0,dataID,dialogue_session_1,dialogue_session_2,speakers_2,timeInterval_1,timeInterval_2,dialogue_session_3,speakers_3,timeInterval_3,dialogue_session_4,speakers_4,timeInterval_4
0,train:ordered_3537,[I need some advice on where to go on vacation...,"[Are you still in the military?, No, I have no...","[Speaker 1, Speaker 2, Speaker 1, Speaker 2, S...",Start,2 days,[I've booked myself a week long vacation from ...,"[Speaker 1, Speaker 2, Speaker 1, Speaker 2, S...",5 days,[When do you think you will travel to New Orle...,"[Speaker 1, Speaker 2, Speaker 1, Speaker 2, S...",5 days 6 hours
1,train:ordered_1289,"[Howdy! How all doing today?, I'm doing great ...",[I was thinking about you living in New York. ...,"[Speaker 1, Speaker 2, Speaker 1, Speaker 2, S...",Start,7 days,"[Why is Wicked your favorite Broadway show? , ...","[Speaker 1, Speaker 2, Speaker 1, Speaker 2, S...",7 days 6 hours,[I watched some YouTube videos of Wicked perfo...,"[Speaker 1, Speaker 2, Speaker 1, Speaker 2, S...",7 days 11 hours
2,train:ordered_3305,[Hey do you have any unique features? I got 3 ...,[I almost lost my sweet bird when I forgot she...,"[Speaker 1, Speaker 2, Speaker 1, Speaker 2, S...",Start,1 day,[My doctor just gave me a call back and actual...,"[Speaker 1, Speaker 2, Speaker 1, Speaker 2, S...",1 day 3 hours,[I just had my thumb removed! Recovery is goin...,"[Speaker 1, Speaker 2, Speaker 1, Speaker 2, S...",1 day 10 hours
3,train:ordered_2320,[Good evening! I'm so glad someone is able to ...,[I have to see a physiotherapist to assess whe...,"[Speaker 1, Speaker 2, Speaker 1, Speaker 2, S...",Start,4 days,[I saw my physiotherapist and he gave me great...,"[Speaker 1, Speaker 2, Speaker 1, Speaker 2, S...",10 days,[My doctor called and gave me great news! I sh...,"[Speaker 1, Speaker 2, Speaker 1, Speaker 2, S...",10 days 6 hours
4,train:ordered_2649,"[Hello how are you tonight?, Enjoying some goo...","[I put my cat outside so it won't bother you.,...","[Speaker 1, Speaker 2, Speaker 1, Speaker 2, S...",Start,1 day,[How was your weekend? Did you stay home with ...,"[Speaker 1, Speaker 2, Speaker 1, Speaker 2, S...",4 days,"[So, how many brothers and sisters do you have...","[Speaker 1, Speaker 2, Speaker 1, Speaker 2, S...",4 days 5 hours


In [9]:
len(df_merged_final)

1001

In [10]:
df_merged_final["dialogue_session_1"][3]

["Good evening! I'm so glad someone is able to play with me.",
 'Hi there, it can be a pain sometimes, how are you tonight?',
 'Okay. Lonely, my cat and fish want to be alone instead of with me. You?',
 'Curled up with my cats, finally got my five kids to bed',
 '5? Whoa! Wish I could have kids, since the accident I am stuck in a bed.',
 'Believe me, five is like three too many, they are crazy, what happened?',
 'I severed my spinal cord sky diving, it was crazy. Luckily I can work from home now.',
 'I wish I could find a really good work from home job',
 'What do you do for work?',
 'I work with market research, those annoying phonecalls for surveys',
 'I bet you get some pretty mad people on the phone. No one likes those things.',
 'Exactly. Makes the job not so fun',
 'What kinds of things are you into?',
 'I am very craftsy, but I am a perfectionist which makes it hard']

In [11]:
df_merged_final["dialogue_session_2"][3]

["I have to see a physiotherapist to assess whether I'll be able to walk again. ",
 'Wow that must be nerve racking. Does the doctor have a good reputation?',
 'I am better now. Yes, my doctor is very caring and with all her support I am here. Why you not like your job? ',
 "My job is quite boring because I have to do the same thing over and over again. It's quite monotonous. I prefer work that is more fun",
 'I can understand! Do you have any other job in your mind which you want to do like related to craft or something?',
 "Yes if I could I would be a painter but I don't think I would make enough money to support my children",
 'you can start painting with your current job. when your business will do well, you can quit your job. you will get a fair idea how much you can earn by your painting. you might become millionaire. .',
 "Wow, that's a good idea! Do you enjoy painting?",
 'yes i love to do, but i am not a great artist like you. I like to do paint by number. I make me calm when 

In [12]:
df_merged_final["dialogue_session_3"][3]

['I saw my physiotherapist and he gave me great news!',
 "That's amazing! What did he tell you?",
 'That with biochem and a lot of hard work, I can probably walk again!',
 'That is so great to hear! Did they have timeline on how long they think it might take?',
 'Not really, things were touch and go. I am so happy right now! Anyway though. how are the kids?',
 "My kids are doing good (other than keeping me on my toes). My twins are about to turn 4 in a month! So there's been a lot planning. ",
 'Omg! The gift of life is so beautiful! And are you planning on anything for them semi annually? I feel like that kinda thing is more for the parents than the kids.',
 "Just the annual birthday parties. With 5 kids I do what I can. It's not easy being a single parent. But I don't like to complain, really. How have your pain levels been?",
 'You got it rough then! And I am on constant pain killers so manageable! What do you do fr you time?',
 'I like doing crafty things, like painting and cross s

In [13]:
df_merged_final["turn_len_session_0"] = df_merged_final["dialogue_session_1"].apply(get_turn_length)
df_merged_final["turn_len_session_0"].mean()

np.float64(51.372839889048684)

dataframe with sessions 1, 2, and 3 has 4000 entries, with sessions 1, 2, 3 and 4 has 1001

# Locomo Dataset

In [14]:
with open(f"locomo/locomo10.json", 'r') as file:
    data = json.load(file) 

In [15]:
len(data)

10

In [16]:
data[1].keys()

dict_keys(['qa', 'conversation', 'event_summary', 'observation', 'session_summary', 'sample_id'])

In [17]:
all_sessions = []
all_speakers = []
all_timestamps = []
data_ids = []
df_rows = []

for i in range(10):  # iterate through the 10 conversations (each consisting of multiple sessions)
    
    dataID = data[i]["sample_id"]
    data_ids.append(dataID)
    # figure out how many sessions a conversation has
    num_sessions = len([k for k, v in data[i]["conversation"].items() if k.startswith('session') and not k.endswith("date_time")])
    
    session_dialogue = []
    session_speakers = []
    session_timestamps = []
    
    for j in range(1, num_sessions + 1): # iterate over the sessions
        session_nr = f"session_{j}"
        session = data[i]["conversation"][session_nr]
        timeStamp = data[i]["conversation"][f'session_{j}_date_time']
        
        all_turns_in_session = []
        all_speakers_in_session = []
        
        for turn in session:  # create a list with all turns in a session, and the corresponding speaker
            all_turns_in_session.append(turn["text"])
            all_speakers_in_session.append(turn["speaker"])
            
        session_dialogue.append(all_turns_in_session)
        session_speakers.append(all_speakers_in_session)
        session_timestamps.append(timeStamp)

    all_sessions.append(session_dialogue)  # append this list to list with all conversations
    all_speakers.append(session_speakers)
    all_timestamps.append(session_timestamps)

for k, (session, speaker, dataID, time_stamps) in enumerate(zip(all_sessions, all_speakers, data_ids, all_timestamps)):
    my_dict = {"dataID": dataID}
    for j, (sub_session, sub_speaker, timeStamp) in enumerate(zip(session, speaker, time_stamps)):
        my_dict.update({f"session_{j}": sub_session, f"speakers_{j}": sub_speaker, f"timestamp_{j}": timeStamp})
    df_rows.append(my_dict)

    
locomo_df = pd.DataFrame(df_rows)  

In [18]:
locomo_df.head()

Unnamed: 0,dataID,session_0,speakers_0,timestamp_0,session_1,speakers_1,timestamp_1,session_2,speakers_2,timestamp_2,...,timestamp_28,session_29,speakers_29,timestamp_29,session_30,speakers_30,timestamp_30,session_31,speakers_31,timestamp_31
0,conv-26,"[Hey Mel! Good to see you! How have you been?,...","[Caroline, Melanie, Caroline, Melanie, Carolin...","1:56 pm on 8 May, 2023","[Hey Caroline, since we last chatted, I've had...","[Melanie, Caroline, Melanie, Caroline, Melanie...","1:14 pm on 25 May, 2023",[Hey Melanie! How's it going? I wanted to tell...,"[Caroline, Melanie, Caroline, Melanie, Carolin...","7:55 pm on 9 June, 2023",...,,,,,,,,,,
1,conv-30,[Hey Jon! Good to see you. What's up? Anything...,"[Gina, Jon, Gina, Jon, Gina, Jon, Gina, Jon, G...","4:04 pm on 20 January, 2023",[Hey Jon! Long time no see! Things have been h...,"[Gina, Jon, Gina, Jon, Gina, Jon, Gina, Jon, G...","2:32 pm on 29 January, 2023","[Hey Gina, hope you're doing ok! Still followi...","[Jon, Gina, Jon, Gina, Jon, Gina, Jon, Gina, J...","12:48 am on 1 February, 2023",...,,,,,,,,,,
2,conv-41,"[Hey John! Long time no see! What's up?, Hey M...","[Maria, John, Maria, John, Maria, John, Maria,...","11:01 am on 17 December, 2022","[Hey John, been a few days since we chatted. I...","[Maria, John, Maria, John, Maria, John, Maria,...","6:10 pm on 22 December, 2022","[Hey Maria, great to chat again! I joined a se...","[John, Maria, John, Maria, John, Maria, John, ...","8:30 pm on 1 January, 2023",...,"8:06 pm on 9 August, 2023",[Hey John! Long time no talk! Guess what - I g...,"[Maria, John, Maria, John, Maria, John, Maria,...","12:10 am on 11 August, 2023","[Hi Maria, since we last chatted, I'm voluntee...","[John, Maria, John, Maria, John, Maria, John, ...","3:14 pm on 13 August, 2023",[Hey Maria! Guess what? I'm now part of the fi...,"[John, Maria, John, Maria, John, Maria, John, ...","11:08 am on 16 August, 2023"
3,conv-42,[Hey Joanna! Long time no see! What's up? Anyt...,"[Nate, Joanna, Nate, Joanna, Nate, Joanna, Nat...","7:31 pm on 21 January, 2022",[Hey Nate! Haven't talked in a few days. Crazy...,"[Joanna, Nate, Joanna, Nate, Joanna, Nate, Joa...","2:01 pm on 23 January, 2022","[Hey Nate, long time no see! The screenplay I ...","[Joanna, Nate, Joanna, Nate, Joanna, Nate, Joa...","9:27 am on 7 February, 2022",...,"12:06 am on 11 November, 2022",,,,,,,,,
4,conv-43,"[Hey Tim, nice to meet you! What's up? Anythin...","[John, Tim, John, Tim, John, Tim, John, Tim, J...","7:48 pm on 21 May, 2023",[Last night I joined a fantasy literature foru...,"[Tim, John, Tim, John, Tim, John, Tim, John, T...","5:08 pm on 15 June, 2023",[Hey Tim! Good to see you again. So much has h...,"[John, Tim, John, Tim, John, Tim, John, Tim, J...","4:21 pm on 16 July, 2023",...,"1:41 pm on 12 January, 2024",,,,,,,,,


In [19]:
locomo_df["turn_len_session_0"] = locomo_df["session_0"].apply(get_turn_length)
locomo_df["turn_len_session_0"].mean()

np.float64(94.39995662528557)

In [20]:
locomo_df["session_0"][0]

['Hey Mel! Good to see you! How have you been?',
 "Hey Caroline! Good to see you! I'm swamped with the kids & work. What's up with you? Anything new?",
 'I went to a LGBTQ support group yesterday and it was so powerful.',
 "Wow, that's cool, Caroline! What happened that was so awesome? Did you hear any inspiring stories?",
 'The transgender stories were so inspiring! I was so happy and thankful for all the support.',
 "Wow, love that painting! So cool you found such a helpful group. What's it done for you?",
 'The support group has made me feel accepted and given me courage to embrace myself.',
 "That's really cool. You've got guts. What now?",
 'Gonna continue my edu and check out career options, which is pretty exciting!',
 "Wow, Caroline! What kinda jobs are you thinkin' of? Anything that stands out?",
 "I'm keen on counseling or working in mental health - I'd love to support those with similar issues.",
 "You'd be a great counselor! Your empathy and understanding will really help t

In [21]:
locomo_df["session_1"][0]

["Hey Caroline, since we last chatted, I've had a lot of things happening to me. I ran a charity race for mental health last Saturday – it was really rewarding. Really made me think about taking care of our minds.",
 "That charity race sounds great, Mel! Making a difference & raising awareness for mental health is super rewarding - I'm really proud of you for taking part!",
 "Thanks, Caroline! The event was really thought-provoking. I'm starting to realize that self-care is really important. It's a journey for me, but when I look after myself, I'm able to better look after my family.",
 "I totally agree, Melanie. Taking care of ourselves is so important - even if it's not always easy. Great that you're prioritizing self-care.",
 "Yeah, it's tough. So I'm carving out some me-time each day - running, reading, or playing my violin - which refreshes me and helps me stay present for my fam!",
 "That's great, Mel! Taking time for yourself is so important. You're doing an awesome job looking 

# Conversation Chronicles

In [27]:
cc = datasets.load_dataset("jihyoung/ConversationChronicles")

In [28]:
train_df = cc['train'].to_pandas()

In [29]:
train_df.head()

Unnamed: 0,dataID,relationship,time_interval,summary,first_session_dialogue,first_session_speakers,second_session_dialogue,second_session_speakers,third_session_dialogue,third_session_speakers,fourth_session_dialogue,fourth_session_speakers,fifth_session_dialogue,fifth_session_speakers
0,episode-96974,Classmates,"[Start, A few weeks after, A couple of years a...",[Two classmates express appreciation for each ...,"[Hey, I just wanted to let you know that I rea...","[Classmates A, Classmates B, Classmates A, Cla...","[Hey there, it's been a while since we talked....","[Classmates A, Classmates B, Classmates A, Cla...",[I am just so angry about the state of the wor...,"[Classmates A, Classmates B, Classmates A, Cla...","[, you know why I like spending time with you?...","[Classmates A, Classmates B, Classmates A, Cla...",[ I just feel so uncomfortable in my own skin ...,"[Classmates A, Classmates B, Classmates A, Cla..."
1,episode-105961,Classmates,"[Start, A few hours after, A few hours after, ...",[Two classmates express their feelings for eac...,"[Hey, do you have Sarah's number by any chance...","[Classmates A, Classmates B, Classmates A, Cla...","[Hey, do you have a phone number I can borrow?...","[Classmates A, Classmates B, Classmates A, Cla...",[I really enjoy spending time with your childr...,"[Classmates A, Classmates B, Classmates A, Cla...","[I felt really bad when your vase broke, B. I ...","[Classmates A, Classmates B, Classmates A, Cla...","[, I have to tell you something. I'm feeling s...","[Classmates B, Classmates A, Classmates B, Cla..."
2,episode-50365,Neighbors,"[Start, A few weeks after, A couple of years a...",[Neighbors A and B catch up on Friday night an...,"[Hey, Neighbors B! How's it going?, Not bad, h...","[Neighbors A, Neighbors B, Neighbors A, Neighb...","[You know, it's been tough lately. But I wake ...","[Neighbors A, Neighbors B, Neighbors A, Neighb...",[*looks away from Neighbors B and goes back to...,"[Neighbors A, Neighbors B, Neighbors A, Neighb...",[I was thinking about adding a table with a la...,"[Neighbors A, Neighbors B, Neighbors A, Neighb...","[I am having such a great time on vacation., T...","[Neighbors A, Neighbors B, Neighbors A, Neighb..."
3,episode-192703,Co-workers,"[Start, A few days after, A few weeks after, A...",[Co-worker A has a bad headache and sleeps poo...,"[Are you okay? You don't look so good. , I hav...","[Co-workers B, Co-workers A, Co-workers B, Co-...",[I can't believe I mispronounced my boss's nam...,"[Co-workers A, Co-workers B, Co-workers A, Co-...","[Hey, have you traveled to any interesting pla...","[Co-workers A, Co-workers B, Co-workers A, Co-...","[Hey, check out that guy over there with the b...","[Co-workers A, Co-workers B, Co-workers A, Co-...","[Ugh, my boss just asked me to do a report on ...","[Co-workers A, Co-workers B, Co-workers A, Co-..."
4,episode-153179,Classmates,"[Start, A couple of years after, A few weeks a...",[Two classmates discuss their final exams and ...,[Do you remember how hot it was on the day of ...,"[Classmates A, Classmates B, Classmates A, Cla...",[I can't believe how fulfilling it is to be a ...,"[Classmates A, Classmates B, Classmates A, Cla...","[, you won't believe what happened to me last ...","[Classmates A, Classmates B, Classmates A, Cla...","[Hey, did you hear about the robbery that's go...","[Classmates A, Classmates B, Classmates A, Cla...","[Thanks for the compliment, B. I just love thi...","[Classmates A, Classmates B, Classmates A, Cla..."


In [30]:
train_df["turn_len_session_0"] = train_df["first_session_dialogue"].apply(get_turn_length)
train_df["turn_len_session_0"].mean()

np.float64(92.13408442329889)