In [2]:
import numpy as np
import pandas as pd
import json
import srt
import re

In [2]:
#looking at the data
with open('/mnt/e/Storage/masters-thesis-data/Taskmaster-master/Taskmaster-master/TM-1-2019/self-dialogs.json') as fp:
    dat = json.load(fp)

In [3]:
len(dat)

7708

In [13]:
#data is separated by utterances, denotes the speaker and the text, index irrelevant
dat[0]['utterances']

[{'index': 0,
  'speaker': 'USER',
  'text': "Hi, I'm looking to book a table for Korean fod."},
 {'index': 1,
  'speaker': 'ASSISTANT',
  'text': 'Ok, what area are you thinking about?'},
 {'index': 2,
  'speaker': 'USER',
  'text': 'Somewhere in Southern NYC, maybe the East Village?',
  'segments': [{'start_index': 13,
    'end_index': 49,
    'text': 'Southern NYC, maybe the East Village',
    'annotations': [{'name': 'restaurant_reservation.location.restaurant.accept'}]},
   {'start_index': 13,
    'end_index': 25,
    'text': 'Southern NYC',
    'annotations': [{'name': 'restaurant_reservation.location.restaurant.accept'}]}]},
 {'index': 3,
  'speaker': 'ASSISTANT',
  'text': "Ok, great.  There's Thursday Kitchen, it has great reviews.",
  'segments': [{'start_index': 20,
    'end_index': 35,
    'text': 'Thursday Kitche',
    'annotations': [{'name': 'restaurant_reservation.name.restaurant.reject'}]}]},
 {'index': 4,
  'speaker': 'USER',
  'text': "That's great. So I need a table

From this I can assume we can take the information, and store it with our own labels as long as long as we sync up

In [29]:
#for example, we take one conversation
order = []
speech = []
for i in range (len(dat[0]['utterances'])):
    order.append(dat[0]['utterances'][i]['speaker'])
    speech.append(dat[0]['utterances'][i]['text'])

print(order)
print(speech)
print(len(order),len(speech))

df = pd.DataFrame({"p":order, "u":speech})
print(df)

['USER', 'ASSISTANT', 'USER', 'ASSISTANT', 'USER', 'ASSISTANT', 'USER', 'ASSISTANT', 'USER', 'ASSISTANT', 'USER', 'ASSISTANT', 'USER', 'ASSISTANT', 'USER', 'ASSISTANT', 'USER', 'ASSISTANT', 'USER', 'ASSISTANT']
["Hi, I'm looking to book a table for Korean fod.", 'Ok, what area are you thinking about?', 'Somewhere in Southern NYC, maybe the East Village?', "Ok, great.  There's Thursday Kitchen, it has great reviews.", "That's great. So I need a table for tonight at 7 pm for 8 people. We don't want to sit at the bar, but anywhere else is fine.", "They don't have any availability for 7 pm.", 'What times are available?', '5 or 8.', "Yikes, we can't do those times.", 'Ok, do you have a second choice?', 'Let me check.', 'Ok.', 'Lets try Boka, are they free for 8 people at 7?', 'Yes.', "Great, let's book that.", 'Ok great, are there any other requests?', "No, that's it, just book.", 'Great, should I use your account you have open with them?', 'Yes please.', 'Great. You will get a confirmation

In [33]:
#now we can do the same for every conversation in the provided dataset
p = []
u = []
for i in range(len(dat)):
    for j in range(len(dat[i]['utterances'])):
        p.append(dat[i]['utterances'][j]['speaker'])
        u.append(dat[i]['utterances'][j]['text'])
df = pd.DataFrame({"p":p,"u":u})
print(df.head(5))

           p                                                  u
0       USER    Hi, I'm looking to book a table for Korean fod.
1  ASSISTANT              Ok, what area are you thinking about?
2       USER  Somewhere in Southern NYC, maybe the East Vill...
3  ASSISTANT  Ok, great.  There's Thursday Kitchen, it has g...
4       USER  That's great. So I need a table for tonight at...


In [36]:
df.to_csv('tm1_data.csv',index=False)

Now that we exported to CSV, we can spend time to produce a labels column

# SRT file processing

We decided to take some talkshow/interview show subtitles in SRT file format, since we believe those subtitles will most likely be more like the format we imagine a LE to be in.

In [3]:
#SRT files are processed as a generator, we take that as a list
fp = open("./srt-files/blackpink/Blackpink_Light_up_the_Sky.srt", "r")
subs_gen = srt.parse(fp)
subs = list(subs_gen)

In [4]:
#we can use commands such as content to extract the words
text = []
for i in range(len(subs)):
    text.append(subs[i].content)

In [5]:
#we can use regex to remove the font tag per line
for i in range(len(text)):
    text[i] = re.sub('<[^>]*>', '', text[i])

#we fix an issue with \n
for i in range(len(text)):
    if len(text[i].split("\n")) >=2:
        text[i] = " ".join(text[i].split("\n"))

In [6]:
#we can now have nice clean text to label
text

["Today is the debut of YG's newest girl group in seven years.",
 'Introducing BLACKPINK!',
 '♪ BLACKPINK… ♪',
 'Please give it up for BLACKPINK!',
 "Hello, we're BLACKPINK!",
 'Our next guests are the best-charting',
 'female Korean group of all time.',
 '…wildly popular K-pop group…',
 'K-pop sensations, BLACKPINK!',
 '♪ Hit you with that ddu-du ddu-du du ♪',
 'BLACKPINK has become',
 'the highest charting K-pop girl group ever.',
 'They also made history as they became',
 'the first K-pop girl group to perform at Coachella.',
 'Performing it live for the first time ever on US television,',
 'here is BLACKPINK!',
 '- BLACKPINK! - BLACKPINK!',
 'Congratulations, BLACKPINK.',
 'BLACKPINK!',
 '♪ Hit you with that ddu-du ddu-du du ♪',
 "- I'm so hungry. - Are you?",
 '- Did I eat something today? - No.',
 "There's a dessert. It's like a croissant\xa0with injeolmi.",
 'Ah!',
 "You guys don't like injeolmi?",
 'It tastes really good.',
 'Says the Thai girl in the car.',
 "Every time a new 

In [7]:
#we toss this into a csv file in excel for manual labeling
df = pd.DataFrame({"U":text})
df.to_csv('blackpink_data.csv',index=False)

# Important to note here that our file has multiple speakers that we cannot identify from the file itself (different from google taskmaster data) but it should not necessarily be a problem.

However, in this case, there might be a lot of bloat with audience reactions... We'll have to see how we want to deal with those later

# SRT file processing for manual labeling post meeting 9 discussions (check meeting notes in document 3 for further info)

In [1]:
#SRT-file processing for manual labeling
import numpy as np
import pandas as pd
import json
import srt
import re

#pull file
fp = open("./srt-files/Spider-Man.Into.the.Spider-Verse.2018.Alt-Universe.Cut.1080p.BluRay.REMUX.AVC.DTS-HD.MA.5.srt", "r")
subs_gen = srt.parse(fp)
subs = list(subs_gen)

#we can use commands such as content to extract the words
text = []
for i in range(len(subs)):
    text.append(subs[i].content)

#we can use regex to remove the font tag per line
for i in range(len(text)):
    text[i] = re.sub('<[^>]*>', '', text[i])

#we fix an issue with \n
for i in range(len(text)):
    if len(text[i].split("\n")) >=2:
        text[i] = " ".join(text[i].split("\n"))

In [2]:
text

["- Hi, everybody. I'm Phil Lord. - And I'm Chris Miller.",
 'This is "Alt Universe Mode."',
 'When we were making the movie, we tried a lot of different scenes,',
 "jokes, ideas, moments that didn't make it into the finished film.",
 'But in another universe...',
 'Who knows? We do. And now you do.',
 "There's gonna be some stuff that's not finished.",
 'And I promise you, from the very first frame,',
 'this will be a different experience.',
 '- Enjoy "Alt Universe Mode." - It\'s glorious.',
 'Yum, yum, yum, yum, yum.',
 "Don't worry, people. It's 100% beef.",
 "I'm lying.",
 "Nothing's gonna get in the way of me eating this hot dog.",
 'What am I, pulled pork?',
 'What... What?',
 "I've been pig-napped.",
 'So glad you could make it to my pig roast, Spider-Ham.',
 'Prepare to be honey "glasered."',
 'Do you have any last words?',
 'Why, yes, I do.',
 "- You won't get away with this! - Oh, yes, I will.",
 "- Oh, no, you won't. - Oh, yes, I will.",
 "- Oh, no, you won't. - Oh, yes, I w

In [39]:
#we toss this into a csv file in excel for manual labeling
df = pd.DataFrame({"U":text})
df.to_csv('./data/graham.norton.s23e02.csv',index=False)