In [1]:
# Import Dependencies
import pandas as pd
from itertools import tee

In [2]:
# Loading movie_lines.txt using definitions from README.txt
movie_lines_fields = ["Line_ID", "Character_ID", "Movie_ID", "Character_Name", "Movie_Line"]
movie_lines = pd.read_csv("movie_lines.txt", sep="\+\+\+\$\+\+\+", 
                          engine='python', names=movie_lines_fields)

In [3]:
movie_lines

Unnamed: 0,Line_ID,Character_ID,Movie_ID,Character_Name,Movie_Line
0,L1045,u0,m0,BIANCA,They do not!
1,L1044,u2,m0,CAMERON,They do to!
2,L985,u0,m0,BIANCA,I hope so.
3,L984,u2,m0,CAMERON,She okay?
4,L925,u0,m0,BIANCA,Let's go.
...,...,...,...,...,...
304708,L666371,u9030,m616,DURNFORD,Lord Chelmsford seems to want me to stay back...
304709,L666370,u9034,m616,VEREKER,I'm to take the Sikali with the main column t...
304710,L666369,u9030,m616,DURNFORD,"Your orders, Mr Vereker?"
304711,L666257,u9030,m616,DURNFORD,"Good ones, yes, Mr Vereker. Gentlemen who can..."


In [4]:
# Removing extraneous spaces from fields and using capitalize on Character_Name
movie_lines['Line_ID'] = movie_lines['Line_ID'].apply(lambda x: ''.join(x.split()))
movie_lines['Character_ID'] = movie_lines['Character_ID'].apply(lambda x: ''.join(x.split()))
movie_lines['Movie_ID'] = movie_lines['Movie_ID'].apply(lambda x: ''.join(x.split()))
movie_lines['Character_Name'] = movie_lines['Character_Name'].apply(lambda x: ''.join(x.split()).capitalize())
movie_lines['Movie_Line'] = movie_lines['Movie_Line'].apply(lambda x: ' '.join(str(x).split()))

In [5]:
movie_lines.loc[movie_lines['Line_ID'] == 'L194']

Unnamed: 0,Line_ID,Character_ID,Movie_ID,Character_Name,Movie_Line
68,L194,u0,m0,Bianca,Can we make this quick? Roxanne Korrine and An...


In [6]:
# Setting Line_ID as dataframe index
movie_lines = movie_lines.set_index('Line_ID')

In [7]:
movie_lines

Unnamed: 0_level_0,Character_ID,Movie_ID,Character_Name,Movie_Line
Line_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
L1045,u0,m0,Bianca,They do not!
L1044,u2,m0,Cameron,They do to!
L985,u0,m0,Bianca,I hope so.
L984,u2,m0,Cameron,She okay?
L925,u0,m0,Bianca,Let's go.
...,...,...,...,...
L666371,u9030,m616,Durnford,Lord Chelmsford seems to want me to stay back ...
L666370,u9034,m616,Vereker,I'm to take the Sikali with the main column to...
L666369,u9030,m616,Durnford,"Your orders, Mr Vereker?"
L666257,u9030,m616,Durnford,"Good ones, yes, Mr Vereker. Gentlemen who can ..."


In [8]:
# Loading movie_conversations.txt using definitions from README.txt
movie_conversations_fields = ["Character_ID_1", "Character_ID_2", "Movie_ID", "Movie_Line_Order"]
movie_conversations = pd.read_csv("movie_conversations.txt", sep="\+\+\+\$\+\+\+", 
                                  engine='python', names=movie_conversations_fields)

In [9]:
movie_conversations

Unnamed: 0,Character_ID_1,Character_ID_2,Movie_ID,Movie_Line_Order
0,u0,u2,m0,"['L194', 'L195', 'L196', 'L197']"
1,u0,u2,m0,"['L198', 'L199']"
2,u0,u2,m0,"['L200', 'L201', 'L202', 'L203']"
3,u0,u2,m0,"['L204', 'L205', 'L206']"
4,u0,u2,m0,"['L207', 'L208']"
...,...,...,...,...
83092,u9028,u9031,m616,"['L666324', 'L666325', 'L666326', 'L666327']"
83093,u9028,u9031,m616,"['L666575', 'L666576']"
83094,u9030,u9034,m616,"['L666256', 'L666257']"
83095,u9030,u9034,m616,"['L666369', 'L666370', 'L666371', 'L666372']"


In [10]:
# Removing extraneous spaces from fields and converting string enclosed list in Movie_Line_Order into proper list
movie_conversations['Character_ID_1'] = movie_conversations['Character_ID_1'].apply(lambda x: ''.join(x.split()))
movie_conversations['Character_ID_2'] = movie_conversations['Character_ID_2'].apply(lambda x: ''.join(x.split()))
movie_conversations['Movie_ID'] = movie_conversations['Movie_ID'].apply(lambda x: ''.join(x.split()))
movie_conversations['Movie_Line_Order'] = movie_conversations['Movie_Line_Order'].apply(lambda x: eval(' '.join(x.split())))

In [11]:
movie_conversations

Unnamed: 0,Character_ID_1,Character_ID_2,Movie_ID,Movie_Line_Order
0,u0,u2,m0,"[L194, L195, L196, L197]"
1,u0,u2,m0,"[L198, L199]"
2,u0,u2,m0,"[L200, L201, L202, L203]"
3,u0,u2,m0,"[L204, L205, L206]"
4,u0,u2,m0,"[L207, L208]"
...,...,...,...,...
83092,u9028,u9031,m616,"[L666324, L666325, L666326, L666327]"
83093,u9028,u9031,m616,"[L666575, L666576]"
83094,u9030,u9034,m616,"[L666256, L666257]"
83095,u9030,u9034,m616,"[L666369, L666370, L666371, L666372]"


In [12]:
movie_conversations['Movie_Line_Order'][0]

['L194', 'L195', 'L196', 'L197']

In [13]:
movie_lines.loc['L194', 'Movie_Line']

'Can we make this quick? Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad. Again.'

In [14]:
# Defining function to convert Movie_Line_Order into actual text
def grab_lines(movie_line_list):
    """Function takes list from movie_conversations['Movie_Line_Order'] and turns into dialogue"""
    dialogue = []
    for line in movie_line_list:
        movie_line = movie_lines.loc[line, 'Movie_Line']
        dialogue.append(movie_line)
    return dialogue

In [15]:
# Generate Conversation column using grab_line function
movie_conversations['Conversation'] = movie_conversations['Movie_Line_Order'].apply(grab_lines)

In [16]:
movie_conversations

Unnamed: 0,Character_ID_1,Character_ID_2,Movie_ID,Movie_Line_Order,Conversation
0,u0,u2,m0,"[L194, L195, L196, L197]",[Can we make this quick? Roxanne Korrine and A...
1,u0,u2,m0,"[L198, L199]",[You're asking me out. That's so cute. What's ...
2,u0,u2,m0,"[L200, L201, L202, L203]","[No, no, it's my fault -- we didn't have a pro..."
3,u0,u2,m0,"[L204, L205, L206]","[Why?, Unsolved mystery. She used to be really..."
4,u0,u2,m0,"[L207, L208]","[Gosh, if only we could find Kat a boyfriend....."
...,...,...,...,...,...
83092,u9028,u9031,m616,"[L666324, L666325, L666326, L666327]",[Do you think she might be interested in someo...
83093,u9028,u9031,m616,"[L666575, L666576]",[Choose your targets men. That's right Watch t...
83094,u9030,u9034,m616,"[L666256, L666257]",[Colonel Durnford... William Vereker. I hear y...
83095,u9030,u9034,m616,"[L666369, L666370, L666371, L666372]","[Your orders, Mr Vereker?, I'm to take the Sik..."


In [17]:
# Defining functions to convert conversations into (comment, reply) pairs
def make_pairs(movie_line_list):
    """Turn list into (list[0], list[1]), (list[1], list[2]), (list[2], list[3]), ..."""
    a, b = tee(movie_line_list)
    next(b, None)
    return zip(a, b)

def return_pairs(dialogue_pair_list):
    """Return dialogue pairs from make_pairs function"""
    pairs = []
    for a, b in list(dialogue_pair_list):
        pairs.append("['{}', '{}']".format(a, b))
    return pairs

In [18]:
# Generate (comment, reply) pairs into Dialogue_Pairs column using make_pairs and return_pairs function
movie_conversations['Dialogue_Pairs'] = movie_conversations['Movie_Line_Order'].apply(lambda x: return_pairs(make_pairs(x)))

In [19]:
movie_conversations

Unnamed: 0,Character_ID_1,Character_ID_2,Movie_ID,Movie_Line_Order,Conversation,Dialogue_Pairs
0,u0,u2,m0,"[L194, L195, L196, L197]",[Can we make this quick? Roxanne Korrine and A...,"[['L194', 'L195'], ['L195', 'L196'], ['L196', ..."
1,u0,u2,m0,"[L198, L199]",[You're asking me out. That's so cute. What's ...,"[['L198', 'L199']]"
2,u0,u2,m0,"[L200, L201, L202, L203]","[No, no, it's my fault -- we didn't have a pro...","[['L200', 'L201'], ['L201', 'L202'], ['L202', ..."
3,u0,u2,m0,"[L204, L205, L206]","[Why?, Unsolved mystery. She used to be really...","[['L204', 'L205'], ['L205', 'L206']]"
4,u0,u2,m0,"[L207, L208]","[Gosh, if only we could find Kat a boyfriend.....","[['L207', 'L208']]"
...,...,...,...,...,...,...
83092,u9028,u9031,m616,"[L666324, L666325, L666326, L666327]",[Do you think she might be interested in someo...,"[['L666324', 'L666325'], ['L666325', 'L666326'..."
83093,u9028,u9031,m616,"[L666575, L666576]",[Choose your targets men. That's right Watch t...,"[['L666575', 'L666576']]"
83094,u9030,u9034,m616,"[L666256, L666257]",[Colonel Durnford... William Vereker. I hear y...,"[['L666256', 'L666257']]"
83095,u9030,u9034,m616,"[L666369, L666370, L666371, L666372]","[Your orders, Mr Vereker?, I'm to take the Sik...","[['L666369', 'L666370'], ['L666370', 'L666371'..."


In [20]:
movie_conversations['Dialogue_Pairs']

0        [['L194', 'L195'], ['L195', 'L196'], ['L196', ...
1                                       [['L198', 'L199']]
2        [['L200', 'L201'], ['L201', 'L202'], ['L202', ...
3                     [['L204', 'L205'], ['L205', 'L206']]
4                                       [['L207', 'L208']]
                               ...                        
83092    [['L666324', 'L666325'], ['L666325', 'L666326'...
83093                             [['L666575', 'L666576']]
83094                             [['L666256', 'L666257']]
83095    [['L666369', 'L666370'], ['L666370', 'L666371'...
83096     [['L666520', 'L666521'], ['L666521', 'L666522']]
Name: Dialogue_Pairs, Length: 83097, dtype: object

In [21]:
# Defining function to convert (comment, reply) pairs into (comment, reply) pair texts
def grab_pairs(dialogue_pairs_list):
    """Function takes list of dialogue pairs and convert them into text"""
    dialogue_pair_text = []
    for lines in dialogue_pairs_list:
        dialogue_pair_pairs = grab_lines(eval(lines))
        dialogue_pair_text.append(dialogue_pair_pairs)
    return dialogue_pair_text

In [22]:
# Generate (comment, reply) pair texts into Dialogue_Pairs_Text column using grab_pairs function
movie_conversations['Dialogue_Pairs_Text'] = movie_conversations['Dialogue_Pairs'].apply(grab_pairs)

In [23]:
movie_conversations['Dialogue_Pairs_Text']

0        [[Can we make this quick? Roxanne Korrine and ...
1        [[You're asking me out. That's so cute. What's...
2        [[No, no, it's my fault -- we didn't have a pr...
3        [[Why?, Unsolved mystery. She used to be reall...
4        [[Gosh, if only we could find Kat a boyfriend....
                               ...                        
83092    [[Do you think she might be interested in some...
83093    [[Choose your targets men. That's right Watch ...
83094    [[Colonel Durnford... William Vereker. I hear ...
83095    [[Your orders, Mr Vereker?, I'm to take the Si...
83096    [[Well I assure you, Sir, I have no desire to ...
Name: Dialogue_Pairs_Text, Length: 83097, dtype: object

In [24]:
movie_conversations['Dialogue_Pairs_Text'][0]

[['Can we make this quick? Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad. Again.',
  "Well, I thought we'd start with pronunciation, if that's okay with you."],
 ["Well, I thought we'd start with pronunciation, if that's okay with you.",
  'Not the hacking and gagging and spitting part. Please.'],
 ['Not the hacking and gagging and spitting part. Please.',
  "Okay... then how 'bout we try out some French cuisine. Saturday? Night?"]]

In [25]:
# Using movie_conversations dataframe to generate individual (comment, reply) pairs
individual_dialogue_pairs = []
for pair_num in movie_conversations['Dialogue_Pairs_Text']:
    for pair in pair_num:
        individual_dialogue_pairs.append(pair)

In [26]:
# Defining new dataframe
columns = ['Comment', 'Reply']
individual_dialogue_pairs_df = pd.DataFrame(individual_dialogue_pairs, columns=columns)

In [27]:
individual_dialogue_pairs_df

Unnamed: 0,Comment,Reply
0,Can we make this quick? Roxanne Korrine and An...,"Well, I thought we'd start with pronunciation,..."
1,"Well, I thought we'd start with pronunciation,...",Not the hacking and gagging and spitting part....
2,Not the hacking and gagging and spitting part....,Okay... then how 'bout we try out some French ...
3,You're asking me out. That's so cute. What's y...,Forget it.
4,"No, no, it's my fault -- we didn't have a prop...",Cameron.
...,...,...
221611,"Your orders, Mr Vereker?",I'm to take the Sikali with the main column to...
221612,I'm to take the Sikali with the main column to...,Lord Chelmsford seems to want me to stay back ...
221613,Lord Chelmsford seems to want me to stay back ...,I think Chelmsford wants a good man on the bor...
221614,"Well I assure you, Sir, I have no desire to cr...","And I assure you, you do not In fact I'd be ob..."
