# Associate each line of text with its conversation

In [None]:
#@title Imports

# libraries for the files in google drive
from pydrive.auth import GoogleAuth
from google.colab import drive
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

import pandas as df
import pandas as pd

In [None]:
#@title Mount google drive

auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [None]:
#@title Txt drirectories

movie_conversations = "/content/drive/MyDrive/AAI520/movie_conversations.txt"
movie_lines = "/content/drive/MyDrive/AAI520/movie_lines.txt"

The following code defines a function called load_lines that loads individual lines of dialogue from a file specified by file_path. If line_ids is provided, only the specified lines will be returned; otherwise, all lines will be returned.

The function reads in the file using open(), and then iterates over each line using a for loop. Each line is stripped of whitespace and split on the string " +++$+++ ", which separates the line ID from the actual line of dialogue. If line_ids is None or the current line ID is in line_ids, the line is added to a dictionary called lines with the ID as the key and the actual line of dialogue as the value.

If line_ids is not None and the number of lines in lines is equal to the length of line_ids, the function breaks out of the loop. Finally, the function either returns a list of the values in lines (if line_ids is None) or a list of the values in lines corresponding to the IDs in line_ids.

The function then returns the first 100 lines of dialogue using Python's slice notation [:100].

We're training on 100 samples intentionally out here

In [None]:
#@title Load individual lines
# Refrence: https://www.kaggle.com/code/suraj520/keras-dialogue-act-classif-with-description

def load_lines(file_path, line_ids=None):
    """
    Load individual lines of dialogue from the given file path.
    If line_ids is provided, only the specified lines will be returned.
    """
    lines = {}
    with open(file_path, 'r', encoding='iso-8859-1') as f:
        for line in f:
            line = line.strip().split(' +++$+++ ')
            if line_ids is None or line[0] in line_ids:
                lines[line[0]] = line[-1]
                if line_ids is not None and len(lines) == len(line_ids):
                    break
    if line_ids is not None:
        lines = [lines[line_id] for line_id in line_ids]
    else:
        lines = list(lines.values())
    return lines[:100]

In [None]:
# load_lines(movie_lines, line_ids=None)

The following code snippet defines a function called load_conversations that takes in a file path as input. The function reads the data from the file located at the given file path. The file contains a list of conversations, where each conversation is represented by a series of lines of dialogue.

The function creates an empty list called conversations to store the loaded conversations. It then loops through each line of the file and processes it. For each line, it extracts the list of line IDs that belong to a single conversation. It then loads the text for each of those lines using the load_lines function, and appends those lines to the conversation list. Once all lines of a conversation have been processed, the entire conversation is appended to the conversations list.

Note that the function stops after loading the first 100 conversations in the file, using the condition if len(conversations) >= 100: break, to limit the number of conversations to load.

The function returns the list of loaded conversations.

In [None]:
#@title Associate each line of text with its conversation
# Refrence: https://www.kaggle.com/code/suraj520/keras-dialogue-act-classif-with-description

def load_conversations(file_path):
    """
    Load conversation data from the given file path.
    """
    conversations = []
    with open(file_path, 'r', encoding='iso-8859-1') as f:
        for line in f:
            if len(conversations) >= 100:
                break
            conversation = []
            line = line.strip().split(' +++$+++ ')
            line_ids = line[-1][1:-1].replace("'", "").split(", ")
            for line_id in line_ids:
                line_text = load_lines(movie_lines, [line_id])[0]
                conversation.append(line_text)
            conversations.append(conversation)
    return conversations

In [None]:
convo = load_conversations(movie_conversations)

In [None]:
df = pd.DataFrame(convo)
df

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,Can we make this quick? Roxanne Korrine and A...,"Well, I thought we'd start with pronunciation,...",Not the hacking and gagging and spitting part....,Okay... then how 'bout we try out some French ...,,,,,
1,You're asking me out. That's so cute. What's ...,Forget it.,,,,,,,
2,"No, no, it's my fault -- we didn't have a prop...",Cameron.,"The thing is, Cameron -- I'm at the mercy of a...",Seems like she could get a date easy enough...,,,,,
3,Why?,Unsolved mystery. She used to be really popul...,That's a shame.,,,,,,
4,"Gosh, if only we could find Kat a boyfriend...",Let me see what I can do.,,,,,,,
...,...,...,...,...,...,...,...,...,...
95,What's the worst?,You get the girl.,,,,,,,
96,"The vintage look is over, Kat. Haven't you bee...","Yeah, and I noticed the only part of you featu...",They're running the rest of me next month.,,,,,,
97,Hey -- do you mind?,Not at all,,,,,,,
98,Where ya goin?,Away.,Your sister here?,,,,,,


In [None]:
#@title Normalize Text

def clean_dataset(dataset):
    """
    Loop through all columns in a dataset, remove Unicode characters, and convert to lowercase.
    """
    cleaned_dataset = dataset.copy()  # Create a copy to avoid modifying the original dataset

    # Define a function to clean a single column
    def clean_column(column):
        # Use lambda function to apply the cleaning to each cell in the column
        return column.apply(lambda x: str(x).encode('ascii', 'ignore').decode('utf-8').lower())

    # Apply the clean_column function to each column in the dataset
    cleaned_dataset = cleaned_dataset.apply(clean_column)

    return cleaned_dataset

In [None]:
clean_dataset(df)

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,can we make this quick? roxanne korrine and a...,"well, i thought we'd start with pronunciation,...",not the hacking and gagging and spitting part....,okay... then how 'bout we try out some french ...,none,none,none,none,none
1,you're asking me out. that's so cute. what's ...,forget it.,none,none,none,none,none,none,none
2,"no, no, it's my fault -- we didn't have a prop...",cameron.,"the thing is, cameron -- i'm at the mercy of a...",seems like she could get a date easy enough...,none,none,none,none,none
3,why?,unsolved mystery. she used to be really popul...,that's a shame.,none,none,none,none,none,none
4,"gosh, if only we could find kat a boyfriend...",let me see what i can do.,none,none,none,none,none,none,none
...,...,...,...,...,...,...,...,...,...
95,what's the worst?,you get the girl.,none,none,none,none,none,none,none
96,"the vintage look is over, kat. haven't you bee...","yeah, and i noticed the only part of you featu...",they're running the rest of me next month.,none,none,none,none,none,none
97,hey -- do you mind?,not at all,none,none,none,none,none,none,none
98,where ya goin?,away.,your sister here?,none,none,none,none,none,none


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 9 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       100 non-null    object
 1   1       100 non-null    object
 2   2       47 non-null     object
 3   3       25 non-null     object
 4   4       14 non-null     object
 5   5       7 non-null      object
 6   6       4 non-null      object
 7   7       3 non-null      object
 8   8       1 non-null      object
dtypes: object(9)
memory usage: 7.2+ KB


In [None]:
df.isnull().sum()

0     0
1     0
2    53
3    75
4    86
5    93
6    96
7    97
8    99
dtype: int64