In [17]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import confusion_matrix
import numpy as np
import random
import pandas as pd
import numpy as np
import spacy
import seaborn as sns
import matplotlib.pyplot as plt
from google.colab import drive
import os
import spacy
nlp = spacy.load('en_core_web_sm')
import matplotlib.pyplot as plt

In [18]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Configuration of data extraction

In [19]:
file_path = '/content/drive/MyDrive/NLP Project/movie_lines.txt'
separator = r'\s*\+\+\+\$\+\+\+\s*'
encod = 'iso-8859-1'
raw_data = pd.DataFrame(columns = ['Index', 'U', 'Movie', 'Char', 'Line'])
#raw_data.columns = ['Index', 'U', 'Movie', 'Char', 'Line']
raw_data = pd.read_csv(file_path, sep= separator, encoding= encod, header=None)

  raw_data = pd.read_csv(file_path, sep= separator, encoding= encod, header=None)


In [20]:
raw_data.columns = ['Line_ID', 'Char_ID', 'Movie', 'Char', 'Line'] #Name columns
df = raw_data.astype(str)          # Convert to string
df['Line_ID'] = df['Line_ID'].str.replace('L', '')  #Sort by Line ID
df['Line_ID'] = df['Line_ID'].astype(int)
df = df.sort_values(by='Line_ID')

Here I limit the dataframe to the first 5k rows since it showed to be a very heavy dataset to process.

In [21]:
df = df.iloc[0:50000] #Limiting DF
df['Line'] = df['Line'].apply(lambda x:x.lower()).astype(str)

df

Unnamed: 0,Line_ID,Char_ID,Movie,Char,Line
86,49,u0,m0,BIANCA,did you change your hair?
85,50,u3,m0,CHASTITY,no.
84,51,u0,m0,BIANCA,you might wanna think about it
648,59,u9,m0,PATRICK,i missed you.
647,60,u8,m0,MISS PERKY,it says here you exposed yourself to a group o...
...,...,...,...,...,...
20989,118953,u669,m42,STRASSER,for the time being.
20988,118954,u665,m42,LASZLO,good day.
20612,118967,u667,m42,RICK,i see the bus is in. i'll take my shipment wit...
20611,118968,u663,m42,FERRARI,no hurry. i'll have it sent over. have a drink...


In [22]:
def tokenize_text(text): #Might not need this tokenizing function
  doc = nlp(text)
  return [token.text for token in doc]

def remove_stopwords(doc):
    #doc = nlp(text)
    text_wo_stopwords = ' '.join([token.text for token in doc if not token.is_stop])
    return text_wo_stopwords

def lemmas_tokens(doc):
  #doc = nlp(text)
  lemmas = ' '.join([token.lemma_ for token in doc])
  return lemmas

def remove_punct(text):
    doc = nlp(text)
    text_no_punct = ' '.join([token.text for token in doc if not token.is_punct])
    return text_no_punct

def NER(text):      #defined a function to apply NER to a whole column
  doc = nlp(text)
  entities = [(ent.text, ent.label_) for ent in doc.ents]
  return [entities]

def POS(text):      #defined a function to apply POS to a whole column
  doc = nlp(text)
  pos_tags = [(token.text, token.pos_) for token in doc]
  return [pos_tags]

In [23]:
#df['docs'] = df['Line'].apply(lambda x:nlp(x))
#Tokenizing straight with NLP function / did not work

In [24]:
#df['Line'] = df['Line'].apply(remove_punct)
#df['Line'] = df['Line'].apply(remove_stopwords)
#df['Line'] = df['Line'].apply(lemmas_tokens)

In [25]:
df['Line'] = df['Line'].apply(remove_punct)

In [26]:
df['NER'] = df['Line'].apply(NER)

In [27]:
df['POS'] = df['Line'].apply(POS)

At this time, we have the 'Line' Column with no punct all lower case. We decided to try and train the model with stopwords. If we have low accuracy then we will remove the stopwords. Lemmatizing still on the table for discussion.
NER and POS tagging have been performed, we'll see how we can feed the model with this information.

In [28]:
df

Unnamed: 0,Line_ID,Char_ID,Movie,Char,Line,NER,POS
86,49,u0,m0,BIANCA,did you change your hair,[[]],"[[(did, AUX), (you, PRON), (change, VERB), (yo..."
85,50,u3,m0,CHASTITY,no,[[]],"[[(no, INTJ)]]"
84,51,u0,m0,BIANCA,you might wanna think about it,[[]],"[[(you, PRON), (might, AUX), (wanna, VERB), (t..."
648,59,u9,m0,PATRICK,i missed you,[[]],"[[(i, PRON), (missed, VERB), (you, PRON)]]"
647,60,u8,m0,MISS PERKY,it says here you exposed yourself to a group o...,[[]],"[[(it, PRON), (says, VERB), (here, ADV), (you,..."
...,...,...,...,...,...,...,...
20989,118953,u669,m42,STRASSER,for the time being,[[]],"[[(for, ADP), (the, DET), (time, NOUN), (being..."
20988,118954,u665,m42,LASZLO,good day,[[]],"[[(good, ADJ), (day, NOUN)]]"
20612,118967,u667,m42,RICK,i see the bus is in i 'll take my shipment wit...,[[]],"[[(i, PRON), (see, VERB), (the, DET), (bus, NO..."
20611,118968,u663,m42,FERRARI,no hurry i 'll have it sent over have a drink ...,[[]],"[[(no, DET), (hurry, NOUN), (i, PRON), ('ll, A..."




This function, structures the dialogs to join the consecutive lines of each character. If a character says two or more consecutive lines, these are joined until the next character says something. Structured in a dictionary fashion for handling.

In [29]:
def structure_dialogues(df):
    dialogues = []
    current_character = None
    current_dialogue = ""

    for index, row in df.iterrows():
        character = row['Char']
        dialogue = row['Line']

        if character != current_character:
            # A new character's dialogue begins
            if current_character is not None:
                dialogues.append({"character": current_character, "dialogue": current_dialogue})
            current_character = character
            current_dialogue = dialogue
        else:
            # Continue the dialogue for the same character
            current_dialogue += " " + dialogue

    # Append the last character's dialogue
    if current_character is not None:
        dialogues.append({"character": current_character, "dialogue": current_dialogue})

    return dialogues



In [30]:
structured_data = structure_dialogues(df)
structured_data[0:50]
#csv_file_path = '/content/drive/MyDrive/NLP Project/structured_dialogues.csv'
#df.to_csv(csv_file_path, index=False)

[{'character': 'BIANCA', 'dialogue': 'did you change your hair'},
 {'character': 'CHASTITY', 'dialogue': 'no'},
 {'character': 'BIANCA', 'dialogue': 'you might wanna think about it'},
 {'character': 'PATRICK', 'dialogue': 'i missed you'},
 {'character': 'MISS PERKY',
  'dialogue': 'it says here you exposed yourself to a group of freshmen girls'},
 {'character': 'PATRICK',
  'dialogue': 'it was a bratwurst   i was eating lunch'},
 {'character': 'MISS PERKY', 'dialogue': 'with the teeth of your zipper'},
 {'character': 'MICHAEL', 'dialogue': 'you the new guy'},
 {'character': 'CAMERON', 'dialogue': 'so they tell me'},
 {'character': 'MICHAEL',
  'dialogue': "c'm on   i 'm supposed to give you the tour so which dakota you from"},
 {'character': 'CAMERON', 'dialogue': "north actually   how 'd you   "},
 {'character': 'MICHAEL',
  'dialogue': 'i was kidding people actually live there'},
 {'character': 'CAMERON',
  'dialogue': "yeah   a couple   we 're outnumbered by the cows though"},
 {'ch

Here we'll start our test playground for BERT

In [32]:
!pip install transformers
!pip install tensorflow

Collecting transformers
  Downloading transformers-4.34.0-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m44.9 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.17.3-py3-none-any.whl (295 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m295.0/295.0 kB[0m [31m26.3 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m80.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m67.5 MB/s[0m eta [36m0:00:00[0m
Insta

In [33]:
from transformers import BertTokenizer, BertModel

# Load the pre-trained BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
#model = BertModel.from_pretrained('bert-base-uncased')

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [51]:
def BERT_tokenizer(text):
    # Tokenize the text
    B_Tokens = tokenizer.tokenize(text)
    # Convert tokens to their integer IDs
    token_ids = tokenizer.convert_tokens_to_ids(B_Tokens)
    return (token_ids, B_Tokens)



Applying BERT Tokenization to our DF

In [53]:
df['B_Tokens_IDs'], df['B_Tokens'] = zip(*df['Line'].apply(BERT_tokenizer))


In [54]:
df

Unnamed: 0,Line_ID,Char_ID,Movie,Char,Line,NER,POS,B_Tokens,B_Tokens_IDs
86,49,u0,m0,BIANCA,did you change your hair,[[]],"[[(did, AUX), (you, PRON), (change, VERB), (yo...","[did, you, change, your, hair]","[2106, 2017, 2689, 2115, 2606]"
85,50,u3,m0,CHASTITY,no,[[]],"[[(no, INTJ)]]",[no],[2053]
84,51,u0,m0,BIANCA,you might wanna think about it,[[]],"[[(you, PRON), (might, AUX), (wanna, VERB), (t...","[you, might, wanna, think, about, it]","[2017, 2453, 10587, 2228, 2055, 2009]"
648,59,u9,m0,PATRICK,i missed you,[[]],"[[(i, PRON), (missed, VERB), (you, PRON)]]","[i, missed, you]","[1045, 4771, 2017]"
647,60,u8,m0,MISS PERKY,it says here you exposed yourself to a group o...,[[]],"[[(it, PRON), (says, VERB), (here, ADV), (you,...","[it, says, here, you, exposed, yourself, to, a...","[2009, 2758, 2182, 2017, 6086, 4426, 2000, 103..."
...,...,...,...,...,...,...,...,...,...
20989,118953,u669,m42,STRASSER,for the time being,[[]],"[[(for, ADP), (the, DET), (time, NOUN), (being...","[for, the, time, being]","[2005, 1996, 2051, 2108]"
20988,118954,u665,m42,LASZLO,good day,[[]],"[[(good, ADJ), (day, NOUN)]]","[good, day]","[2204, 2154]"
20612,118967,u667,m42,RICK,i see the bus is in i 'll take my shipment wit...,[[]],"[[(i, PRON), (see, VERB), (the, DET), (bus, NO...","[i, see, the, bus, is, in, i, ', ll, take, my,...","[1045, 2156, 1996, 3902, 2003, 1999, 1045, 100..."
20611,118968,u663,m42,FERRARI,no hurry i 'll have it sent over have a drink ...,[[]],"[[(no, DET), (hurry, NOUN), (i, PRON), ('ll, A...","[no, hurry, i, ', ll, have, it, sent, over, ha...","[2053, 9241, 1045, 1005, 2222, 2031, 2009, 274..."


We shall create a DF with inputs and outputs for the Bert model.
My best guess is that we should train the inputs with the utterance of the first character in the conversation and the output hould be the reply of the next character. This shall be done in the BERT_df ['Outputs'] column.

In [60]:
BERT_df =pd.DataFrame()
BERT_df['Inputs'] = df['B_Tokens']
BERT_df['Outputs'] = df['B_Tokens']