In [21]:
# Import necessary libraries
import os  # Operating System module
import chardet  # Character encoding detection library
import re  # Regular expression library
import ast  # Abstract Syntax Trees module for parsing Python code
import pickle  # Module for serializing and deserializing Python objects
import itertools  # Module for efficient looping and iteration
import pandas as pd  # Pandas library for data manipulation and analysis
import numpy as np  # NumPy library for numerical operations
import matplotlib.pyplot as plt  # Matplotlib library for data visualization
from transformers import GPT2Tokenizer  # GPT-2 Tokenizer from Hugging Face's Transformers library
from pandarallel import pandarallel  # Parallel processing with pandas

# Set display option for Pandas to show entire column width
pd.set_option("display.max_colwidth", None)

# Initialize parallel processing with pandas
pandarallel.initialize(progress_bar=True, nb_workers=8)

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


In [22]:
# Define the filename for the dataset
DATASET_FILENAME = "movie_conversations.pkl"

# Define the maximum token length for the dataset
MAX_TOKEN_LENGTH = 1024

# Flag to track whether the dataset has been generated or not
dataset_generated = False

# Check if the dataset file already exists
if os.path.exists(DATASET_FILENAME):
    # If the file exists, set the dataset_generated flag to True
    dataset_generated = True

In [23]:
# Check if the dataset has not been generated
if not dataset_generated:
    # Get the current working directory
    cwd = os.getcwd()

    # Detect the encoding of the movie lines file
    with open(cwd + "/movie_lines/movie_lines.txt", "rb") as f:
        result = chardet.detect(f.read())
        movie_lines_encoding = result["encoding"]

    # Detect the encoding of the movie conversations file
    with open(cwd + "/movie_lines/movie_conversations.txt", "rb") as f:
        result = chardet.detect(f.read())
        movie_conversations_encoding = result["encoding"]

    # Print the detected encodings
    print(movie_lines_encoding)
    print(movie_conversations_encoding)

    # Note: The lines below are commented out. If needed, you can manually set encodings.
    # movie_lines_encoding = "Windows-1252"
    # movie_conversations_encoding = "ascii"

In [24]:
# Check if the dataset has not been generated
if not dataset_generated:
    # Collect individual movie lines
    with open(cwd + "/movie_lines/movie_lines.txt", "r", encoding=movie_lines_encoding) as f:
        content = f.read()

    # Split the content into individual lines
    lines = content.split("\n")

    # Print the first 5 lines to verify content
    print(lines[:5])

    # Remove the last element of lines because it's an empty string
    lines = lines[:-1]

    # Print the length of the lines to verify correctness
    print(len(lines))

In [25]:
# Check if the dataset has not been generated
if not dataset_generated:
    # Initialize containers for values to put in the dataframe
    line_numbers_dict = {}

    # Iterate through each line in the lines list
    for line in lines:
        # Split the line on whitespace
        split = line.split(" ")
        # Extract line number and character ID
        line_number = split[0]
        character_id = split[2]

        # Extract the text after the last "+" character using regular expression
        # This is a way to split the line on the last "+" in case there are other "+" characters in the text
        l = re.split(r'\+\s+(?=[^+]*$)', line)[-1]

        # Store the extracted values in the dictionary
        line_numbers_dict[line_number] = (character_id, l)

    # Print the first 10 entries of the dictionary (for verification purposes)
    print(dict(itertools.islice(line_numbers_dict.items(), 10)))

In [26]:
# Check if the dataset has not been generated
if not dataset_generated:
    # Collect movie conversation lists
    with open(cwd + "/movie_lines/movie_conversations.txt", "r", encoding=movie_conversations_encoding) as f:
        content = f.read()

    # Split the content into individual lines
    lines = content.split("\n")

    # Print the first 5 lines to verify content
    print(lines[:5])

    # Remove the last element of lines because it's an empty string
    lines = lines[:-1]

    # Print the length of the lines to verify correctness
    print(len(lines))

In [27]:
# Check if the dataset has not been generated
if not dataset_generated:
    # Initialize containers for values to put in the dataframe
    speaker1_ids = []
    speaker2_ids = []
    conversation_lines = []

    # Iterate through each line in the lines list
    for line in lines:
        # Split the line on whitespace
        split = line.split(" ")
        
        # Extract speaker IDs
        speaker1_ids.append(split[0])
        speaker2_ids.append(split[2])

        # Extract the text after the last "+" character using regular expression
        # This is a way to split the line on the last "+" in case there are other "+" characters in the text
        l = re.split(r'\+\s+(?=[^+]*$)', line)[-1]

        # Evaluate the literal expression to convert it into a Python object
        l = ast.literal_eval(l)
        
        # Append the conversation lines to the list
        conversation_lines.append(l)

    # Create a dataframe from the extracted values
    movie_conversations = pd.DataFrame(list(zip(speaker1_ids, speaker2_ids, conversation_lines)),
                                       columns=["speaker1_id", "speaker2_id", "conversation_lines"])

    # Display the first few rows of the dataframe
    display(movie_conversations.head())

    # Display information about the dataframe (e.g., data types, memory usage)
    movie_conversations.info()

In [28]:
# Function for turning movie lines into multi-turn conversations for training
def create_conversation_turns(row):
    # Extract speaker1 ID and conversation lines from the row
    speaker1 = row["speaker1_id"]
    conversation_list = row["conversation_lines"]
    
    # Initialize an empty string to store the conversation
    convo = ""
    
    # For each line in the conversation, add it to the conversation string with a role label
    for line_id in conversation_list:
        # Retrieve the movie line details from the preprocessed dictionary
        movie_line = line_numbers_dict[line_id]
        text = movie_line[1]

        # Determine the role label based on the speaker ID
        if movie_line[0] == speaker1:
            convo += f"<USER>: {text} \n"
        else:
            convo += f"<AGENT>: {text} \n"
    
    return convo

In [29]:
# Check if the dataset has not been generated
if not dataset_generated:
    # Try the create_conversation_turns function on a sample for testing
    sample = movie_conversations.copy().iloc[:2]
    sample["conversation"] = sample.apply(create_conversation_turns, axis=1)
    display(sample)

    # Apply create_conversation_turns on every row of the dataframe
    movie_conversations["conversation"] = movie_conversations.apply(create_conversation_turns, axis=1)

    # Drop speaker ID columns because they aren't needed anymore
    movie_conversations.drop(columns=["speaker1_id", "speaker2_id"], inplace=True)

In [30]:
# Check if the dataset has not been generated
if not dataset_generated:
    # Display the first few rows of the 'movie_conversations' DataFrame
    display(movie_conversations.head())

    # Print the conversation text of the first row for verification
    print(movie_conversations["conversation"].head(1).values[0])

In [31]:
# Check if the dataset has not been generated
if not dataset_generated:
    # Initialize GPT-2 tokenizer from Hugging Face's Transformers library
    tokenizer = GPT2Tokenizer.from_pretrained("gpt2-medium")

    # Calculate the token count for each conversation to identify ones that are too long
    movie_conversations["token_count"] = movie_conversations["conversation"].apply(lambda x: len(tokenizer.encode(x)))

    # Display the first few rows of the 'movie_conversations' DataFrame with the new 'token_count' column
    movie_conversations.head()

In [32]:
# Check if the dataset has not been generated
if not dataset_generated:    
    # Get the conversation entries that are over the token limit
    over_token_limit = movie_conversations[movie_conversations["token_count"] > MAX_TOKEN_LENGTH].index
    print(over_token_limit)

In [33]:
# Check if the dataset has not been generated
if not dataset_generated:
    # NOTE: Be careful not to run this more than once without resetting the dataframe

    # Initialize a list to store new entries after splitting conversations
    new_entries = []

    # For each conversation that is too long, split it in half
    for idx in over_token_limit:
        lines_to_split = movie_conversations.iloc[idx]["conversation_lines"]
        split_idx = (len(lines_to_split)//2)
        first_half = lines_to_split[:split_idx]
        second_half = lines_to_split[split_idx:]

        first_convo = ""
        second_convo = ""

        # For each line in the first half, add it to the conversation
        for i, line_id in enumerate(first_half):
            movie_line = line_numbers_dict[line_id]
            text = movie_line[1]

            # Alternate between USER and AGENT roles
            if i % 2 == 0:
                first_convo += f"<USER>: {text} \n"
            else:
                first_convo += f"<AGENT>: {text} \n"

        # Append the new entry for the first half to the list
        new_entries.append({"conversation_lines": first_half, "conversation": first_convo})

        # For each line in the second half, add it to the conversation
        for i, line_id in enumerate(second_half):
            movie_line = line_numbers_dict[line_id]
            text = movie_line[1]

            # Alternate between USER and AGENT roles
            if i % 2 == 0:
                second_convo += f"<USER>: {text} \n"
            else:
                second_convo += f"<AGENT>: {text} \n"

        # Append the new entry for the second half to the list
        new_entries.append({"conversation_lines": second_half, "conversation": second_convo})

    # Add the new entries from splitting and drop the originals
    movie_conversations = pd.concat([movie_conversations, pd.DataFrame(new_entries)], axis=0, ignore_index=True)
    movie_conversations.drop(index=over_token_limit, inplace=True)

    # Reset the index of the dataframe
    movie_conversations.reset_index(inplace=True, drop=True)

    # Display information about the updated 'movie_conversations' DataFrame
    movie_conversations.info()

In [34]:
# Check if the dataset has not been generated
if not dataset_generated:
    # Check that no conversations are over the token limit
    display(movie_conversations[movie_conversations["token_count"] > MAX_TOKEN_LENGTH])

In [35]:
# Check if the dataset has not been generated
if not dataset_generated:
    # Save the 'movie_conversations' DataFrame to a file using pickle
    with open(DATASET_FILENAME, "wb") as f:
        print("Writing dataframe to file")
        pickle.dump(movie_conversations, f)
else:
    # Load the 'movie_conversations' DataFrame from the saved file
    with open(DATASET_FILENAME, "rb") as f:
        print("Loading dataframe from file")
        movie_conversations = pickle.load(f)

# Display the first few rows of the 'movie_conversations' DataFrame
display(movie_conversations.head())

# Print the conversation text of the first row for verification
print(movie_conversations["conversation"].head(1).values[0])

Loading dataframe from file


Unnamed: 0,speaker1_id,speaker2_id,conversation_lines,conversation
0,u0,u2,"[L194, L195, L196, L197]","<USER>: Can we make this quick? Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad. Again. \n<AGENT>: Well, I thought we'd start with pronunciation, if that's okay with you. \n<USER>: Not the hacking and gagging and spitting part. Please. \n<AGENT>: Okay... then how 'bout we try out some French cuisine. Saturday? Night? \n"
1,u0,u2,"[L198, L199]",<USER>: You're asking me out. That's so cute. What's your name again? \n<AGENT>: Forget it. \n
2,u0,u2,"[L200, L201, L202, L203]","<USER>: No, no, it's my fault -- we didn't have a proper introduction --- \n<AGENT>: Cameron. \n<USER>: The thing is, Cameron -- I'm at the mercy of a particularly hideous breed of loser. My sister. I can't date until she does. \n<AGENT>: Seems like she could get a date easy enough... \n"
3,u0,u2,"[L204, L205, L206]","<AGENT>: Why? \n<USER>: Unsolved mystery. She used to be really popular when she started high school, then it was just like she got sick of it or something. \n<AGENT>: That's a shame. \n"
4,u0,u2,"[L207, L208]","<USER>: Gosh, if only we could find Kat a boyfriend... \n<AGENT>: Let me see what I can do. \n"


<USER>: Can we make this quick?  Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad.  Again. 
<AGENT>: Well, I thought we'd start with pronunciation, if that's okay with you. 
<USER>: Not the hacking and gagging and spitting part.  Please. 
<AGENT>: Okay... then how 'bout we try out some French cuisine.  Saturday?  Night? 



## ------------------------- Fine tune + Train -------------------------

In [36]:
! pip install torch transformers



In [37]:
# Convert pkl to txt
# import pickle
# import pandas as pd

# # Load data from pickle file
# with open('movie_conversations.pkl', 'rb') as f:
#     data = pickle.load(f)

# # Assuming your DataFrame has a 'text_column' containing text data
# if isinstance(data, pd.DataFrame) and 'conversation' in data.columns:
#     text_content = '\n'.join(data['conversation'].astype(str))
# else:
#     raise ValueError("Unexpected data format in the pickle file")

# # Save the text content to a text file
# with open('movie_convo.txt', 'w', encoding='utf-8') as f:
#     f.write(text_content)

In [41]:
! pip install accelerate -U



In [38]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer, GPT2Config
from transformers import TextDataset, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments

In [39]:
def fine_tune_gpt2(model_name, train_file, output_dir):
    # Load GPT-2 model and tokenizer
    model = GPT2LMHeadModel.from_pretrained(model_name)
    tokenizer = GPT2Tokenizer.from_pretrained(model_name)

    # Load training dataset
    train_dataset = TextDataset(
        tokenizer=tokenizer,
        file_path=train_file,
        block_size=128)

    # Create data collator for language modeling
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm=False)

    # Set training arguments
    training_args = TrainingArguments(
        output_dir=output_dir,
        overwrite_output_dir=True,
        num_train_epochs=1,
        per_device_train_batch_size=4,
        save_steps=10_000,
        save_total_limit=2,
    )

    # Train the model
    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=train_dataset,
    )

    trainer.train()

    # Save the fine-tuned model
    model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)

In [40]:
fine_tune_gpt2(
    "gpt2-medium",
    "/Users/maimuna/Desktop/AI Masters @ USD/AAI-520-02 - FALL23-Natural Language Processing /generative-chat-bot/movie_convo.txt",
    "/Users/maimuna/Desktop/AI Masters @ USD/AAI-520-02 - FALL23-Natural Language Processing /generative-chat-bot/NEWmovie.txt"
)

  4%|▍         | 500/12164 [12:15<5:31:31,  1.71s/it] 

{'loss': 2.6, 'learning_rate': 4.7944755014797766e-05, 'epoch': 0.04}


  8%|▊         | 1000/12164 [21:37<3:16:28,  1.06s/it]

{'loss': 2.5745, 'learning_rate': 4.588951002959553e-05, 'epoch': 0.08}


 12%|█▏        | 1500/12164 [30:46<3:13:56,  1.09s/it]

{'loss': 2.5416, 'learning_rate': 4.3834265044393294e-05, 'epoch': 0.12}


 16%|█▋        | 2000/12164 [39:52<3:05:44,  1.10s/it]

{'loss': 2.5363, 'learning_rate': 4.177902005919106e-05, 'epoch': 0.16}


 21%|██        | 2500/12164 [49:19<2:54:11,  1.08s/it]

{'loss': 2.5262, 'learning_rate': 3.972377507398882e-05, 'epoch': 0.21}


 25%|██▍       | 3000/12164 [1:32:22<4:49:47,  1.90s/it]   

{'loss': 2.5462, 'learning_rate': 3.7668530088786585e-05, 'epoch': 0.25}


 29%|██▉       | 3500/12164 [2:19:21<12:38:02,  5.25s/it]  

{'loss': 2.5143, 'learning_rate': 3.561328510358435e-05, 'epoch': 0.29}


 33%|███▎      | 4000/12164 [2:56:49<8:42:51,  3.84s/it] 

{'loss': 2.5121, 'learning_rate': 3.355804011838211e-05, 'epoch': 0.33}


 37%|███▋      | 4500/12164 [3:15:44<2:19:06,  1.09s/it] 

{'loss': 2.5256, 'learning_rate': 3.150279513317988e-05, 'epoch': 0.37}


 41%|████      | 5000/12164 [3:35:05<2:10:40,  1.09s/it]  

{'loss': 2.5009, 'learning_rate': 2.9447550147977644e-05, 'epoch': 0.41}


 45%|████▌     | 5500/12164 [4:09:54<1:59:47,  1.08s/it]  

{'loss': 2.4945, 'learning_rate': 2.7392305162775405e-05, 'epoch': 0.45}


 49%|████▉     | 6000/12164 [4:27:24<1:56:38,  1.14s/it]  

{'loss': 2.4885, 'learning_rate': 2.533706017757317e-05, 'epoch': 0.49}


 53%|█████▎    | 6500/12164 [4:38:19<1:57:37,  1.25s/it] 

{'loss': 2.5084, 'learning_rate': 2.3281815192370932e-05, 'epoch': 0.53}


 58%|█████▊    | 7000/12164 [4:47:53<1:34:55,  1.10s/it]

{'loss': 2.4806, 'learning_rate': 2.1226570207168696e-05, 'epoch': 0.58}


 62%|██████▏   | 7500/12164 [5:22:08<1:23:37,  1.08s/it]  

{'loss': 2.4713, 'learning_rate': 1.917132522196646e-05, 'epoch': 0.62}


 66%|██████▌   | 8000/12164 [6:14:58<1:42:04,  1.47s/it]  

{'loss': 2.464, 'learning_rate': 1.7116080236764224e-05, 'epoch': 0.66}


 70%|██████▉   | 8500/12164 [6:24:14<1:08:38,  1.12s/it]

{'loss': 2.4803, 'learning_rate': 1.5060835251561986e-05, 'epoch': 0.7}


 74%|███████▍  | 9000/12164 [6:34:42<59:22,  1.13s/it]   

{'loss': 2.4743, 'learning_rate': 1.3005590266359752e-05, 'epoch': 0.74}


 78%|███████▊  | 9500/12164 [6:46:37<54:54,  1.24s/it]   

{'loss': 2.4649, 'learning_rate': 1.0950345281157515e-05, 'epoch': 0.78}


 82%|████████▏ | 10000/12164 [7:12:56<7:47:29, 12.96s/it]  

{'loss': 2.4507, 'learning_rate': 8.895100295955277e-06, 'epoch': 0.82}


 86%|████████▋ | 10500/12164 [7:26:08<30:41,  1.11s/it]   

{'loss': 2.4498, 'learning_rate': 6.839855310753042e-06, 'epoch': 0.86}


 90%|█████████ | 11000/12164 [7:35:29<22:10,  1.14s/it]

{'loss': 2.4637, 'learning_rate': 4.784610325550805e-06, 'epoch': 0.9}


 95%|█████████▍| 11500/12164 [7:44:50<12:07,  1.10s/it]

{'loss': 2.4649, 'learning_rate': 2.7293653403485694e-06, 'epoch': 0.95}


 99%|█████████▊| 12000/12164 [7:54:04<03:03,  1.12s/it]

{'loss': 2.4701, 'learning_rate': 6.741203551463335e-07, 'epoch': 0.99}


100%|██████████| 12164/12164 [7:57:07<00:00,  2.35s/it]


{'train_runtime': 28627.4092, 'train_samples_per_second': 1.7, 'train_steps_per_second': 0.425, 'train_loss': 2.499623904217234, 'epoch': 1.0}


Using pad_token, but it is not set yet.
Using pad_token, but it is not set yet.


In [44]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2-medium")
model = GPT2LMHeadModel.from_pretrained("gpt2-medium")
model.save_pretrained("/Users/maimuna/Desktop/AI Masters @ USD/AAI-520-02 - FALL23-Natural Language Processing /generative-chat-bot/NEWmovie.txt")
tokenizer.save_pretrained("/Users/maimuna/Desktop/AI Masters @ USD/AAI-520-02 - FALL23-Natural Language Processing /generative-chat-bot/NEWmovie.txt")

Using pad_token, but it is not set yet.
Using pad_token, but it is not set yet.


('/Users/maimuna/Desktop/AI Masters @ USD/AAI-520-02 - FALL23-Natural Language Processing /generative-chat-bot/NEWmovie.txt/tokenizer_config.json',
 '/Users/maimuna/Desktop/AI Masters @ USD/AAI-520-02 - FALL23-Natural Language Processing /generative-chat-bot/NEWmovie.txt/special_tokens_map.json',
 '/Users/maimuna/Desktop/AI Masters @ USD/AAI-520-02 - FALL23-Natural Language Processing /generative-chat-bot/NEWmovie.txt/vocab.json',
 '/Users/maimuna/Desktop/AI Masters @ USD/AAI-520-02 - FALL23-Natural Language Processing /generative-chat-bot/NEWmovie.txt/merges.txt',
 '/Users/maimuna/Desktop/AI Masters @ USD/AAI-520-02 - FALL23-Natural Language Processing /generative-chat-bot/NEWmovie.txt/added_tokens.json')

In [49]:
import torch

In [50]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
def generate_response(prompt, model, tokenizer, max_length=50):
    input_ids = tokenizer.encode(prompt, return_tensors="pt").to(device)
    with torch.no_grad():
        output = model.generate(input_ids, max_length=max_length)
    response = tokenizer.decode(output[0], skip_special_tokens=True)
    return response

# Example usage
prompt = "Hello, how was your day?"
response = generate_response(prompt, model, tokenizer)
print(response)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Hello, how was your day?

I was fine. I was just tired.

What was your favorite part of the day?

I was just enjoying the day.

What was your worst part of the day?

