#### Final Model: Attempt 4

In [1]:
from transformers import GPT2LMHeadModel, Trainer, TrainingArguments
import pandas as pd
import torch
import re
import os
from convokit import Corpus, download
from transformers import GPT2Tokenizer
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split

# Local save path for corpus & models
save_path = '/Users/halladaykinsey/Desktop/Conversational_Chatbot/conversational_data'
if not os.path.exists(save_path):
    os.makedirs(save_path)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Downloading & loading the corpus
corpus = Corpus(filename=download("movie-corpus"))

# Extracting conversations into a list
conversations = list(corpus.iter_conversations())

# Saving conversations to a file for later use
pd.DataFrame(conversations).to_csv(os.path.join(save_path, 'conversations.csv'), index=False)

# Printing a sample conversation for verification
for utt_id in conversations[0].get_utterance_ids():
    print(corpus.get_utterance(utt_id).text)

Downloading movie-corpus to /Users/halladaykinsey/.convokit/downloads/movie-corpus
Downloading movie-corpus from http://zissou.infosci.cornell.edu/convokit/datasets/movie-corpus/movie-corpus.zip (40.9MB)... Done
They do not!
They do to!


In [3]:
# Function to clean text by removing brackets, punctuation, & spaces
def clean_text(text):
    text = re.sub(r'\[.*?\]', '', text)  
    text = re.sub(r'[^a-zA-Z\s]', '', text)  
    text = re.sub(r'\s+', ' ', text).strip() 
    return text.lower() 

# Cleaning text for all utterances
total_convos = len(conversations)
for i, convo in enumerate(conversations):
    for utt_id in convo.get_utterance_ids():
        utt = corpus.get_utterance(utt_id)
        utt.text = clean_text(utt.text)
    if i % 100 == 0:  # Printing progress every 100 conversations
        print(f"Cleaned {i}/{total_convos} conversations")

Cleaned 0/83097 conversations
Cleaned 100/83097 conversations
Cleaned 200/83097 conversations
Cleaned 300/83097 conversations
Cleaned 400/83097 conversations
Cleaned 500/83097 conversations
Cleaned 600/83097 conversations
Cleaned 700/83097 conversations
Cleaned 800/83097 conversations
Cleaned 900/83097 conversations
Cleaned 1000/83097 conversations
Cleaned 1100/83097 conversations
Cleaned 1200/83097 conversations
Cleaned 1300/83097 conversations
Cleaned 1400/83097 conversations
Cleaned 1500/83097 conversations
Cleaned 1600/83097 conversations
Cleaned 1700/83097 conversations
Cleaned 1800/83097 conversations
Cleaned 1900/83097 conversations
Cleaned 2000/83097 conversations
Cleaned 2100/83097 conversations
Cleaned 2200/83097 conversations
Cleaned 2300/83097 conversations
Cleaned 2400/83097 conversations
Cleaned 2500/83097 conversations
Cleaned 2600/83097 conversations
Cleaned 2700/83097 conversations
Cleaned 2800/83097 conversations
Cleaned 2900/83097 conversations
Cleaned 3000/83097 con

In [4]:
# Creating conversation data
conversation_data = []

# Looping through processed conversations & collecting text
for convo in conversations:
    for utt_id in convo.get_utterance_ids():
        utt = corpus.get_utterance(utt_id)
        processed_text = utt.text
        conversation_data.append({'text': processed_text})

# Converting conversation data to df
df = pd.DataFrame(conversation_data)

# Displaying first few rows of df
print(df.head())

          text
0  they do not
1   they do to
2    i hope so
3     she okay
4      lets go


In [5]:
# Creating dictionary to store conversation data
conversation_dfs = {}
conversation_dfs['all_conversations'] = df

# Taking a random sample of 10,000 rows 
subset_size = 10000 
df_subset = df.sample(n=subset_size, random_state=42)

print(df_subset.shape)
print(df_subset.head())

# Splitting the dataset into training and evaluation sets
train_df, eval_df = train_test_split(df_subset, test_size=0.1, random_state=42)

(10000, 1)
                                                     text
63127                                 my mom wasnt a goat
134803  i must protect my interests ms kyle and intere...
143713  you said bad things hurt places so maybe good ...
216087                             she hates all freshmen
165209                              what are you gonna do


In [6]:
# Initializing tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token

# Define a function to tokenize input and response texts
def tokenize_text(text, max_length=128):
    return tokenizer.encode(text, truncation=True, padding='max_length', max_length=max_length)

# Tokenizing input and response text for training set
train_df['input_ids'] = train_df['text'].apply(lambda x: tokenize_text(x))
train_df['response_ids'] = train_df['text'].shift(-1).apply(lambda x: tokenize_text(x) if pd.notna(x) else [tokenizer.eos_token_id])

# Tokenizing input and response text for evaluation set
eval_df['input_ids'] = eval_df['text'].apply(lambda x: tokenize_text(x))
eval_df['response_ids'] = eval_df['text'].shift(-1).apply(lambda x: tokenize_text(x) if pd.notna(x) else [tokenizer.eos_token_id])



In [7]:
class MovieDialoguesDataset(Dataset):
    def __init__(self, dataframe):
        self.input_ids = dataframe['input_ids'].tolist()
        self.labels = dataframe['response_ids'].tolist()

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {
            'input_ids': torch.tensor(self.input_ids[idx], dtype=torch.long),
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }

In [8]:
# Filtering out rows where input_ids or response_ids are not of length 128
train_df = train_df[(train_df['input_ids'].apply(len) == 128) & (train_df['response_ids'].apply(len) == 128)]
eval_df = eval_df[(eval_df['input_ids'].apply(len) == 128) & (eval_df['response_ids'].apply(len) == 128)]

In [9]:
# Creating dataset instances
train_dataset = MovieDialoguesDataset(train_df)
eval_dataset = MovieDialoguesDataset(eval_df)

# Checking dataset instances
print(f"Train dataset size: {len(train_dataset)}")
print(f"Eval dataset size: {len(eval_dataset)}")

Train dataset size: 8981
Eval dataset size: 995


In [10]:
# Checking input & response ID lengths
print("Input IDs lengths:", train_df['input_ids'].apply(len).value_counts())
print("Response IDs lengths:", train_df['response_ids'].apply(len).value_counts())

Input IDs lengths: input_ids
128    8981
Name: count, dtype: int64
Response IDs lengths: response_ids
128    8981
Name: count, dtype: int64


In [11]:
# Training function 
trainer = None

def train_model_for_conversation(dataset):
    global trainer

    # Loading GPT-2
    model = GPT2LMHeadModel.from_pretrained('gpt2')

    # Defining training arguments for conversation
    training_args = TrainingArguments(
        output_dir=os.path.join(save_path, 'model_results'), 
        num_train_epochs=2, 
        per_device_train_batch_size=2,  
        per_device_eval_batch_size=2,
        warmup_steps=500,
        weight_decay=0.01,
        logging_dir=os.path.join(save_path, 'logs'),  
        logging_steps=10,
        evaluation_strategy="steps",
        save_strategy="steps",    
        load_best_model_at_end=True, 
    )

    # Creating trainer instance
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=dataset,
        eval_dataset=eval_dataset  
    )

    # Training model
    trainer.train()

    # Saving trained model to local path
    model.save_pretrained(os.path.join(save_path, 'trained_conversational_model'))
    tokenizer.save_pretrained(os.path.join(save_path, 'trained_conversational_model'))  # Save tokenizer

    return trainer


In [12]:
# Training model
train_model_for_conversation(train_dataset)

  0%|          | 10/8982 [00:11<2:26:50,  1.02it/s]

{'loss': 12.7561, 'grad_norm': 225.73989868164062, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.0}


                                                   
  0%|          | 10/8982 [01:44<2:26:50,  1.02it/s]

{'eval_loss': 13.287164688110352, 'eval_runtime': 93.238, 'eval_samples_per_second': 10.672, 'eval_steps_per_second': 5.341, 'epoch': 0.0}


  0%|          | 20/8982 [01:56<5:13:33,  2.10s/it] 

{'loss': 11.004, 'grad_norm': 226.78887939453125, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.0}


                                                   
  0%|          | 20/8982 [03:31<5:13:33,  2.10s/it]

{'eval_loss': 10.524092674255371, 'eval_runtime': 94.7719, 'eval_samples_per_second': 10.499, 'eval_steps_per_second': 5.255, 'epoch': 0.0}


  0%|          | 30/8982 [03:41<5:09:46,  2.08s/it] 

{'loss': 6.9438, 'grad_norm': 119.64160919189453, 'learning_rate': 3e-06, 'epoch': 0.01}


                                                   
  0%|          | 30/8982 [05:16<5:09:46,  2.08s/it]

{'eval_loss': 4.3112101554870605, 'eval_runtime': 95.8652, 'eval_samples_per_second': 10.379, 'eval_steps_per_second': 5.195, 'epoch': 0.01}


  0%|          | 40/8982 [05:26<5:07:26,  2.06s/it] 

{'loss': 2.8138, 'grad_norm': 38.805328369140625, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.01}


                                                   
  0%|          | 40/8982 [06:58<5:07:26,  2.06s/it]

{'eval_loss': 1.8046867847442627, 'eval_runtime': 91.9933, 'eval_samples_per_second': 10.816, 'eval_steps_per_second': 5.413, 'epoch': 0.01}


  1%|          | 50/8982 [07:08<5:03:30,  2.04s/it] 

{'loss': 1.598, 'grad_norm': 35.55680847167969, 'learning_rate': 5e-06, 'epoch': 0.01}


                                                   
  1%|          | 50/8982 [08:42<5:03:30,  2.04s/it]

{'eval_loss': 1.4474492073059082, 'eval_runtime': 93.9426, 'eval_samples_per_second': 10.592, 'eval_steps_per_second': 5.301, 'epoch': 0.01}


  1%|          | 60/8982 [08:52<5:09:06,  2.08s/it] 

{'loss': 1.0077, 'grad_norm': 30.959524154663086, 'learning_rate': 6e-06, 'epoch': 0.01}


                                                   
  1%|          | 60/8982 [10:28<5:09:06,  2.08s/it]

{'eval_loss': 1.2322136163711548, 'eval_runtime': 96.5894, 'eval_samples_per_second': 10.301, 'eval_steps_per_second': 5.156, 'epoch': 0.01}


  1%|          | 70/8982 [10:40<5:22:13,  2.17s/it] 

{'loss': 1.5614, 'grad_norm': 13.015396118164062, 'learning_rate': 7.000000000000001e-06, 'epoch': 0.02}


                                                   
  1%|          | 70/8982 [12:13<5:22:13,  2.17s/it]

{'eval_loss': 1.154893398284912, 'eval_runtime': 92.6999, 'eval_samples_per_second': 10.734, 'eval_steps_per_second': 5.372, 'epoch': 0.02}


  1%|          | 80/8982 [12:23<5:03:45,  2.05s/it] 

{'loss': 0.9707, 'grad_norm': 13.588808059692383, 'learning_rate': 8.000000000000001e-06, 'epoch': 0.02}


                                                   
  1%|          | 80/8982 [13:56<5:03:45,  2.05s/it]

{'eval_loss': 1.0686204433441162, 'eval_runtime': 93.1474, 'eval_samples_per_second': 10.682, 'eval_steps_per_second': 5.346, 'epoch': 0.02}


  1%|          | 90/8982 [14:05<5:01:57,  2.04s/it] 

{'loss': 1.1687, 'grad_norm': 25.365427017211914, 'learning_rate': 9e-06, 'epoch': 0.02}


                                                   
  1%|          | 90/8982 [15:41<5:01:57,  2.04s/it]

{'eval_loss': 0.9753501415252686, 'eval_runtime': 95.8951, 'eval_samples_per_second': 10.376, 'eval_steps_per_second': 5.193, 'epoch': 0.02}


  1%|          | 100/8982 [15:52<5:36:02,  2.27s/it]

{'loss': 1.1953, 'grad_norm': 4.623264312744141, 'learning_rate': 1e-05, 'epoch': 0.02}


                                                    
  1%|          | 100/8982 [17:27<5:36:02,  2.27s/it]

{'eval_loss': 0.8940721154212952, 'eval_runtime': 94.9598, 'eval_samples_per_second': 10.478, 'eval_steps_per_second': 5.244, 'epoch': 0.02}


  1%|          | 110/8982 [17:36<5:04:41,  2.06s/it] 

{'loss': 1.0254, 'grad_norm': 15.71133804321289, 'learning_rate': 1.1000000000000001e-05, 'epoch': 0.02}


                                                    
  1%|          | 110/8982 [19:10<5:04:41,  2.06s/it]

{'eval_loss': 0.8520007133483887, 'eval_runtime': 93.5919, 'eval_samples_per_second': 10.631, 'eval_steps_per_second': 5.321, 'epoch': 0.02}


  1%|▏         | 120/8982 [19:20<5:10:06,  2.10s/it] 

{'loss': 0.6429, 'grad_norm': 12.167830467224121, 'learning_rate': 1.2e-05, 'epoch': 0.03}


                                                    
  1%|▏         | 120/8982 [20:52<5:10:06,  2.10s/it]

{'eval_loss': 0.7836592793464661, 'eval_runtime': 91.9585, 'eval_samples_per_second': 10.82, 'eval_steps_per_second': 5.415, 'epoch': 0.03}


  1%|▏         | 130/8982 [21:02<4:57:20,  2.02s/it] 

{'loss': 0.5077, 'grad_norm': 7.217987060546875, 'learning_rate': 1.3000000000000001e-05, 'epoch': 0.03}


                                                    
  1%|▏         | 130/8982 [22:39<4:57:20,  2.02s/it]

{'eval_loss': 0.768967866897583, 'eval_runtime': 97.4718, 'eval_samples_per_second': 10.208, 'eval_steps_per_second': 5.109, 'epoch': 0.03}


  2%|▏         | 140/8982 [22:49<5:12:21,  2.12s/it] 

{'loss': 0.8115, 'grad_norm': 3.7317817211151123, 'learning_rate': 1.4000000000000001e-05, 'epoch': 0.03}


                                                    
  2%|▏         | 140/8982 [24:21<5:12:21,  2.12s/it]

{'eval_loss': 0.765421986579895, 'eval_runtime': 92.1896, 'eval_samples_per_second': 10.793, 'eval_steps_per_second': 5.402, 'epoch': 0.03}


  2%|▏         | 150/8982 [24:31<4:58:10,  2.03s/it] 

{'loss': 0.9785, 'grad_norm': 32.19070816040039, 'learning_rate': 1.5e-05, 'epoch': 0.03}


                                                    
  2%|▏         | 150/8982 [26:08<4:58:10,  2.03s/it]

{'eval_loss': 0.7573814988136292, 'eval_runtime': 96.8191, 'eval_samples_per_second': 10.277, 'eval_steps_per_second': 5.144, 'epoch': 0.03}


  2%|▏         | 160/8982 [26:17<5:07:01,  2.09s/it] 

{'loss': 0.7549, 'grad_norm': 4.792784690856934, 'learning_rate': 1.6000000000000003e-05, 'epoch': 0.04}


                                                    
  2%|▏         | 160/8982 [27:49<5:07:01,  2.09s/it]

{'eval_loss': 0.7633912563323975, 'eval_runtime': 91.8102, 'eval_samples_per_second': 10.838, 'eval_steps_per_second': 5.424, 'epoch': 0.04}


  2%|▏         | 170/8982 [27:59<4:58:11,  2.03s/it] 

{'loss': 0.6052, 'grad_norm': 11.91805648803711, 'learning_rate': 1.7000000000000003e-05, 'epoch': 0.04}


                                                    
  2%|▏         | 170/8982 [29:30<4:58:11,  2.03s/it]

{'eval_loss': 0.782227098941803, 'eval_runtime': 91.7229, 'eval_samples_per_second': 10.848, 'eval_steps_per_second': 5.429, 'epoch': 0.04}


  2%|▏         | 180/8982 [29:40<4:58:33,  2.04s/it] 

{'loss': 0.8482, 'grad_norm': 15.787534713745117, 'learning_rate': 1.8e-05, 'epoch': 0.04}


                                                    
  2%|▏         | 180/8982 [31:13<4:58:33,  2.04s/it]

{'eval_loss': 0.7484285831451416, 'eval_runtime': 92.8153, 'eval_samples_per_second': 10.72, 'eval_steps_per_second': 5.365, 'epoch': 0.04}


  2%|▏         | 190/8982 [31:22<4:57:43,  2.03s/it] 

{'loss': 0.6797, 'grad_norm': 9.558030128479004, 'learning_rate': 1.9e-05, 'epoch': 0.04}


                                                    
  2%|▏         | 190/8982 [32:58<4:57:43,  2.03s/it]

{'eval_loss': 0.7376377582550049, 'eval_runtime': 96.0173, 'eval_samples_per_second': 10.363, 'eval_steps_per_second': 5.187, 'epoch': 0.04}


  2%|▏         | 200/8982 [33:08<5:02:09,  2.06s/it] 

{'loss': 0.5914, 'grad_norm': 6.9453301429748535, 'learning_rate': 2e-05, 'epoch': 0.04}


                                                    
  2%|▏         | 200/8982 [34:46<5:02:09,  2.06s/it]

{'eval_loss': 0.7730563282966614, 'eval_runtime': 97.9523, 'eval_samples_per_second': 10.158, 'eval_steps_per_second': 5.084, 'epoch': 0.04}


  2%|▏         | 210/8982 [34:55<5:04:02,  2.08s/it] 

{'loss': 0.7969, 'grad_norm': 16.71982192993164, 'learning_rate': 2.1e-05, 'epoch': 0.05}


                                                    
  2%|▏         | 210/8982 [36:30<5:04:02,  2.08s/it]

{'eval_loss': 0.7590102553367615, 'eval_runtime': 95.2236, 'eval_samples_per_second': 10.449, 'eval_steps_per_second': 5.23, 'epoch': 0.05}


  2%|▏         | 220/8982 [36:40<4:58:05,  2.04s/it] 

{'loss': 0.6488, 'grad_norm': 20.064926147460938, 'learning_rate': 2.2000000000000003e-05, 'epoch': 0.05}


                                                    
  2%|▏         | 220/8982 [38:10<4:58:05,  2.04s/it]

{'eval_loss': 0.7452864646911621, 'eval_runtime': 90.3078, 'eval_samples_per_second': 11.018, 'eval_steps_per_second': 5.514, 'epoch': 0.05}


  3%|▎         | 230/8982 [38:20<4:51:14,  2.00s/it] 

{'loss': 1.3537, 'grad_norm': 10.655451774597168, 'learning_rate': 2.3000000000000003e-05, 'epoch': 0.05}


                                                    
  3%|▎         | 230/8982 [39:57<4:51:14,  2.00s/it]

{'eval_loss': 0.7939386963844299, 'eval_runtime': 97.8464, 'eval_samples_per_second': 10.169, 'eval_steps_per_second': 5.09, 'epoch': 0.05}


  3%|▎         | 240/8982 [40:07<5:01:12,  2.07s/it] 

{'loss': 0.6137, 'grad_norm': 3.812127113342285, 'learning_rate': 2.4e-05, 'epoch': 0.05}


                                                    
  3%|▎         | 240/8982 [41:43<5:01:12,  2.07s/it]

{'eval_loss': 0.7432315349578857, 'eval_runtime': 96.5353, 'eval_samples_per_second': 10.307, 'eval_steps_per_second': 5.159, 'epoch': 0.05}


  3%|▎         | 250/8982 [41:53<5:07:08,  2.11s/it] 

{'loss': 0.7728, 'grad_norm': 10.535128593444824, 'learning_rate': 2.5e-05, 'epoch': 0.06}


                                                    
  3%|▎         | 250/8982 [43:29<5:07:08,  2.11s/it]

{'eval_loss': 0.7861974835395813, 'eval_runtime': 96.1392, 'eval_samples_per_second': 10.35, 'eval_steps_per_second': 5.18, 'epoch': 0.06}


  3%|▎         | 260/8982 [43:39<5:06:26,  2.11s/it] 

{'loss': 0.8077, 'grad_norm': 9.04298210144043, 'learning_rate': 2.6000000000000002e-05, 'epoch': 0.06}


                                                    
  3%|▎         | 260/8982 [49:20<5:06:26,  2.11s/it]

{'eval_loss': 0.7289537191390991, 'eval_runtime': 340.9982, 'eval_samples_per_second': 2.918, 'eval_steps_per_second': 1.46, 'epoch': 0.06}


  3%|▎         | 270/8982 [49:31<12:23:07,  5.12s/it]  

{'loss': 0.7384, 'grad_norm': 10.024827003479004, 'learning_rate': 2.7000000000000002e-05, 'epoch': 0.06}


                                                     
  3%|▎         | 270/8982 [51:03<12:23:07,  5.12s/it]

{'eval_loss': 0.7296831607818604, 'eval_runtime': 91.9316, 'eval_samples_per_second': 10.823, 'eval_steps_per_second': 5.417, 'epoch': 0.06}


  3%|▎         | 280/8982 [51:13<5:03:33,  2.09s/it] 

{'loss': 0.6311, 'grad_norm': 43.645442962646484, 'learning_rate': 2.8000000000000003e-05, 'epoch': 0.06}


                                                    
  3%|▎         | 280/8982 [52:48<5:03:33,  2.09s/it]

{'eval_loss': 0.7777179479598999, 'eval_runtime': 94.9007, 'eval_samples_per_second': 10.485, 'eval_steps_per_second': 5.248, 'epoch': 0.06}


  3%|▎         | 290/8982 [52:57<5:01:47,  2.08s/it] 

{'loss': 0.4039, 'grad_norm': 5.0722784996032715, 'learning_rate': 2.9e-05, 'epoch': 0.06}


                                                    
  3%|▎         | 290/8982 [54:36<5:01:47,  2.08s/it]

{'eval_loss': 0.7596169114112854, 'eval_runtime': 99.1318, 'eval_samples_per_second': 10.037, 'eval_steps_per_second': 5.024, 'epoch': 0.06}


  3%|▎         | 300/8982 [54:46<5:07:17,  2.12s/it] 

{'loss': 1.1128, 'grad_norm': 11.951732635498047, 'learning_rate': 3e-05, 'epoch': 0.07}


                                                    
  3%|▎         | 300/8982 [56:18<5:07:17,  2.12s/it]

{'eval_loss': 0.7700353860855103, 'eval_runtime': 92.1221, 'eval_samples_per_second': 10.801, 'eval_steps_per_second': 5.406, 'epoch': 0.07}


  3%|▎         | 310/8982 [56:28<4:53:29,  2.03s/it] 

{'loss': 1.4994, 'grad_norm': 31.20751190185547, 'learning_rate': 3.1e-05, 'epoch': 0.07}


                                                    
  3%|▎         | 310/8982 [58:00<4:53:29,  2.03s/it]

{'eval_loss': 0.76714026927948, 'eval_runtime': 91.7883, 'eval_samples_per_second': 10.84, 'eval_steps_per_second': 5.426, 'epoch': 0.07}


  4%|▎         | 320/8982 [58:09<4:52:05,  2.02s/it] 

{'loss': 0.79, 'grad_norm': 3.9634838104248047, 'learning_rate': 3.2000000000000005e-05, 'epoch': 0.07}


                                                    
  4%|▎         | 320/8982 [1:01:48<4:52:05,  2.02s/it]

{'eval_loss': 0.7585521936416626, 'eval_runtime': 218.4296, 'eval_samples_per_second': 4.555, 'eval_steps_per_second': 2.28, 'epoch': 0.07}


  4%|▎         | 330/8982 [1:01:58<8:30:22,  3.54s/it]  

{'loss': 0.6143, 'grad_norm': 5.89286994934082, 'learning_rate': 3.3e-05, 'epoch': 0.07}


                                                      
  4%|▎         | 330/8982 [1:03:26<8:30:22,  3.54s/it]

{'eval_loss': 0.7334747910499573, 'eval_runtime': 88.5268, 'eval_samples_per_second': 11.24, 'eval_steps_per_second': 5.625, 'epoch': 0.07}


  4%|▍         | 340/8982 [1:03:36<4:51:10,  2.02s/it] 

{'loss': 0.8197, 'grad_norm': 4.248692989349365, 'learning_rate': 3.4000000000000007e-05, 'epoch': 0.08}


                                                      
  4%|▍         | 340/8982 [1:05:02<4:51:10,  2.02s/it]

{'eval_loss': 0.7188882231712341, 'eval_runtime': 86.3882, 'eval_samples_per_second': 11.518, 'eval_steps_per_second': 5.765, 'epoch': 0.08}


  4%|▍         | 350/8982 [1:05:12<4:41:24,  1.96s/it] 

{'loss': 0.7969, 'grad_norm': 10.747626304626465, 'learning_rate': 3.5e-05, 'epoch': 0.08}


                                                      
  4%|▍         | 350/8982 [1:06:44<4:41:24,  1.96s/it]

{'eval_loss': 0.7218369841575623, 'eval_runtime': 91.5087, 'eval_samples_per_second': 10.873, 'eval_steps_per_second': 5.442, 'epoch': 0.08}


  4%|▍         | 360/8982 [1:06:53<4:50:51,  2.02s/it] 

{'loss': 0.7067, 'grad_norm': 13.693099021911621, 'learning_rate': 3.6e-05, 'epoch': 0.08}


                                                      
  4%|▍         | 360/8982 [1:08:25<4:50:51,  2.02s/it]

{'eval_loss': 0.715442419052124, 'eval_runtime': 91.5553, 'eval_samples_per_second': 10.868, 'eval_steps_per_second': 5.439, 'epoch': 0.08}


  4%|▍         | 370/8982 [1:08:34<4:46:52,  2.00s/it] 

{'loss': 0.4952, 'grad_norm': 4.46554708480835, 'learning_rate': 3.7e-05, 'epoch': 0.08}


                                                      
  4%|▍         | 370/8982 [1:10:04<4:46:52,  2.00s/it]

{'eval_loss': 0.8134557008743286, 'eval_runtime': 89.3011, 'eval_samples_per_second': 11.142, 'eval_steps_per_second': 5.577, 'epoch': 0.08}


  4%|▍         | 380/8982 [1:10:13<4:41:26,  1.96s/it] 

{'loss': 0.824, 'grad_norm': 12.475844383239746, 'learning_rate': 3.8e-05, 'epoch': 0.08}


                                                      
  4%|▍         | 380/8982 [1:11:43<4:41:26,  1.96s/it]

{'eval_loss': 0.7219618558883667, 'eval_runtime': 89.7109, 'eval_samples_per_second': 11.091, 'eval_steps_per_second': 5.551, 'epoch': 0.08}


  4%|▍         | 390/8982 [1:11:52<4:50:53,  2.03s/it] 

{'loss': 1.0781, 'grad_norm': 19.894636154174805, 'learning_rate': 3.9000000000000006e-05, 'epoch': 0.09}


                                                      
  4%|▍         | 390/8982 [1:13:20<4:50:53,  2.03s/it]

{'eval_loss': 0.741669774055481, 'eval_runtime': 87.8866, 'eval_samples_per_second': 11.321, 'eval_steps_per_second': 5.666, 'epoch': 0.09}


  4%|▍         | 400/8982 [1:13:30<4:44:17,  1.99s/it] 

{'loss': 0.6871, 'grad_norm': 7.530923843383789, 'learning_rate': 4e-05, 'epoch': 0.09}


                                                      
  4%|▍         | 400/8982 [1:14:58<4:44:17,  1.99s/it]

{'eval_loss': 0.7725932598114014, 'eval_runtime': 88.5067, 'eval_samples_per_second': 11.242, 'eval_steps_per_second': 5.627, 'epoch': 0.09}


  5%|▍         | 410/8982 [1:15:08<4:41:19,  1.97s/it] 

{'loss': 0.6749, 'grad_norm': 12.001233100891113, 'learning_rate': 4.1e-05, 'epoch': 0.09}


                                                      
  5%|▍         | 410/8982 [1:16:39<4:41:19,  1.97s/it]

{'eval_loss': 0.7639809250831604, 'eval_runtime': 90.4922, 'eval_samples_per_second': 10.995, 'eval_steps_per_second': 5.503, 'epoch': 0.09}


  5%|▍         | 420/8982 [1:16:48<4:51:54,  2.05s/it] 

{'loss': 0.5663, 'grad_norm': 19.694570541381836, 'learning_rate': 4.2e-05, 'epoch': 0.09}


                                                      
  5%|▍         | 420/8982 [1:18:18<4:51:54,  2.05s/it]

{'eval_loss': 0.7200566530227661, 'eval_runtime': 90.2528, 'eval_samples_per_second': 11.025, 'eval_steps_per_second': 5.518, 'epoch': 0.09}


  5%|▍         | 430/8982 [1:18:28<4:44:03,  1.99s/it] 

{'loss': 0.718, 'grad_norm': 7.53505802154541, 'learning_rate': 4.3e-05, 'epoch': 0.1}


                                                      
  5%|▍         | 430/8982 [1:19:58<4:44:03,  1.99s/it]

{'eval_loss': 0.7132872343063354, 'eval_runtime': 90.2092, 'eval_samples_per_second': 11.03, 'eval_steps_per_second': 5.52, 'epoch': 0.1}


  5%|▍         | 440/8982 [1:20:08<4:49:54,  2.04s/it] 

{'loss': 1.0859, 'grad_norm': 3.3442037105560303, 'learning_rate': 4.4000000000000006e-05, 'epoch': 0.1}


                                                      
  5%|▍         | 440/8982 [1:21:40<4:49:54,  2.04s/it]

{'eval_loss': 0.7192938327789307, 'eval_runtime': 91.5253, 'eval_samples_per_second': 10.871, 'eval_steps_per_second': 5.441, 'epoch': 0.1}


  5%|▌         | 450/8982 [1:21:51<5:13:50,  2.21s/it] 

{'loss': 0.6907, 'grad_norm': 4.642205715179443, 'learning_rate': 4.5e-05, 'epoch': 0.1}


                                                      
  5%|▌         | 450/8982 [1:26:51<5:13:50,  2.21s/it]

{'eval_loss': 0.7263298630714417, 'eval_runtime': 300.5338, 'eval_samples_per_second': 3.311, 'eval_steps_per_second': 1.657, 'epoch': 0.1}


  5%|▌         | 460/8982 [1:27:02<11:37:36,  4.91s/it] 

{'loss': 0.6052, 'grad_norm': 7.136635780334473, 'learning_rate': 4.600000000000001e-05, 'epoch': 0.1}


                                                       
  5%|▌         | 460/8982 [1:28:27<11:37:36,  4.91s/it]

{'eval_loss': 0.8042763471603394, 'eval_runtime': 84.8913, 'eval_samples_per_second': 11.721, 'eval_steps_per_second': 5.866, 'epoch': 0.1}


  5%|▌         | 470/8982 [1:28:36<4:32:45,  1.92s/it] 

{'loss': 0.9245, 'grad_norm': 3.0325064659118652, 'learning_rate': 4.7e-05, 'epoch': 0.1}


                                                      
  5%|▌         | 470/8982 [1:30:01<4:32:45,  1.92s/it]

{'eval_loss': 0.7295767664909363, 'eval_runtime': 84.6808, 'eval_samples_per_second': 11.75, 'eval_steps_per_second': 5.881, 'epoch': 0.1}


  5%|▌         | 480/8982 [1:30:09<4:23:17,  1.86s/it] 

{'loss': 0.5705, 'grad_norm': 6.795499324798584, 'learning_rate': 4.8e-05, 'epoch': 0.11}


                                                      
  5%|▌         | 480/8982 [1:31:36<4:23:17,  1.86s/it]

{'eval_loss': 0.7218373417854309, 'eval_runtime': 86.0463, 'eval_samples_per_second': 11.564, 'eval_steps_per_second': 5.788, 'epoch': 0.11}


  5%|▌         | 490/8982 [1:31:44<4:23:55,  1.86s/it] 

{'loss': 0.8097, 'grad_norm': 7.625492095947266, 'learning_rate': 4.9e-05, 'epoch': 0.11}


                                                      
  5%|▌         | 490/8982 [1:33:12<4:23:55,  1.86s/it]

{'eval_loss': 0.7263543605804443, 'eval_runtime': 87.4239, 'eval_samples_per_second': 11.381, 'eval_steps_per_second': 5.696, 'epoch': 0.11}


  6%|▌         | 500/8982 [1:33:21<4:25:37,  1.88s/it] 

{'loss': 0.6295, 'grad_norm': 7.837323188781738, 'learning_rate': 5e-05, 'epoch': 0.11}


                                                      
  6%|▌         | 500/8982 [1:34:53<4:25:37,  1.88s/it]

{'eval_loss': 0.7308434247970581, 'eval_runtime': 92.1516, 'eval_samples_per_second': 10.797, 'eval_steps_per_second': 5.404, 'epoch': 0.11}


  6%|▌         | 510/8982 [1:35:04<4:41:02,  1.99s/it] 

{'loss': 0.882, 'grad_norm': 2.5392556190490723, 'learning_rate': 4.994105163876445e-05, 'epoch': 0.11}


                                                      
  6%|▌         | 510/8982 [1:36:34<4:41:02,  1.99s/it]

{'eval_loss': 0.7174102067947388, 'eval_runtime': 90.5318, 'eval_samples_per_second': 10.991, 'eval_steps_per_second': 5.501, 'epoch': 0.11}


  6%|▌         | 520/8982 [1:36:43<4:34:26,  1.95s/it] 

{'loss': 0.6176, 'grad_norm': 2.4369993209838867, 'learning_rate': 4.9882103277528884e-05, 'epoch': 0.12}


                                                      
  6%|▌         | 520/8982 [1:38:14<4:34:26,  1.95s/it]

{'eval_loss': 0.7373520731925964, 'eval_runtime': 90.9649, 'eval_samples_per_second': 10.938, 'eval_steps_per_second': 5.475, 'epoch': 0.12}


  6%|▌         | 530/8982 [1:38:23<4:33:03,  1.94s/it] 

{'loss': 0.4594, 'grad_norm': 2.190375328063965, 'learning_rate': 4.982315491629333e-05, 'epoch': 0.12}


                                                      
  6%|▌         | 530/8982 [1:39:56<4:33:03,  1.94s/it]

{'eval_loss': 0.7759990096092224, 'eval_runtime': 93.7189, 'eval_samples_per_second': 10.617, 'eval_steps_per_second': 5.314, 'epoch': 0.12}


  6%|▌         | 540/8982 [1:40:05<4:36:39,  1.97s/it] 

{'loss': 0.5292, 'grad_norm': 4.201676845550537, 'learning_rate': 4.976420655505777e-05, 'epoch': 0.12}


                                                      
  6%|▌         | 540/8982 [1:43:14<4:36:39,  1.97s/it]

{'eval_loss': 0.7408835291862488, 'eval_runtime': 188.8116, 'eval_samples_per_second': 5.27, 'eval_steps_per_second': 2.638, 'epoch': 0.12}


  6%|▌         | 550/8982 [1:43:23<7:22:29,  3.15s/it]  

{'loss': 0.597, 'grad_norm': 3.916090488433838, 'learning_rate': 4.970525819382221e-05, 'epoch': 0.12}


                                                      
  6%|▌         | 550/8982 [1:44:52<7:22:29,  3.15s/it]

{'eval_loss': 0.7820221781730652, 'eval_runtime': 88.8805, 'eval_samples_per_second': 11.195, 'eval_steps_per_second': 5.603, 'epoch': 0.12}


  6%|▌         | 560/8982 [1:45:02<4:35:55,  1.97s/it] 

{'loss': 0.3935, 'grad_norm': 2.952568769454956, 'learning_rate': 4.9646309832586655e-05, 'epoch': 0.12}


                                                      
  6%|▌         | 560/8982 [1:46:32<4:35:55,  1.97s/it]

{'eval_loss': 0.7576313614845276, 'eval_runtime': 90.1143, 'eval_samples_per_second': 11.042, 'eval_steps_per_second': 5.526, 'epoch': 0.12}


  6%|▋         | 570/8982 [1:46:41<4:31:12,  1.93s/it] 

{'loss': 0.6958, 'grad_norm': 9.688631057739258, 'learning_rate': 4.95873614713511e-05, 'epoch': 0.13}


                                                      
  6%|▋         | 570/8982 [1:48:12<4:31:12,  1.93s/it]

{'eval_loss': 0.7609564065933228, 'eval_runtime': 91.5174, 'eval_samples_per_second': 10.872, 'eval_steps_per_second': 5.442, 'epoch': 0.13}


  6%|▋         | 580/8982 [1:48:21<4:36:28,  1.97s/it] 

{'loss': 0.8719, 'grad_norm': 6.546107769012451, 'learning_rate': 4.952841311011554e-05, 'epoch': 0.13}


                                                      
  6%|▋         | 580/8982 [1:49:48<4:36:28,  1.97s/it]

{'eval_loss': 0.7293756604194641, 'eval_runtime': 86.9699, 'eval_samples_per_second': 11.441, 'eval_steps_per_second': 5.726, 'epoch': 0.13}


  7%|▋         | 590/8982 [1:49:57<4:23:48,  1.89s/it] 

{'loss': 0.86, 'grad_norm': 23.958505630493164, 'learning_rate': 4.946946474887998e-05, 'epoch': 0.13}


                                                      
  7%|▋         | 590/8982 [1:51:28<4:23:48,  1.89s/it]

{'eval_loss': 0.7581551671028137, 'eval_runtime': 90.7497, 'eval_samples_per_second': 10.964, 'eval_steps_per_second': 5.488, 'epoch': 0.13}


  7%|▋         | 600/8982 [1:51:37<4:31:45,  1.95s/it] 

{'loss': 1.0699, 'grad_norm': 7.325691223144531, 'learning_rate': 4.9410516387644426e-05, 'epoch': 0.13}


                                                      
  7%|▋         | 600/8982 [2:02:32<4:31:45,  1.95s/it]

{'eval_loss': 0.7260693907737732, 'eval_runtime': 655.8013, 'eval_samples_per_second': 1.517, 'eval_steps_per_second': 0.759, 'epoch': 0.13}


  7%|▋         | 610/8982 [2:02:42<20:29:36,  8.81s/it]  

{'loss': 0.639, 'grad_norm': 1.2969063520431519, 'learning_rate': 4.935156802640887e-05, 'epoch': 0.14}


                                                       
  7%|▋         | 610/8982 [2:04:09<20:29:36,  8.81s/it]

{'eval_loss': 0.7679963111877441, 'eval_runtime': 87.1415, 'eval_samples_per_second': 11.418, 'eval_steps_per_second': 5.715, 'epoch': 0.14}


  7%|▋         | 620/8982 [2:04:18<4:50:58,  2.09s/it] 

{'loss': 1.0717, 'grad_norm': 2.810419797897339, 'learning_rate': 4.9292619665173315e-05, 'epoch': 0.14}


                                                      
  7%|▋         | 620/8982 [2:05:49<4:50:58,  2.09s/it]

{'eval_loss': 0.7204889059066772, 'eval_runtime': 91.1097, 'eval_samples_per_second': 10.921, 'eval_steps_per_second': 5.466, 'epoch': 0.14}


  7%|▋         | 630/8982 [2:05:58<4:33:08,  1.96s/it] 

{'loss': 0.7598, 'grad_norm': 10.782885551452637, 'learning_rate': 4.923367130393775e-05, 'epoch': 0.14}


                                                      
  7%|▋         | 630/8982 [2:07:29<4:33:08,  1.96s/it]

{'eval_loss': 0.7082875370979309, 'eval_runtime': 90.5231, 'eval_samples_per_second': 10.992, 'eval_steps_per_second': 5.501, 'epoch': 0.14}


  7%|▋         | 640/8982 [2:07:39<5:08:12,  2.22s/it] 

{'loss': 0.807, 'grad_norm': 4.408230304718018, 'learning_rate': 4.91747229427022e-05, 'epoch': 0.14}


                                                      
  7%|▋         | 640/8982 [2:09:06<5:08:12,  2.22s/it]

{'eval_loss': 0.7118021845817566, 'eval_runtime': 86.0712, 'eval_samples_per_second': 11.56, 'eval_steps_per_second': 5.786, 'epoch': 0.14}


  7%|▋         | 650/8982 [2:09:15<4:26:04,  1.92s/it] 

{'loss': 1.0928, 'grad_norm': 3.7372682094573975, 'learning_rate': 4.911577458146664e-05, 'epoch': 0.14}


                                                      
  7%|▋         | 650/8982 [2:10:44<4:26:04,  1.92s/it]

{'eval_loss': 0.7119852304458618, 'eval_runtime': 89.0386, 'eval_samples_per_second': 11.175, 'eval_steps_per_second': 5.593, 'epoch': 0.14}


  7%|▋         | 660/8982 [2:10:53<4:25:36,  1.92s/it] 

{'loss': 0.6392, 'grad_norm': 5.44580078125, 'learning_rate': 4.905682622023108e-05, 'epoch': 0.15}


                                                      
  7%|▋         | 660/8982 [2:12:20<4:25:36,  1.92s/it]

{'eval_loss': 0.7180932760238647, 'eval_runtime': 87.4665, 'eval_samples_per_second': 11.376, 'eval_steps_per_second': 5.694, 'epoch': 0.15}


  7%|▋         | 670/8982 [2:12:29<4:24:30,  1.91s/it] 

{'loss': 0.8921, 'grad_norm': 3.24893856048584, 'learning_rate': 4.8997877858995523e-05, 'epoch': 0.15}


                                                      
  7%|▋         | 670/8982 [3:12:20<4:24:30,  1.91s/it]

{'eval_loss': 0.7249559164047241, 'eval_runtime': 3591.0549, 'eval_samples_per_second': 0.277, 'eval_steps_per_second': 0.139, 'epoch': 0.15}


  8%|▊         | 680/8982 [3:59:19<516:02:30, 223.77s/it]  

{'loss': 1.026, 'grad_norm': 4.157061576843262, 'learning_rate': 4.893892949775997e-05, 'epoch': 0.15}


                                                         
  8%|▊         | 680/8982 [10:50:54<516:02:30, 223.77s/it]

{'eval_loss': 0.7660076022148132, 'eval_runtime': 24694.5579, 'eval_samples_per_second': 0.04, 'eval_steps_per_second': 0.02, 'epoch': 0.15}


  8%|▊         | 690/8982 [10:51:07<705:53:20, 306.46s/it]   

{'loss': 0.5782, 'grad_norm': 2.7078917026519775, 'learning_rate': 4.8879981136524405e-05, 'epoch': 0.15}


                                                          
  8%|▊         | 690/8982 [10:52:56<705:53:20, 306.46s/it]

{'eval_loss': 0.7605716586112976, 'eval_runtime': 108.9711, 'eval_samples_per_second': 9.131, 'eval_steps_per_second': 4.57, 'epoch': 0.15}


  8%|▊         | 700/8982 [10:53:08<25:19:10, 11.01s/it]  

{'loss': 1.4542, 'grad_norm': 4.265566825866699, 'learning_rate': 4.882103277528885e-05, 'epoch': 0.16}


                                                        
  8%|▊         | 700/8982 [10:54:59<25:19:10, 11.01s/it]

{'eval_loss': 0.7116566896438599, 'eval_runtime': 111.3937, 'eval_samples_per_second': 8.932, 'eval_steps_per_second': 4.471, 'epoch': 0.16}


  8%|▊         | 710/8982 [10:55:11<6:13:01,  2.71s/it] 

{'loss': 0.6493, 'grad_norm': 1.2409753799438477, 'learning_rate': 4.8762084414053294e-05, 'epoch': 0.16}


                                                       
  8%|▊         | 710/8982 [10:57:04<6:13:01,  2.71s/it]

{'eval_loss': 0.7099127173423767, 'eval_runtime': 112.8346, 'eval_samples_per_second': 8.818, 'eval_steps_per_second': 4.414, 'epoch': 0.16}


  8%|▊         | 719/8982 [10:57:14<7:03:32,  3.08s/it] 

In [13]:
# Evaluating the model
if trainer is not None:  
    eval_results = trainer.evaluate()

    # Printing metrics 
    print("Evaluation Metrics:")
    print("=====================")
    for key, value in eval_results.items():
        print(f"{key}: {value:.4f}")
else:
    print("Trainer is not defined.")

100%|██████████| 498/498 [01:34<00:00,  5.25it/s]

Evaluation Metrics:
eval_loss: 0.7101
eval_runtime: 95.4551
eval_samples_per_second: 10.4240
eval_steps_per_second: 5.2170
epoch: 3.0000





In [20]:
# Loading model from saved path
model_path = '/Users/halladaykinsey/Desktop/Conversational_Chatbot/trained_conversational_model'
model = GPT2LMHeadModel.from_pretrained(model_path)

model.eval()

def generate_response(prompt):
    input_ids = tokenizer.encode(prompt, return_tensors='pt')
    attention_mask = (input_ids != tokenizer.pad_token_id).long()  

    with torch.no_grad():
        output = model.generate(input_ids, 
                                attention_mask=attention_mask, 
                                max_length=150, 
                                num_return_sequences=1,
                                temperature=0.9,  
                                top_k=50,        
                                top_p=0.95)

    response = tokenizer.decode(output[0], skip_special_tokens=True)
    return response

conversation_history = ""

def chat():
    global conversation_history
    print("Welcome to the chatbot! Type 'exit' to end the conversation.")
    
    while True:
        user_input = input("You: ")
        if user_input.lower() == 'exit':
            break
        
        # Appending user input to conversation history
        conversation_history += f"You: {user_input}\n"
        
        # Generating response based on the updated conversation history
        response = generate_response(conversation_history)
        
        print(f"Bot: {response}")
        
        # Appending bot's response to conversation history
        conversation_history += f"Bot: {response}\n"

In [22]:
# Starting chat
chat()

Welcome to the chatbot! Type 'exit' to end the conversation.


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Bot: You: Hello
Bot: You: Hello

You: How are you?
Bot: You: Hello
Bot: You: Hello

You: How are you?

You: If I lived in Los Angeles, what movie would you recommend watching on a cold and stormy Saturday night?



Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


ValueError: Input length of input_ids is 157, but `max_length` is set to 150. This can lead to unexpected behavior. You should consider increasing `max_length` or, better yet, setting `max_new_tokens`.