## Build your own Sheldon Chatbot


In [39]:
from transformers import AutoModelWithLMHead, AutoTokenizer
import torch
import pandas as pd
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings("ignore")

In [23]:
tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-small")
tokenizer.padding_side = "left"
model = AutoModelWithLMHead.from_pretrained("microsoft/DialoGPT-small")

In [25]:
for step in range(5):
    # encode user input and add eos_token
    new_user_input_ids = tokenizer.encode(input(">> User:") + tokenizer.eos_token, return_tensors='pt')

    if step > 0:
        # append new user input token to chat history
        bot_input_ids = torch.cat([chat_history_ids, new_user_input_ids], dim=-1) if step > 0 else new_user_input_ids

    # generate a response
    chat_history_ids = model.generate(
        bot_input_ids, max_length=1000,
        pad_token_id=tokenizer.eos_token_id
    )

    print("DialoGPT: {}".format(tokenizer.decode(chat_history_ids[:, bot_input_ids.shape[-1]:][0], skip_special_tokens=True)))

A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


DialoGPT: I'm rick


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


DialoGPT: I'm rick


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


DialoGPT: i am morty


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


DialoGPT: i am morty


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


DialoGPT: i am morty


## Data Preprocessing

src: https://www.kaggle.com/datasets/mitramir5/the-big-bang-theory-series-transcript

In [37]:
bbt = pd.read_csv('data/1_10_seasons_tbbt.csv')
bbt.head(10)

Unnamed: 0,episode_name,dialogue,person_scene
0,Series 01 Episode 01 – Pilot Episode,A corridor at a sperm bank.,Scene
1,Series 01 Episode 01 – Pilot Episode,So if a photon is directed through a plane wi...,Sheldon
2,Series 01 Episode 01 – Pilot Episode,"Agreed, what’s your point?",Leonard
3,Series 01 Episode 01 – Pilot Episode,"There’s no point, I just think it’s a good id...",Sheldon
4,Series 01 Episode 01 – Pilot Episode,Excuse me?,Leonard
5,Series 01 Episode 01 – Pilot Episode,Hang on.,Receptionist
6,Series 01 Episode 01 – Pilot Episode,"One across is Aegean, eight down is Nabakov, ...",Leonard
7,Series 01 Episode 01 – Pilot Episode,Can I help you?,Receptionist
8,Series 01 Episode 01 – Pilot Episode,"Yes. Um, is this the High IQ sperm bank?",Leonard
9,Series 01 Episode 01 – Pilot Episode,"If you have to ask, maybe you shouldn’t be here.",Receptionist


In [38]:
contexted = []

n = 7

for i in range(n, len(bbt['dialogue'])):
    row = []
    prev = i - 1 - n # subtract 1 to include cur response and 7 prev response
    if bbt['person_scene'][i] == 'Sheldon':
        for j in range(i, prev, -1):
            row.append(bbt['dialogue'][j])
        contexted.append(row)
columns = ['reponse', 'context']
columns = columns + ['context/'+str(i) for i in range(n-1)]

df = pd.DataFrame.from_records(contexted, columns=columns)
df.head(3)

Unnamed: 0,reponse,context,context/0,context/1,context/2,context/3,context/4,context/5
0,I think this is the place.,"If you have to ask, maybe you shouldn’t be here.","Yes. Um, is this the High IQ sperm bank?",Can I help you?,"One across is Aegean, eight down is Nabakov, ...",Hang on.,Excuse me?,"There’s no point, I just think it’s a good id..."
1,"Leonard, I don’t think I can do this.","Oh, take your time. I’ll just finish my cross...",Thank-you. We’ll be right back.,Fill these out.,I think this is the place.,"If you have to ask, maybe you shouldn’t be here.","Yes. Um, is this the High IQ sperm bank?",Can I help you?
2,No. We are committing genetic fraud. There’s ...,"What, are you kidding? You’re a semi-pro.","Leonard, I don’t think I can do this.","Oh, take your time. I’ll just finish my cross...",Thank-you. We’ll be right back.,Fill these out.,I think this is the place.,"If you have to ask, maybe you shouldn’t be here."


In [34]:
len(df)

11482

In [41]:
train_df, val_df = train_test_split(df, test_size=0.1)

In [43]:
train_df.to_csv("data/sheldon_train.csv", index=False)
val_df.to_csv("data/sheldon_val.csv", index=False)