In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("https://0x0.st/H5DC.csv")
df

Unnamed: 0,Season,Episode,Character,Line
0,10,1,Stan,"You guys, you guys! Chef is going away. \n"
1,10,1,Kyle,Going away? For how long?\n
2,10,1,Stan,Forever.\n
3,10,1,Chef,I'm sorry boys.\n
4,10,1,Stan,"Chef said he's been bored, so he joining a gro..."
...,...,...,...,...
70891,9,14,Stan,I think you're pushing it.\n
70892,9,14,Randy,How about twenty?\n
70893,9,14,Stan,That's not disciprine.\n
70894,9,14,Randy,Right right. Does vodka count?\n


In [47]:
from transformers import AutoTokenizer, AutoModel
import torch

tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")

In [49]:
import re

REGEX_WHITESPACE = re.compile(r"\s+")


def clean_text(text):
    text = re.sub(REGEX_WHITESPACE, " ", text)
    text = text.strip()
    return text

In [50]:
df["cleaned_text"] = df["Line"].apply(clean_text)

In [51]:
stan = df[df["Character"] == "Stan"]["cleaned_text"].reset_index(drop=True)
stan

0                 You guys, you guys! Chef is going away.
1                                                Forever.
2       Chef said he's been bored, so he joining a gro...
3       Dude, how are we gonna go on? Chef was our fuh...
4                                            I'll get it.
                              ...                        
7675                                           All right!
7676           That's probably okay if you spread it out.
7677                           I think you're pushing it.
7678                               That's not disciprine.
7679                                                 Dad!
Name: cleaned_text, Length: 7680, dtype: object

In [48]:
import numpy as np


def get_embedding(text):
    inputs = tokenizer(
        text, return_tensors="pt", padding=True, truncation=True, max_length=512
    )
    with torch.no_grad():
        outputs = model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1)
    return embeddings.numpy()

In [53]:
stan_embeddings = stan.apply(get_embedding)
stan_embeddings

0       [[0.049187645, -0.05694672, 0.07485127, 0.2308...
1       [[-0.40455797, 0.094735384, -0.069649436, 0.23...
2       [[-0.0007675134, -0.2751124, -0.22038087, 0.48...
3       [[0.0039247703, -0.062776096, -0.2656448, 0.14...
4       [[-0.638608, 0.11582565, -0.13687724, -0.17727...
                              ...                        
7675    [[-0.14151666, 0.0066320896, -0.32950217, -0.4...
7676    [[0.21162994, -0.13237353, -0.0614202, 0.16815...
7677    [[-0.013884425, -0.34359527, 0.058191538, -0.3...
7678    [[0.25847092, -0.42476812, -0.06599857, -0.138...
7679    [[-0.34460104, 0.13874263, -0.26019868, -0.173...
Name: cleaned_text, Length: 7680, dtype: object

In [61]:
import numpy as np

np.save("data/stan_embeddings.npy", stan_embeddings.to_numpy())
np.save("data/stan.npy", stan.to_numpy())

In [54]:
from sklearn.metrics.pairwise import cosine_similarity


def respond(query):
    query_embedding = get_embedding(query)
    similarity = cosine_similarity(np.vstack(stan_embeddings), query_embedding)
    most_similar_index = similarity.argmax()
    return stan[most_similar_index], similarity[most_similar_index]

In [55]:
input_lines = [
    "What's your favorite thing to do with your friends after school?",
    "How do you feel about Wendy?",
    "What do you think about Eric Cartman?",
    "Do you like living in South Park?",
    "I need to leave now, bye!",
]

responses = []
scores = []

for line in input_lines:
    response, score = respond(line)
    responses.append(response)
    scores.append(score)
    print(f"Q: {line}\nA: {response}\nScore: {score.max()}\n")

Q: What's your favorite thing to do with your friends after school?
A: So today we went to the amusement park with all our possible friends. It was a really fun time. We rode all the rides and everyone got along great.
Score: 0.471310019493103

Q: How do you feel about Wendy?
A: Wendy, why is it such a big deal?
Score: 0.760873019695282

Q: What do you think about Eric Cartman?
A: What the hell's wrong with Cartman?!
Score: 0.6942796111106873

Q: Do you like living in South Park?
A: What other crime in South Park?
Score: 0.5700172185897827

Q: I need to leave now, bye!
A: You're leaving already?
Score: 0.5960509181022644

