In [13]:
import pandas as pd

In [14]:
df = pd.read_csv("https://0x0.st/H5DC.csv")
df

Unnamed: 0,Season,Episode,Character,Line
0,10,1,Stan,"You guys, you guys! Chef is going away. \n"
1,10,1,Kyle,Going away? For how long?\n
2,10,1,Stan,Forever.\n
3,10,1,Chef,I'm sorry boys.\n
4,10,1,Stan,"Chef said he's been bored, so he joining a gro..."
...,...,...,...,...
70891,9,14,Stan,I think you're pushing it.\n
70892,9,14,Randy,How about twenty?\n
70893,9,14,Stan,That's not disciprine.\n
70894,9,14,Randy,Right right. Does vodka count?\n


In [15]:
import re

REGEX_WHITESPACE = re.compile(r"\s+")


def clean_text(text):
    text = re.sub(REGEX_WHITESPACE, " ", text)
    text = text.strip()
    return text

In [16]:
df["cleaned_text"] = df["Line"].apply(clean_text)
df["cleaned_text"]

0                  You guys, you guys! Chef is going away.
1                                Going away? For how long?
2                                                 Forever.
3                                          I'm sorry boys.
4        Chef said he's been bored, so he joining a gro...
                               ...                        
70891                           I think you're pushing it.
70892                                    How about twenty?
70893                               That's not disciprine.
70894                       Right right. Does vodka count?
70895                                                 Dad!
Name: cleaned_text, Length: 70896, dtype: object

In [17]:
df["Character"].value_counts()

Character
Cartman          9774
Stan             7680
Kyle             7099
Butters          2602
Randy            2467
                 ... 
Reenactor 1         1
Confederate 2       1
Confederate 1       1
Army Doctor         1
Blind Man           1
Name: count, Length: 3950, dtype: int64

In [18]:
stan = df[df["Character"] == "Stan"]["cleaned_text"].reset_index(drop=True)
stan

0                 You guys, you guys! Chef is going away.
1                                                Forever.
2       Chef said he's been bored, so he joining a gro...
3       Dude, how are we gonna go on? Chef was our fuh...
4                                            I'll get it.
                              ...                        
7675                                           All right!
7676           That's probably okay if you spread it out.
7677                           I think you're pushing it.
7678                               That's not disciprine.
7679                                                 Dad!
Name: cleaned_text, Length: 7680, dtype: object

In [19]:
%pip install -U spacy[cuda12x] --quiet

zsh:1: no matches found: spacy[cuda12x]
Note: you may need to restart the kernel to use updated packages.


In [20]:
import spacy

spacy_gpu = spacy.prefer_gpu()
print(f"{spacy_gpu}")
nlp = spacy.load("en_core_web_sm")


def preprocess_text(text):
    doc = nlp(text.lower())
    return " ".join(
        [token.lemma_ for token in doc if not token.is_stop and not token.is_punct]
    )

True


In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer

processed_dataset = stan.apply(preprocess_text)

vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(processed_dataset)

In [24]:
from sklearn.metrics.pairwise import cosine_similarity


def respond(query):
    query_vector = vectorizer.transform([preprocess_text(query)])
    similarity = cosine_similarity(tfidf_matrix, query_vector)
    most_similar_index = similarity.argmax()
    return stan[most_similar_index], similarity[most_similar_index]

In [25]:
input_lines = [
    "What's your favorite thing to do with your friends after school?",
    "How do you feel about Wendy?",
    "What do you think about Eric Cartman?",
    "Do you like living in South Park?",
    "I need to leave now, bye!",
]

responses = []
scores = []

for line in input_lines:
    response, score = respond(line)
    responses.append(response)
    scores.append(score)
    print(f"Q: {line}\nA: {response}\nScore: {score.max()}\n")

Q: What's your favorite thing to do with your friends after school?
A: Yeah, dude. That's my favorite toy.
Score: 0.4703949944239866

Q: How do you feel about Wendy?
A: How are you feeling?
Score: 0.735900739202231

Q: What do you think about Eric Cartman?
A: I didn't think so.
Score: 0.40745856183234935

Q: Do you like living in South Park?
A: What other crime in South Park?
Score: 0.6029543543540647

Q: I need to leave now, bye!
A: No, they're leaving.
Score: 0.5343109095614417

