In [24]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Download NLTK resources
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt_tab')


# Load dataset
data = pd.read_csv('3k_conversations.csv')

# Check columns
print(data.columns)

# Preprocessing
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    tokens = nltk.word_tokenize(str(text).lower())
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word.isalpha() and word not in stop_words]
    return " ".join(tokens)

data['cleaned_question'] = data['question'].apply(preprocess_text)

# TF-IDF vectorizer
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(data['cleaned_question'])

# Chatbot response function
def chatbot_response(user_input):
    user_input_clean = preprocess_text(user_input)
    user_vec = vectorizer.transform([user_input_clean])
    cosine_similarities = cosine_similarity(user_vec, tfidf_matrix)
    most_similar_index = np.argmax(cosine_similarities)
    return data['answer'].iloc[most_similar_index]

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Index(['Unnamed: 0', 'question', 'answer'], dtype='object')


In [25]:
response = chatbot_response("hi how are you")
print(response)

i'm fine. how about yourself?


In [27]:
response = chatbot_response("what are you standing on?")
print(response)

a couple of dictionaries and some textbooks.


In [26]:
# Print a few sample questions from the dataset
print("Sample questions from the dataset:")
print(data['question'].sample(5).tolist())

Sample questions from the dataset:
['what are you standing on?', 'i think some fish have blue eyes.', 'you hit a white ball.', 'i forgot.', "they probably wouldn't like that."]
