In [22]:
import numpy as np
import pandas as pd

In [23]:
data=pd.read_csv('Mental_Health_FAQ.csv')
data.head()

Unnamed: 0,Question_ID,Questions,Answers
0,1590140,What does it mean to have a mental illness?,Mental illnesses are health conditions that di...
1,2110618,Who does mental illness affect?,It is estimated that mental illness affects 1 ...
2,6361820,What causes mental illness?,It is estimated that mental illness affects 1 ...
3,9434130,What are some of the warning signs of mental i...,Symptoms of mental health disorders vary depen...
4,7657263,Can people with mental illness recover?,"When healing from mental illness, early identi..."


In [24]:
import spacy
from nltk.tokenize import word_tokenize
import string

nlp = spacy.load("en_core_web_sm")

def preprocess(text):
    text = text.lower()  
    doc = nlp(text)
    tokens = [token.text for token in doc if token.text not in string.punctuation]
    return tokens
data['Processed_Questions'] = data['Questions'].apply(preprocess)

In [25]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

vectorizer=TfidfVectorizer(stop_words='english')
matrix=vectorizer.fit_transform(data['Questions'])
def similar_ans(userinput):
    userinptfidf=vectorizer.transform([userinput])
    cosinesimilarities=cosine_similarity(userinptfidf,matrix)
    similarindex=cosinesimilarities.argmax()
    return data['Answers'][similarindex]





In [26]:
while True:
    userinput = input("Ask me anything about mental health (type 'exit' to quit): ")
    if userinput.lower() == 'exit':
        print("Goodbye!")
        break
    response = similar_ans(userinput)
    print(f"MentalHealthBot: {response}")

Goodbye!


In [27]:
def get_most_similar_answer(user_input, threshold=0.2):
    # Preprocess user input and transform to TF-IDF vector
    user_input_tfidf = vectorizer.transform([user_input])

    # Compute cosine similarity between the user's input and dataset questions
    cosine_similarities = cosine_similarity(user_input_tfidf, matrix)

    # Get the index of the most similar question
    most_similar_index = cosine_similarities.argmax()
    highest_similarity = cosine_similarities[0][most_similar_index]

    # If similarity is below a threshold, return a default response
    if highest_similarity < threshold:
        return "I'm sorry, I couldn't find a relevant answer. Could you please rephrase your question?"

    # Return the corresponding answer
    return data['Answers'][most_similar_index]


In [28]:
import pickle

# Save the vectorizer and TF-IDF matrix
with open('vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)

with open('tfidf_matrix.pkl', 'wb') as f:
    pickle.dump(matrix, f)
