# Text Preprocessing

In [37]:
import pandas as pd
import re
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
import nltk
import gensim
from gensim.utils import simple_preprocess
import gensim.corpora as corpora

# Load the data
ai_df = pd.read_csv("C:\\Users\\dangk\\Desktop\\Projects\\Chatbot\\data\\ques_ai_data.csv")
database_df = pd.read_json("C:\\Users\\dangk\\Desktop\\Projects\\Chatbot\\data\\databases.json")
stats_df=pd.read_json("C:\\Users\\dangk\\Desktop\\Projects\\Chatbot\\data\\Stats.json")
ai2_df = pd.read_json("C:\\Users\\dangk\\Desktop\\Projects\\Chatbot\\data\\ai_data.json")

# Drop N/A from stats_df
stats_df.dropna(subset='question',inplace= True)

# Drop columns ID and rename so it can be merged
ai_df.drop(columns=['ID'],inplace=True)
ai_df.rename(columns={"Question":"question","Answer":"answer"},inplace=True)

# Concat df
df = pd.concat([ai_df,stats_df,database_df,ai2_df])

# Copy df to a new one
cleaned_df = df.copy()

# Function to preprocess text
def preprocess_text(text):
    # Remove HTML tags
    text = re.sub(r'<[^>]+>', ' ', text)
    # Remove special characters and line breaks
    text = re.sub(r'([^\s\w_])+', ' ', text)
    text = re.sub(r'[\n\r]', ' ', text)
    # Remove numbers
    text = re.sub(r'\d+', ' ', text)
    return text

# Apply preprocessing to 'Question' column
cleaned_df['question_processed'] = cleaned_df['question'].apply(preprocess_text)

# Function to tokenize text
def tokenize_text(text):
    text = text.lower()
    tokenizer = RegexpTokenizer('\w+|\$[\d\.]+|\S+')
    words = tokenizer.tokenize(text)
    return words

# Function to remove stopwords
def remove_stopwords(words):
    stop = set(stopwords.words('english'))
    filtered_words = [w for w in words if w not in stop]
    return filtered_words

# Function to perform lemmatization
def lemmatize_text(words):
    lemmatizer = WordNetLemmatizer()
    lem_words = [lemmatizer.lemmatize(word, get_part_of_speech_tags(word)) for word in words]
    return lem_words

# Function to get Part of Speech tags
def get_part_of_speech_tags(word):
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    tag = nltk.pos_tag([word])[0][1][0].upper()
    return tag_dict.get(tag, wordnet.NOUN)

# Preprocess 'Question' column
cleaned_df['question_processed'] = cleaned_df['question_processed'].apply(tokenize_text)
cleaned_df['question_processed'] = cleaned_df['question_processed'].apply(remove_stopwords)
cleaned_df['question_processed'] = cleaned_df['question_processed'].apply(lemmatize_text)

# Convert tokens back to string
cleaned_df['question_processed'] = cleaned_df['question_processed'].apply(lambda x: ' '.join(x))

cleaned_df.head(4)


Unnamed: 0,question,answer,question_answer,question_processed
0,What is artificial intelligence?,Artificial intelligence (AI) refers to the sim...,,artificial intelligence
1,What are the two types of AI?,The two types of AI are narrow AI (also known ...,,two type ai
2,What is narrow AI?,Narrow AI is AI that is designed and trained f...,,narrow ai
3,What is general AI?,General AI is AI that has the ability to under...,,general ai


In [38]:
cleaned_df.drop(columns='question_answer',inplace=True)

# Training LDA model and getting dictionary

In [39]:
# Function to convert text to words
def text_to_words(texts):
    return [[word for word in simple_preprocess(str(text), deacc=True)]
            for text in texts]
    
text = cleaned_df.question_processed.values.tolist()
text_words = text_to_words(text)
dict_word = corpora.Dictionary(text_words)
corpus_vec = [dict_word.doc2bow(text) for text in text_words]

# Train LDA model
lda_model = gensim.models.LdaModel(corpus=corpus_vec, id2word=dict_word, num_topics=3, iterations=20)

lda_model.save('lda_model')


In [45]:
lda_model_1 = gensim.models.LdaModel.load('lda_model')
topics = lda_model_1.show_topic(2, topn=3)
print(topics)

[('ai', 0.12531477), ('difference', 0.043323364), ('data', 0.020845372)]


# Topic Name extraction

In [46]:
# Function to extract topics using LDA model and return topic numbers
def extract_topics(text):
    text_words = text_to_words([text])
    # Create corpus
    corpus_vec = [dict_word.doc2bow(words) for words in text_words]
    # Get topic distribution
    topics = lda_model.get_document_topics(corpus_vec[0])
    #print("Topics inside extract topics is **************", topics)
    # Extract most probable topic
    topic_num = max(topics, key=lambda x: x[1])[0]
    return topic_num


# Function to get topic names based on representative words
def infer_topic_names(lda_model, dict_word, num_words=3):
    topic_names = {}
    for i in range(lda_model.num_topics):
        #print("i is", i)
        words = lda_model.show_topic(i, topn=num_words)
        #print("words is", words)
        topic_names[i] = ', '.join([word for word, _ in words])
        #print("topic names ",topic_names)
    return topic_names

# Displaying Output for trained LDA model

In [47]:
# Get inferred topic names
topic_names = infer_topic_names(lda_model, dict_word)

# Apply topic extraction function to each question
cleaned_df['Topic_Num'] = cleaned_df['question_processed'].apply(extract_topics)

# Map topic numbers to inferred topic names
cleaned_df['Topic'] = cleaned_df['Topic_Num'].map(topic_names)

# Drop the 'Topic_Num' column if needed
cleaned_df = cleaned_df.drop(columns=['Topic_Num'])
pd.set_option('display.max_colwidth', 300)
cleaned_df.head(20)

Unnamed: 0,question,answer,question_processed,Topic
0,What is artificial intelligence?,Artificial intelligence (AI) refers to the simulation of human intelligence in machines that are programmed to think and learn like humans.,artificial intelligence,"role, ai, learn"
1,What are the two types of AI?,The two types of AI are narrow AI (also known as weak AI) and general AI (also known as strong AI).,two type ai,"role, ai, learn"
2,What is narrow AI?,Narrow AI is AI that is designed and trained for a particular task or set of tasks. It operates within a limited context and cannot perform tasks outside.,narrow ai,"ai, difference, data"
3,What is general AI?,"General AI is AI that has the ability to understand, learn, and apply its intelligence across a wide range of tasks, similar to human intelligence.",general ai,"ai, difference, data"
4,What are some examples of narrow AI?,"Some examples of narrow AI include virtual personal assistants (e.g., Siri, Alexa), recommendation systems, and image recognition software.",example narrow ai,"ai, difference, data"
5,What are some challenges in AI?,"Some challenges in AI include ethical considerations, job displacement, and ensuring AI systems are transparent and accountable.",challenge ai,"role, ai, learn"
6,What is machine learning?,Machine learning is a subset of AI that allows computers to learn from data and improve over time without being explicitly programmed.,machine learn,"role, ai, learn"
7,What are some popular machine learning algorithms?,"Some popular machine learning algorithms include linear regression, logistic regression, decision trees, random forests, support vector machines (SVM), and neural networks.",popular machine learn algorithm,"role, ai, learn"
8,What is deep learning?,Deep learning is a subset of machine learning that uses neural networks with many layers to learn complex patterns in large amounts of data.,deep learn,"role, ai, learn"
9,What are some applications of AI?,"Some applications of AI include natural language processing (NLP), computer vision, autonomous vehicles, healthcare, and finance.",application ai,"concept, data, explain"


# Embed Processed Question and store in MongoDB

In [48]:
# Turn each question into a vector
from sklearn.feature_extraction.text import TfidfVectorizer

# Create a list of cleaned question
question_documents = cleaned_df['question_processed'].values

# Declare rge tfidf Vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=1153)
tfidf_matrix = tfidf_vectorizer.fit_transform(question_documents)


In [60]:
import pickle
with open('dictionary.pkl', 'wb') as f:
    pickle.dump(dict_word, f)
    
# Save Tfidf Vectorizer model
with open('tfidf_vectorizer.pkl','wb') as f:
    pickle.dump(tfidf_vectorizer, f)

In [50]:
# Turn each matrix into array
question_vector = tfidf_matrix.toarray()

# Create list of answer
answer = cleaned_df['answer'].values

# Create list of topic words
topic_words = cleaned_df['Topic'].values

data_list = []
for i in range(0,len(answer)):
    data = {}
    data['embed_question']=question_vector[i].tolist()
    data['answer']=answer[i]
    data['topic_words']=topic_words[i] 
    data_list.append(data)

In [51]:
# Initiate MongoDB Instance
# %pip install pymongo
import pymongo

try:
    mongodb_client = pymongo.MongoClient("mongodb+srv://dbkhoatrann1998:GHbuiwMymRsTtcPC@dbkhoatrann1998.zbjantr.mongodb.net/?retryWrites=true&w=majority&appName=dbkhoatrann1998")
except:
    print('error occured')

In [53]:
# Create a new database and a new connection 
collection = mongodb_client['CIS8045']['data']

# Insert new data
insert_results = collection.insert_many(data_list)

# print result
print("Inserted document ID:", insert_results.inserted_ids)

Inserted document ID: [ObjectId('6628276d6ae49c603fc895bf'), ObjectId('6628276d6ae49c603fc895c0'), ObjectId('6628276d6ae49c603fc895c1'), ObjectId('6628276d6ae49c603fc895c2'), ObjectId('6628276d6ae49c603fc895c3'), ObjectId('6628276d6ae49c603fc895c4'), ObjectId('6628276d6ae49c603fc895c5'), ObjectId('6628276d6ae49c603fc895c6'), ObjectId('6628276d6ae49c603fc895c7'), ObjectId('6628276d6ae49c603fc895c8'), ObjectId('6628276d6ae49c603fc895c9'), ObjectId('6628276d6ae49c603fc895ca'), ObjectId('6628276d6ae49c603fc895cb'), ObjectId('6628276d6ae49c603fc895cc'), ObjectId('6628276d6ae49c603fc895cd'), ObjectId('6628276d6ae49c603fc895ce'), ObjectId('6628276d6ae49c603fc895cf'), ObjectId('6628276d6ae49c603fc895d0'), ObjectId('6628276d6ae49c603fc895d1'), ObjectId('6628276d6ae49c603fc895d2'), ObjectId('6628276d6ae49c603fc895d3'), ObjectId('6628276d6ae49c603fc895d4'), ObjectId('6628276d6ae49c603fc895d5'), ObjectId('6628276d6ae49c603fc895d6'), ObjectId('6628276d6ae49c603fc895d7'), ObjectId('6628276d6ae49c603

# Test

In [77]:
def getTopics(userQuestion):
    # Load the dictionary from the file
    lda_model = gensim.models.LdaModel.load(r'C:\Users\dangk\Desktop\Projects\Chatbot\model\lda_model')
    with open(r'C:\Users\dangk\Desktop\Projects\Chatbot\model\dictionary.pkl', 'rb') as f:
        dict_word = pickle.load(f)

    new_data = pd.DataFrame(data=[userQuestion], columns=['Question'])

    #new_data.head()
    new_data['Question_processed'] = new_data['Question'].apply(preprocess_text)
    new_data['Question_processed'] = new_data['Question_processed'].apply(tokenize_text)
    new_data['Question_processed'] = new_data['Question_processed'].apply(remove_stopwords)
    new_data['Question_processed'] = new_data['Question_processed'].apply(lemmatize_text)
    new_data['Question_processed'] = new_data['Question_processed'].apply(lambda x: ' '.join(x))

    new_text = new_data.Question_processed.values.tolist()
    new_text_words = text_to_words(new_text)
    new_corpus_vec = [dict_word.doc2bow(text) for text in new_text_words]

    topic_names = infer_topic_names(lda_model, dict_word)

    # Infer topics for the new data
    new_topics = [max(lda_model.get_document_topics(doc), key=lambda x: x[1])[0] for doc in new_corpus_vec]

    # Interpret the inferred topics for the new data
    new_data['Topic_Num'] = new_topics
    new_data['Topic'] = new_data['Topic_Num'].map(topic_names)

    user_output = new_data[['Question', 'Question_processed', 'Topic']]
    # Extract the 'Topic' column from user_output
    topics = user_output['Topic']

    topics_list = topics.astype(str).tolist()

    # Join the elements of the list into a single string
    topics_string = ' '.join(topics_list)

    return topics_string

In [80]:
topic = getTopics(question)
topic

'concept, data, explain'

In [82]:

# Create a random question
import pickle 

question = "data science"

# Load the model
with open(r'C:\Users\dangk\Desktop\Projects\Chatbot\model\tfidf_vectorizer.pkl', 'rb') as file:
    tfidf_vectorizer = pickle.load(file)

# Turn into vector using TFIDF vectorizer
vec_question = tfidf_vectorizer.transform([question]).toarray().tolist()
topic_words = getTopics(question)

# Make a query to see how Atlas search works
pipeline = [
        {
            '$vectorSearch': {
                'index': 'vector-search-question', 
                'path': 'embed_question', 
                'queryVector': vec_question[0],
                'filter': {
                  'topic_words':topic_words
                  },
                'numCandidates': 100, 
                'limit': 1
            }
        }, {
            '$project': {
                '_id': 0, 
                'answer': 1,
                'category': 1,
                'score': {
                    '$meta': 'vectorSearchScore'
                }
            }
        }
    ]

result = collection.aggregate(pipeline)

result_list = list(result)

print(result_list[0]['answer'])
print(result_list[0]['score'])

Data science is an interdisciplinary field that uses scientific methods, processes, algorithms, and systems to extract knowledge and insights from structured and unstructured data.
1.0
