https://www.youtube.com/watch?v=JzPgeRJfNo4&ab_channel=Intellipaat

In [5]:
import numpy as np
import nltk
import string
import random
import wikipedia
import wikipediaapi
import json
import pickle
import sklearn

## Build dataset

In [6]:
search_topics = ['science', 'chemisry', 'physics', 'biology', 'sciences', 'cosmology', 'medicine', 'geology']

In [7]:
wiki_pages = []
for topic in search_topics:
    results = wikipedia.search(topic, results=500)
    for result in results:
        if result not in wiki_pages:
            wiki_pages.append(result)

In [10]:
wiki_wiki = wikipediaapi.Wikipedia(language='en', extract_format=wikipediaapi.ExtractFormat.WIKI)

In [11]:
#remove_str = ["\n\n", "\n", "\"", "\u200a", "\u2013", "\u00e4", "\u2014", "\u00e9", "\u00f6", "\u00e0"]
remove_str = ["\n\n", "\n", "\""]

In [12]:
wiki_data = {}
for page in wiki_pages:
    text = wiki_wiki.page(page).text
    for str in remove_str:
        text = text.replace(str, ' ')
    wiki_data[page] = text.lower()

In [14]:
with open('wiki_data_full.json', 'w') as f:
    json.dump(wiki_data, f, indent=4)

In [13]:
filehandler = open(b"wiki_data_full.pickle","wb")
pickle.dump(wiki_data,filehandler)

In [2]:
file = open("wiki_data.pickle",'rb')
wiki_data = pickle.load(file)

## Preprocessing

This tokenizer divides a text into a list of sentences by using an unsupervised algorithm to build a model for abbreviation words, collocations, and words that start sentences. It must be trained on a large collection of plaintext in the target language before it can be used.

The NLTK data package includes a pre-trained Punkt tokenizer for English.

In [3]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/katiez/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

WordNet is just another NLTK corpus reader

Synset: a set of synonyms that share a common meaning.

In [4]:
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to /home/katiez/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/katiez/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [16]:
# tokenisation
sentence_tokens = nltk.sent_tokenize(wiki_data[list(wiki_data)[0]])
word_tokens = nltk.word_tokenize(wiki_data[list(wiki_data)[0]])

In [21]:
lemmer = nltk.stem.WordNetLemmatizer()
def LemTokens(tokens):
    return [lemmer.lemmatize(token) for token in tokens]
remove_punc_dict = dict((ord(punct), None) for punct in string.punctuation)
def LemNormalize(text):
    return LemTokens(nltk.word_tokenize(text.lower().translate(remove_punc_dict)))

In [32]:
LemNormalize('he loves eating leaves!')

['he', 'love', 'eating', 'leaf']

In [34]:
from nltk.stem.lancaster import LancasterStemmer
stemmer = LancasterStemmer()
stemmer.stem('eating')

'eat'

## Define greeting functions

In [22]:
greet_inputs = ("hello", "hi", "hey", "whassup", "how are you?")
greet_responses = ("hello", "hi", "Hey", "Hi there!", "Hey there!")
def greet(sentence):
    for word in sentence.split():
        if word.lower() in greet_inputs:
            return random.choice(greet_responses)

In [49]:
a = ['1', '2']
b = ['3']
a.extend(b)
print(a)

['1', '2', '3']


## Response generation

Convert a collection of raw documents to a matrix of TF-IDF features. As tf–idf is very often used for text features, there is also another class called TfidfVectorizer that combines all the options of CountVectorizer and TfidfTransformer in a single model.

While the tf–idf normalization is often very useful, there might be cases where the binary occurrence markers might offer better features. This can be achieved by using the binary parameter of CountVectorizer. In particular, some estimators such as Bernoulli Naive Bayes explicitly model discrete boolean random variables. Also, very short texts are likely to have noisy tf–idf values while the binary occurrence info is more stable.

In [23]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [46]:
def response(user_input):
    robot_response = ''
    TfidfVec = TfidfVectorizer(tokenizer=LemNormalize, stop_words='english')
    print(TfidfVec)
    tfidf = TfidfVec.fit_transform(sentence_tokens)
    print(sentence_tokens)
    vals = cosine_similarity(tfidf[-1], tfidf)
    # -1 is user_input
    idx = vals.argsort()[0][-2]
    flat = vals.flatten()
    flat.sort()
    req_tfidf = flat[-2]
    if (req_tfidf == 0):
        robot_response += "Sorry, I can't understand you."
    else:
        robot_response += sentence_tokens[idx]
    return robot_response

In [44]:
sentence_tokens = nltk.sent_tokenize(wiki_data[list(wiki_data)[2]])
len(sentence_tokens)

183

In [45]:
TfidfVec = TfidfVectorizer(tokenizer=LemNormalize, stop_words='english')
tfidf = TfidfVec.fit_transform(sentence_tokens)
array_of_feature = tfidf.toarray()
array_of_feature.shape



(183, 1616)

## Define chatflow

In [36]:
flag = True
print("Hi, I'm science robot. You can ask me any scientific questions! Please say goodbye before leaving!")
while (flag == True):
    user_input = input()
    print(f"User: {user_input}")
    user_input = user_input.lower()
    if (user_input != 'goodbye' and user_input != 'bye'):
        if(user_input == 'thanks' or user_input == 'thank you'):
            flag = False
            print("Bot: You are welcome!")
        else:
            if(greet(user_input) != None):
                print(f"Bot: {greet(user_input)}")
            else:
                sentence_tokens.append(user_input)
                word_tokens += nltk.word_tokenize(user_input)
                final_words = list(set(word_tokens))
                print(f"Bot: {response(user_input)}")
                sentence_tokens.remove(user_input)
    else:
        flag = False
        print("Bot: Bye!")


Hi, I'm science robot. You can ask me any scientific questions! Please say goodbye before leaving!
User: who is the father of science?
TfidfVectorizer(stop_words='english',
                tokenizer=<function LemNormalize at 0x7fc3f6fa0550>)




  (0, 219)	0.1897389562744652
  (0, 6)	0.21745256910072722
  (0, 49)	0.21745256910072722
  (0, 1046)	0.21745256910072722
  (0, 564)	0.21745256910072722
  (0, 134)	0.1588250525543729
  (0, 341)	0.21745256910072722
  (0, 1077)	0.15310355286859859
  (0, 1262)	0.21745256910072722
  (0, 813)	0.2336639933635954
  (0, 1375)	0.2059503805373333
  (0, 1754)	0.21745256910072722
  (0, 552)	0.1970285899577288
  (0, 1687)	0.2336639933635954
  (0, 1266)	0.1897389562744652
  (0, 641)	0.1897389562744652
  (0, 1622)	0.2336639933635954
  (0, 690)	0.18357565881405188
  (0, 932)	0.1326795737256002
  (0, 1166)	0.2336639933635954
  (0, 257)	0.21745256910072722
  (0, 589)	0.2336639933635954
  (0, 1599)	0.17823676771107125
  (0, 1461)	0.16656518571797915
  (1, 286)	0.22887110233768793
  :	:
  (250, 746)	0.23256132423543932
  (250, 915)	0.23256132423543932
  (250, 623)	0.19119224345132585
  (250, 908)	0.24554971589040744
  (250, 780)	0.4145911087884596
  (250, 1461)	0.09404357511025137
  (251, 1739)	0.231258073