Credits/Course URL - https://www.udemy.com/course/the-ultimate-beginners-guide-to-natural-language-processing/

In [1]:
import nltk
import spacy
import en_core_web_sm
from goose3 import Goose
from spacy.matcher import PhraseMatcher
from IPython.display import HTML
import numpy as np
import pandas as pd
import random
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

### Keyword Search

In [2]:
string = 'Natural language processing (NLP) is an interdisciplinary subfield of linguistics, computer science, and artificial intelligence concerned with the interactions between computers and human language, in particular how to program computers to process and analyze large amounts of natural language data.'

In [4]:
str_to_search = ['artificial', 'computer', 'language']

In [5]:
nlp = spacy.load('en_core_web_sm')

In [6]:
document = nlp(string)

In [7]:
tokens_list = [nlp(element) for element in str_to_search]
tokens_list

[artificial, computer, language]

In [8]:
matcher = PhraseMatcher(nlp.vocab)
matcher.add('SEARCH', None, *tokens_list)

In [9]:
match_results = matcher(document)

In [10]:
match_results

[(8661325627334373315, 1, 2),
 (8661325627334373315, 13, 14),
 (8661325627334373315, 17, 18),
 (8661325627334373315, 27, 28),
 (8661325627334373315, 43, 44)]

In [11]:
for tup in match_results:
    print(f'{document[tup[1]:tup[2]]} found at index {tup[1]}')

language found at index 1
computer found at index 13
artificial found at index 17
language found at index 27
language found at index 43


In [12]:
document[13-3:14+3]

of linguistics, computer science, and

In [13]:
goose_obj = Goose()
url = 'https://en.wikipedia.org/wiki/Natural_language_processing'
article = goose_obj.extract(url)

In [14]:
tot_words = 50
str_to_search_html = ', '.join(str_to_search)
str_to_search_html

'artificial, computer, language'

In [15]:
marked_text = ''
display(HTML(f'<h2>Search results for: {str_to_search_html.upper()}</h2>'))
doc = nlp(article.cleaned_text)
found_results = matcher(doc)
#print(found_results)
display(HTML(f"""<p><strong>Number of Matches:</strong> {len(found_results)}</p>"""))
for tup in found_results:
    print(f'{doc[tup[1]:tup[2]]} found at index {tup[1]}')

language found at index 1
computer found at index 13
artificial found at index 17
language found at index 27
language found at index 43
computer found at index 50
language found at index 67
language found at index 96
artificial found at index 149
language found at index 167
computer found at index 207
language found at index 210
language found at index 338
language found at index 350
computer found at index 468
language found at index 619
language found at index 683
language found at index 711
language found at index 721
language found at index 776
language found at index 937
language found at index 1088
language found at index 1119
language found at index 1125
language found at index 1171
language found at index 1578
computer found at index 1627
language found at index 1652
language found at index 1816
language found at index 1853
language found at index 2082
language found at index 2128
language found at index 2157
language found at index 2191
language found at index 2382
language fo

In [16]:
marked_text = ''
display(HTML(f'<h2>Search results for: {str_to_search_html.upper()}</h2>'))
doc = nlp(article.cleaned_text)
found_results = matcher(doc)
#print(found_results)
display(HTML(f"""<p><strong>Number of Matches:</strong> {len(found_results)}</p>"""))
for tup in found_results:
    #print(tup)
    start = tup[1] - tot_words
    if start<0:
        start = 0
    for j in range(len(tokens_list)):
        if doc[tup[1]:tup[2]].similarity(tokens_list[j])== 1.0:
            #print(j, tokens_list[j])
            search_text = str(tokens_list[j])
            marked_text += str(doc[start:tup[2]+tot_words]).replace(search_text, f'<mark style="background-color:yellow;">{search_text}</mark>')
            marked_text += "<br/><br/>"
            
display(HTML(f"""<blockquote>{marked_text}<blockquote>"""))

  if doc[tup[1]:tup[2]].similarity(tokens_list[j])== 1.0:


### Bag of words

In [17]:
sentences = ['This is a first document.', 
             'This document is the second document.', 
             'And this is the third one.',
             'Is this the first document?']

In [18]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(sentences)

In [19]:
print(vectorizer.get_feature_names_out())

['and' 'document' 'first' 'is' 'one' 'second' 'the' 'third' 'this']


In [20]:
X.toarray()

array([[0, 1, 1, 1, 0, 0, 0, 0, 1],
       [0, 2, 0, 1, 0, 1, 1, 0, 1],
       [1, 0, 0, 1, 1, 0, 1, 1, 1],
       [0, 1, 1, 1, 0, 0, 1, 0, 1]], dtype=int64)

In [21]:
pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out(), index = sentences)

Unnamed: 0,and,document,first,is,one,second,the,third,this
This is a first document.,0,1,1,1,0,0,0,0,1
This document is the second document.,0,2,0,1,0,1,1,0,1
And this is the third one.,1,0,0,1,1,0,1,1,1
Is this the first document?,0,1,1,1,0,0,1,0,1


In [22]:
original_sentence = nltk.tokenize.sent_tokenize(article.cleaned_text)
original_sentence

['Natural language processing (NLP) is an interdisciplinary subfield of linguistics, computer science, and artificial intelligence concerned with the interactions between computers and human language, in particular how to program computers to process and analyze large amounts of natural language data.',
 'The goal is a computer capable of "understanding" the contents of documents, including the contextual nuances of the language within them.',
 'The technology can then accurately extract information and insights contained in the documents as well as categorize and organize the documents themselves.',
 'Natural language processing has its roots in the 1950s.',
 'Already in 1950, Alan Turing published an article titled "Computing Machinery and Intelligence" which proposed what is now called the Turing test as a criterion of intelligence, though at the time that was not articulated as a problem separate from artificial intelligence.',
 'The proposed test includes a task that involves the 

In [23]:
vectorizer = CountVectorizer()
X_sentences = vectorizer.fit_transform(original_sentence)

In [24]:
print(vectorizer.get_feature_names_out())

['10' '100' '11' '12' '13' '14' '15' '16' '17' '18' '19' '1950' '1950s'
 '1954' '1960s' '1964' '1966' '1970s' '1975' '1976' '1977' '1978' '1979'
 '1980s' '1981' '1990s' '1999' '2000s' '2002' '2003' '2006' '2007' '2009'
 '2010s' '2012' '2015' '2017' '2018' '2020' '40' '41' '42' '43' '44' '45'
 '46' '47' '48' '49' '50' '51' '60' 'abandoned' 'able' 'about' 'above'
 'accidentally' 'accurate' 'accurately' 'achieve' 'acl' 'acquiring' 'act'
 'action' 'additional' 'addressed' 'advanced' 'advantage' 'advantages'
 'after' 'age' 'ai' 'aid' 'alan' 'algorithm' 'algorithms' 'alignment'
 'alike' 'all' 'almost' 'along' 'alpac' 'already' 'also' 'although'
 'ambiguous' 'among' 'amount' 'amounts' 'an' 'analysis' 'analyze'
 'analyzed' 'and' 'annotated' 'annotation' 'annotations' 'another'
 'answering' 'answers' 'apertium' 'apparent' 'applications' 'applied'
 'apply' 'applying' 'approach' 'approaches' 'arabic' 'are' 'area' 'areas'
 'art' 'article' 'articulated' 'artificial' 'as' 'aspects' 'assign' 'at'
 'a

In [25]:
print(len(vectorizer.get_feature_names_out()))

859


In [26]:
X_sentences.toarray()[1]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

### TF-IDF (Term Frequency - Inverse Document Frequency)

TF = Number of times T appears in the document / number of terms in the document 

IDF = 1 + log (total number of documents / number of documents term T appeared)

In [27]:
sentences

['This is a first document.',
 'This document is the second document.',
 'And this is the third one.',
 'Is this the first document?']

In [28]:
tfidf_vectorizer = TfidfVectorizer()

In [29]:
X = tfidf_vectorizer.fit_transform(sentences)

In [30]:
print(tfidf_vectorizer.get_feature_names_out())

['and' 'document' 'first' 'is' 'one' 'second' 'the' 'third' 'this']


In [31]:
np.round(X.toarray(),2)

array([[0.  , 0.51, 0.63, 0.42, 0.  , 0.  , 0.  , 0.  , 0.42],
       [0.  , 0.67, 0.  , 0.28, 0.  , 0.53, 0.34, 0.  , 0.28],
       [0.5 , 0.  , 0.  , 0.26, 0.5 , 0.  , 0.32, 0.5 , 0.26],
       [0.  , 0.45, 0.56, 0.37, 0.  , 0.  , 0.45, 0.  , 0.37]])

In [32]:
pd.DataFrame(np.round(X.toarray(),2), columns=tfidf_vectorizer.get_feature_names_out(), index = sentences)

Unnamed: 0,and,document,first,is,one,second,the,third,this
This is a first document.,0.0,0.51,0.63,0.42,0.0,0.0,0.0,0.0,0.42
This document is the second document.,0.0,0.67,0.0,0.28,0.0,0.53,0.34,0.0,0.28
And this is the third one.,0.5,0.0,0.0,0.26,0.5,0.0,0.32,0.5,0.26
Is this the first document?,0.0,0.45,0.56,0.37,0.0,0.0,0.45,0.0,0.37


In [33]:
tfidf_vectorizer = TfidfVectorizer()
X_sentences_tfidf = tfidf_vectorizer.fit_transform(original_sentence)

In [34]:
print(tfidf_vectorizer.get_feature_names_out())

['10' '100' '11' '12' '13' '14' '15' '16' '17' '18' '19' '1950' '1950s'
 '1954' '1960s' '1964' '1966' '1970s' '1975' '1976' '1977' '1978' '1979'
 '1980s' '1981' '1990s' '1999' '2000s' '2002' '2003' '2006' '2007' '2009'
 '2010s' '2012' '2015' '2017' '2018' '2020' '40' '41' '42' '43' '44' '45'
 '46' '47' '48' '49' '50' '51' '60' 'abandoned' 'able' 'about' 'above'
 'accidentally' 'accurate' 'accurately' 'achieve' 'acl' 'acquiring' 'act'
 'action' 'additional' 'addressed' 'advanced' 'advantage' 'advantages'
 'after' 'age' 'ai' 'aid' 'alan' 'algorithm' 'algorithms' 'alignment'
 'alike' 'all' 'almost' 'along' 'alpac' 'already' 'also' 'although'
 'ambiguous' 'among' 'amount' 'amounts' 'an' 'analysis' 'analyze'
 'analyzed' 'and' 'annotated' 'annotation' 'annotations' 'another'
 'answering' 'answers' 'apertium' 'apparent' 'applications' 'applied'
 'apply' 'applying' 'approach' 'approaches' 'arabic' 'are' 'area' 'areas'
 'art' 'article' 'articulated' 'artificial' 'as' 'aspects' 'assign' 'at'
 'a

In [35]:
X_sentences_tfidf.toarray()[1]

array([0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.     

### Cosine Similarity

In [36]:
X_sentences.shape

(88, 859)

In [37]:
X_sentences_tfidf.shape

(88, 859)

In [38]:
X_test_similarity = X_sentences_tfidf[0:3].toarray()

In [39]:
X_test_similarity = np.concatenate((X_test_similarity, X_test_similarity[0].reshape(1,-1)), axis = 0)

In [40]:
X_test_similarity

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [41]:
X_test_similarity.shape

(4, 859)

In [42]:
cosine_similarity(X = X_test_similarity[0].reshape(1,-1), Y = X_test_similarity[1].reshape(1,-1) )

array([[0.13238531]])

In [43]:
cosine_similarity(X = X_test_similarity[0].reshape(1,-1), Y = X_test_similarity[2].reshape(1,-1) )

array([[0.05924402]])

In [44]:
cosine_similarity(X = X_test_similarity[0].reshape(1,-1), Y = X_test_similarity[3].reshape(1,-1) )

array([[1.]])

In [45]:
cosine_similarity(X = X_test_similarity[0].reshape(1,-1), Y = X_test_similarity)

array([[1.        , 0.13238531, 0.05924402, 1.        ]])

### Simulation of a simple chatbot

In [46]:
welcome_input = ['hi', 'hello', 'hey', 'hii']
welcome_output = ['Hi, How may I help you?', 'Hello there!', 'How are you doing today? :)', 'Welcome to the Chatbot!', 'Please tell us what do you need help with?']

In [47]:
def welcome_message(message):
    words = message.lower().split()
    
    for word in words:
        if word in welcome_input:
            return random.choice(welcome_output)
    

In [48]:
welcome_message('hii')

'Please tell us what do you need help with?'

In [50]:
def preprocess(sentence):
    
    sentence = sentence.lower()
    sentence = sentence.replace('.', '')
    sentence = sentence.replace('[', '')
    sentence = sentence.replace(']', '')
    
    tokens = [token.text for token in nlp(sentence) if not (token.is_stop or token.like_num or token.is_punct or token.is_space)]
    
    tokens = ' '.join(tokens)
    return tokens

In [68]:
def answers(user_text, threshold = 0.3):
    cleaned_sentences = []
    for sentence in original_sentence:
        cleaned_sentences.append(preprocess(sentence))
    #print(cleaned_sentences)
    
    chatbot_answer = ''
    user_text = preprocess(user_text)
    #print(user_text)
    cleaned_sentences.append(user_text)
    
    tfidf_vectorizer = TfidfVectorizer()
    X_sentences = tfidf_vectorizer.fit_transform(cleaned_sentences)
    
    similarity = cosine_similarity(X_sentences[-1], X_sentences)
    sentence_index = similarity.argsort()[0][-2]
    #print(sentence_index)
    
    if similarity[0][sentence_index] < threshold:
        chatbot_answer += 'Sorry, no answer found for the mentioned threshold!'
    else:
        chatbot_answer += original_sentence[sentence_index]
    
    return chatbot_answer

In [69]:
answers('what is Natural Language processing')

'Natural language processing has its roots in the 1950s.'

In [70]:
answers('who is alan turing')

'Already in 1950, Alan Turing published an article titled "Computing Machinery and Intelligence" which proposed what is now called the Turing test as a criterion of intelligence, though at the time that was not articulated as a problem separate from artificial intelligence.'

In [74]:
cont = True
print("Hello, I am Chatbot! Ask me anything related to Natural Language Processing (NLP)")

while cont == True:
    user_text = input().lower()
    if user_text != 'quit':
        if welcome_message(user_text) != None:
            print('Chatbot: '+ welcome_message(user_text))
        else:
            print('Chatbot: ')
            print(answers(user_text))
    else:
        cont = False
        print('Chatbot: Bye! Come again! :)')

Hello, I am Chatbot! Ask me anything related to Natural Language Processing (NLP)
Hello
Chatbot: Welcome to the Chatbot!
NLP
Chatbot: 
[50] Likewise, ideas of cognitive NLP are inherent to neural models multimodal NLP (although rarely made explicit).
Alan Turing
Chatbot: 
Already in 1950, Alan Turing published an article titled "Computing Machinery and Intelligence" which proposed what is now called the Turing test as a criterion of intelligence, though at the time that was not articulated as a problem separate from artificial intelligence.
Quit
Chatbot: Bye! Come again! :)
