### Creating BoW Model

In [2]:
import nltk  
import numpy as np  
import random  
import string

import bs4 as bs  
import urllib.request  
import re  

##### Web Scrapping / Data Acquisition

In [3]:
from bs4 import BeautifulSoup
import requests

# Use urllib.request library to open any wikipedia page of your liking
# url = urllib.request.urlopen('https://en.wikipedia.org/wiki/Natural_language_processing')

url = "https://en.wikipedia.org/wiki/Natural_language_processing"

# Read html page
raw_html = requests.get(url)

# Use beautifulSoup to parse using lxml parser
article_html = BeautifulSoup(raw_html.text, "html.parser")


# Find only the paragraphs in the entire text
article_paragraphs = article_html.find_all("p")

article_text = ''

for para in article_paragraphs:  
    article_text += para.text

article_text

'Natural language processing (NLP) is an interdisciplinary subfield of computer science and linguistics. It is primarily concerned with giving computers the ability to support and manipulate speech. It involves processing natural language datasets, such as text corpora or speech corpora, using either rule-based or probabilistic (i.e. statistical and, most recently, neural network-based) machine learning approaches. The goal is a computer capable of "understanding" the contents of documents, including the contextual nuances of the language within them. The technology can then accurately extract information and insights contained in the documents as well as categorize and organize the documents themselves.\nChallenges in natural language processing frequently involve speech recognition, natural-language understanding, and natural-language generation.\nNatural language processing has its roots in the 1950s. Already in 1950, Alan Turing published an article titled "Computing Machinery and 

In [4]:
# In the cleaning process, make sure not to remove "." because it has to perform sentence tokenization later.
# So, we will remove everything which is not a-z or A-Z or ".". That's why there is a "." after Z in next line.
cleaned_text = re.sub(r'[^a-zA-Z.\s]', '', article_text) 
cleaned_text = re.sub(r'\n', '', cleaned_text)
print(cleaned_text)

Natural language processing NLP is an interdisciplinary subfield of computer science and linguistics. It is primarily concerned with giving computers the ability to support and manipulate speech. It involves processing natural language datasets such as text corpora or speech corpora using either rulebased or probabilistic i.e. statistical and most recently neural networkbased machine learning approaches. The goal is a computer capable of understanding the contents of documents including the contextual nuances of the language within them. The technology can then accurately extract information and insights contained in the documents as well as categorize and organize the documents themselves.Challenges in natural language processing frequently involve speech recognition naturallanguage understanding and naturallanguage generation.Natural language processing has its roots in the s. Already in  Alan Turing published an article titled Computing Machinery and Intelligence which proposed what

##### Tokenization

In [5]:
# Use Sentence Tokenizer
corpus = cleaned_text

In [6]:
from nltk.tokenize import sent_tokenize
corpus = sent_tokenize(corpus)
print(corpus)

['Natural language processing NLP is an interdisciplinary subfield of computer science and linguistics.', 'It is primarily concerned with giving computers the ability to support and manipulate speech.', 'It involves processing natural language datasets such as text corpora or speech corpora using either rulebased or probabilistic i.e.', 'statistical and most recently neural networkbased machine learning approaches.', 'The goal is a computer capable of understanding the contents of documents including the contextual nuances of the language within them.', 'The technology can then accurately extract information and insights contained in the documents as well as categorize and organize the documents themselves.Challenges in natural language processing frequently involve speech recognition naturallanguage understanding and naturallanguage generation.Natural language processing has its roots in the s. Already in  Alan Turing published an article titled Computing Machinery and Intelligence wh

In [7]:
for i in range(len(corpus )):
    corpus [i] = corpus [i].lower()
    corpus [i] = re.sub(r'\W',' ',corpus [i]) 
    # \W indicates non-word characters. This line replaces these characters with blank.
    corpus [i] = re.sub(r'\s+',' ',corpus [i])
    # \s+ indicates more than 1 white space. This line replaces those spaces with single space.

In [8]:
corpus

['natural language processing nlp is an interdisciplinary subfield of computer science and linguistics ',
 'it is primarily concerned with giving computers the ability to support and manipulate speech ',
 'it involves processing natural language datasets such as text corpora or speech corpora using either rulebased or probabilistic i e ',
 'statistical and most recently neural networkbased machine learning approaches ',
 'the goal is a computer capable of understanding the contents of documents including the contextual nuances of the language within them ',
 'the technology can then accurately extract information and insights contained in the documents as well as categorize and organize the documents themselves challenges in natural language processing frequently involve speech recognition naturallanguage understanding and naturallanguage generation natural language processing has its roots in the s already in alan turing published an article titled computing machinery and intelligence

In [9]:
print(corpus[20])

neural machine translation based on thennewlyinvented sequencetosequence transformations made obsolete the intermediate steps such as word alignment previously necessary for statistical machine translation the following is a list of some of the most commonly researched tasks in natural language processing 


##### Finding Word Frequency

In [10]:
from nltk.tokenize import word_tokenize

In [11]:
# Count how many times the same word is repeating
wordfreq = {}
for sentence in corpus:
    tokens = word_tokenize(sentence)
    for token in tokens:
        if token in wordfreq:
            count += 1
        else:
            count = 1
        wordfreq[token] = count

In [12]:
wordfreq

{'natural': 2,
 'language': 2,
 'processing': 4,
 'nlp': 2,
 'is': 4,
 'an': 3,
 'interdisciplinary': 6,
 'subfield': 1,
 'of': 3,
 'computer': 3,
 'science': 4,
 'and': 2,
 'linguistics': 4,
 'it': 2,
 'primarily': 1,
 'concerned': 1,
 'with': 3,
 'giving': 1,
 'computers': 1,
 'the': 2,
 'ability': 1,
 'to': 2,
 'support': 1,
 'manipulate': 1,
 'speech': 2,
 'involves': 3,
 'datasets': 1,
 'such': 2,
 'as': 2,
 'text': 3,
 'corpora': 3,
 'or': 2,
 'using': 2,
 'either': 1,
 'rulebased': 5,
 'probabilistic': 1,
 'i': 4,
 'e': 2,
 'statistical': 4,
 'most': 4,
 'recently': 3,
 'neural': 3,
 'networkbased': 1,
 'machine': 4,
 'learning': 3,
 'approaches': 4,
 'goal': 1,
 'a': 2,
 'capable': 1,
 'understanding': 3,
 'contents': 1,
 'documents': 3,
 'including': 1,
 'contextual': 1,
 'nuances': 1,
 'within': 1,
 'them': 1,
 'technology': 3,
 'can': 3,
 'then': 2,
 'accurately': 1,
 'extract': 1,
 'information': 1,
 'insights': 1,
 'contained': 1,
 'in': 2,
 'well': 1,
 'categorize': 1,
 '

##### Removing the least frequent words

In [13]:
import heapq
most_freq = heapq.nlargest(200, wordfreq, key=wordfreq.get)
# This will find 200 most frequent words

In [14]:
most_freq

['tasks',
 'modelling',
 'shared',
 'following',
 'some',
 'conll',
 'through',
 'interdisciplinary',
 'algorithms',
 'layer',
 'achieve',
 'symbols',
 'tagging',
 'been',
 'are',
 'among',
 'rulebased',
 'test',
 'was',
 'hidden',
 'coauthors',
 'manipulating',
 'have',
 'partofspeech',
 'translation',
 'trends',
 'processing',
 'is',
 'science',
 'linguistics',
 'i',
 'statistical',
 'most',
 'machine',
 'approaches',
 'involve',
 'naturallanguage',
 's',
 'intelligence',
 'proposed',
 'time',
 'not',
 'other',
 'see',
 'approach',
 'word',
 'single',
 'context',
 'develop',
 'methods',
 'be',
 'ai',
 'networks',
 'still',
 'old',
 'models',
 'intermediate',
 'longstanding',
 'cognitive',
 'an',
 'of',
 'computer',
 'with',
 'involves',
 'text',
 'corpora',
 'recently',
 'neural',
 'learning',
 'understanding',
 'documents',
 'technology',
 'can',
 'generation',
 'its',
 'turing',
 'though',
 'artificial',
 'symbolic',
 'chinese',
 'given',
 'g',
 'up',
 'were',
 'late',
 'introducti

##### Creating Bag of Words Model

In [15]:
sentence_vectors = []
for sentence in corpus:
    sentence_tokens = word_tokenize(sentence)
    sent_vec = []
    for token in most_freq:
        if token in sentence_tokens:
            sent_vec.append(1)
        else:
            sent_vec.append(0)
    sentence_vectors.append(sent_vec)

##### Converting it to numpy array

In [16]:
sentence_vectors = np.asarray(sentence_vectors)

In [17]:
sentence_vectors

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

##### Using pandas dataframe for visualization

In [18]:
import pandas as pd
pd.DataFrame(sentence_vectors, columns=most_freq)

Unnamed: 0,tasks,modelling,shared,following,some,conll,through,interdisciplinary,algorithms,layer,...,includes,task,automated,interpretation,premise,wellsummarized,john,searles,room,experiment
0,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,1
7,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


#### Computing IDF Values

In [19]:
# IDF = log(e) [Total documents in corpus / Number of documents with the word]
import math
word_idf_values = {}
for token in most_freq:
    total_docs = len(sentence)
    doc_containing_word = 0
    for document in corpus:
        if token in sentence:
            doc_containing_word += 1
        idf_value = math.log(total_docs/(doc_containing_word+1))
        word_idf_values[token] = idf_value
        

In [20]:
# print(word_idf_values)

##### Computing TF Values

In [21]:
# TF = Number of occurences of term in document / Total number of terms in the document
word_tf_values = {}
for token in most_freq:
    sent_tf_vector = []
    total_occurance = 0
    for document in corpus:
        words = word_tokenize(document)
        total_words = len(words)
        if token in document:
            total_occurance += 1
        else:
            total_occurance += 0
        tf_value = total_occurance / total_words
        word_tf_values[token] = tf_value

In [22]:
# print(word_tf_values)

##### Finding TF-IDF Values

In [23]:
sentence_vectors = []
for sentence in corpus:
    sentence_tokens = word_tokenize(sentence)
    sent_vec = []
    for token in most_freq:
        if token in sentence_tokens:
            sent_vec.append(1)
        else:
            sent_vec.append(0)
    sentence_vectors.append(sent_vec)

In [24]:
tfidf_values = []
for token in word_tf_values.keys():
    tfidf_sentences = []
    for tf_sentence in word_tf_values:
        tf_value = word_tf_values[tf_sentence]
        idf_value = word_idf_values[tf_sentence]
        tfidf = tf_value * idf_value      
        tfidf_sentences.append(tfidf)
    tfidf_values.append(tfidf_sentences)
 

In [25]:
tf_idf_model = np.asarray(tfidf_values)

In [26]:
tf_idf_model = np.transpose(tf_idf_model)

### Using sklearn

In [28]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()

bow_tfidf = tfidf.fit_transform(corpus)

# Printing IDF values
print(tfidf.idf_) 

[3.83321334 3.83321334 3.83321334 3.42774824 3.83321334 3.83321334
 3.83321334 3.83321334 3.83321334 3.83321334 3.83321334 3.83321334
 3.83321334 3.14006616 3.83321334 3.83321334 3.83321334 3.42774824
 3.83321334 3.83321334 3.83321334 3.42774824 3.42774824 2.73460106
 3.83321334 1.48183809 3.83321334 3.83321334 3.83321334 3.83321334
 3.42774824 3.42774824 3.83321334 2.58045038 2.73460106 3.14006616
 3.83321334 3.83321334 3.83321334 3.42774824 2.22377543 3.42774824
 3.83321334 3.42774824 3.83321334 3.14006616 3.14006616 3.83321334
 3.83321334 3.14006616 3.83321334 3.42774824 3.83321334 3.83321334
 3.83321334 3.14006616 3.83321334 3.83321334 3.83321334 3.83321334
 3.83321334 2.44691898 3.83321334 2.91692261 3.83321334 3.83321334
 3.83321334 3.83321334 3.83321334 3.83321334 3.83321334 3.42774824
 3.83321334 3.83321334 3.83321334 3.83321334 3.83321334 3.83321334
 2.32913595 3.83321334 3.83321334 3.14006616 3.83321334 3.83321334
 3.14006616 3.14006616 3.83321334 3.83321334 3.83321334 3.8332

In [32]:
feature_names = tfidf.get_feature_names_out()

In [40]:
corpus_index = [index for index, _ in enumerate(corpus)]
rows, cols = bow_tfidf.nonzero()
for row, col in zip(rows, cols):
    print((feature_names[col], corpus_index[row]), bow_tfidf[row, col])


('linguistics', 0) 0.3014482542053966
('and', 0) 0.16335015428621036
('science', 0) 0.3461446272794799
('computer', 0) 0.3461446272794799
('of', 0) 0.13980685185403866
('subfield', 0) 0.42255358173208235
('interdisciplinary', 0) 0.3461446272794799
('an', 0) 0.3014482542053966
('is', 0) 0.22503929975279424
('nlp', 0) 0.21621580630664108
('processing', 0) 0.23463098320433481
('language', 0) 0.18664377412088265
('natural', 0) 0.21621580630664108
('speech', 1) 0.26626414273124616
('manipulate', 1) 0.32504005069264963
('support', 1) 0.32504005069264963
('to', 1) 0.15418491659129402
('ability', 1) 0.32504005069264963
('the', 1) 0.12565351407553976
('computers', 1) 0.32504005069264963
('giving', 1) 0.32504005069264963
('with', 1) 0.20748823476984268
('concerned', 1) 0.32504005069264963
('primarily', 1) 0.32504005069264963
('it', 1) 0.2473425266559566
('and', 1) 0.12565351407553976
('is', 1) 0.1731065326665836
('probabilistic', 2) 0.2589937460060895
('rulebased', 2) 0.21216077097610725
('eithe

In [42]:
import pandas as pd
# pd.DataFrame(tf_idf_model, columns=most_freq)

df = pd.DataFrame(bow_tfidf.T.todense(), index=feature_names, columns=feature_names[corpus_index])
print(df)

            ability    above  accurately  achieve  acl  acquiring  action  \
ability         0.0  0.32504         0.0      0.0  0.0   0.000000     0.0   
above           0.0  0.00000         0.0      0.0  0.0   0.000000     0.0   
accurately      0.0  0.00000         0.0      0.0  0.0   0.111597     0.0   
achieve         0.0  0.00000         0.0      0.0  0.0   0.000000     0.0   
acl             0.0  0.00000         0.0      0.0  0.0   0.000000     0.0   
...             ...      ...         ...      ...  ...        ...     ...   
wordvec         0.0  0.00000         0.0      0.0  0.0   0.000000     0.0   
would           0.0  0.00000         0.0      0.0  0.0   0.000000     0.0   
writing         0.0  0.00000         0.0      0.0  0.0   0.000000     0.0   
years           0.0  0.00000         0.0      0.0  0.0   0.000000     0.0   
yoshua          0.0  0.00000         0.0      0.0  0.0   0.000000     0.0   

            actr  addressed  advance  ...   an   analyze  and  announced  \