## Scrape Wikipedia

In [1]:
import bs4 as bs
import urllib.request 

html = urllib.request.urlopen('https://en.wikipedia.org/wiki/Natural_language_processing')
html = html.read()

article = bs.BeautifulSoup(html, 'lxml')
paragraphs = article.find_all('p')

text = ''
for paragraph in paragraphs:
    text += paragraph.text

## Tokenize to sentences and Preprocess

In [2]:
import nltk
import re

sentences = nltk.sent_tokenize(text) # tokenize to sentences

for i in range(len(sentences)):
    sentences[i] = sentences[i].lower() # convert to lower case
    sentences[i] = re.sub(r'\W', ' ', sentences[i]) # remove punctuations
    sentences[i] = re.sub(r'\s+', ' ', sentences[i]) # replace multiple spaces with single space

print('No. of sentences: ', len(sentences)) 
print('21st sentence: ', sentences[20])

No. of sentences:  47
21st sentence:  the cache language models upon which many speech recognition systems now rely are examples of such statistical models 


## Tokenize to words, remove stopwords, and create a dictionary of word frequency

In [3]:
from nltk.corpus import stopwords

wordfreq = {}

for sentence in sentences:
    tokens = nltk.word_tokenize(sentence) # tokenize to words
    tokens = [i for i in tokens if i not in stopwords.words('english')] # remove stopwords
    for token in tokens:
        if token not in wordfreq.keys():
            wordfreq[token] = 1
        else:
            wordfreq[token] += 1
           
print('Word frequencies: ', wordfreq)
print('No. of words: ', len(wordfreq)) 

Word frequencies:  {'natural': 19, 'language': 24, 'processing': 16, 'nlp': 3, 'subfield': 1, 'linguistics': 3, 'computer': 3, 'science': 1, 'information': 3, 'engineering': 1, 'artificial': 2, 'intelligence': 4, 'concerned': 1, 'interactions': 1, 'computers': 2, 'human': 4, 'languages': 2, 'particular': 1, 'program': 1, 'process': 1, 'analyze': 1, 'large': 3, 'amounts': 2, 'data': 10, 'challenges': 1, 'frequently': 2, 'involve': 1, 'speech': 4, 'recognition': 2, 'understanding': 1, 'generation': 1, 'history': 1, 'generally': 3, 'started': 1, '1950s': 1, 'although': 1, 'work': 3, 'found': 2, 'earlier': 1, 'periods': 1, '1950': 1, 'alan': 1, 'turing': 2, 'published': 3, 'article': 1, 'titled': 1, 'computing': 1, 'machinery': 1, 'proposed': 1, 'called': 2, 'test': 1, 'criterion': 1, 'clarification': 1, 'needed': 1, 'georgetown': 1, 'experiment': 1, '1954': 1, 'involved': 1, 'fully': 1, 'automatic': 1, 'translation': 10, 'sixty': 2, 'russian': 1, 'sentences': 1, 'english': 1, 'authors': 1

## Fetch 200 most frequent words

In [4]:
import heapq

mostfreq = heapq.nlargest(200, wordfreq, key=wordfreq.get)

print(mostfreq)

['language', 'natural', 'processing', 'machine', 'systems', 'learning', 'data', 'translation', 'many', 'statistical', 'rules', 'research', 'algorithms', 'models', 'tasks', 'real', 'input', 'however', 'based', 'results', 'world', 'hand', 'used', 'intelligence', 'human', 'speech', '1980s', 'developed', 'written', 'produced', 'annotated', 'nlp', 'linguistics', 'computer', 'information', 'large', 'generally', 'work', 'published', 'much', 'late', 'system', 'eliza', 'using', 'examples', 'due', 'e', 'g', 'part', 'increasingly', 'focused', 'make', 'given', 'larger', 'corpora', 'learn', 'deep', 'neural', 'set', 'artificial', 'computers', 'languages', 'amounts', 'frequently', 'recognition', 'found', 'turing', 'called', 'sixty', '1966', 'first', 'restricted', 'example', 'head', 'hurts', '1978', 'lehnert', 'time', 'including', 'revolution', 'increase', 'corpus', 'earliest', 'decision', 'trees', 'hard', 'similar', 'existing', 'tagging', 'use', 'soft', 'probabilistic', 'decisions', 'attaching', 'val

## Find the IDF values for the words

In [5]:
import numpy as np

wordidf = {}

for word in mostfreq:
    indocs = 0 # number of documents in which the word appears
    for document in sentences:
        if word in nltk.word_tokenize(document):
            indocs += 1
    wordidf[word] = np.log(len(sentences)/(1 + indocs))

print('No. of idf values found: ', len(wordidf))
print(wordidf)

No. of idf values found:  200
{'language': 0.9597758438138939, 'natural': 1.2110902720948, 'processing': 1.0775588794702773, 'machine': 1.2110902720948, 'systems': 1.3652409519220583, 'learning': 1.2110902720948, 'data': 1.547562508716013, 'translation': 1.7707060600302227, 'many': 1.547562508716013, 'statistical': 1.547562508716013, 'rules': 1.9042374526547452, 'research': 1.6529230243738393, 'algorithms': 1.6529230243738393, 'models': 1.9042374526547452, 'tasks': 1.7707060600302227, 'real': 1.7707060600302227, 'input': 2.0583881324820035, 'however': 1.9042374526547452, 'based': 1.9042374526547452, 'results': 2.0583881324820035, 'world': 2.0583881324820035, 'hand': 2.0583881324820035, 'used': 2.0583881324820035, 'intelligence': 2.463853240590168, 'human': 2.463853240590168, 'speech': 2.2407096892759584, '1980s': 2.2407096892759584, 'developed': 2.2407096892759584, 'written': 2.2407096892759584, 'produced': 2.2407096892759584, 'annotated': 2.751535313041949, 'nlp': 2.463853240590168, '

## Find the TF values for the words

In [6]:
wordtf = {}

for word in mostfreq:
    tf_vector = []
    for document in sentences:
        docfreq = 0
        for token in nltk.word_tokenize(document):
            if word == token:
                docfreq += 1
        tf = docfreq/len(nltk.word_tokenize(document))
        tf_vector.append(tf)
    wordtf[word] = tf_vector

print(wordtf)

{'language': [0.047619047619047616, 0.1875, 0.05, 0.0, 0.0, 0.0, 0.0, 0.0, 0.04878048780487805, 0.0, 0.0, 0.0, 0.0, 0.0, 0.05555555555555555, 0.08333333333333333, 0.0, 0.05, 0.0, 0.023255813953488372, 0.05555555555555555, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.05263157894736842, 0.0, 0.02857142857142857, 0.021739130434782608, 0.03333333333333333, 0.038461538461538464, 0.0, 0.06666666666666667, 0.0, 0.0, 0.0, 0.0, 0.03333333333333333, 0.0, 0.0625, 0.0, 0.0], 'natural': [0.07142857142857142, 0.1875, 0.05, 0.0, 0.0, 0.0, 0.0, 0.0, 0.04878048780487805, 0.0, 0.0, 0.0, 0.0, 0.0, 0.05555555555555555, 0.041666666666666664, 0.0, 0.0, 0.0, 0.023255813953488372, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.03508771929824561, 0.0, 0.02857142857142857, 0.0, 0.0, 0.038461538461538464, 0.0, 0.06666666666666667, 0.0, 0.0, 0.0, 0.0, 0.03333333333333333, 0.0, 0.0625, 0.0, 0.0], 'processing': [0.023809523809523808, 0.0625, 0.05, 0.0, 0.0, 0.0, 0.0, 0.0, 0.024390243902439025, 0.0, 0.0, 0.0,

In [7]:
wordtfidf = []

for word in wordtf.keys():
    tfidf = []
    for sentence_tf in wordtf[word]:
        tfidf_score = sentence_tf * wordidf[word]
        tfidf.append(tfidf_score)
    wordtfidf.append(tfidf)
    
TFIDF = np.asarray(wordtfidf)
TFIDF = np.transpose(TFIDF) # Rows to represent the TF-IDF vectors

print('(sentences, words) =', TFIDF.shape)
print(TFIDF)

(sentences, words) = (47, 200)
[[0.04570361 0.08650645 0.02565616 ... 0.         0.         0.        ]
 [0.17995797 0.22707943 0.06734743 ... 0.         0.         0.        ]
 [0.04798879 0.06055451 0.05387794 ... 0.         0.         0.        ]
 ...
 [0.05998599 0.07569314 0.06734743 ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]]
