## Scrape Wikipedia

In [1]:
import bs4 as bs
import urllib.request 

html = urllib.request.urlopen('https://en.wikipedia.org/wiki/Natural_language_processing')
html = html.read()

article = bs.BeautifulSoup(html, 'lxml')
paragraphs = article.find_all('p')

text = ''
for paragraph in paragraphs:
    text += paragraph.text

## Tokenize to sentences and Preprocess

In [2]:
import nltk
import re

sentences = nltk.sent_tokenize(text) # tokenize to sentences

for i in range(len(sentences)):
    sentences[i] = sentences[i].lower() # convert to lower case
    sentences[i] = re.sub(r'\W', ' ', sentences[i]) # remove punctuations
    sentences[i] = re.sub(r'\s+', ' ', sentences[i]) # replace multiple spaces with single space

print('No. of sentences: ', len(sentences)) 
print('21st sentence: ', sentences[20])

No. of sentences:  47
21st sentence:  the cache language models upon which many speech recognition systems now rely are examples of such statistical models 


## Tokenize to words, remove stopwords, and create a dictionary of word frequency

In [3]:
from nltk.corpus import stopwords

wordfreq = {}

for sentence in sentences:
    tokens = nltk.word_tokenize(sentence) # tokenize to words
    tokens = [i for i in tokens if i not in stopwords.words('english')] # remove stopwords
    for token in tokens:
        if token not in wordfreq.keys():
            wordfreq[token] = 1
        else:
            wordfreq[token] += 1
           
print('Word frequencies: ', wordfreq)
print('No. of words: ', len(wordfreq)) 

Word frequencies:  {'natural': 19, 'language': 24, 'processing': 16, 'nlp': 3, 'subfield': 1, 'linguistics': 3, 'computer': 3, 'science': 1, 'information': 3, 'engineering': 1, 'artificial': 2, 'intelligence': 4, 'concerned': 1, 'interactions': 1, 'computers': 2, 'human': 4, 'languages': 2, 'particular': 1, 'program': 1, 'process': 1, 'analyze': 1, 'large': 3, 'amounts': 2, 'data': 10, 'challenges': 1, 'frequently': 2, 'involve': 1, 'speech': 4, 'recognition': 2, 'understanding': 1, 'generation': 1, 'history': 1, 'generally': 3, 'started': 1, '1950s': 1, 'although': 1, 'work': 3, 'found': 2, 'earlier': 1, 'periods': 1, '1950': 1, 'alan': 1, 'turing': 2, 'published': 3, 'article': 1, 'titled': 1, 'computing': 1, 'machinery': 1, 'proposed': 1, 'called': 2, 'test': 1, 'criterion': 1, 'clarification': 1, 'needed': 1, 'georgetown': 1, 'experiment': 1, '1954': 1, 'involved': 1, 'fully': 1, 'automatic': 1, 'translation': 10, 'sixty': 2, 'russian': 1, 'sentences': 1, 'english': 1, 'authors': 1

## Fetch 200 most frequent words

In [4]:
import heapq

mostfreq = heapq.nlargest(200, wordfreq, key=wordfreq.get)

print(mostfreq)

['language', 'natural', 'processing', 'machine', 'systems', 'learning', 'data', 'translation', 'many', 'statistical', 'rules', 'research', 'algorithms', 'models', 'tasks', 'real', 'input', 'however', 'based', 'results', 'world', 'hand', 'used', 'intelligence', 'human', 'speech', '1980s', 'developed', 'written', 'produced', 'annotated', 'nlp', 'linguistics', 'computer', 'information', 'large', 'generally', 'work', 'published', 'much', 'late', 'system', 'eliza', 'using', 'examples', 'due', 'e', 'g', 'part', 'increasingly', 'focused', 'make', 'given', 'larger', 'corpora', 'learn', 'deep', 'neural', 'set', 'artificial', 'computers', 'languages', 'amounts', 'frequently', 'recognition', 'found', 'turing', 'called', 'sixty', '1966', 'first', 'restricted', 'example', 'head', 'hurts', '1978', 'lehnert', 'time', 'including', 'revolution', 'increase', 'corpus', 'earliest', 'decision', 'trees', 'hard', 'similar', 'existing', 'tagging', 'use', 'soft', 'probabilistic', 'decisions', 'attaching', 'val

## Vectorize sentences and create bag of words

In [5]:
import numpy as np

sentence_vectors = [] # all sentences
for sentence in sentences:
    tokens = nltk.word_tokenize(sentence) # break into tokens
    sentence_vector = [] # sentence
    for word in mostfreq:
        if word in tokens:
            sentence_vector.append(1)
        else:
            sentence_vector.append(0)
    sentence_vectors.append(sentence_vector) # add the sentence vector to the sentences vector
    
BOW = np.asarray(sentence_vectors) # convert list of list to matrix

print('(sentences, words) =', BOW.shape)
print(BOW)

(sentences, words) = (47, 200)
[[1 1 1 ... 0 0 0]
 [1 1 1 ... 0 0 0]
 [1 1 1 ... 0 0 0]
 ...
 [1 1 1 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
