In [1]:
%reload_ext nb_black

<IPython.core.display.Javascript object>

# Text Data Cleaning and Preprocessing Assignment

In [2]:
from nltk.corpus.reader.plaintext import PlaintextCorpusReader
from nltk import sent_tokenize
from nltk import word_tokenize
from nltk import pos_tag
import feedparser
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from bs4 import BeautifulSoup

<IPython.core.display.Javascript object>

In [3]:
def html_to_text(html):
    TAGS = ["h1", "h2", "h3", "h4", "h5", "h6", "h7", "p", "li"]

    soup = BeautifulSoup(html, "lxml")
    text_list = [tag.get_text() for tag in soup.find_all(TAGS)]
    text = " ".join(text_list)
    return text

<IPython.core.display.Javascript object>

In [4]:
def corpus_stats(corpus):
    print("Corpus Statistics")
    print("Number of documents: " + str(len(corpus.fileids())))
    print("Number of paragraphs: " + str(len(corpus.paras())))
    print("Number of sentences: " + str(len(corpus.sents())))
    print("Number of words: " + str(len(corpus.words())))
    print("Vocabulary: " + str(len(set(w.lower() for w in corpus.words()))))
    print(
        "Avg chars per word: " + str(round(len(corpus.raw()) / len(corpus.words()), 1))
    )
    print(
        "Avg words per sentence: "
        + str(round(len(corpus.words()) / len(corpus.sents()), 1))
    )

<IPython.core.display.Javascript object>

### Read the O'Reilly RSS plain text file articles into a corpus using the NLTK's PlaintextCorpusReader.

In [5]:
!mkdir oreilly_rss

A subdirectory or file oreilly_rss already exists.


<IPython.core.display.Javascript object>

In [6]:
feed = "http://feeds.feedburner.com/oreilly/radar/atom"

parsed = feedparser.parse(feed)
posts = parsed.entries

path = "./oreilly_rss/"

for index, post in enumerate(posts):
    content = posts[index]["content"][0]["value"]
    text = html_to_text(content)
    with open(path + "article_" + str(index) + ".txt", "wb") as f:
        f.write(text.encode())

<IPython.core.display.Javascript object>

In [7]:
from nltk.corpus.reader.plaintext import PlaintextCorpusReader

DOC_PATTERN = r".*\.txt"
news_corpus = PlaintextCorpusReader("oreilly_rss", DOC_PATTERN)

<IPython.core.display.Javascript object>

In [8]:
import nltk

nltk.download("punkt")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jlim7\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

<IPython.core.display.Javascript object>

In [9]:
corpus_stats(news_corpus)

Corpus Statistics
Number of documents: 60
Number of paragraphs: 61
Number of sentences: 2348
Number of words: 57620
Vocabulary: 6461
Avg chars per word: 5.1
Avg words per sentence: 24.5


<IPython.core.display.Javascript object>

### Iterate through the fileids in the corpus, extract the raw text of each document, and store them in a list.

In [10]:
docs = [news_corpus.raw(file_id) for file_id in news_corpus.fileids()]

<IPython.core.display.Javascript object>

### Sentence tokenize each document in the list of documents.

In [11]:
for doc in docs:
    sentence_token = sent_tokenize(doc)
    print(sentence_token)

['Mutation Testing — in this paper, we semi-automatically learn error-inducing patterns from a corpus of common Java coding errors and from changes that caused operational anomalies at Facebook specifically.', 'We combine the mutations with instrumentation that measures which tests exactly visited the mutated piece of code.', 'Results on more than 15,000 generated mutants show that more than half of the generated mutants survive Facebook’s rigorous test suite of unit, integration, and system tests.', 'Causal Reasoning in Probability Trees — A Colab notebook tutorial that is the companion tutorial for the paper “Algorithms for Causal Reasoning in Probability trees” by Genewein T. et al.', '(2020).', 'Probability trees are one of the simplest models of causal generative processes.They possess clean semantics and are strictly more general than causal Bayesian networks, being able to e.g.', 'represent causal relations that causal Bayesian networks can’t.', 'Even so, they have received litt

<IPython.core.display.Javascript object>

### Word tokenize each sentence within each document.

You should end up with a nested list structure where the outer list contains all the sentences in each document and the inner list contains the tokenized sentences.

In [12]:
for doc in docs:
    word_token = word_tokenize(doc)
    print(word_token)

['Mutation', 'Testing', '—', 'in', 'this', 'paper', ',', 'we', 'semi-automatically', 'learn', 'error-inducing', 'patterns', 'from', 'a', 'corpus', 'of', 'common', 'Java', 'coding', 'errors', 'and', 'from', 'changes', 'that', 'caused', 'operational', 'anomalies', 'at', 'Facebook', 'specifically', '.', 'We', 'combine', 'the', 'mutations', 'with', 'instrumentation', 'that', 'measures', 'which', 'tests', 'exactly', 'visited', 'the', 'mutated', 'piece', 'of', 'code', '.', 'Results', 'on', 'more', 'than', '15,000', 'generated', 'mutants', 'show', 'that', 'more', 'than', 'half', 'of', 'the', 'generated', 'mutants', 'survive', 'Facebook', '’', 's', 'rigorous', 'test', 'suite', 'of', 'unit', ',', 'integration', ',', 'and', 'system', 'tests', '.', 'Causal', 'Reasoning', 'in', 'Probability', 'Trees', '—', 'A', 'Colab', 'notebook', 'tutorial', 'that', 'is', 'the', 'companion', 'tutorial', 'for', 'the', 'paper', '“', 'Algorithms', 'for', 'Causal', 'Reasoning', 'in', 'Probability', 'trees', '”', 'by

['VSCode', 'Debug', 'Visualizer', '—', 'A', 'VS', 'Code', 'extension', 'for', 'visualizing', 'data', 'structures', 'while', 'debugging', '.', 'Like', 'the', 'VS', 'Code', '’', 's', 'watch', 'view', ',', 'but', 'with', 'rich', 'visualizations', 'of', 'the', 'watched', 'value', '.', 'The', 'screencast', 'is', 'wow', '.', 'Userland', '—', 'an', 'integrated', 'dataflow', 'environment', 'for', 'end-users', '.', 'It', 'allows', 'users', 'to', 'interact', 'with', 'modules', 'that', 'implement', 'functionality', 'for', 'different', 'domains', 'from', 'a', 'single', 'user', 'interface', 'and', 'combine', 'these', 'modules', 'in', 'creative', 'ways', '.', 'The', 'talk', 'shows', 'it', 'in', 'action', '.', 'It', '’', 's', 'a', 'spreadsheet', 'and', 'cells', 'can', 'be', 'like', 'a', 'spreadsheet', ',', 'or', 'can', 'be', 'like', 'a', 'Unix', 'shell', ',', 'or', 'can', 'be', 'an', 'audio', 'synthesizer', '(', '!', ')', '.', 'Minglr', '—', 'Open', 'source', 'software', '(', 'built', 'on', 'Jitsi', 

['Microservices', 'seem', 'to', 'be', 'everywhere', '.', 'Scratch', 'that', ':', 'talk', 'about', 'microservices', 'seems', 'to', 'be', 'everywhere', '.', 'And', 'that', '’', 's', 'the', 'problem', '.', 'Thinkers', 'as', 'dissimilar', 'as', 'Plato', ',', 'Robert', 'Boyle', ',', 'and', 'Keith', 'Richards', 'tend', 'to', 'agree', 'about', 'one', 'thing', ':', 'Talk', 'is', 'cheap', '.', 'So', 'we', 'wanted', 'to', 'determine', 'to', 'what', 'extent', ',', 'and', 'how', ',', 'O', '’', 'Reilly', 'subscribers', 'are', 'empirically', 'using', 'microservices', '.', 'In', 'other', 'words', ',', 'how', 'long', 'have', 'people', 'been', 'using', 'them', '?', 'What', 'are', 'they', 'using', 'them', 'for', '?', 'Are', 'they', 'having', 'success', '?', 'If', 'so', ',', 'what', 'kinds', 'of', 'benefits', 'are', 'they', 'seeing', '?', 'What', 'can', 'we', 'learn', 'from', 'their', 'failures', '?', 'So', 'we', 'did', 'what', 'we', 'usually', 'do', ':', 'we', 'ran', 'a', 'survey', '.', 'The', 'survey',

<IPython.core.display.Javascript object>

### Tag each token with its part of speech.

In [13]:
import nltk

nltk.download("averaged_perceptron_tagger")

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\jlim7\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

<IPython.core.display.Javascript object>

In [14]:
tag = pos_tag(word_token)
print(tag)

[('The', 'DT'), ('release', 'NN'), ('of', 'IN'), ('GPT-3', 'NNP'), ('has', 'VBZ'), ('reinvigorated', 'VBN'), ('a', 'DT'), ('discussion', 'NN'), ('of', 'IN'), ('creativity', 'NN'), ('and', 'CC'), ('artificial', 'JJ'), ('intelligence', 'NN'), ('.', '.'), ('That', 'DT'), ('’', 'VBZ'), ('s', 'VBZ'), ('a', 'DT'), ('good', 'JJ'), ('discussion', 'NN'), ('to', 'TO'), ('have', 'VB'), (',', ','), ('primarily', 'RB'), ('because', 'IN'), ('it', 'PRP'), ('forces', 'VBZ'), ('us', 'PRP'), ('to', 'TO'), ('think', 'VB'), ('carefully', 'RB'), ('about', 'IN'), ('what', 'WP'), ('we', 'PRP'), ('mean', 'VBP'), ('when', 'WRB'), ('we', 'PRP'), ('use', 'VBP'), ('words', 'NNS'), ('like', 'IN'), ('“', 'NNP'), ('creativity', 'NN'), ('”', 'NNP'), ('and', 'CC'), ('“', 'NNP'), ('art.', 'RB'), ('”', 'VBD'), ('As', 'IN'), ('I', 'PRP'), ('’', 'VBP'), ('ve', 'JJ'), ('argued', 'VBN'), ('in', 'IN'), ('the', 'DT'), ('past', 'JJ'), (',', ','), ('each', 'DT'), ('time', 'NN'), ('we', 'PRP'), ('have', 'VBP'), ('this', 'DT'), (

<IPython.core.display.Javascript object>

### Word tokenize the raw text of each document and remove stop words.

In [15]:
for doc in docs:
    sents = sent_tokenize(doc)
    tokenized = [word_tokenize(sent) for sent in sents]
    tagged = [pos_tag(tokens) for tokens in tokenized]

<IPython.core.display.Javascript object>

In [16]:
import nltk

nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jlim7\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

<IPython.core.display.Javascript object>

In [17]:
no_stop = [
    token.lower()
    for token in word_tokenize(doc)
    if not token.lower() in stopwords.words("english")
]

<IPython.core.display.Javascript object>

### For every document, stem all the words in the document.

In [18]:
# removing punctuation
no_punct = [token.lower() for token in word_tokenize(doc) if token.isalpha() == True]

<IPython.core.display.Javascript object>

In [19]:
# stemming the words
stemmer = SnowballStemmer("english")
stemmed = [stemmer.stem(token) for token in no_punct]
print(stemmed)

['the', 'releas', 'of', 'has', 'reinvigor', 'a', 'discuss', 'of', 'creativ', 'and', 'artifici', 'intellig', 'that', 's', 'a', 'good', 'discuss', 'to', 'have', 'primarili', 'becaus', 'it', 'forc', 'us', 'to', 'think', 'care', 'about', 'what', 'we', 'mean', 'when', 'we', 'use', 'word', 'like', 'creativ', 'and', 'as', 'i', 've', 'argu', 'in', 'the', 'past', 'each', 'time', 'we', 'have', 'this', 'discuss', 'we', 'end', 'up', 'rais', 'the', 'bar', 'each', 'time', 'an', 'ai', 'system', 'doe', 'someth', 'that', 'look', 'intellig', 'or', 'creativ', 'we', 'end', 'up', 'decid', 'that', 's', 'not', 'what', 'intellig', 'realli', 'is', 'and', 'that', 's', 'a', 'good', 'thing', 'ai', 'is', 'like', 'to', 'teach', 'us', 'more', 'about', 'what', 'intellig', 'and', 'creativ', 'are', 'not', 'than', 'about', 'what', 'they', 'are', 'i', 'm', 'not', 'terribl', 'interest', 'in', 'whether', 'ai', 'can', 'imit', 'human', 'creativ', 'can', 'an', 'ai', 'creat', 'a', 'new', 'poem', 'that', 'read', 'as', 'if', 'it

<IPython.core.display.Javascript object>

### Iterate through each document, computing and printing the following document statistics for each.

- Number of sentences
- Average words per sentence
- Vocabulary
- Lexical Diversity