In [0]:
import requests
from bs4 import BeautifulSoup
import os
import bs4
import nltk
import codecs
import string
from readability.readability import Document

In [0]:
r  = requests.get("https://en.wikipedia.org/wiki/Artificial_neural_network")

In [0]:
data = r.text

In [0]:
soup = BeautifulSoup(data , 'html5lib')

In [0]:
# This way we store only the information from the main div of the page and
# also removing some tags and their contents before moving to the preprocess steps
# we must create 2 folders in our system 1st folder: html 2nd folder: text
f = open('html/temp.html','w')
maindiv=soup.find_all('div','mw-parser-output')[0]
for sem in maindiv('semantics'):
    sem.decompose()
f.write(str(maindiv))
f.close()

In [0]:
def evaluation(textdir):
    """
    Uses the nltk.TaggedCorpusReader to answer the evaluation questions.
    """

    # Construct the corpus
    corpus    = nltk.corpus.TaggedCorpusReader(textdir, r'.*txt')
    sents=len(corpus.sents())
    # Construct stopwords
    stopwords = nltk.corpus.stopwords.words('english')
    stopwords.extend(list(string.punctuation))          # Remove punctuation
    stopwords.extend(["''", '``', "'s", "n't", "'ll"])  # Custom stopwords
    # Get the interesting words from corpus
    words     = [word.lower() for word in corpus.words() if word not in stopwords]
    # Count the words and tags
    tokens    = nltk.FreqDist(corpus.words())
    unigrams  = nltk.FreqDist(words)
    bigrams   = nltk.FreqDist(nltk.bigrams(words))
    tags      = nltk.FreqDist(tag for word, tag in corpus.tagged_words())
    # Eliminate stopwords
    for word in stopwords:
        unigrams.pop(word, None)
        bigrams.pop(word, None)
    # Enumerate the vocabulary and word count
    vocab     = len(tokens)            # The number of unique tokens
    count     = sum(tokens.values())   # The word count for the entire corpus

    # Answer the evaluation questions
    print "This corpus contains %i words with a vocabulary of %i tokens."  % (count, vocab)
    print "This corpus contains %i sentences." % (sents)
    print "The lexical diversity is %0.3f" % (float(count) / float(vocab))

    print "The 5 most common tags are:"
    for idx, tag in enumerate(tags.most_common(5)):
        print "    %i. %s (%i samples)" % ((idx+1,) + tag)

    print "\nThe 10 most common unigrams are:"
    for idx, tag in enumerate(unigrams.most_common(10)):
        print "    %i. %s (%i samples)" % ((idx+1,) + tag)

    print "\nThe 10 most common bigrams are:"
    for idx, tag in enumerate(bigrams.most_common(10)):
        print "    %i. %s (%i samples)" % ((idx+1,) + tag)

    print "\nThere are %i nouns in the corpus" % sum(val for key,val in tags.items() if key.startswith('N'))



In [0]:
# Tags to extract as paragraphs from the HTML text
TAGS = [ 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'h7', 'p', 'li']

def preprocess(path):
    """
    Opens a file, reads the contents - then performs the following tasks:
    1. Summarize the text with readability
    1. Gets all the target tags in the text
    2. Segments the paragraphs with the sent_tokenizer
    3. Tokenizes the sentences with the word_tokenizer
    4. Tags the sentences using the default pos_tagger
    It then yields a list of paragraphs whose data structure is a list of
    sentences that are tokenized and tagged.
    """

    with open(path, 'r') as f:

        # Transform the document into a readability paper summary
        html = Document(f.read()).summary()

        # Parse the HTML using BeautifulSoup
        soup = bs4.BeautifulSoup(html,"lxml")

        # Extract the paragraph delimiting elements
        for tag in soup.find_all(TAGS):

            # Get the HTML node text
            paragraph = tag.get_text()

            # Sentence Tokenize
            sentences = nltk.sent_tokenize(paragraph)
            for idx, sentence in enumerate(sentences):
                # Word Tokenize and Part of Speech Tagging
                sentences[idx] = nltk.pos_tag(nltk.word_tokenize(sentence))

            # Yield a list of sentences (the paragraph); each sentence of
            # which is a list of tuples in the form (token, tag).
            yield sentences


def transform(htmldir, textdir):
    """
    Pass in a directory containing HTML documents and an output directory
    for the preprocessed text and this function transforms the HTML to a
    text corpus that has been tagged in the Brown corpus style.
    """
    # List the target HTML directory
    for name in os.listdir(htmldir):

        # Determine the path of the file to transform and the file to write to
        inpath  = os.path.join(htmldir, name)
        outpath = os.path.join(textdir, os.path.splitext(name)[0] + ".txt")

        # Open the file for reading UTF-8
        if os.path.isfile(inpath):
            with codecs.open(outpath, 'w+', encoding='utf-8') as f:

                # Write paragraphs double newline separated and sentences
                # separated by a single newline. Also write token/tag pairs.
                for paragraph in preprocess(inpath):
                    for sentence in paragraph:
                        f.write(" ".join("%s/%s" % (word, tag) for word, tag in sentence))
                        f.write("\n")
                    f.write("\n")



In [0]:
transform('html', 'text')

In [0]:
evaluation('text/')

This corpus contains 22837 words with a vocabulary of 4586 tokens.
This corpus contains 1733 stentences.
The lexical diversity is 4.980
The 5 most common tags are:
    1. NNP (3574 samples)
    2. NN (3182 samples)
    3. JJ (1968 samples)
    4. . (1574 samples)
    5. IN (1482 samples)

The 10 most common unigrams are:
    1. neural (246 samples)
    2. networks (191 samples)
    3. learning (174 samples)
    4. \displaystyle (125 samples)
    5. network (110 samples)
    6. deep (88 samples)
    7. x (77 samples)
    8. j (64 samples)
    9. edit (62 samples)
    10. function (60 samples)

The 10 most common bigrams are:
    1. (u'neural', u'networks') (125 samples)
    2. (u'neural', u'network') (61 samples)
    3. (u'\\displaystyle', u'\\textstyle') (57 samples)
    4. (u'artificial', u'neural') (37 samples)
    5. (u'x', u'\\displaystyle') (24 samples)
    6. (u'deep', u'learning') (22 samples)
    7. (u'j', u'\\displaystyle') (22 samples)
    8. (u'machine', u'learning') (20 sam