In [5]:
!pip install readability-lxml

import os
import string
import pickle
import requests
import bs4
import nltk
import codecs
from nltk.corpus import stopwords
from readability.readability import Document

nltk.download('punkt')
nltk.download('averaged_perception_tagger')
nltk.download('tagsets')

nltk.download('stopwords')
from nltk.corpus import stopwords 
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package punkt to /Users/henry/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Error loading averaged_perception_tagger: Package
[nltk_data]     'averaged_perception_tagger' not found in index
[nltk_data] Downloading package tagsets to /Users/henry/nltk_data...
[nltk_data]   Package tagsets is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/henry/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
os.getcwd()

'/Users/henry/Desktop/digital_humanities'

In [7]:
BASE=os.getcwd()
HTML_CORPUS=os.path.join(BASE, 'data', 'raw', 'corpus2')

def fetch(url):
    res = requests.get(url)
    return res.text

def save_webpath(url, category=None, encoding='utf-8'):
    if category:
        dpath = os.path.join(HTML_CORPUS, category)
    else:
        dpath = HTML_CORPUS
        
    if not os.path.exists(dpath):
        os.makedirs(dpath)
        
    file_name = os.path.basename(url)
    path= os.path.join(dpath, file_name + '.html')
    
    if os.path.exists(path):
        raise ValueError('warning, file with that name aleady exists')
    else:
        with open (path, 'w', encoding = encoding) as f:
            f.write(fetch(url))

In [10]:
url="https://www.theguardian.com/commentisfree/2018/dec/20/government-stupid-labour-brexit-referendum-jeremy-corbyn"

In [11]:
save_webpath(url)

In [12]:
BASE=os.getcwd()
DATA = os.path.join(BASE, 'data')
NEW_CORPUS = os.path.join(DATA,'interim', 'new_corpus')
CORPUS = os.path.join(DATA, 'raw', 'corpus2')

TAGS=['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'h7', 'p', 'li']

def preprocess(path):
    with open(path, 'r') as f:
        html = Document(f.read()).summary()
        soup = bs4.BeautifulSoup(html, features = 'lxml')
        for tag in soup.find_all(TAGS):
            paragraph = tag.get_text()
            
            yield[
                nltk.pos_tag(nltk.wordpunct_tokenize(sentence))
                for sentence in nltk.sent_tokenize(paragraph)
            ]
            
def transform(htmldir, textdir):
    if not os.path.exists(textdir):
        os.makedirs(textdir)
        
    for name in os.listdir(htmldir):
        try:
            inpath = os.path.join(htmldir, name)
            outpath = os.path.join(textdir, os.path.splitext(name)[0] + '.txt')
            
            if os.path.isfile(inpath):
                with open (outpath, 'w+', encoding = 'utf-8') as f:
                    for paragraph in preprocess(inpath):
                        for sentence in paragraph:
                            f.write(" ".join(f"{word}/{tag}" for word, tag in sentence))
                            f.write("\n")
                        f.write("\n")
        except UnicodeDecodeError as e:
            print(f'could not parse HTML: {e} ')
            continue
            
            
if __name__ == '__main__':
    transform(CORPUS, NEW_CORPUS)
                    

In [13]:
NEW_CORPUS = os.path.join(DATA, 'interim', 'new_corpus')
corpus = nltk.corpus.TaggedCorpusReader(NEW_CORPUS, r".*\.txt")

words=corpus.words()
tagged_words= corpus.tagged_words()

from collections import Counter

def get_counts(words):
    counts = Counter()
    tokens = Counter()
    for word in words:
        counts['words'] += 1
        tokens[word] += 1
    #return{'There are '+str(counts['words'])+' words and '+ str(len(tokens))+' unique tokens in the corpus.'+' The lexical diversity is '+ str(counts['words'] / len(tokens))  }
    print('There are '+str(counts['words'])+' words and '+ str(len(tokens))+' unique tokens in the corpus.'+' The lexical diversity is '+ str(counts['words'] / len(tokens)))

def five_most_common_word_classes(tagged_words):
    counts = Counter()
    for word, tag in tagged_words:
        counts[tag]+=1
    counts5=counts.most_common(5)
    print("\n Here are the five most common word classes:")
    for n, tag in enumerate(counts5):
        print(f" {n + 1}. {tag} ")

from nltk import ngrams
def ten_most_common_unigrams(words_for_ngrams):
    words_for_ngrams=[]
    for word in words:
        word=word.lower()
        if word not in stop_words:
            if word.isalpha():
                words_for_ngrams.append(word)
              
    unigrams=Counter()
    unigrams = ngrams(words_for_ngrams,1)
    bigrams = ngrams(words_for_ngrams,2)
    unigrams2=Counter(unigrams)
    bigrams2=Counter(bigrams)
    
    print("\n Here are the ten most common unigrams:")
    for n, word in enumerate(unigrams2.most_common(10)):
        print(f"    {n + 1}. {word}")
        
    print("\n Here are the ten most common bigrams:")
    for n, word in enumerate(bigrams2.most_common(10)):
        print(f"    {n + 1}. {word}" )
        
        
def noun_to_verb_ratio(corpus):
    tag_counts=Counter()
    for word, tag in tagged_words:
        if tag.startswith('N'):
            tag_counts['nouns'] += 1
        if tag.startswith('V'):
            tag_counts['verbs']+=1
    number_of_nouns=tag_counts['nouns']
    number_of_verbs=tag_counts['verbs']
    n_to_v_ratio=number_of_nouns/(number_of_nouns+number_of_verbs)
    #return{'There are '+str(number_of_nouns)+' nouns and '+str(number_of_verbs)+" verbs in the corpus. The noun-to-verb ratio is "+str(n_to_v_ratio)}
    print('There are '+str(number_of_nouns)+' nouns and '+str(number_of_verbs)+" verbs in the corpus. The noun-to-verb ratio is "+str(n_to_v_ratio))

In [14]:
get_counts(words)
five_most_common_word_classes(tagged_words)
ten_most_common_unigrams(words)
noun_to_verb_ratio(tagged_words)

There are 1122 words and 553 unique tokens in the corpus. The lexical diversity is 2.0289330922242317

 Here are the five most common word classes:
 1. ('NN', 200) 
 2. ('IN', 118) 
 3. ('DT', 114) 
 4. ('NNP', 89) 
 5. ('JJ', 78) 

 Here are the ten most common unigrams:
    1. (('stupid',), 10)
    2. (('labour',), 10)
    3. (('corbyn',), 6)
    4. (('government',), 5)
    5. (('tory',), 5)
    6. (('brexit',), 5)
    7. (('cabinet',), 5)
    8. (('may',), 5)
    9. (('would',), 4)
    10. (('referendum',), 4)

 Here are the ten most common bigrams:
    1. (('leave', 'voters'), 3)
    2. (('panic', 'buying'), 2)
    3. (('three', 'day'), 2)
    4. (('day', 'week'), 2)
    5. (('voters', 'election'), 2)
    6. (('liberal', 'democrats'), 2)
    7. (('would', 'rise'), 2)
    8. (('support', 'among'), 2)
    9. (('britain', 'stupidest'), 1)
    10. (('stupidest', 'hour'), 1)
There are 340 nouns and 148 verbs in the corpus. The noun-to-verb ratio is 0.6967213114754098
