In [1]:
from bs4 import BeautifulSoup
import requests
import PyPDF2
import os
from pdfminer import high_level
import re
from pdfminer.high_level import extract_pages
import shutil
import textstat
import wikipedia

def remove_brackets(text):
    newtext = re.sub(r'\([^)]*\)', '',text)
    return  ' '.join((''.join(newtext.splitlines())).split())

def remove_single_newline(text):
    return re.sub('(?<![\r\n])(\r?\n|\r)(?![\r\n])', ' ', text)

def get_grade(text):
    grade_str = textstat.text_standard(text)
    num_str = re.findall(r'\d+', grade_str)
    grade = int(num_str[0])
    return grade

In [2]:
url = "https://www.scmp.com/yp/discover/your-voice/letters-editorial"
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')

In [3]:
soup

<!DOCTYPE html>
<html><head><meta charset="utf-8"/><link href="/yp/assets/png/favicon.png" rel="shortcut icon" type="image/x-icon"/><link href="/yp/assets/png/appicon.png" rel="apple-touch-icon"/><title>Letters to the Editors - YP | South China Morning Post</title><link href="https://www.scmp.com/yp/discover/your-voice/letters-editorial" rel="canonical"/><meta content="" name="description"/><meta content="googleanalytics" name="amp-google-client-id-api"/><meta content="Young Post" property="og:site_name"/><meta content="https://www.scmp.com/yp/discover/your-voice/letters-editorial" property="og:url"/><meta content="Letters to the Editors - YP | South China Morning Post" property="og:title"/><meta content="" property="og:description"/><meta content="https://www.scmp.com/yp/assets/png/ogImage.png" property="og:image"/><meta content="700" property="og:image:width"/><meta content="400" property="og:image:height"/><meta content="116313905072708" property="fb:pages"/><meta content="282931841

In [224]:
# Lingua

def scrape_lingua(redownload_pdfs=False):
    directory = './texts/Lingua'
    pdf_directory = os.path.join(directory, 'pdfs')

    url = "https://lingua.com/english/reading/"
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html.parser')

    pdf_urls = []
    for a in soup.find_all('a',target="_blank"):
        pdf_urls.append('https://lingua.com/'+a['href'][6:])

    # Download the pdfs
    if redownload_pdfs:
        for url in pdf_urls:
            response = requests.get(url)
            with open(os.path.join(pdf_directory,url.split('/')[-1]), 'wb') as f:
                f.write(response.content)

    end_sentence = 'Did you understand the text?'
    for url in pdf_urls:
        try:
            local_pdf_filename = os.path.join(pdf_directory,url.split('/')[-1])
            number_of_pages = len(list(extract_pages(local_pdf_filename)))
            pages = list(range(number_of_pages))
            extracted_text = high_level.extract_text(local_pdf_filename, "", pages)
            title = remove_brackets(extracted_text.split('\n')[0]).strip()
            print('Title: ' + title)
            text_title_removed = '\n\n'.join(extracted_text.split('\n\n')[1:])
            content_end_idx = re.search(end_sentence, text_title_removed).start()
            content = text_title_removed[:content_end_idx].strip('\n')   
            content = remove_single_newline(content)
            content = re.sub('[0-9]/[0-9]', '', content)
            f = open(os.path.join(directory,title+".txt"), "w")
            f.write(content)
            f.close()
        except:
            print("Error for " + url.split('/')[-1])

def scrape_wikipedia(title,scrape_to_txt=True,to_remove_brackets=True,long=True):
    directory = r'.\texts\Wiki_simple_en'
    wikipedia.set_lang('simple')
    dt = wikipedia.page(title)
    if not long:
        text = dt.summary
    else:
        text = dt.content
        try:
            text_end_idx = re.search('== References',text).start()
            text = text[:text_end_idx]
        except:
            pass
#         try:
#             titles = ["=="+max(re.findall(r'[A-Za-z ]*',title),key=len).strip()+"==" for title in re.findall(r"==.{1,100}==", text)]
#             text = re.sub(r"==.{1,100}==",'\n\n'+title+'\n\n',text)
#         except:
#             pass
    if to_remove_brackets:
        text = remove_brackets(text)
    if scrape_to_txt:
        with open(os.path.join(directory, title+'.txt'),'w') as f:
            f.write(text)
    return text

def scrape_pearson_readers(folder):
    directory = os.path.join(r'.\texts\Pearson',folder)
    pdf_files = []
    for file in os.listdir(directory):
        if file.endswith(".pdf"):
            pdf_files.append(os.path.join(directory,file))
    for pdf in pdf_files:
        number_of_pages = len(list(extract_pages(pdf)))
        pages = list(range(number_of_pages))
        extracted_text = high_level.extract_text(local_pdf_filename, "", pages)
        title = remove_brackets(extracted_text.split('\n')[0]).strip()
        print('Title: ' + title)
        text_title_removed = '\n\n'.join(extracted_text.split('\n\n')[1:])
        content_end_idx = re.search(end_sentence, text_title_removed).start()
        content = text_title_removed[:content_end_idx].strip('\n')   
        content = remove_single_newline(content)
        content = re.sub('[0-9]/[0-9]', '', content)
        f = open(os.path.join(directory,title+".txt"), "w")
        f.write(content)
        f.close()
    

In [150]:
directory = os.path.join(r'.\texts\Pearson',"Level 1")
files = []
for file in os.listdir(directory):
    if file.endswith('.txt'):
        files.append(file)
files_with_dir = [os.path.join(directory,file) for file in files]
        
for file in files_with_dir:
    with open(file,'r',encoding='utf-8') as f:
        text = f.read()
        text = remove_single_newline(text)
        newfilename = file.split('\\')[-1].split('.')[0]+'-2.txt'
        with open(os.path.join(directory, newfilename),'w', encoding='utf-8') as g:
            g.write(text)
    
        

In [151]:
import textstat

# directory = r'.\texts\Lingua'
directory = os.path.join(r'.\texts\Pearson',"Level 1")
txt_files = []
for file in os.listdir(directory):
    if file.endswith(".txt"):
        txt_files.append(os.path.join(directory, file))

for file in txt_files:
    with open(file, 'r',encoding='utf-8') as f:
        title = file.split('\\')[-1].split('.')[0]
        text = f.read()
        grade = get_grade(text)
        print(title + ': grade ' + str(grade))
        





Money Talks: grade 4
Soapy's Winter Home: grade 4
The Adventures of Tom Sawyer - Chapter 1: grade 5
The Adventures of Tom Sawyer - Chapter 10: grade 4
The Adventures of Tom Sawyer - Chapter 11: grade 5
The Adventures of Tom Sawyer - Chapter 12: grade 4
The Adventures of Tom Sawyer - Chapter 13: grade 4
The Adventures of Tom Sawyer - Chapter 2: grade 6
The Adventures of Tom Sawyer - Chapter 3: grade 5
The Adventures of Tom Sawyer - Chapter 4: grade 5
The Adventures of Tom Sawyer - Chapter 5: grade 6
The Adventures of Tom Sawyer - Chapter 6: grade 6
The Adventures of Tom Sawyer - Chapter 7: grade 5
The Adventures of Tom Sawyer - Chapter 8: grade 6
The Adventures of Tom Sawyer - Chapter 9: grade 3
The Art Game: grade 4
The Gift of the Magi: grade 3
The Troubadour: grade 3


In [225]:
text = scrape_wikipedia('British Hong Kong')

In [226]:
text

"British Hong Kong was a colony and British Dependent Territory of the United Kingdom. It was Hong Kong when it was controlled as part of the British Empire. The British governed Hong Kong from 1841 to 1997, except for a small amount of time when the Japanese took over Hong Kong during World War II. During the First Opium War in 1841, the British took over Hong Kong Island. The British took over the Kowloon Peninsula in 1860 and the New Territories in 1898.== History ==Hong Kong was given by the Qing Dynasty to the British Empire in 1841 under the Treaty of Nanking. Kowloon was added to the area of Hong Kong in 1860 under the Convention of Peking, and New Territories was added in 1898 under the Second Convention of Peking. In 1941–1945, Hong Kong was ruled by the Japanese Empire during World War II. In 1984, People's Republic of China and the United Kingdom agreed under Sino-British Joint Declaration on the Question of Hong Kong that Hong Kong was to be returned to China on 1 July 1997

In [229]:
titles = ["=="+max(re.findall(r'[A-Za-z ]*',title),key=len).strip()+"==" for title in re.findall(r"==.{1,100}==", text)]
newtext = text
for title in titles:
    newtext = re.sub(r"==.{1,100}==",'\n\n'+title+'\n\n',newtext)
newtext 


"British Hong Kong was a colony and British Dependent Territory of the United Kingdom. It was Hong Kong when it was controlled as part of the British Empire. The British governed Hong Kong from 1841 to 1997, except for a small amount of time when the Japanese took over Hong Kong during World War II. During the First Opium War in 1841, the British took over Hong Kong Island. The British took over the Kowloon Peninsula in 1860 and the New Territories in 1898.\n\n\n\n\n\n\n\n==Independent Commission Against Corruption==\n\n\n\n\n\n\n\nHong Kong was given by the Qing Dynasty to the British Empire in 1841 under the Treaty of Nanking. Kowloon was added to the area of Hong Kong in 1860 under the Convention of Peking, and New Territories was added in 1898 under the Second Convention of Peking. In 1941–1945, Hong Kong was ruled by the Japanese Empire during World War II. In 1984, People's Republic of China and the United Kingdom agreed under Sino-British Joint Declaration on the Question of Hon