In [1]:
# Purpose: Run some basic checks and set up the dataset for the workflow.

import os, sys, time
import requests as re 
from bs4 import BeautifulSoup as bs  

# Send a GET request to the page
url = re.get('https://www.gutenberg.org/files/58383/58383-h/58383-h')

# This pauses the programme for a certain delay. 
# When we scrape, it is important not to "hammer" the servers hosting the data 
# we are scraping. Delays of this kind are essential to perform "ethical scraping".
sec_pause = 2
time.sleep(2)

# Check if HTTP responds correctly
url

# this object contains information about the request and the response:
print("query URL:", url.request.url)

# The HTTP status code 200 means "OK", while 404 means "page not found"
print("status code:", url.status_code)

# Store the html document of that page.
content = url.content

# Parse the HTML document with a parser to make the document traversible for extracting specific content
#htmlDoc = bs(content, 'html5lib')

htmlDoc = bs(content, "html.parser")
type(htmlDoc)

# From the parsed HTML document stored in the htmlDoc variable
# above extract all the bolded elements that contain
# the novel titles and urls on the page

titles = htmlDoc.find_all('b')

# Remove text not associated with a url

Titles= []

for i in titles:
    url = i.a
    Titles.append(url)  


query URL: https://www.gutenberg.org/files/58383/58383-h/58383-h
status code: 200


In [2]:
# Clean the data from Project Gutenberg

import re # Import regex

Titles_one = re.sub('\r|\n\s\s\s\s\s\s\s\s\s|None', '', str(Titles)) # Remove new lines, white space, None.
Titles_two = re.sub('\r| ,|\[,|\]', '', str(Titles_one)) # Removed unecessary commas and [ and ].
Titles_three = re.sub('(-h/\d\d\d\d\d-h.htm|-h/\d\d\d-h.htm|-h/\d\d\d\d-h.htm)', '.txt', str(Titles_two)) # Change .htm to .txt.
Cleaned_Titles = re.sub('\\b,', '', str(Titles_three), flags=re.IGNORECASE) # Remove comma after word ending (to stop importing to csv as new line.

# Convert string to list for manipulation in Pandas.

str_inp = Cleaned_Titles
Cleaned_Data = str_inp.split(",")

Cleaned_Data

[' <a href="http://www.gutenberg.org/files/619/619.txt">THE WARDEN</a>',
 ' <a href="http://www.gutenberg.org/files/814/814.txt">HUNTING SKETCHES</a>',
 ' <a href="http://www.gutenberg.org/files/1865/1865.txt">NORTH AMERICA VOLUME I</a>',
 ' <a href="http://www.gutenberg.org/files/1866/1866.txt">NORTH AMERICA VOLUME II</a>',
 ' <a href="http://www.gutenberg.org/files/2158/2158.txt">THE PRIME MINISTER</a>',
 ' <a href="http://www.gutenberg.org/files/2860/2860.txt">FRAMLEY PARSONAGE</a>',
 ' <a href="http://www.gutenberg.org/files/3045/3045.txt">THE LAST CHRONICLE OF BARSET</a>',
 ' <a href="http://www.gutenberg.org/files/3166/3166.txt">DOCTOR THORNE</a>',
 ' <a href="http://www.gutenberg.org/files/3409/3409.txt">BARCHESTER TOWERS</a>',
 ' <a href="http://www.gutenberg.org/files/3622/3622.txt">THE DUKE\'S CHILDREN</a>',
 ' <a href="http://www.gutenberg.org/files/4599/4599.txt">THE SMALL HOUSE AT ALLINGTON</a>',
 ' <a href="http://www.gutenberg.org/files/4917/4917.txt">THE KELLYS AND THE 

In [3]:
#Convert to dataframe, split source data, and add a column for titles.

import pandas as pd

df_source = pd.DataFrame(Cleaned_Data)
df_source.columns = ['URLs_Titles'] # Rename column title
df_source["URLs_Titles"]=df_source.astype(str) # Open df as a string

pattern = r'(http:\/\/[\w-][\w.,@?^=%&amp;:\/~+#-]*[\w@?^=%&amp;\/~+#-])' # Define a regex to extract the urls 

df_source['URLS']= df_source["URLs_Titles"].str.extract(pattern, expand=True) # Extract URLs into separate column
df_source = df_source.dropna(subset=["URLS"])# Drop NaN from URLS column to avoid downstream URLLIB error

# Add a column and populate it with titles extracted using regex

df_source2 = pd.DataFrame(df_source)

df_source2["Titles"]='Titles' #Add column 'Titles'
df_source2["Titles"]=df_source2["URLs_Titles"].astype(str) # Open df as a string
pattern2 = r'\>([^\<]*)' # Define a regex to extract the titles

df_source2['Titles']= df_source2["Titles"].str.extract(pattern2, expand=True) # Extract titles into separate column


In [4]:
# Test the urls in the dataframe

import urllib

df_source2['Error_codes']= 0 #Create a new df column to hold generated error codes

# Use .iterrows() to iterate over Pandas rows, testing that the urls work
for idx, row in df_source2.iterrows():
    url = row['URLS']
    title = row['Titles']
    error = row['Error_codes']
    try:
        conn = urllib.request.urlopen(url)
    except urllib.error.HTTPError as e:
        # Return code error (e.g. 404, 501, ...)
        # ...
        print('Failed!! ' + str(title + ' ' + url) + ' HTTPError: {}'.format(e.code)) 
        # Write error code to error_code df column using loc
        # df_source2.loc[df_source2["Error_codes"] == 0, "Error_codes"] = e.code 
        df_source2.at[idx, 'Error_codes'] = e.code
    except urllib.error.URLError as e:
        # Not an HTTP-specific error (e.g. connection refused)
        # ...
        print('Failed!! ' + str(title + ' ' + url) + ' URLError: {}'.format(e.reason))
        # Write error code to error_code df column using loc
        #df_source2.loc[df_source2["Error_codes"] == "000", "Error_codes"] = e.reason
    else:
        # 200
        # ...
        print('Success with ' + str(title + ' ' + url))

# Append '-0' to the urls associated with 404 errors
for idx, row in df_source2.iterrows():
    url = row['URLS']
    error = row['Error_codes']
    if error == 404:
        new_url = url[:-4] + '-0' + url[-4:]
        df_source2.at[idx, 'URLS'] = new_url

# Rerun failed tests
for idx, row in df_source2.iterrows():
    url = row['URLS']
    title = row['Titles']
    error = row['Error_codes']
    if error != 0:
        try:
            conn = urllib.request.urlopen(url)
        except urllib.error.HTTPError as e:
            # Return code error (e.g. 404, 501, ...)
            # ...
            print('Failed again!! ' + str(title + ' ' + url) + ' HTTPError: {}'.format(e.code)) 
            # Write error code to error_code df column using loc
            # df_source2.loc[df_source2["Error_codes"] == 0, "Error_codes"] = e.code 
            df_source2.at[idx, 'Error_codes'] = e.code
        except urllib.error.URLError as e:
            # Not an HTTP-specific error (e.g. connection refused)
            # ...
            print('Failed again!! ' + str(title + ' ' + url) + ' URLError: {}'.format(e.reason))
            # Write error code to error_code df column using loc
            #df_source2.loc[df_source2["Error_codes"] == "000", "Error_codes"] = e.reason
        else:
            # 200
            # ...
            print('Success with ' + str(title + ' ' + url))

print ('Test complete!!')

Success with THE WARDEN http://www.gutenberg.org/files/619/619.txt
Success with HUNTING SKETCHES http://www.gutenberg.org/files/814/814.txt
Success with NORTH AMERICA VOLUME I http://www.gutenberg.org/files/1865/1865.txt
Success with NORTH AMERICA VOLUME II http://www.gutenberg.org/files/1866/1866.txt
Success with THE PRIME MINISTER http://www.gutenberg.org/files/2158/2158.txt
Success with FRAMLEY PARSONAGE http://www.gutenberg.org/files/2860/2860.txt
Success with THE LAST CHRONICLE OF BARSET http://www.gutenberg.org/files/3045/3045.txt
Success with DOCTOR THORNE http://www.gutenberg.org/files/3166/3166.txt
Success with BARCHESTER TOWERS http://www.gutenberg.org/files/3409/3409.txt
Success with THE DUKE'S CHILDREN http://www.gutenberg.org/files/3622/3622.txt
Success with THE SMALL HOUSE AT ALLINGTON http://www.gutenberg.org/files/4599/4599.txt
Failed!! THE KELLYS AND THE O'KELLYS http://www.gutenberg.org/files/4917/4917.txt HTTPError: 404
Success with THE BELTON ESTATE http://www.guten

In [5]:
# Export the df_source2 dataframe to a CSV file
df_source2.to_csv('acquired_gutenberg_data.csv', index=False)

In [6]:
# Download the files via acquired_data.csv

import os, csv
import urllib.request

#open import file
with open("acquired_gutenberg_data.csv", newline='', encoding='utf-8') as f:
#Assign the import file to the DictReader "reader"    
    reader = csv.DictReader(f)
    #Now loop through all rows and build out variables    
    for row in reader:
        url = row['URLS']
        title = row['Titles']
        myPath = "../sources/gutenberg_source_files/"
        filename = str(title + '.txt')
        fullfilename = os.path.join(myPath, filename)
        print("Fetching " + str(title + ' ' + url))
        urllib.request.urlretrieve(url, fullfilename)

print('Downloads Completed!!!')

Fetching THE WARDEN http://www.gutenberg.org/files/619/619.txt
Fetching HUNTING SKETCHES http://www.gutenberg.org/files/814/814.txt
Fetching NORTH AMERICA VOLUME I http://www.gutenberg.org/files/1865/1865.txt
Fetching NORTH AMERICA VOLUME II http://www.gutenberg.org/files/1866/1866.txt
Fetching THE PRIME MINISTER http://www.gutenberg.org/files/2158/2158.txt
Fetching FRAMLEY PARSONAGE http://www.gutenberg.org/files/2860/2860.txt
Fetching THE LAST CHRONICLE OF BARSET http://www.gutenberg.org/files/3045/3045.txt
Fetching DOCTOR THORNE http://www.gutenberg.org/files/3166/3166.txt
Fetching BARCHESTER TOWERS http://www.gutenberg.org/files/3409/3409.txt
Fetching THE DUKE'S CHILDREN http://www.gutenberg.org/files/3622/3622.txt
Fetching THE SMALL HOUSE AT ALLINGTON http://www.gutenberg.org/files/4599/4599.txt
Fetching THE KELLYS AND THE O'KELLYS http://www.gutenberg.org/files/4917/4917-0.txt
Fetching THE BELTON ESTATE http://www.gutenberg.org/files/4969/4969.txt
Fetching THE AMERICAN SENATOR ht