In [5]:
import csv 
import re,os
# ! pip install gutenbergpy
import gutenbergpy.textget

with open('english_novelists.csv','r') as file: 
    csv_file = csv.reader(file)
    next(csv_file) # skip header 
    for line in csv_file:
        name = line[1]
        if name not in os.listdir('novels'): 
            os.mkdir(f'novels/{name}')
        text_ids = re.findall("\{'gutenberg_id': '(\d+)',", line[8])
        for text_id in text_ids: 
            book = gutenbergpy.textget.get_text_by_id(text_id) 
            book = gutenbergpy.textget.strip_headers(book) # remove headers
            book = book.decode("utf-8",errors='ignore') # turn bytes into string
            book = re.sub('\n','  ',book)
            book = re.sub(f'-|–',' ',book)
            book = re.sub(r"[^a-zA-Z\s\']",'', book.lower())
            book = re.sub('\s+',' ',book)
            with open(f'novels/{name}/{text_id}.txt','w+') as file: 
                file.write(book.strip())         

In [6]:
import re
from nltk.stem import WordNetLemmatizer
from nltk.tag import pos_tag
from nltk.tokenize import word_tokenize
lemmatizer = WordNetLemmatizer() 

def standardize(line):
    line = line.strip('\n').lower()
    line = re.sub(r'-|–',' ',line)
    line = re.sub("don't|do not",'',line)
    line = re.sub(r'\(a\)|\(the\)|^a |\bi\b','',line)
    # remove all puncutation and special characters except apostrophes
    line = re.sub(r"[^a-zA-Z\s\'\)\()]",'', line) 
    # part of speech tagging & tokenization 
    tagged = pos_tag(word_tokenize(line)) 
    # remove pronouns, determinants, conjunctions
    line = [token[0] for token in tagged if not re.search(r'PRP\$|DT|POS|CC|PRP',token[1])] 
    line = [lemmatizer.lemmatize(token) for token in line]
    # based on van Cranenburgh's rules for cleaning 
    line = re.sub(r"'s|\bwa\b|\bwas\b|\bis\b|\bam\b|\bare\b|'re|'m|\bwere\b|\bbeen\b", 'be',' '.join(line))
    line = re.sub(r"'ve|\bha\b|\bhas\b|\bhad\b","have",line)
    line = re.sub(r"\bshould\b|\bwould\b|\bcould\b","must",line)
    line = re.sub("n't","",line)
    line = re.sub("\bdoe\b|\bdid\b|\bdone\b",'do',line)
    line = re.sub("'ll|shall","will",line)
    line = re.sub('\s+',' ',line)
    return line

In [None]:
'''
Have already edited Reilly's original cliches.txt file to include some more expressions, 
change spellings to British English, and remove extraneous and unnecessary parenthical material

Also added in these 19th century idioms: 
https://en.wikipedia.org/wiki/List_of_English-language_idioms_of_the_19th_century 
'''
filename = 'cliches.txt'
with open(filename,'r') as file: 
    for idx, line in enumerate(file): 
        print(standardize(line))      

In [17]:
import os
for author in os.listdir('novels'):
    if author not in os.listdir('novels_clean'): 
        os.mkdir(f'novels_clean/{author}')
    for text in os.listdir(f'novels/{author}'):
        with open(f'novels/{author}/{text}','r') as infile: 
            line = infile.readline()
            with open(f'novels_clean/{author}/{text}','w+') as outfile: 
                outfile.write(standardize(line))
    print(f'{author}: done')

H.B. Marriott Watson: done
William Harrison Ainsworth: done
Clemence Housman: done
James Payn: done
Georgiana Fullerton: done
Emma Leslie: done
Frederick Marryat: done
Leonard Merrick: done
Emily Bronte: done
William Makepeace Thackeray: done
Florence Marryat: done
Israel Zangwill: done
Hall Caine: done
Morley Roberts: done
Ann Radcliffe: done
H.G. Wells: done
E. Phillips Oppenheim: done
R.D. Blackmore: done
Thomas Hughes: done
Thomas Love Peacock: done
Edgar Jepson: done
Ouida: done
Maurice Hewlett: done
Thomas Hardy: done
Walter Besant: done
G.A. Henty: done
Bernard Capes: done
Charles Dickens: done
Frederick Rolfe: done
Hugh Conway: done
Maria Edgeworth: done
Elizabeth Gaskell: done
William Godwin: done
Walter Pater: done
Grace Aguilar: done
Charles Kingsley: done
Charlotte Bronte: done
George Grossmith: done
Anthony Trollope: done
Flora Annie Steel: done
Amelia Opie: done
Henry Kingsley: done
Nicholas Wiseman: done
Regina Maria Roche: done
E.F. Benson: done
Benjamin Disraeli: done
