In [2]:
import os
import re
import random
import chardet
from shutil import copyfile

In [5]:
files = os.listdir('source')
clean_files = os.listdir('gutenberg')

In [28]:
s = 'source/'
g = 'gutenberg/'

## Print

In [7]:
def printlines(lines):
    print('\n'.join(lines))

In [8]:
def printsep():
    print()
    print('-'*40)
    print()    

In [9]:
def print_header_footer(lines, fname):
    eoe = end_of_header(lines, fname)
    printlines(lines[:eoe])
    printsep()
    sof = start_of_footer(lines, fname)
    printlines(lines[sof:])

In [10]:
def print_footer(lines, fname):
    sof = start_of_footer(lines, fname)
    printlines(lines[sof:])

In [11]:
def print_excerpt(lines, l=100):
    printlines(lines[:l])
    printsep()
    printlines(lines[-l:])

In [12]:
def print_start(lines, l=100):
    printlines(lines[:l])

In [13]:
def print_end(lines, l=100):
    printlines(lines[-l:])

In [14]:
def random_file(mydir):
    files = os.listdir(mydir)
    fname = files[random.randint(0, len(files)-1)]
    return openfile(mydir+fname), fname 

---

## Utils

In [15]:
def f_it(files):
    return files.__iter__()

In [25]:
def randfile(files):
    return files[random.randint(0,len(files)-1)]

In [162]:
def openfileraw(fname):
    try:
        with open(fname, 'r', encoding='utf-8') as f:
            txt = f.read()
    except:
        with open(fname, 'r', encoding='latin-1') as f:
            txt = f.read()
    txt = reencoderaw(txt, fname)            
    return txt

In [18]:
def openfile(fname):
    try:
        with open(fname, 'r', encoding='utf-8') as f:
            lines = f.readlines()
    except:
        with open(fname, 'r', encoding='latin-1') as f:
            lines = f.readlines()
    lines = reencode(lines, fname)
    return lines

In [183]:
def reencoderaw(txt, fname):
    newtxt = ''
    if 'ï»¿' in txt:    
        try:
            newtxt = txt.encode('latin-1').decode('utf-8')
            return newtxt
        except Exception as e:
            newtxt = txt.encode('latin-1').decode('utf-8', 'ignore')
            print(f'ignoring error in file {fname}: {e}')
            return newtxt
    return txt

In [19]:
def reencode(lines, fname):
    newlines = []
    if 'ï»¿' in lines[0]:
        for i in range(len(lines)):     
            try:
                newlines.append(lines[i].encode('latin-1').decode('utf-8'))
            except:
                newlines.append(lines[i].encode('latin-1').decode('utf-8', 'ignore'))
                print(f'ignoring error in file {fname}, line: {i}:')
                print(f'before: {lines[i]}')
                print(f'after: {newlines[-1]}')
        return newlines
    return lines

In [20]:
def file_is_empty(fname):
    if os.path.isfile(fname):
        if os.stat(fname).st_size == 0:
            return True
    else:
        return True
    return False

In [21]:
def writefile(fname, lines):
    with open(fname, 'w') as wr:
        wr.writelines(lines)  

In [386]:
def writefileraw(fname, txt):
    with open(fname, 'w') as wr:
        wr.write(txt)  

---

## Mecha

In [22]:
def end_of_header(lines, fname):
    start = 0

    # try to find *** START OF TH...
    pattern = re.compile(r'^ *\*\*\* *start of th', re.IGNORECASE)
    for i, l in enumerate(lines[:200]):    
        if re.match(pattern, l):
            ind = 0 # find the end of the paragraph
            while not '\n' == lines[i+ind]: 
                ind += 1
            start = i+ind
            break
    assert start != 0, f'Header not found: {fname}'
    
    # try to find lines starting with 'Produced/Distributed'
    pattern = re.compile(r'^(produced|distributed)', re.IGNORECASE)
    for i, l in enumerate(lines[start:start+50]):
        if re.match(pattern, l):
            ind = 0 # find the end of the paragraph
            while not '\n' == lines[i+ind]: 
                ind += 1
            start = start+i+ind
            break    
            
    # try to find a title
    for i, l in enumerate(lines[start:start+50]):
        if l.isupper():
            start = start+i
            break
            
    return start

In [23]:
def start_of_footer(lines, fname):
    pattern = re.compile(r'^(\*\*\*)* *end of th.*project gutenberg', re.IGNORECASE)
    start = 0
    for i, l in enumerate(lines):        
        if re.match(pattern, l): 
            start = i
            break
    assert start != 0, f'Footer not found: {fname}, start: {start}'
    return start

---

## Split into folders

In [882]:
h = 'header/'
nh = 'noheader/'

In [878]:
files[0]

'10053-8.txt'

In [890]:
total = len(files)
n_h = 0
n_nh = 0 
for i, fname in enumerate(files): 
    print(f'file {i:4}/{total}, name: {fname}', end='\r')
    try:
        f = openfile(g+fname)
        eoh = end_of_header(f, fname)
        copyfile(g+fname, h+fname)
        n_h += 1
    except Exception as e:
        print('-'*40)
        print(fname)
        print(e)
        copyfile(g+fname, nh+fname)
        n_nh += 1
print()
print(f'header found in {n_h} files, not found in {n_nh}.')

----------------------------------------
1910.txt
Header not found: 1910.txt
----------------------------------------
3456-8.txt
Header not found: 3456-8.txt
----------------------------------------
3644-8.txt
Header not found: 3644-8.txt
----------------------------------------
4563-8.txt
Header not found: 4563-8.txt
----------------------------------------
4564-8.txt
Header not found: 4564-8.txt
----------------------------------------
4565-8.txt
Header not found: 4565-8.txt
----------------------------------------
4566-8.txt
Header not found: 4566-8.txt
----------------------------------------
4567-8.txt
Header not found: 4567-8.txt
----------------------------------------
4568-8.txt
Header not found: 4568-8.txt
----------------------------------------
4647.txt
Header not found: 4647.txt
----------------------------------------
4648.txt
Header not found: 4648.txt
----------------------------------------
4649.txt
Header not found: 4649.txt
----------------------------------------
465

---

## Bulk of the work.

In [967]:
def transfer(files):
    total = len(files)
    for i, fname in enumerate(files):
        print(f'file {i:4}/{total}, name: {fname}', end='\r')         
        f = openfile(s+fname)
        eoh = end_of_header(f, fname)
        sof = start_of_footer(f, fname)        
        writefile(g+fname, f[eoh:sof])

In [941]:
files = os.listdir('header')

In [968]:
transfer(files)

ignoring error in file gutenberg/54035-0.txt, line: 3438:
before: © en femme, by FranÃ§ois-TimolÃ©on de Choisy

after:  en femme, by François-Timoléon de Choisy

file 2982/2983, name: 9976-8.txtt

---

## Random tests

In [930]:
f, fname = random_file(s)
print_footer(f, fname)

End of Project Gutenberg's Le livre des masques, by Remy de Gourmont





In [343]:
fname = randfile()
fname

'7809-8.txt'

In [344]:
f = openfile(fname)

In [689]:
fname = randfile()
with open(g+fname, 'r', encoding='latin-1') as x:
    f = x.read()

In [676]:
fname = '5147-8.txt'
f = openfile(fname)
# printlines(f[:1000])
print_header_footer(f, fname)

The Project Gutenberg EBook of Le Jardin d'Épicure, by Anatole France

#8 in our series by Anatole France



Copyright laws are changing all over the world. Be sure to check the

copyright laws for your country before downloading or redistributing

this or any other Project Gutenberg eBook.



This header should be the first thing seen when viewing this Project

Gutenberg file.  Please do not remove it.  Do not change or edit the

header without written permission.



Please read the "legal small print," and other information about the

eBook and Project Gutenberg at the bottom of this file.  Included is

important information about your specific rights and restrictions in

how the file may be used.  You can also find out about how to make a

donation to Project Gutenberg, and how to get involved.





**Welcome To The World of Free Plain Vanilla Electronic Texts**



**eBooks Readable By Both Humans and By Computers, Since 1971**



*****These eBooks Were Prepared By Thousands of Volu

In [688]:
fname = empties[4]
f = openfile(fname)
print_header_footer(f, fname)

The Project Gutenberg EBook of Les adevineaux amoureux, by Anonymous



This eBook is for the use of anyone anywhere in the United States and

most other parts of the world at no cost and with almost no restrictions

whatsoever.  You may copy it, give it away or re-use it under the terms

of the Project Gutenberg License included with this eBook or online at

www.gutenberg.org.  If you are not located in the United States, you'll

have to check the laws of the country where you are located before using

this ebook.







Title: Les adevineaux amoureux



Author: Anonymous



Editor: Colard Mansion



Release Date: August 18, 2018 [EBook #57719]



Language: French



Character set encoding: ISO-8859-1



*** START OF THIS PROJECT GUTENBERG EBOOK LES ADEVINEAUX AMOUREUX ***









Produced by Laurent Vogel and the Online Distributed

Proofreading Team at http://www.pgdp.net (This file was

produced from images generously made available by the

Bibliothèque nationale de France (BnF/Ga

In [678]:
fname = '10061-8.txt'
f = openfile(fname)
print_header_footer(f, fname)

The Project Gutenberg EBook of Les Heures Claires, by Emile Verhaeren



This eBook is for the use of anyone anywhere at no cost and with

almost no restrictions whatsoever.  You may copy it, give it away or

re-use it under the terms of the Project Gutenberg License included

with this eBook or online at www.gutenberg.net





Title: Les Heures Claires



Author: Emile Verhaeren



Release Date: November 12, 2003 [EBook #10061]



Language: French



Character set encoding: ISO-8859-1



*** START OF THIS PROJECT GUTENBERG EBOOK LES HEURES CLAIRES ***









Produced by Christine De Ryck and PG Distributed Proofreaders.

This file was produced from images generously made available

by the Bibliothèque nationale de France (BnF/Gallica) at

http://gallica.bnf.fr.









Em. Verhaeren



Les

heures claires



1896













O la splendeur de notre joie,

Tissée en or dans l'air de soie!



Voici la maison douce et son pignon léger,

Et le jardin et le verger.



Voici le banc, so

In [664]:
start_of_footer(f, '5147-8.txt')

4316

In [655]:
fname = randfile()
lines = openfile(fname)
print_header_footer(lines, fname)

The Project Gutenberg EBook of Le Jardin d'Épicure, by Anatole France

#8 in our series by Anatole France



Copyright laws are changing all over the world. Be sure to check the

copyright laws for your country before downloading or redistributing

this or any other Project Gutenberg eBook.



This header should be the first thing seen when viewing this Project

Gutenberg file.  Please do not remove it.  Do not change or edit the

header without written permission.



Please read the "legal small print," and other information about the

eBook and Project Gutenberg at the bottom of this file.  Included is

important information about your specific rights and restrictions in

how the file may be used.  You can also find out about how to make a


----------------------------------------



AssertionError: Footer not found: 5147-8.txt

---
## Test for titles

Sadly quite a few irregularities.

In [740]:
txt = openfileraw(s+'1910.txt')
title = re.search(r'(?<=\ntitle: ).*\n', txt, re.IGNORECASE)
print(txt[:1000])

Project Gutenberg Etext La Tulipe Noire (abridged),
by Alexandre Dumas, Pere
#6 in our series by Alexandre Dumas, Pere

This Etext is in French, the English version is Etext #965

Copyright laws are changing all over the world, be sure to check the
copyright laws for your country before posting these files!!

Please take a look at the important information in this header.  We
encourage you to keep this file on your own disk, keeping an
electronic path open for the next readers.  Do not remove this.


**Welcome To The World of Free Plain Vanilla Electronic Texts**

**Etexts Readable By Both Humans and By Computers, Since 1971**

*These Etexts Prepared By Hundreds of Volunteers and Donations*

Information on contacting Project Gutenberg to get Etexts, and
further information is included below.  We need your donations.


La Tulipe Noire [abridged]

by Alexandre Dumas, Pere

September, 1999  [Etext #1910]


Project Gutenberg Etext La Tulipe Noire, by Alexandre Dumas, Pere
*******This file 

In [722]:
total = len(files) 
for i, fname in enumerate(files):
    txt = openfileraw(fname)
    title = re.search(r'(?<=\ntitle: ).*\n', txt, re.IGNORECASE)
    assert title, f'Title not found: {fname}'    
    print(f'file {i:4}/{total}, name: {fname}, title: {title.group(0).strip()}', end='\r')    

file  757/2999, name: 19075-8.txt, title: Traduction nouvelle, Tome Istoire de mon temps (Tome 6)3729lice Générale poste aux lettres, et des messageries, sous une seule administrationl'Afrique intérieure.

AssertionError: Title not found: 1910.txt

In [726]:
print(re.search(r'(?<=\ntitle: ).*\n', f, re.IGNORECASE).group(0))

Lettre relative à l'organisation des postes et relais



---

## Dealing with encoding

In [554]:
with open(s+errors[6], 'r', encoding='latin-1') as x:
    f = x.readlines()

In [590]:
f[3438].encode('latin-1').decode('utf-8', 'ignore')

' en femme, by François-Timoléon de Choisy\n'

In [558]:
fr = reencode(f, +errors[6])

line: 3438: © en femme, by FranÃ§ois-TimolÃ©on de Choisy



In [620]:
othererrors = []
for fname in errors:
    print(f'file: {fname}', end='\r')
    try:
        f = openfile(s+fname)
        eoh = end_of_header(f, fname)
        sof = start_of_footer(f, fname)        
        writefile(fname, f[eoh:sof])
    except:
        othererrors.append(fname)    

file: 6966.txttxt

---

## Empty results

In [848]:
empties = []
for fname in os.listdir(o):
    if os.stat(o+fname).st_size == 0:
        print(f'{fname} is empty')
        empties.append(fname)

--- 

## Encoding tests

In [776]:
fname = clean_files[random.randint(0, len(sauberfiles)-1)]
f = openfile(g+fname)
print_excerpt(f, l=20)

                    LE TOUR DU MONDE









                         PARIS

                IMPRIMERIE FERNAND SCHMIDT

                  20, rue du Dragon, 20









                NOUVELLE SÉRIE--11e ANNÉE

                       2e SEMESTRE









                    LE TOUR DU MONDE




----------------------------------------



  Détail de la chapelle de San Fernando. (D'après une

    photographie.)                                                 620



  Vue extérieure de la Mosquée de Cordoue, avec l'église

    catholique élevée en 1523, malgré les protestations des

    Cordouans. (D'après une photographie.)                         621



  Statue de Gonzalve de Cordoue. (D'après une photographie.)       622



  Statue de doña Maria Manrique, femme de Gonzalve de Cordoue.

    (D'après une photographie.)                                    623



  Détail d'une porte de la mosquée. (D'après une photographie.)    624















In [736]:
total = len(sauberfiles)
toreencode = []
for i, fname in enumerate(sauberfiles):
    f = openfile(g+fname)
    if 'ï»¿' in f[0]: 
        torrencode.append(fname)
        print(f'file {i:4}/{total}, name: {fname}, reencode: YES', end='\r')
    else:
        print(f'file {i:4}/{total}, name: {fname}, reencode: no', end='\r')

file 2991/2992, name: 9976-8.txt, reencode: noo

---

## More cleaning

Illustrations:

In [389]:
def test_illustration(fname):
    f = openfileraw(g+fname)    
    p = re.compile('\[Illustration[^[]*?\]', re.IGNORECASE | re.DOTALL)
    r = set(re.findall(p, f))
    if r: print(*r,sep='\n')
    else: print(f'not found in {fname}')    

In [375]:
fname = randfile(clean_files)
print(f'file: {fname}')
print('-'*30)
test_illustration(fname)

file: 36786-8.txt
------------------------------
[Illustration: La femme la plus riche du monde: Mrs. Hetty Green. (Douze
millions de revenus annuels.)]
[Illustration: L'escalier d'honneur de l'hôtel Hirsch.]
[Illustration: La rue Sagasta à Algésiras: un rassemblement.]
[Illustration: Un geyser jailli le 13 novembre dernier. Dans le fond,
«cône» d'un ancien geyser.]
[Illustration: La salle des délibérations du conseil municipal.]
[Illustration: Un puits dans l'église de Notre-Dame de l'Epine.]
[Illustration: Le «Sudrophone» correspondant au tuba.]
[Illustration: UNE GRANDE PREMIÈRE A LA COMÉDIE-FRANÇAISE.--Mme Bartet
au deuxième acte du «Réveil», de M. Paul Hervieu.]
[Illustration: Miss Alice Roosevelt.]
[Illustration: Cheminée du hall.]
[Illustration: L'inscription des plis et paquets chargés ou
recommandés.]
[Illustration: La basse ordinaire ou «tuba».]
[Illustration: Arrivée des colis postaux à la gare.]
[Illustration: Algésiras vu de la mer.]
[Illustration: «Ronde d'enfants», peint

In [390]:
fname = '60594-0.txt'
test_illustration(fname)

[Illustration: GRAVURE DE A. HERVIEU

(Extrait de _Paris and the Parisians_, par Mrs. Trollope)]
[Illustration: UNE SOIRÉE

(Par A. Hervieu) (Extr. de _Paris and the Parisians_, by Mrs. Trollope)]
[Illustration: CAUSERIES DU SOIR, PAR E. LAMI

(Bibl. Nationale)]
[Illustration: (V. Adam del.) (Collection J. B.)]
[Illustration: L’ANGLAISE

(Par Guérin) (Coll. J. B.)]
[Illustration: «CE SOIR A LA PORTE SAINT-MARTIN!--J’Y SERAI!»

(Grav. de A. Hervieu) (Extr. de _Paris and the Parisians_, by Mrs.
Trollope)]
[Illustration: STATUETTE DE VICTOR HUGO

(Par Dantan) (Extr. du _Musée Dantan_)]
[Illustration: LAMENNAIS

«Galerie de la Presse» (Bibl. nat.)]
[Illustration: VUE DU JARDIN DES TUILERIES

(Par Arnout) (Coll. J. Boulenger)]
[Illustration: «MARCHANDES DE MODES»

(Par Gavarni) (Bibl. nat.)]
[Illustration: AU LOUVRE]
[Illustration: L’ERMITAGE DE JEAN-JACQUES A MONTMORENCY

(Par A. Pollet) (Coll. J. B.)]
[Illustration: DILIGENCE]
[Illustration: UN AGENT DE POLICE]
[Illustration: GEORGE SAND 

In [387]:
p = re.compile('\[Illustration[^[]*?\]', re.IGNORECASE | re.DOTALL)
total = len(clean_files)
for i, fname in enumerate(clean_files):
    print(f'{i+1:4}/{total} | filename: {fname}', end=' '*10+'\r')
    f = openfileraw(g+fname)
    r = set(re.findall(p, f))
    if r:
        for x in r:
            f = f.replace(x, '')
    writefileraw('sauberberg/'+fname, f)

2998/2999 | filename: 9976-8.txt                                                             

A better way, that works even in nested cases:

In [3]:
p = re.compile('\[Illustration[^[]*?\]', re.DOTALL)
st = """Un example [Illustration: L'avocat en chef.--Phot. du
Sénégal.
[Illustration]--_Phot. Kline._] de ce que je veux faire"""
while re.search(p, st):
    st = re.sub(p,'', st)
print(st)

Un example  de ce que je veux faire


In [4]:
st = """\n\n\n\nblah\nand something else\nand yet another thing   \n\n\n"""
print(st.strip().encode('utf-8'))

b'blah\nand something else\nand yet another thing'
