# Splitting EU Texts into Articles

In [1]:
from bs4 import BeautifulSoup
import os 
import codecs
import re

## Functions and preparation

In [3]:
### structural components of legal texts

articles_enumerated =  ['Article {}'.format(i) for i in range(1,350)]

sections = ['\nSection 1\n', '\nSection 2\n', '\nSection 3\n', '\nSection 4\n', 
            '\nSection 5\n', '\nSection 6\n', '\nSection 7\n', 
            '\nSECTION 1\n', '\nSECTION 2\n', '\nSECTION 3\n', '\nSECTION 4\n', 
            '\nSECTION 5\n', '\nSECTION 6\n', '\nSection 7\n', 
            'Section 1', 'Section 2', 'Section 3', 'Section 4', 
            'Section 5', 'Section 6', 'Section 7', 
            'SECTION 1', 'SECTION 2', 'SECTION 3', 'SECTION 4', 
            'SECTION 5', 'SECTION 6', 'SECTION 7'] 

chapters=  ['CHAPTER I', 'CHAPTER II', 'CHAPTER III', 'CHAPTER IV', 'CHAPTER V', 'CHAPTER VI', 'CHAPTER VII',
            'CHAPTER 1', 'CHAPTER 2', 'CHAPTER 3', 'CHAPTER 4', 'CHAPTER 5', 'CHAPTER 6', 'CHAPTER 7',
            '\nCHAPTER I\n', '\nCHAPTER II\n', '\nCHAPTER III\n', '\nCHAPTER IV\n', '\nCHAPTER V\n', 
            '\nCHAPTER VI\n', '\nCHAPTER VII\n',
            '\nCHAPTER 1\n', '\nCHAPTER 2\n', '\nCHAPTER 3\n', '\nCHAPTER 4\n', '\nCHAPTER 5\n', 
            '\nCHAPTER 6\n', '\nCHAPTER 7\n',]
        
titles = ['TITLE I', 'TITLE II', 'TITLE III', 'TITLE IV', 'TITLE V', 'TITLE VI', 'TITLE VII', 'TITLE VIII'
         'TITLE 1', 'TITLE 2', 'TITLE 3', 'TITLE 4', 'TITLE 5', 'TITLE 6', 'TITLE 7', 'TITLE 8']

### Function to split the laws
The function 'process_text' takes in an html file of an EU law as can be dowloaded from EUR-Lex https://eur-lex.europa.eu/homepage.html and splits it into articles. It assumes that the files are stored in the directory 'texts'. Refer to download_searches.py for downloading EU laws.

In [2]:
## get all EU text
texts = os.listdir('texts')

In [4]:
def process_text(text):
    
    ##read legal text
    f = codecs.open("../texts/{}".format(text), 'r', 'utf-8')
    ## parse with beatiful soup
    soup = BeautifulSoup(f, 'html.parser')
    ##close file
    f.close()
    
    ##only use body text
    body = soup.find('body')
    text_only = body
    ##xreate list with paragraphs
    paragraphs = text_only.find_all('p')

    i=0 ##article counter
    j=0 ##title counter
    k=0 ##chapter counter
    l=0 ##section counter
    
    ##check if folder for text already exists
    if not os.path.exists(text[:-5]):
        os.mkdir(text[:-5])

    ##open new file for the fron text    
    file = open( text[:-5] + '/' +  text[:-5] +'_' + 'front.txt', "w", encoding  = 'utf-8')
    
    ##create iterable for paragraphs (sueful for skipping certain paragraphs)
    paragraphs_iter = iter(paragraphs[3:])


    ## ITERATE OVER PARAGPHS
    for paragraph in paragraphs_iter:
        
        string = paragraph.text.replace(u'\xa0', u' ')
    
        ##catch whereas
        if string == 'Whereas:':
            file.close()
            file = open( text[:-5] + '/' +  text[:-5] +'_' + 'Whereas' + '.txt', "w", encoding  = 'utf-8')

        if string in titles:
            j+=1
            ##resets chapter index
            k=0
            next(paragraphs_iter)
            continue


        if string in chapters:
            k+=1
            next(paragraphs_iter)
            continue


        if string in sections:
            l+=1
            next(paragraphs_iter)
            continue

        
        ## catch ending
        if string == 'For the European Parliament':
            file.close()
            break
        if string[:18] == 'Done at Luxembourg':
            file.close()
            break
        if string[:16] == 'Done at Brussels':
            file.close()
            break 
        if string[:18] == 'Done at Strasbourg':
            file.close()
            break    


        if string in articles_enumerated:        
            file.close()
            i += 1
            file = open( text[:-5] + '/' +  text[:-5] + '_' 
                            + 'Title_' + str(j) +'_'
                            + 'Chapter_' + str(k) +'_'
                            + 'Section_' +str(l) +'_'
                            + 'Article_' +'000'[:3-len(str(i))] + str(i) + '.txt', "w", encoding  = 'utf-8')
            file.write(paragraph.text + '\n')

        else:    
            file.write(paragraph.text + '\n')

    file.close()

## Processing the text
This part executes the function to split the laws into articles and and saves them in a folder 'processed'. All articles for each law are stored in a separate folder that is labeled with the respective CELEX number.

In [5]:
os.chdir('processed') 

In [6]:
for text in texts:
    process_text(text)