In [62]:
import pandas as pd
import os
import json
import re
from difflib import SequenceMatcher

In [63]:
# import excluded titles
excluded_titles = pd.read_csv('../data/utility_data/excluded_titles.csv')

#inport settings
sample = True

#set input and output dir
if sample:
    input_dir = '../data/sample/1_json'
    output_dir = '../data/sample/2_clean_json'
else:
    input_dir = '../data/result/1_json'
    output_dir = '../data/result/2_clean_json'
    
#set ouput path if not exists
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

In [64]:
total_files_loaded = 0
processed_files_count = 0
total_paragraphs_count = 0


In [65]:
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

In [66]:
#total files in input dir
total_files_loaded = len(os.listdir(input_dir))

#iterate over all files
for filename in os.listdir(input_dir):
    
    #open document if .json
    if filename.endswith('.json'):  
        input_file_path = os.path.join(input_dir, filename)
        with open(input_file_path, 'r', encoding='utf-8') as file:
            data = json.load(file)
            
        # skipp documents with faulty meta data or insufficient size
        if data['title'] is None or \
            len(data['authors']) < 1 or \
            any(auth['last_name'] is None for auth in data['authors']) or \
            data['pub_year'] is None or data['pub_year'] < 1950 or data['pub_year'] > 2024 or \
            len(data['references']) < 5 or \
            len(data['sections']) < 3 or\
            data['lang'] != 'en':
                continue
            
        #check whether title is in excluded titles
        title_is_dublicate = False
        for title in excluded_titles['title']:
            s = SequenceMatcher(None, data['title'], title)
            if s.ratio() > 0.80:
                print(data['title'], title)
                title_is_dublicate = True
                break
        if title_is_dublicate:
            continue
        
        # Filter Paragraphs and count the cumulative count
        # delete all paragraphs that have no refs
        filtered_sections = []
        targets = [ref['id'] for ref in data['references']]
        for section in data.get('sections', []):
            pars = ''.join(section['paragraphs'])
            if re.search(r'<ref.*?<\/ref>', pars) is not None:
                filtered_sections.append(section)                                   
                total_paragraphs_count += len(section['paragraphs'])
            else:
                continue
            
        data['sections'] = filtered_sections
        
        #check for insufficient length after removing references and paragraphs
        if len(data['references']) < 5 or \
           len(data['sections']) < 3:
            continue

           
        # save file
        output_file_path = os.path.join(output_dir, filename)     
        with open(output_file_path, 'w', encoding='utf-8') as outfile:
            json.dump(data, outfile, indent=4, ensure_ascii=False)
        processed_files_count += 1


Europarl: A Parallel Corpus for Statistical Machine Translation europarl a parallel corpus for statistical machine translation


In [67]:
print(f"Total loaded files: {total_files_loaded}")
print(f"Total processed files: {processed_files_count}")
print(f"Total paragraphs count: {total_paragraphs_count}")

Total loaded files: 80
Total processed files: 73
Total paragraphs count: 2258
