## Format data

In [50]:
from os import walk, path
import fitz

INPUT_PDF_PATH = path.abspath('./data/input/pdfs')
OUTPUT_TXT_PATH = path.abspath('./data/output/txt')
OUTPUT_JSON_PATH = path.abspath('./data/output/json')

def pdfToText(path):
    doc = fitz.open(path)
    pages = [ doc[ i ] for i in range( doc.pageCount ) ]

    return ''.join([page.get_text('text') for page in pages])

dirnames = next(walk(INPUT_PDF_PATH), (None, [], None))[1]  # [] if no file

for dirname in dirnames:
    filenames = next(walk(f'{INPUT_PDF_PATH}/{dirname}'), (None, None, []))[2]  # [] if no file
    pdf_filenames = [filename for filename in filenames if filename.endswith('.pdf')]

    output_filepath = path.join(OUTPUT_TXT_PATH, f'{dirname}.txt')
    with open(output_filepath, 'w+') as file:
        for filename in pdf_filenames:
            text = pdfToText(path.join(INPUT_PDF_PATH, f'{dirname}/{filename}'))
            file.write(text)

filenames = next(walk(OUTPUT_TXT_PATH), (None, None, []))[2]  # [] if no file
filepaths = [path.join(OUTPUT_TXT_PATH, filename) for filename in filenames]


In [51]:
raw_contents = {}

for filepath in filepaths:
    filename = filepath.split('/')[-1].replace('.txt', '')
    with open(filepath, 'r') as file:
        raw_contents[filename] = file.read()

with open('./data/stop-words/fr.txt') as file:
    stop_words = file.read().splitlines()

# print([content[0:100] for filename, content in raw_contents.items()])


In [52]:
import re

file_words = {}

for filename, raw_content in raw_contents.items():
    words = re.sub("[!,’:%«»•())“/\-\.\s\d]+", "\n", raw_content).lower().split()
    file_words[filename] = words

# print([words[0:10] for filename, words in file_words.items()])

In [53]:
file_filtered_words = {}

for filename, words in file_words.items():
    filtered_words = [word for word in words if word not in stop_words ]
    file_filtered_words[filename] = filtered_words

# print([filtered_words[0:10] for filename, filtered_words in file_filtered_words.items()])



In [54]:
file_count_dicts = {}

for filename, filtered_words in file_filtered_words.items():
    count_dict = {}
    for current_word in filtered_words:
        if current_word not in count_dict:
            count = len([word for word in filtered_words if word == current_word ])
            count_dict[current_word] = count
    
    file_count_dicts[filename] = count_dict

# print([list(count_dict.keys())[0:10] for filename, count_dict in file_count_dicts.items()])


In [55]:
results = {}

for filename, count_dict in file_count_dicts.items():
    count_list = list(count_dict.items())
    count_list.sort(key=lambda x: x[1], reverse=True)
    result = dict(count_list)
    results[filename] = result

# print([list(result.keys())[0:10] for filename, result in results.items()])


In [56]:
import json

for filename, result in results.items():
    output_filepath = path.join(OUTPUT_JSON_PATH, f'{filename}.json')

    with open(output_filepath, 'w', encoding='utf-8') as file:
        json.dump(result, file, ensure_ascii=False, indent=2)