## Base Stats

In [1]:
import sys
!{sys.executable} -m pip install pymupdf
!{sys.executable} -m pip install unidecode

Collecting unidecode
  Downloading Unidecode-1.3.4-py3-none-any.whl (235 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m235.9/235.9 KB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: unidecode
Successfully installed unidecode-1.3.4


In [2]:
#Load stop words
StopWord = []
with open('./data/stop-words/fr.txt', encoding="utf-8") as file:
    StopWords = file.read().splitlines()

In [3]:
from os import walk, path, listdir
import glob
import re
from unidecode import unidecode

INPUT_PATH = path.abspath('./data/generated/input/')

words_per_candidate = {}
for type_path in ['html', 'pdf']:
    for candidate in listdir(path.join(INPUT_PATH, type_path)):
        if not candidate in words_per_candidate :
            words_per_candidate[candidate] = []

        text_files_path = path.join(INPUT_PATH, type_path, candidate, "*.txt")
        for filepath in glob.iglob(text_files_path):
            with open(filepath, mode='r', encoding='utf-8') as file:
                file_text = file.read()

                if type_path == 'pdf':
                    file_text = file_text.replace("---PAGE---", "")

                words = re.sub("[!,’:%«»•())“/\-\.\s\d]+", "\n", file_text).lower().split()
                words_per_candidate[candidate] = words_per_candidate[candidate] + words


In [4]:
infos_per_candidate = {}
for candidate, candidate_words in words_per_candidate.items(): 
    # Remove stop_words
    without_stop_words = [unidecode(word) for word in candidate_words if word not in StopWords ]
    word_dict = {}
    for current_word in without_stop_words:
        if current_word not in word_dict:
            word_dict[current_word] = 0
        word_dict[current_word] += 1
    print("{:15s}: {:6d} -> {:6d}".format(candidate, len(candidate_words), len(without_stop_words)))
    infos_per_candidate[candidate] = {
        'nb_words': len(candidate_words),
        'nb_without_stop': len(without_stop_words),
        'ratio_stop': float(len(candidate_words)-len(without_stop_words))/len(candidate_words),
        'word_map': word_dict
    }

poutou         :  11637 ->   5564
macron         :  91226 ->  43997
pecresse       :  36384 ->  18754
hidalgo        :  14870 ->   7196
lassalle       :   5216 ->   2536
jadot          :  40048 ->  19154
zemmour        :  25505 ->  11689
lepen          :  76474 ->  35822
roussel        :  29165 ->  13999
dupont-aignan  :  84029 ->  40408
melenchon      : 226778 -> 107495


In [7]:
# Save infos
import json
import os
OUTPUT_DIR_PATH = path.abspath('./data/generated/output/')
output_filepath = path.join(OUTPUT_DIR_PATH, 'infos.json')

if not os.path.exists(OUTPUT_DIR_PATH):
    os.makedirs(OUTPUT_DIR_PATH)
with open(output_filepath, 'w', encoding='utf-8') as file:
    json.dump(infos_per_candidate, file, ensure_ascii=False, indent=2)