## Base Stats

In [16]:
import sys
!{sys.executable} -m pip install pymupdf
!{sys.executable} -m pip install unidecode



In [17]:
#Load stop words
StopWord = []
with open('./data/stop-words/fr.txt', encoding="utf-8") as file:
    StopWords = file.read().splitlines()

In [18]:
from os import walk, path, listdir
import glob
import re
from unidecode import unidecode

INPUT_PATH = path.abspath('./data/generated/input/')

words_per_candidate = {}
for type_path in ['html', 'pdf']:
    for candidate in listdir(path.join(INPUT_PATH, type_path)):
        if not candidate in words_per_candidate :
            words_per_candidate[candidate] = []

        text_files_path = path.join(INPUT_PATH, type_path, candidate, "*.txt")
        for filepath in glob.iglob(text_files_path):
            with open(filepath, mode='r', encoding='utf-8') as file:
                file_text = file.read()

                if type_path == 'pdf':
                    file_text = file_text.replace("---PAGE---", "")

                words_in_document = re.sub("[!,’:%«»•())“/\-\.\s\d]+", "\n", file_text).lower().split()
                
                words_per_candidate[candidate] = words_per_candidate[candidate] + words_in_document


In [19]:
# Load model
from gensim.models import Word2Vec
filename = "./data/model/frwiki/frwiki.gensim"
word2vecModel = Word2Vec.load(filename)

In [20]:
from itertools import chain
import math

def calculate_tfidf(corpus):
    number_of_documents = len(corpus)
    
    all_words = list(chain.from_iterable(corpus))
    all_words_unique = set(all_words)

    tf_idfs = {}
    for current_word in all_words_unique:
        # IDF
        document_with_word = len([True for doc in corpus if current_word in doc])
        idf = math.log(float(number_of_documents) / document_with_word)

        # TF
        tf = len([True for word2 in all_words if word2==current_word]) / len(all_words)
        tf_idfs[current_word] = tf * idf
    
    return tf_idfs

infos_per_candidate = {}

for candidate, candidate_words in words_per_candidate.items(): 
    # Remove stop_words and word not in wikipedia model
    without_stop_words = [word for word in candidate_words if word not in StopWords and word in word2vecModel ]

    # Calculate frequency
    fixed_page_word_length = 275
    page_chunks = [without_stop_words[x:x+fixed_page_word_length] for x in range(0, len(without_stop_words), fixed_page_word_length)]

    tf_idfs = calculate_tfidf(page_chunks)

    word_dict = {}
    for current_word in without_stop_words:
        if current_word not in word_dict:
            word_dict[current_word] = 0
        word_dict[current_word] += 1

    # Sorting
    word_dict_items = list(word_dict.items())
    word_dict_items.sort(key=lambda x: x[1], reverse=True)
    word_dict = dict(word_dict_items)

    # Sorting tf_idfs
    tf_idfs_items = list(tf_idfs.items())
    tf_idfs_items.sort(key=lambda x: x[1], reverse=True)
    tf_idfs = dict(tf_idfs_items)

    infos_per_candidate[candidate] = {
        'nb_words': len(candidate_words),
        'nb_without_stop': len(without_stop_words),
        'ratio_stop': float(len(candidate_words)-len(without_stop_words))/len(candidate_words),
        'word_map': word_dict,
        'tf_idfs': tf_idfs
    }
    print("{:15s}: {:6d} -> {:6d}".format(candidate, len(candidate_words), len(without_stop_words)))

  without_stop_words = [word for word in candidate_words if word not in StopWords and word in word2vecModel ]


macron         :  63067 ->  28215
pecresse       :  33768 ->  15888
poutou         :  11454 ->   5048
dupont-aignan  :  84029 ->  38635
hidalgo        :  14870 ->   6600
jadot          :  40048 ->  17855
lassalle       :   5216 ->   2463
lepen          :  76474 ->  32675
melenchon      : 226778 ->  98686
roussel        :  29165 ->  12790
zemmour        :  25505 ->  11516


In [21]:
# Calcul de la map global des mots
all_words_frequencies = {}
for candidate, candidate_infos in infos_per_candidate.items():
    nb_word_candidat = candidate_infos['nb_without_stop']
    infos_per_candidate[candidate]['freq'] = {}
    for word, word_count in candidate_infos['word_map'].items():

        word_frequency_for_candidat = float(word_count) / nb_word_candidat

        if not word in all_words_frequencies:
            all_words_frequencies[word] = [word_frequency_for_candidat]
        else:
            all_words_frequencies[word].append(word_frequency_for_candidat)
        
        infos_per_candidate[candidate]['freq'][word] = word_frequency_for_candidat 

all_words_idfs = {}
for word, frequencies in all_words_frequencies.items():
    all_words_idfs[word] = math.log(12 / sum(frequencies))

for candidate, candidate_infos in infos_per_candidate.items():
    nb_word_candidat = candidate_infos['nb_without_stop']
    infos_per_candidate[candidate]['tf_idfs_global'] = {}

    for word in candidate_infos['word_map'].keys():
        infos_per_candidate[candidate]['tf_idfs_global'][word] = infos_per_candidate[candidate]['freq'][word] * all_words_idfs[word]


In [22]:
# Save infos
import json
import os
OUTPUT_DIR_PATH = path.abspath('./data/generated/output/')
output_filepath = path.join(OUTPUT_DIR_PATH, 'infos.json')

if not os.path.exists(OUTPUT_DIR_PATH):
    os.makedirs(OUTPUT_DIR_PATH)
with open(output_filepath, 'w', encoding='utf-8') as file:
    json.dump(infos_per_candidate, file, ensure_ascii=False, indent=2)