# Du modèle d'AGODA à un topic modelling adapté à mon corpus

## Conversion des pdf en txt

In [15]:
import os
from PyPDF2 import PdfReader

def extract_text_from_pdfs(root_folder, destination_folder):
    for root, dirs, files in os.walk(root_folder):
        for file in files:
            if file.endswith(".pdf"):
                file_path = os.path.join(root, file)
                pdf = PdfReader(file_path)
                text = ""
                for page in range(len(pdf.pages)):
                    text += pdf.pages[page].extract_text()
                text_file_path = os.path.join(destination_folder, file.replace(".pdf", ".txt"))
                with open(text_file_path, "w", encoding="utf-8") as text_file:
                    text_file.write(text)

# Utilisez la fonction comme suit :
extract_text_from_pdfs("/Volumes/Elements/JO_débats_1956-1958/", "/Volumes/Elements/JO_débats_1956-1958/débats_texte/")

## Entraînement du modèle

## Exécution du code adapté de Marie Puren

In [4]:
pip install git+https://github.com/ClaudeCoulombe/FrenchLefffLemmatizer.git

Collecting git+https://github.com/ClaudeCoulombe/FrenchLefffLemmatizer.git
  Cloning https://github.com/ClaudeCoulombe/FrenchLefffLemmatizer.git to /private/var/folders/py/k9l9n4nj5l966__zkxgrfsj00000gn/T/pip-req-build-rilpsybh
  Running command git clone --filter=blob:none --quiet https://github.com/ClaudeCoulombe/FrenchLefffLemmatizer.git /private/var/folders/py/k9l9n4nj5l966__zkxgrfsj00000gn/T/pip-req-build-rilpsybh
  Resolved https://github.com/ClaudeCoulombe/FrenchLefffLemmatizer.git to commit bc0ebd0135a6cc78f48ddf184069b4c0b9c017d8
  Preparing metadata (setup.py) ... [?25ldone
[?25hBuilding wheels for collected packages: FrenchLefffLemmatizer
  Building wheel for FrenchLefffLemmatizer (setup.py) ... [?25ldone
[?25h  Created wheel for FrenchLefffLemmatizer: filename=FrenchLefffLemmatizer-0.3-py3-none-any.whl size=3533513 sha256=bed0659c54558f8b025743e34e6c16b3a589ce9515d9528266d7bc32517734a5
  Stored in directory: /private/var/folders/py/k9l9n4nj5l966__zkxgrfsj00000gn/T/pip-e

Suit le code de Marie Puren, modifié pour mon corpus. Comme je n'ai pas de modèle entraîné, je ne récupère pas tous les résultats proposés. 

In [38]:
import os
import pickle  # librairie pour save des modèles de machine learning
import re
from typing import List, TextIO

import numpy as np
import pandas as pd
from french_lefff_lemmatizer.french_lefff_lemmatizer import FrenchLefffLemmatizer as FLF
from nltk import word_tokenize
from nltk.corpus import stopwords
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer

lemmatizer = FLF()

path_data = "/Volumes/Elements/JO_débats_1956-1958/débats_texte/"
path_to_model = os.path.join(path_data, "model_ML")

stop_words = set(list(open(os.path.join(path_data, "french_stopwords.txt"), "r", encoding="utf-8").read().split("\n"))
                 + stopwords.words("french"))


def table_ocr_1956_1958():
        print("récupération des fichiers")
        fichiers = os.listdir(os.path.join(path_data))
        data = pd.DataFrame(columns=["date", "text"])
        for name in fichiers:
            print(name)
            if name.split("-")[0][:2] == '19':
                file: TextIO = open(os.path.join(path_data, name), encoding="utf-8")
                text: str = file.read()
                file.close()
                data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
        return data

        
def build_corpus(df: pd.DataFrame):
    print("nettoyage du texte")
    data: List = []
    for i in range(df.shape[0]):
        print(f"{i}/{df.shape[0]}")
        text = " ".join(re.findall("[A-Za-zâêûîôäëüïöùàçéèÉ\-\.]+", df.text[i]))
        text = re.sub("([a-z])- ", r"\1", text)
        text = re.sub("\-", " ", text)
        text = re.sub("[M]+\. ([A-Z]+[a-zâêûîôäëüïöùàçéè]+(?:\s[A-Z]+[a-zâêûîôäëüïöùàçéè]+)?)", " ", text)
        text = re.sub("\.", " ", text)
        bag_of_words: List[str] = word_tokenize(text.lower(), language="french")
        bag_of_words = [w for w in bag_of_words if 1 <= len(w) < 22]
        bag_of_words = [lemmatizer.lemmatize(w).lower() for w in bag_of_words if w not in stop_words]
        data.append(bag_of_words)
    df.loc[:, "bag_of_words"] = data
    return df


def count_vectorizer(df: pd.DataFrame, p: int):
    print("compte des coccurences")
    data = [" ".join(w) for w in df.bag_of_words]
    vectorizer = CountVectorizer(max_features=p)
    X = vectorizer.fit_transform(data)
    word_frequency_matrix = pd.DataFrame(data=X.toarray(), index=df.date, columns=vectorizer.get_feature_names())
    word_frequency_matrix = word_frequency_matrix.sort_index()
    word_frequency_matrix.to_csv(os.path.join(path_data, "word_frequency_56-58.csv"),
                                 sep=";", encoding="utf-8", index=False)
    return word_frequency_matrix


def build_model(model_file):
    nb_topics: int = 50
    words_per_topic: int = 20
    if os.path.exists(os.path.join(path_to_model, model_file)):
        print("il existe déjà un modèle avec ce nom")
    else:
        word_frequency_matrix = pd.read_csv(os.path.join(path_data, "word_frequency_56-58.csv"), sep=";", encoding="utf-8",
                                            index_col=0)
        clefs: List[str] = list(word_frequency_matrix.columns)
        print(clefs[:10])
        blocs: List[str] = list(word_frequency_matrix.index)
        print(blocs[:10])
        print("Topic modeling")
        lda = LatentDirichletAllocation(n_components=nb_topics)
        topic_to_text = lda.fit_transform(word_frequency_matrix.values)
        pkl_filename = os.path.join(path_to_model, model_file)
        with open(pkl_filename, 'wb') as file:
            pickle.dump(lda, file)
    all_topics: pd.DataFrame = pd.DataFrame({f"Topic{i}": [clefs[w] for w in top.argsort()[-words_per_topic:]]
                                             for i, top in enumerate(lda.components_)})
    table_topics_to_texts: pd.DataFrame = pd.DataFrame(np.vectorize(lambda z: f"{z:.3f}")(topic_to_text),
                                                       columns=range(nb_topics), index=blocs)
    all_topics.to_excel(os.path.join(path_data, "topics.xlsx"), encoding="utf-8", index=False)
    table_topics_to_texts.to_excel(os.path.join(path_data, "corpus_topics.xlsx"), encoding="utf-8", index=True)


def load_model(model_file):
    word_freq = pd.read_csv(os.path.join(path_data, "word_frequency_56-58.csv"), sep=";", encoding="utf-8", index_col=0)
    clefs: List[str] = list(word_freq.columns)
    blocs: List[str] = list(word_freq.index)
    pkl_filename = os.path.join(path_to_model, model_file)
    with open(pkl_filename, 'rb') as file:
        lda = pickle.load(file)
    nb_topics: int = lda.n_components
    words_per_topic: int = 20
    topic_to_text = lda.transform(word_freq.values)
    all_topics: pd.DataFrame = pd.DataFrame({f"Topic{i}": [clefs[w] for w in top.argsort()[-words_per_topic:]]
                                             for i, top in enumerate(lda.components_)})
    table_topics_to_texts: pd.DataFrame = pd.DataFrame(np.vectorize(lambda z: f"{z:.3f}")(topic_to_text),
                                                       columns=range(nb_topics), index=blocs)
    all_topics.to_excel(os.path.join(path_data, "topics.xlsx"), encoding="utf-8", index=False)
    table_topics_to_texts.to_excel(os.path.join(path_data, "corpus_topics.xlsx"), encoding="utf-8", index=True)
    return topic_to_text, all_topics, table_topics_to_texts


def get_parameter(adr: str):
    pkl_filename = os.path.join(path_to_model, adr)
    with open(pkl_filename, 'rb') as file:
        lda: LatentDirichletAllocation = pickle.load(file)
    print(lda.get_params())


if __name__ == "__main__":

    df = table_ocr_1956_1958()  # collecte dans un dataframe, les textes des années 56-58 avec la date comme référence
    df = build_corpus(df)  # calcul les bag of words pour chaque document, puis les ajoute dans le dataframe précédent
    word_frequency = count_vectorizer(df, 10000)  # à partir des bag of words, calcul la matrice de fréquence par doc
    build_model('lda_model.pkl')  # lance un modèle de LDA. Il ne fera rien si un modèle du même nom existe déjà
    text_topics, topics, table_text_topics = load_model("lda_model.pkl")
    get_parameter("lda_model.pkl")

récupération des fichiers
.DS_Store
1956_i100.txt


  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)


1956_i101.txt
1956_i102.txt
1956_i103.txt
1956_i104.txt
1956_i105.txt
1956_i106.txt
1956_i107.txt
1956_i108.txt


  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)


1956_i109.txt
1956_i110.txt
1956_i111.txt


  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)


1956_i112.txt
1956_i113.txt
1956_i114.txt
1956_i115.txt
1956_i116.txt
1956_i117.txt
1956_i118.txt


  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)


1956_i119.txt
1956_i120.txt
1956_i121.txt
1956_i122.txt
1956_i123.txt
1956_i124.txt
1956_i125.txt


  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)


1956_i126.txt
1956_i127.txt
1956_i128.txt
1956_i129.txt
1956_i130.txt
1956_i131.txt


  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)


1956_i132.txt
1956_i133.txt
1956_i134.txt
1956_i135.txt
1956_i136.txt
1956_i137.txt


  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)


1956_i138.txt
1956_i139.txt
1956_i140.txt
1956_i141.txt
1956_i142.txt
1956_i143.txt
1956_i144.txt


  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)


1956_i145.txt
1956_i146.txt
1956_i147.txt
1956_i95.txt
1956_i96.txt
1956_i97.txt
1956_i98.txt
1956_i99.txt


  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)


1956seance100.txt
1956seance101.txt
1956seance103.txt
1956seance104.txt
1956seance105.txt


  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)


1956seance107.txt
1956seance109.txt
1956seance110.txt
1956seance111.txt


  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)


1956seance113.txt
1956seance115.txt
1956seance116.txt
1956seance117.txt
1956seance118.txt
1956seance119.txt


  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)


1956seance120.txt
1956seance121.txt
1956seance122.txt
1956seance124.txt
1956seance125.txt


  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)


1956seance126.txt
1956seance128.txt
1956seance129.txt
1956seance131.txt
1956seance133.txt
1956seance135.txt
1956seance137.txt


  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)


1956seance139.txt
1956seance141.txt
1956seance143.txt
1956seance146.txt
1956seance148.txt
1956seance150.txt


  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)


1956seance152.txt
1956seance155.txt
1956seance157.txt
1956seance160.txt


  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)


1956seance162.txt
1956seance165.txt
1956seance166.txt
1956seance168.txt
1956seance169.txt
1956seance171.txt


  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)


1956seance172.txt
1956seance174.txt
1956seance177.txt
1956seance178.txt
1956seance180.txt
1956seance181.txt


  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)


1956seance182.txt
1956seance186.txt
1956seance187.txt
1956seance188.txt
1956seance189.txt
1956seance190.txt
1956seance52_19_01.txt


  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)


1956seance53_20_01.txt
1956seance54_.txt
1956seance55.txt
1956seance56.txt
1956seance57.txt
1956seance58.txt


  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)


1956seance59.txt
1956seance60.txt
1956seance61.txt
1956seance62.txt
1956seance63.txt
1956seance64.txt
1956seance65.txt
1956seance66.txt


  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)


1956seance67.txt
1956seance69.txt
1956seance70.txt
1956seance72.txt
1956seance73.txt
1956seance75.txt
1956seance76.txt
1956seance77.txt


  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)


1956seance78.txt
1956seance80.txt
1956seance81.txt
1956seance83.txt
1956seance85.txt
1956seance86.txt
1956seance87.txt
1956seance88.txt


  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)


1956seance89.txt
1956seance91.txt
1956seance92.txt
1956seance95.txt
1956seance97.txt
1957_i1.txt

  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)



1957_i10.txt
1957_i100.txt
1957_i101.txt
1957_i102.txt
1957_i103.txt
1957_i104.txt
1957_i105.txt


  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)


1957_i106.txt
1957_i107.txt
1957_i108.txt
1957_i109.txt
1957_i11.txt
1957_i110.txt


  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)


1957_i111.txt
1957_i112.txt
1957_i113.txt
1957_i114.txt
1957_i115.txt


  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)


1957_i116.txt
1957_i117.txt
1957_i118.txt
1957_i119.txt
1957_i12.txt
1957_i120.txt


  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)


1957_i121.txt
1957_i122.txt
1957_i13.txt
1957_i14.txt
1957_i15.txt


  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)


1957_i16.txt
1957_i17.txt
1957_i18.txt
1957_i19.txt
1957_i2.txt
1957_i20.txt
1957_i21.txt


  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)


1957_i22.txt
1957_i23.txt
1957_i24.txt
1957_i25.txt
1957_i26.txt
1957_i27.txt
1957_i28.txt


  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)


1957_i29.txt
1957_i3.txt
1957_i30.txt
1957_i31.txt
1957_i32.txt
1957_i33.txt


  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)


1957_i34.txt
1957_i35.txt
1957_i36.txt
1957_i37.txt
1957_i38.txt


  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)


1957_i39.txt
1957_i4.txt
1957_i40.txt
1957_i41.txt
1957_i42.txt
1957_i43.txt


  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)


1957_i44.txt
1957_i45.txt
1957_i46.txt
1957_i47.txt
1957_i48.txt
1957_i49.txt


  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)


1957_i5.txt
1957_i50.txt
1957_i51.txt
1957_i52.txt
1957_i53.txt
1957_i54.txt
1957_i55.txt


  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)


1957_i56.txt
1957_i57.txt
1957_i58.txt
1957_i59.txt
1957_i6.txt
1957_i60.txt
1957_i61.txt


  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)


1957_i62.txt
1957_i63.txt
1957_i64.txt
1957_i65.txt
1957_i66.txt
1957_i67.txt


  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)


1957_i68.txt
1957_i69.txt
1957_i7.txt
1957_i70.txt
1957_i71.txt
1957_i72.txt
1957_i73.txt


  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)


1957_i74.txt
1957_i75.txt
1957_i76.txt
1957_i77.txt
1957_i78.txt


  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)


1957_i79.txt
1957_i8.txt
1957_i80.txt
1957_i81.txt
1957_i82.txt
1957_i83.txt
1957_i84.txt
1957_i85.txt


  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)


1957_i86.txt
1957_i87.txt
1957_i88.txt
1957_i89.txt
1957_i9.txt
1957_i90.txt
1957_i91.txt
1957_i92.txt
1957_i93.txt
1957_i94.txt


  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)


1957_i95.txt
1957_i96.txt
1957_i97.txt
1957_i98.txt
1957_i99.txt
1958_i1.txt
1958_i10.txt
1958_i11.txt


  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)


1958_i12.txt
1958_i13.txt
1958_i14.txt
1958_i15.txt
1958_i16.txt
1958_i17.txt
1958_i18.txt
1958_i19.txt
1958_i2.txt


  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)


1958_i20.txt
1958_i21.txt
1958_i22.txt
1958_i23.txt
1958_i24.txt
1958_i25.txt
1958_i26.txt
1958_i27.txt
1958_i28.txt


  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)


1958_i29.txt
1958_i3.txt
1958_i30.txt
1958_i31.txt
1958_i32.txt
1958_i33.txt
1958_i34.txt


  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)


1958_i35.txt
1958_i36.txt
1958_i37.txt
1958_i38.txt
1958_i39.txt
1958_i4.txt
1958_i40.txt
1958_i41.txt


  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)


1958_i42.txt
1958_i43.txt
1958_i44.txt
1958_i45.txt
1958_i46.txt
1958_i47.txt
1958_i48.txt


  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)


1958_i49.txt
1958_i5.txt
1958_i50.txt
1958_i51.txt
1958_i52.txt
1958_i53.txt
1958_i54.txt
1958_i55.txt


  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)


1958_i56.txt
1958_i57.txt
1958_i58.txt
1958_i59.txt
1958_i6.txt
1958_i60.txt


  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)
  data = data.append({'date': name.split('.')[0], "text": text}, ignore_index=True)


1958_i62.txt
1958_i7.txt
1958_i8.txt
1958_i9.txt
french_stopwords.txt
reponses_29_septembre_1956.txt
reponses_4_septembre_1956.txt
word_frequency_80.csv
nettoyage du texte
0/327
1/327
2/327
3/327
4/327
5/327
6/327
7/327
8/327
9/327
10/327
11/327
12/327
13/327
14/327
15/327
16/327
17/327
18/327
19/327
20/327
21/327
22/327
23/327
24/327
25/327
26/327
27/327
28/327
29/327
30/327
31/327
32/327
33/327
34/327
35/327
36/327
37/327
38/327
39/327
40/327
41/327
42/327
43/327
44/327
45/327
46/327
47/327
48/327
49/327
50/327
51/327
52/327
53/327
54/327
55/327
56/327
57/327
58/327
59/327
60/327
61/327
62/327
63/327
64/327
65/327
66/327
67/327
68/327
69/327
70/327
71/327
72/327
73/327
74/327
75/327
76/327
77/327
78/327
79/327
80/327
81/327
82/327
83/327
84/327
85/327
86/327
87/327
88/327
89/327
90/327
91/327
92/327
93/327
94/327
95/327
96/327
97/327
98/327
99/327
100/327
101/327
102/327
103/327
104/327
105/327
106/327
107/327
108/327
109/327
110/327
111/327
112/327
113/327
114/327
115/327
116/327
11



['abaissement', 'abaisser', 'abandon', 'abandonne', 'abandonner', 'abandonné', 'abattage', 'abattement', 'abattoir', 'abattre']
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Topic modeling


FileNotFoundError: [Errno 2] No such file or directory: '/Volumes/Elements/JO_débats_1956-1958/débats_texte/model_ML/lda_model.pkl'

On récupère un DataFrame de 327 lignes (chaque fichier pdf transformé en fichier texte), avec trois colonnes : le nom du fichier, le texte entier, et la liste des mots du texte. 

In [27]:
df 

Unnamed: 0,date,text,bag_of_words
0,1956_i100,/ \n* Année 1956. — N° 100 A. N. ue Numéro ...,"[année, ue, numéro, franc, jeudi, octobre, jou..."
1,1956_i101,* AÂnée M. N° 101 A.^. Lie Numéro: 15 fra...,"[née, lie, numéro, franc, tt, mercredi, octobr..."
2,1956_i102,* Année 1956. N° 102 A. N. Le Numéro: 15 ...,"[année, numéro, franc, jeudi, octobre, journal..."
3,1956_i103,j \n* Année 1956. — N» 103 A. N. Le Numéro ...,"[année, numéro, franc, vendredi, octobre, jour..."
4,1956_i104,Y \ni i f l\ \n* Année 1958. — N° 104 A. N....,"[année, cy, numéro, franc, samedi, octobre, jo..."
...,...,...,...
322,1958_i60,% Année «88. — W 60 A. N. Le Numéro: 50 ...,"[année, numéro, franc, samedi, octobre, républ..."
323,1958_i62,",r* Année 1958. — N» 62 A. N. \n- i• Le Numé...","[année, numéro, franc, samedi, novembre, fe, j..."
324,1958_i7,Année 1958. — N» T A. N. Le Numéro: 50 fr...,"[année, numéro, franc, mercredi, janvier, jour..."
325,1958_i8,* Année 1958. - N° 8 A. N. Le Numéro: 50 f...,"[année, numéro, franc, jeudi, janvier, journal..."


In [54]:
df.to_csv(path_data + 'Mots_Débats.csv', index=False)

In [36]:
# Affiche la première ligne
print(df.iloc[0])

date                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    

In [37]:
cell = df.iloc[0, 2]  # première ligne, deuxième colonne
print(cell)

['année', 'ue', 'numéro', 'franc', 'jeudi', 'octobre', 'journal', 'officiel', 'république', 'fran', 'aise', 'débat', 'parlementaire', 'assemblee', 'nationale', 'compte', 'rendu', 'in', 'extenso', 'séance', 'question', 'écrites', 'reponses', 'ministre', 'question', 'abonnement', 'édition', 'débat', 'assemblée', 'nationale', 'aréststopolls', 'ex', 'frakce', 'xj', 'otjxzr', 'mei', 'fr', 'étranger', 'oo', 'fr', 'compte', 'chèque', 'postal', 'pari', 'pri', 're', 'joindre', 'derni', 're', 'bande', 'renouvellement', 'réclamation', 'direction', 'rédaction', 'administration', 'quai', 'voltaire', 'iv', 'pari', 'changement', 'adresse', 'ajouter', 'franc', 'session', 'legislature', 'compte', 'rendu', 'in', 'extenso', 'seance', 'séance', 'mercredi', 'octobre', 'sommaire', 'ipraeès', 'verbal', 'excuse', 'congé', 'désaccord', 'urgence', 'discussion', 'désignation', 'ipersonnes', 'contrainte', 'travail', 'pays', 'ennemi', 'suite', 'ja', 'discussion', 'lecture', 'proposition', 'loi', 'discussion', 'gén

In [55]:
mots_débats = df['bag_of_words']
mots_débats

0                                                                                                     [année, ue, numéro, franc, jeudi, octobre, journal, officiel, république, fran, aise, débat, parlementaire, assemblee, nationale, compte, rendu, in, extenso, séance, question, écrites, reponses, ministre, question, abonnement, édition, débat, assemblée, nationale, aréststopolls, ex, frakce, xj, otjxzr, mei, fr, étranger, oo, fr, compte, chèque, postal, pari, pri, re, joindre, derni, re, bande, renouvellement, réclamation, direction, rédaction, administration, quai, voltaire, iv, pari, changement, adresse, ajouter, franc, session, legislature, compte, rendu, in, extenso, seance, séance, mercredi, octobre, sommaire, ipraeès, verbal, excuse, congé, désaccord, urgence, discussion, désignation, ipersonnes, contrainte, travail, pays, ennemi, suite, ja, discussion, lecture, proposition, loi, discussion, générale, suite, mérigonde, chevigny, clôture, article, ...]
1                            