Imports de base

In [30]:

import pandas as pd

from Corpus import Corpus
from Document import Document
from SearchEngine import SearchEngine


Chargement du fichier

In [31]:

try:
    # Try with tab delimiter first
    df = pd.read_csv("discours_US.csv", sep="\t", on_bad_lines="skip", engine='python')
except Exception as e:
    print(f"Error with tab delimiter: {e}")
    try:
        # Try with comma delimiter
        df = pd.read_csv("discours_US.csv", on_bad_lines="skip", engine='python')
    except Exception as e2:
        print(f"Error with comma delimiter: {e2}")
        # Try with encoding
        df = pd.read_csv("discours_US.csv", sep="\t", on_bad_lines="skip", encoding='utf-8')

df.head()



Unnamed: 0,speaker,text,date,descr,link
0,CLINTON,": I'm getting ready for a lot of things, a lot...","April 12, 2015",Video Remarks Announcing Candidacy for President,http://www.presidency.ucsb.edu/ws/index.php?pi...
1,CLINTON,"[ ] : I'll be graduating in May, and on gradua...","April 14, 2015",Remarks in a Question and Answer Session at Ki...,http://www.presidency.ucsb.edu/ws/index.php?pi...
2,CLINTON,"So, congratulations on this new poll number in...","October 16, 2015",Interview with Jake tapper of CNN,http://www.presidency.ucsb.edu/ws/index.php?pi...
3,CLINTON,"Thank you, Madam Secretary. This is a big inte...","January 5, 2016",Interview with Chris Matthews of MSNBC,http://www.presidency.ucsb.edu/ws/index.php?pi...
4,CLINTON,Wow! What a night. An unbelievable night. What...,"February 1, 2016",Remarks in Des Moines Following the Iowa Caucus,http://www.presidency.ucsb.edu/ws/index.php?pi...


Distribution des auteurs (value_counts)

In [32]:

# Check available columns
print("Available columns:", df.columns.tolist())

# Try to get author info, handling different column names
author_col = None
for col in ['author', 'Author', 'auteur', 'Auteur', 'speaker']:
    if col in df.columns:
        author_col = col
        break

if author_col:
    df[author_col].value_counts().head(20)
else:
    print(f"No author column found. Available columns: {df.columns.tolist()}")



Available columns: ['speaker', 'text', 'date', 'descr', 'link']


onstruction du Corpus à partir du DataFrame

In [33]:

corpus_discours = Corpus("DiscoursUS")

for _, row in df.iterrows():
    doc = Document(
        row.get("descr", "(Sans titre)"),
        row.get("speaker", "Inconnu"),
        row.get("date", ""),
        row.get("link", ""),
        row.get("text", ""),
    )
    corpus_discours.add_document(doc)

print(corpus_discours)



Corpus('DiscoursUS', ndoc=75, naut=2)


Test de la fonction search

In [34]:

corpus_discours.search("america")[:5]


["ething, too. I'm running for president. Americans have fought their way back from tough",
 ' in favor of those at the top. Everyday Americans need a champion and I want to be that',
 "head, because when families are strong, America is strong. So I'm hitting the road to e",
 "'m here today. I think we all know that Americans have come back from some tough econom",
 'our country are much better off because American families have basically done whatever ']

Test de la fonction concorde

In [35]:

concorde = corpus_discours.concorde("freedom")
concorde[:10]


Unnamed: 0,contexte_gauche,motif_trouve,contexte_droit
0,ent. This happens every time there is a,Freedom,of Information Act request. If somethin
1,r with reforms that give you choice and,freedom,and control in healthcare – at a much l
2,very disadvantaged child in America the,freedom,"to choose the private, public, magnet o"
3,ds will include religious and political,freedom,for the Cuban people. Let's also talk a
4,"Here, in America, I will push to expand",freedom,for all of our people. I am going to lo
5,"betrayed our workers, our borders, our",freedom,"s, and our sovereign rights as a nation."
6,so many lives were given in service to,freedom,. Amazing place. President Lincoln serve
7,iefs about equality and opportunity and,freedom,and common decency. That's something to


Initialisation du moteur de recherche

In [36]:
engine = SearchEngine(corpus_discours)
engine


=== Construction de l'index (TD7) ===
- Nombre de documents : 75
- Taille du vocabulaire : 5011
=== Index construit ===
- Nombre de documents : 75
- Taille du vocabulaire : 5011
=== Index construit ===


<SearchEngine.SearchEngine at 0x1226c91be80>

Tests simples du moteur de recherche

In [37]:

import importlib, SearchEngine
importlib.reload(SearchEngine)

# then recreate the engine with the reloaded class
engine = SearchEngine.SearchEngine(corpus_discours)
engine.search("america security", 5)




=== Construction de l'index (TD7) ===
- Nombre de documents : 75
- Taille du vocabulaire : 5011
=== Index construit ===


Construction des résultats: 100%|██████████| 5/5 [00:00<00:00, 4999.17it/s]


Unnamed: 0,doc_id,score,titre,auteur,date,type,url
0,72,0.04231,Remarks at the Kent State Student Recreation C...,CLINTON,"October 31, 2016",generic,http://www.presidency.ucsb.edu/ws/index.php?pi...
1,47,0.04231,Remarks at the Kent State Student Recreation C...,CLINTON,"October 31, 2016",generic,http://www.presidency.ucsb.edu/ws/index.php?pi...
2,22,0.04231,Remarks at the Kent State Student Recreation C...,CLINTON,"October 31, 2016",generic,http://www.presidency.ucsb.edu/ws/index.php?pi...
3,45,0.038462,Remarks on Proposals for the First 100 Days in...,TRUMP,"October 22, 2016",generic,http://www.presidency.ucsb.edu/ws/index.php?pi...
4,70,0.038462,Remarks on Proposals for the First 100 Days in...,TRUMP,"October 22, 2016",generic,http://www.presidency.ucsb.edu/ws/index.php?pi...


Préparation pour tqdm (barre de progression)

In [38]:
from tqdm.notebook import tqdm


Imports pour l’interface graphique (widgets)

In [39]:
try:
    import ipywidgets as widgets
    from IPython.display import display, clear_output
except ImportError:
    widgets = None
    display = clear_output = None
    print("ipywidgets non disponible ? UI d?sactiv?e.")


Création des widgets de base

In [40]:
if widgets is None:
    print("Interface widgets d?sactiv?e (ipywidgets manquant).")
else:
    label_titre = widgets.Label("Moteur de recherche")
    
    input_query = widgets.Text(
        description="Mots clés :",
        placeholder="mot1 mot2 mot3"
    )
    
    slider_n = widgets.IntSlider(
        value=10,
        min=1,
        max=50,
        step=1,
        description="Nombre d'articles :"
    )
    
    button_search = widgets.Button(
        description="Rechercher",
        button_style="primary"
    )
    
    output_results = widgets.Output()

Mise en page de l’interface

In [41]:
if widgets is None:
    print("Interface widgets d?sactiv?e (ipywidgets manquant).")
else:
    
    ligne_haut = widgets.HBox([input_query, slider_n, button_search])
    ui = widgets.VBox([label_titre, ligne_haut, output_results])
    
    display(ui)

VBox(children=(Label(value='Moteur de recherche'), HBox(children=(Text(value='', description='Mots clés :', pl…

Fonction clique_bouton (lance la recherche)

In [42]:
if widgets is None:
    print("Interface widgets d?sactiv?e (ipywidgets manquant).")
else:
    
    def clique_bouton(b):
        query = input_query.value.strip()
        n = slider_n.value
    
        with output_results:
            clear_output()
    
            if not query:
                print("⚠️ Veuillez entrer au moins un mot-clé.")
                return
    
            results = engine.search(query, n)
    
            if results.empty:
                print("Aucun document trouvé.")
            else:
                cols = ["doc_id", "score", "titre", "auteur"]
                display(results[cols])
    
    button_search.on_click(clique_bouton)


Ajout d’un filtre sur l’auteur

In [43]:
if widgets is None:
    print("Interface widgets d?sactiv?e (ipywidgets manquant).")
else:
    
    
    input_author = widgets.Text(
        description="Filtrer auteur :",
        placeholder="nom (facultatif)"
    )
    
    # on met à jour l'UI pour ajouter le champ auteur
    ligne_haut = widgets.HBox([input_query, slider_n, input_author, button_search])
    ui = widgets.VBox([label_titre, ligne_haut, output_results])
    
    display(ui)
    
    def clique_bouton_with_filter(b):
        query = input_query.value.strip()
        n = slider_n.value
        auteur_filtre = input_author.value.strip().lower()
    
        with output_results:
            clear_output()
    
            if not query:
                print("⚠️ Veuillez entrer au moins un mot-clé.")
                return
    
            results = engine.search(query, n)
    
            if auteur_filtre:
                results = results[results["auteur"].str.lower().str.contains(auteur_filtre)]
    
            if results.empty:
                print("Aucun document trouvé.")
            else:
                cols = ["doc_id", "score", "auteur", "titre"]
                display(results[cols])
    
    button_search.on_click(clique_bouton_with_filter)
    


VBox(children=(Label(value='Moteur de recherche'), HBox(children=(Text(value='', description='Mots clés :', pl…