In [29]:
import pandas as pd
import re
from datetime import datetime

# Widgets 
import ipywidgets as widgets
from IPython.display import display, clear_output

# Barre de progression
from tqdm.notebook import tqdm
from Corpus import Corpus
from Document import Document
from SearchEngine import SearchEngine


In [30]:
# Chargement du fichier discours_US.csv (sÃ©parateur TAB + parseur Python)

df = pd.read_csv(
    "discours_US.csv",
    sep="\t",
    engine="python",
    quotechar='"',
    on_bad_lines="skip"
)

print(df.shape)
print(df.columns)
df.head()


(25, 5)
Index(['speaker', 'text', 'date', 'descr', 'link'], dtype='object')


Unnamed: 0,speaker,text,date,descr,link
0,CLINTON,": I'm getting ready for a lot of things, a lot...","April 12, 2015",Video Remarks Announcing Candidacy for President,http://www.presidency.ucsb.edu/ws/index.php?pi...
1,CLINTON,"[ ] : I'll be graduating in May, and on gradua...","April 14, 2015",Remarks in a Question and Answer Session at Ki...,http://www.presidency.ucsb.edu/ws/index.php?pi...
2,CLINTON,"So, congratulations on this new poll number in...","October 16, 2015",Interview with Jake tapper of CNN,http://www.presidency.ucsb.edu/ws/index.php?pi...
3,CLINTON,"Thank you, Madam Secretary. This is a big inte...","January 5, 2016",Interview with Chris Matthews of MSNBC,http://www.presidency.ucsb.edu/ws/index.php?pi...
4,CLINTON,Wow! What a night. An unbelievable night. What...,"February 1, 2016",Remarks in Des Moines Following the Iowa Caucus,http://www.presidency.ucsb.edu/ws/index.php?pi...


In [31]:
# Distribution des auteurs
df["speaker"].value_counts()


CLINTON    13
TRUMP      12
Name: speaker, dtype: int64

In [32]:
# CrÃ©ation du corpus

corpus = Corpus("US_Speeches")

doc_id = 0

for _, row in df.iterrows():
    auteur = row["speaker"]
    texte = str(row["text"])

    # DÃ©coupage en phrases
    phrases = re.split(r"[.!?]", texte)

    for phrase in phrases:
        phrase = phrase.strip()

        # On ignore les phrases trop courtes
        if len(phrase) < 20:
            continue

        # CrÃ©ation du document
        doc = Document(
            titre=f"Speech_{doc_id}",
            auteur=auteur,
            date=datetime.now(),   # date fictive
            url="discours_US",
            texte=phrase
        )

        corpus.add_document(doc)
        doc_id += 1

print("Nombre de documents dans le corpus :", corpus.taille())


Nombre de documents dans le corpus : 3686


In [33]:
# Test search

corpus.search("freedom")[:10]


['freedom',
 'freedom',
 'freedom',
 'freedom',
 'freedom',
 'freedom',
 'freedom',
 'freedom']

In [34]:
# ===============================
# Test td6 concorde
# ===============================
corpus.concorde("freedom", context=30).head()


Unnamed: 0,contexte_gauche,mot,contexte_droit
0,happens every time there is a,freedom,of information act request if
1,orms that give you choice and,freedom,and control in healthcare â€“ a
2,vantaged child in america the,freedom,"to choose the private, public"
3,clude religious and political,freedom,for the cuban people let's al
4,"merica, i will push to expand",freedom,for all of our people i am go


In [35]:
# CrÃ©ation du moteur de recherche

engine = SearchEngine(corpus)

# Test moteur de recherche
engine.search("freedom democracy", top_n=5)


Unnamed: 0,document,type,score
0,Speech_2880,Generic,0.345115
1,Speech_2308,Generic,0.297088
2,Speech_580,Generic,0.291329
3,Speech_180,Generic,0.277485
4,Speech_2303,Generic,0.26649


In [36]:
# ===============================
# Widgets
# ===============================
title = widgets.Label(value="ðŸ”Ž Moteur de recherche â€“ Discours US")

query_input = widgets.Text(
    description="Mots-clÃ©s :",
    placeholder="ex: freedom democracy"
)

slider = widgets.IntSlider(
    value=5,
    min=1,
    max=20,
    description="RÃ©sultats :"
)

button = widgets.Button(
    description="Rechercher",
    button_style="primary"
)

output = widgets.Output()
output

Output()

In [37]:
# Fonction dÃ©clenchÃ©e par le bouton

def clique_bouton(b):
    with output:
        clear_output()

        query = query_input.value
        n = slider.value

        if query.strip() == "":
            print("Veuillez entrer des mots-clÃ©s.")
            return

        results = engine.search(query, top_n=n)
        display(results)


In [38]:
# Liaison bouton

button.on_click(clique_bouton)

# Organisation de lâ€™interface
interface = widgets.VBox([
    title,
    widgets.HBox([query_input, slider, button]),
    output
])

display(interface)


VBox(children=(Label(value='ðŸ”Ž Moteur de recherche â€“ Discours US'), HBox(children=(Text(value='', description='â€¦