In [5]:
#Partie3
#interfaceComplèteTD8
import pandas as pd
from IPython.display import display, clear_output
import ipywidgets as widgets
import re
from Corpus import Corpus
from Document import Document
from SearchEngine import SearchEngine

# -------------------------
# Charger corpus Discours US
# -------------------------
df_discours = pd.read_csv("data_corpus/discours_us.csv", sep="\t")

corpus_discours = Corpus("Corpus_US")
for idx, row in df_discours.iterrows():
    texte = str(row['text'])
    phrases = re.split(r'(?<=[.!?])\s+', texte)
    for i, phrase in enumerate(phrases):
        doc = Document(
            titre=f"{row['speaker']} - phrase {i+1}",
            auteur=row['speaker'],
            date=row['date'],
            url=row.get('link', ''),
            texte=phrase
        )
        corpus_discours.add_document(doc)

search_engine_discours = SearchEngine(corpus_discours)

# -------------------------
# Widgets TD8
# -------------------------
label_td8 = widgets.Label(" Recherche dans Discours US (TD8)")

text_requete = widgets.Text(
    value='',
    placeholder='Entrez mots clés',
    layout=widgets.Layout(width='100%')
)

slider_topk = widgets.IntSlider(
    value=5,
    min=1,
    max=20,
    continuous_update=False,
    layout=widgets.Layout(width='100%')
)

auteurs = sorted(df_discours['speaker'].dropna().unique())
dropdown_auteur = widgets.Dropdown(
    options=['Tous'] + list(auteurs),
    value='Tous',
    layout=widgets.Layout(width='100%')
)

annees = sorted(df_discours['date'].dropna().apply(lambda x: pd.to_datetime(x).year).unique())
dropdown_annee = widgets.Dropdown(
    options=['Toutes'] + list(annees),
    value='Toutes',
    layout=widgets.Layout(width='100%')
)

btn_search_discours = widgets.Button(
    description="Rechercher",
    button_style='success'
)

output_td8 = widgets.Output(
    layout={'border': '1px solid black', 'height': '300px', 'overflow_y': 'auto'}
)


LABEL_WIDTH = '250px'

ligne_mots_cles = widgets.HBox([
    widgets.Label("Mots clés :", layout=widgets.Layout(width=LABEL_WIDTH)),
    text_requete
])

ligne_topk = widgets.HBox([
    widgets.Label("Nombre d'articles à extraire :", layout=widgets.Layout(width=LABEL_WIDTH)),
  slider_topk
])

ligne_auteur = widgets.HBox([
    widgets.Label("Auteur :", layout=widgets.Layout(width=LABEL_WIDTH)),
    dropdown_auteur
])

ligne_annee = widgets.HBox([
    widgets.Label("Année :", layout=widgets.Layout(width=LABEL_WIDTH)),
    dropdown_annee
])

# -------------------------
# Fonction de recherche
# -------------------------
def search_discours(b):
    with output_td8:
        clear_output(wait=True)

        requete_str = text_requete.value.strip()
        if not requete_str:
            print("Veuillez entrer des mots clés.")
            return

        top_k = slider_topk.value
        resultats = search_engine_discours.search(requete_str, top_k=top_k)

        # Filtrer par auteur
        if dropdown_auteur.value != 'Tous':
            resultats = resultats[resultats['Auteur'] == dropdown_auteur.value]

        # Filtrer par année
        if dropdown_annee.value != 'Toutes':
            resultats = resultats[
                resultats['Date'].apply(lambda x: pd.to_datetime(x).year) == dropdown_annee.value
            ]

        if resultats.empty:
            print("Aucun résultat trouvé.")
        else:
            display(resultats)

btn_search_discours.on_click(search_discours)

# -------------------------
# Interface finale
# -------------------------
display(widgets.VBox([
    label_td8,
    ligne_mots_cles,
    ligne_topk,
    ligne_auteur,
    ligne_annee,
    btn_search_discours,
    output_td8
]))


VBox(children=(Label(value=' Recherche dans Discours US (TD8)'), HBox(children=(Label(value='Mots clés :', lay…

In [6]:
#TD9_10


In [16]:
# INTERFACETD9/10
import pandas as pd
from IPython.display import display, clear_output
import ipywidgets as widgets
import re
import matplotlib.pyplot as plt
from Corpus import Corpus
from CompareCorpus import CompareCorpus

# -------------------------
# Stopwords FR + EN
# -------------------------
stopwords = set([
    "le","la","les","un","une","des","du","de","d'","et","à","au","aux","ce","ces","cet","cette","dans","en",
    "pour","par","sur","avec","sans","sous","chez","entre","mais","ou","où","donc","or","ni","car","ne","pas",
    "que","qui","quoi","dont","lorsque","comme","si","être","l","le","la","les","et","de","des","du","un","une",
    "the","a","an","and","or","but","if","then","for","on","in","at","with","without","by","of","to","from",
    "up","down","over","under","between","into","about","as","is","are","was","were","be","been","being","he",
    "she","it","they","them","his","her","its","their","i","you","we","me","him","us"
])

# -------------------------
# Charger corpus
# -------------------------
reddit = Corpus("Reddit")
reddit.load("data_corpus/reddit_fr.csv")
arxiv = Corpus("Arxiv")
arxiv.load("data_corpus/arxiv_fr.csv")
compare = CompareCorpus(reddit, arxiv)

df_reddit = pd.read_csv("data_corpus/reddit_fr.csv")
df_arxiv = pd.read_csv("data_corpus/arxiv_fr.csv")

# -------------------------
# Préparer vocabulaire et mots communs/spécifiques
# -------------------------
def get_vocab(df):
    all_text = ' '.join(df['texte'].astype(str)).lower()
    words = re.findall(r'\b\w+\b', all_text)
    vocab = {}
    for w in words:
        if w not in stopwords:
            vocab[w] = vocab.get(w,0)+1
    return vocab

vocab_r = get_vocab(df_reddit)
vocab_a = get_vocab(df_arxiv)

mots_communs = {w:vocab_r[w]+vocab_a[w] for w in vocab_r if w in vocab_a}
mots_spec = {
    "Reddit": {w:v for w,v in vocab_r.items() if w not in mots_communs},
    "Arxiv": {w:v for w,v in vocab_a.items() if w not in mots_communs}
}

# -------------------------
# Widgets interface
# -------------------------
label_titre = widgets.Label("Comparaison Reddit / Arxiv")

type_mot = widgets.RadioButtons(
    options=["Mot commun", "Mot spécifique"],
    description="Type :",
    value=None
)

source = widgets.Dropdown(
    options=["Reddit","Arxiv","Tous"],
    description="Source :"
)

liste_mots = widgets.Dropdown(description="Mot :")
filtre_auteur = widgets.Dropdown(description="Auteur :")
filtre_annee = widgets.Dropdown(description="Année :")
btn_analyser = widgets.Button(description="Analyser le mot", button_style="success")

output = widgets.Output(layout={'border':'1px solid black','height':'700px','overflow_y':'auto'})

# -------------------------
# Mettre à jour listes mots / auteurs / années
# -------------------------
def update_liste(*args):

    # =========================
    # 1. Déterminer la liste de mots
    # =========================
    if type_mot.value == "Mot commun":
        mots = sorted(mots_communs.keys())
        source.options = ["Tous"]
        source.value = "Tous"
        source.disabled = True
    else:
        source.options = ["Reddit", "Arxiv"]
        source.disabled = False

        if source.value == "Reddit":
            mots = sorted(mots_spec["Reddit"].keys())
        elif source.value == "Arxiv":
            mots = sorted(mots_spec["Arxiv"].keys())
        else:
            mots = []

    # Mise à jour liste des mots
    liste_mots.options = mots

    # Sélectionner un mot seulement si nécessaire
    if mots and liste_mots.value not in mots:
        liste_mots.value = mots[0]

    # S'il n'y a aucun mot, on vide les filtres
    if not liste_mots.value:
        filtre_auteur.options = ["Tous"]
        filtre_annee.options = ["Toutes"]
        return

    mot_sel = liste_mots.value

    # =========================
    # 2. Filtrer les documents
    # =========================
    df_r = df_reddit[df_reddit['texte'].str.contains(
        rf"\b{re.escape(mot_sel)}\b", case=False, regex=True, na=False
    )]

    df_a = df_arxiv[df_arxiv['texte'].str.contains(
        rf"\b{re.escape(mot_sel)}\b", case=False, regex=True, na=False
    )]

    # =========================
    # 3. Auteurs
    # =========================
    if source.value == "Reddit":
        auteurs = sorted(df_r['auteur'].dropna().unique())
    elif source.value == "Arxiv":
        auteurs = sorted(df_a['auteur'].dropna().unique())
    else:
        auteurs = sorted(
            set(df_r['auteur'].dropna()) | set(df_a['auteur'].dropna())
        )

    filtre_auteur.options = ["Tous"] + auteurs
    filtre_auteur.value = "Tous"

    # =========================
    # 4. Années
    # =========================
    def get_years(df):
        if df.empty:
            return set()
        return set(pd.to_datetime(df['date'], errors='coerce').dt.year.dropna())

    if source.value == "Reddit":
        annees = sorted(get_years(df_r))
    elif source.value == "Arxiv":
        annees = sorted(get_years(df_a))
    else:
        annees = sorted(get_years(df_r) | get_years(df_a))

    filtre_annee.options = ["Toutes"] + [str(a) for a in annees]
    filtre_annee.value = "Toutes"


type_mot.observe(update_liste, names='value')
source.observe(update_liste, names='value')
liste_mots.observe(update_liste, names='value')

# -------------------------
# Comptage occurrences par année
# -------------------------
def count_occurrences(docs, mot):
    counts = {}
    for doc in docs:
        year = pd.to_datetime(doc.date).year
        texte = str(doc.texte).lower()
        nb = len(re.findall(rf"\b{re.escape(mot)}\b", texte))
        if nb>0:
            counts[year] = counts.get(year,0)+nb
    return counts

# -------------------------
# Analyse mot
# -------------------------
def analyser_mot(b):
    with output:
        clear_output()
        mot_sel = liste_mots.value
        if not mot_sel:
            print("Veuillez sélectionner un mot.")
            return
        
        auteur_val = None if filtre_auteur.value=="Tous" else filtre_auteur.value
        annee_val = None if filtre_annee.value=="Toutes" else int(filtre_annee.value)
        
        df_r = df_reddit[df_reddit['texte'].str.contains(rf"\b{re.escape(mot_sel)}\b", case=False, regex=True)]
        df_a = df_arxiv[df_arxiv['texte'].str.contains(rf"\b{re.escape(mot_sel)}\b", case=False, regex=True)]
        
        if auteur_val:
            df_r = df_r[df_r['auteur']==auteur_val] if not df_r.empty else df_r
            df_a = df_a[df_a['auteur']==auteur_val] if not df_a.empty else df_a
        if annee_val:
            df_r = df_r[pd.to_datetime(df_r['date']).dt.year==annee_val] if not df_r.empty else df_r
            df_a = df_a[pd.to_datetime(df_a['date']).dt.year==annee_val] if not df_a.empty else df_a

        if df_r.empty and df_a.empty:
            print(f"Aucun document trouvé pour le mot '{mot_sel}'.")
            return

        res_r,res_a = compare.analyser_mot(mot_sel)
        if not df_r.empty:
            df_r = df_r.copy()
            df_r['TF'] = res_r['TF']
            df_r['IDF'] = res_r['IDF']
            df_r['BM25'] = res_r['BM25']
        if not df_a.empty:
            df_a = df_a.copy()
            df_a['TF'] = res_a['TF']
            df_a['IDF'] = res_a['IDF']
            df_a['BM25'] = res_a['BM25']

        if type_mot.value=="Mot commun":
            if not df_r.empty:
                print(f" Tableau Reddit filtré pour le mot commun '{mot_sel}' :")
                display(df_r[['titre','auteur','date','texte','TF','IDF','BM25']])
            if not df_a.empty:
                print(f" Tableau Arxiv filtré pour le mot commun '{mot_sel}' :")
                display(df_a[['titre','auteur','date','texte','TF','IDF','BM25']])
        else:
            if source.value=="Reddit" and not df_r.empty:
                print(f" Tableau Reddit filtré pour le mot spécifique '{mot_sel}' :")
                display(df_r[['titre','auteur','date','texte','TF','IDF','BM25']])
            elif source.value=="Arxiv" and not df_a.empty:
                print(f" Tableau Arxiv filtré pour le mot spécifique '{mot_sel}' :")
                display(df_a[['titre','auteur','date','texte','TF','IDF','BM25']])

        print("\n Interprétation BM25 :")
        print(f"Reddit : TF={res_r['TF']} | IDF={res_r['IDF']:.3f} | BM25={res_r['BM25']:.3f}")
        print(f"Arxiv  : TF={res_a['TF']} | IDF={res_a['IDF']:.3f} | BM25={res_a['BM25']:.3f}")
        if res_r['BM25']>res_a['BM25']:
            print("Le mot est plus important dans Reddit")
        elif res_r['BM25']<res_a['BM25']:
            print("Le mot est plus important dans Arxiv")
        else:
            print("Le mot a la même importance dans les deux corpus")

        counts_r = count_occurrences(reddit.id2doc.values(), mot_sel)
        counts_a = count_occurrences(arxiv.id2doc.values(), mot_sel)
        plt.figure(figsize=(8,4))
        if counts_r:
            plt.bar([str(y) for y in sorted(counts_r.keys())], [counts_r[y] for y in sorted(counts_r.keys())], alpha=0.6, label='Reddit')
        if counts_a:
            plt.bar([str(y) for y in sorted(counts_a.keys())], [counts_a[y] for y in sorted(counts_a.keys())], alpha=0.6, label='Arxiv')
        plt.xlabel("Année")
        plt.ylabel("Occurrences")
        plt.title(f"Évolution temporelle du mot '{mot_sel}'")
        plt.legend()
        plt.show()

# -------------------------
# Affichage interface
# -------------------------
btn_analyser.on_click(analyser_mot)
display(widgets.VBox([label_titre,type_mot,source,liste_mots,filtre_auteur,filtre_annee,btn_analyser,output]))
update_liste()


VBox(children=(Label(value='Comparaison Reddit / Arxiv'), RadioButtons(description='Type :', options=('Mot com…