<a href="https://colab.research.google.com/github/iswat-portefolio/projet_nlp/blob/main/Untitled24.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [56]:
import pandas

In [57]:
import zipfile
import pandas as pd
import re
import spacy

def charger_et_afficher_tableau(chemin_zip, limite_par_cat=150):
    donnees = []
    categories_cibles = ['business', 'entertainment', 'tech']
    compteurs = {cat: 0 for cat in categories_cibles}

    with zipfile.ZipFile(chemin_zip, 'r') as z:
        for info in z.infolist():
            parties = info.filename.split('/')
            if info.filename.endswith(".txt") and len(parties) > 1:
                cat = parties[-2]
                if cat in categories_cibles and compteurs[cat] < limite_par_cat:
                    with z.open(info.filename) as f:
                        texte_brut = f.read().decode('latin-1')
                        # On garde un aperçu du texte pour le tableau
                        aperçu = texte_brut[:75].replace('\n', ' ') + "..."
                        donnees.append({
                            'Thématique': cat,
                            'Fichier': parties[-1],
                            'Texte Brut (Aperçu)': aperçu,
                            'Contenu_Complet': texte_brut # Pour le pipeline plus tard
                        })
                        compteurs[cat] += 1

            if all(count >= limite_par_cat for count in compteurs.values()):
                break

    # Création du tableau avec Pandas
    df = pd.DataFrame(donnees)

    # Affichage des statistiques
    print("\n--- STATISTIQUES DU CORPUS ---")
    print(df['Thématique'].value_counts())

    # Affichage des 10 premières lignes du tableau
    print("\n--- APERÇU DES DONNÉES ---")
    return df

# Appel de la fonction
df_bbc = charger_et_afficher_tableau('bbc-fulltext.zip')
display(df_bbc.head(10)) # Si vous êtes dans un Notebook


--- STATISTIQUES DU CORPUS ---
Thématique
business         150
entertainment    150
tech             150
Name: count, dtype: int64

--- APERÇU DES DONNÉES ---


Unnamed: 0,Thématique,Fichier,Texte Brut (Aperçu),Contenu_Complet
0,business,001.txt,Ad sales boost Time Warner profit Quarterly p...,Ad sales boost Time Warner profit\n\nQuarterly...
1,business,002.txt,Dollar gains on Greenspan speech The dollar h...,Dollar gains on Greenspan speech\n\nThe dollar...
2,business,003.txt,Yukos unit buyer faces loan claim The owners ...,Yukos unit buyer faces loan claim\n\nThe owner...
3,business,004.txt,High fuel prices hit BA's profits British Air...,High fuel prices hit BA's profits\n\nBritish A...
4,business,005.txt,Pernod takeover talk lifts Domecq Shares in U...,Pernod takeover talk lifts Domecq\n\nShares in...
5,business,006.txt,Japan narrowly escapes recession Japan's econ...,Japan narrowly escapes recession\n\nJapan's ec...
6,business,007.txt,Jobs growth still slow in the US The US creat...,Jobs growth still slow in the US\n\nThe US cre...
7,business,008.txt,"India calls for fair trade rules India, which...","India calls for fair trade rules\n\nIndia, whi..."
8,business,009.txt,Ethiopia's crop production up 24% Ethiopia pr...,Ethiopia's crop production up 24%\n\nEthiopia ...
9,business,010.txt,Court rejects $280bn tobacco case A US govern...,Court rejects $280bn tobacco case\n\nA US gove...


In [58]:
# Chargement du modèle SpaCy
nlp = spacy.load("en_core_web_sm")

def pipeline_nlp(texte):
    # --- NETTOYAGE REGEX ---
    # Suppression URLs, HTML et caractères spéciaux
    texte = re.sub(r'https?://\S+|www\.\S+', '', texte)
    texte = re.sub(r'<.*?>', '', texte)
    texte = re.sub(r'[^a-zA-Z\s]', ' ', texte) # On garde les lettres et espaces
    texte = re.sub(r'\s+', ' ', texte).strip()

    # --- TRAITEMENT SPACY ---
    doc = nlp(texte)

    # Tokenisation, Lemmatisation et Stopwords
    tokens_nettoyes = [
        token.lemma_.lower()
        for token in doc
        if not token.is_stop and not token.is_punct and not token.is_space
    ]

    # POS-tagging et Dépendances
    # On stocke l'analyse sous forme de liste de dictionnaires
    analyse = [
        {"mot": t.text, "pos": t.pos_, "dep": t.dep_}
        for t in doc if not t.is_space
    ]

    return " ".join(tokens_nettoyes), analyse

# Application sur le tableau
df_bbc['Texte_Nettoye'], df_bbc['Analyse_Syntaxique'] = zip(*df_bbc['Contenu_Complet'].apply(pipeline_nlp))

In [59]:
display(df_bbc.head(10)) # Si vous êtes dans un Notebook

Unnamed: 0,Thématique,Fichier,Texte Brut (Aperçu),Contenu_Complet,Texte_Nettoye,Analyse_Syntaxique
0,business,001.txt,Ad sales boost Time Warner profit Quarterly p...,Ad sales boost Time Warner profit\n\nQuarterly...,ad sale boost time warner profit quarterly pro...,"[{'mot': 'Ad', 'pos': 'NOUN', 'dep': 'compound..."
1,business,002.txt,Dollar gains on Greenspan speech The dollar h...,Dollar gains on Greenspan speech\n\nThe dollar...,dollar gain greenspan speech dollar hit high l...,"[{'mot': 'Dollar', 'pos': 'NOUN', 'dep': 'comp..."
2,business,003.txt,Yukos unit buyer faces loan claim The owners ...,Yukos unit buyer faces loan claim\n\nThe owner...,yukos unit buyer face loan claim owner embattl...,"[{'mot': 'Yukos', 'pos': 'PROPN', 'dep': 'comp..."
3,business,004.txt,High fuel prices hit BA's profits British Air...,High fuel prices hit BA's profits\n\nBritish A...,high fuel price hit ba s profit british airway...,"[{'mot': 'High', 'pos': 'ADJ', 'dep': 'amod'},..."
4,business,005.txt,Pernod takeover talk lifts Domecq Shares in U...,Pernod takeover talk lifts Domecq\n\nShares in...,pernod takeover talk lift domecq shares uk dri...,"[{'mot': 'Pernod', 'pos': 'NOUN', 'dep': 'comp..."
5,business,006.txt,Japan narrowly escapes recession Japan's econ...,Japan narrowly escapes recession\n\nJapan's ec...,japan narrowly escape recession japan s econom...,"[{'mot': 'Japan', 'pos': 'PROPN', 'dep': 'nsub..."
6,business,007.txt,Jobs growth still slow in the US The US creat...,Jobs growth still slow in the US\n\nThe US cre...,job growth slow create few job expect january ...,"[{'mot': 'Jobs', 'pos': 'NOUN', 'dep': 'compou..."
7,business,008.txt,"India calls for fair trade rules India, which...","India calls for fair trade rules\n\nIndia, whi...",india call fair trade rule india attend g meet...,"[{'mot': 'India', 'pos': 'PROPN', 'dep': 'nsub..."
8,business,009.txt,Ethiopia's crop production up 24% Ethiopia pr...,Ethiopia's crop production up 24%\n\nEthiopia ...,ethiopia s crop production ethiopia produce mi...,"[{'mot': 'Ethiopia', 'pos': 'PROPN', 'dep': 'p..."
9,business,010.txt,Court rejects $280bn tobacco case A US govern...,Court rejects $280bn tobacco case\n\nA US gove...,court reject bn tobacco case government claim ...,"[{'mot': 'Court', 'pos': 'NOUN', 'dep': 'nsubj..."
