<a href="https://colab.research.google.com/github/jpbeaud/language/blob/main/incident_LAD.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import re
from collections import defaultdict
from sklearn.feature_extraction.text import CountVectorizer

# Prétraitement simple
def preprocess(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    return text

# Charger les données
df = pd.read_csv("incident 1.txt", sep=None, engine="python")
desc_col = [col for col in df.columns if "Description" in col][0]
documents = df[desc_col].dropna().apply(preprocess).tolist()

# Vectorisation
vectorizer = CountVectorizer(stop_words='english')
X = vectorizer.fit_transform(documents)
vocab = vectorizer.get_feature_names_out()
word2id = {word: i for i, word in enumerate(vocab)}
id2word = {i: word for word, i in word2id.items()}

# Paramètres LDA
K = 5  # nombre de sujets
alpha = 0.1
beta = 0.01
D = len(documents)
V = len(vocab)

# Initialisation
doc_words = [doc.split() for doc in documents]
Z = []
doc_topic_counts = np.zeros((D, K)) + alpha
topic_word_counts = np.zeros((K, V)) + beta
topic_counts = np.zeros(K) + V * beta

for d, doc in enumerate(doc_words):
    current_doc = []
    for word in doc:
        if word in word2id:
            topic = np.random.randint(K)
            word_id = word2id[word]
            doc_topic_counts[d, topic] += 1
            topic_word_counts[topic, word_id] += 1
            topic_counts[topic] += 1
            current_doc.append(topic)
    Z.append(current_doc)

# Gibbs Sampling
n_iter = 100
for it in range(n_iter):
    for d, doc in enumerate(doc_words):
        for i, word in enumerate(doc):
            if word not in word2id:
                continue
            word_id = word2id[word]
            topic = Z[d][i]

            # Décrémenter
            doc_topic_counts[d, topic] -= 1
            topic_word_counts[topic, word_id] -= 1
            topic_counts[topic] -= 1

            # Échantillonnage
            p_z = (doc_topic_counts[d] * topic_word_counts[:, word_id]) / topic_counts
            new_topic = np.random.choice(np.arange(K), p=p_z / p_z.sum())

            # Incrémenter
            Z[d][i] = new_topic
            doc_topic_counts[d, new_topic] += 1
            topic_word_counts[new_topic, word_id] += 1
            topic_counts[new_topic] += 1

# Affichage des sujets
n_top_words = 10
for k in range(K):
    top_word_ids = topic_word_counts[k].argsort()[-n_top_words:][::-1]
    top_words = [id2word[i] for i in top_word_ids]
    print(f"Sujet {k + 1}: {', '.join(top_words)}")


                                                  precision    recall  f1-score   support

     COV-APP-APPLI-Mobile-SanFrancisco-Levallois       0.24      0.32      0.28        25
                             COV-APP-CCED ED-GMF       0.00      0.00      0.00         1
                                 COV-APP-CICERON       0.00      0.00      0.00         3
                       COV-APP-COMPTE CLIENT-GMF       1.00      0.32      0.48        47
                       COV-APP-COMPTE-CLIENT-LEV       1.00      1.00      1.00         1
                             COV-APP-CRM MKT-GMF       0.33      0.11      0.17         9
                             COV-APP-CRM OPE-GMF       0.77      0.94      0.84      1315
                                COV-APP-DATA HUB       0.00      0.00      0.00         5
                            COV-APP-DATA HUB SII       0.00      0.00      0.00         7
    COV-APP-DECISIONNEL DATA Mainframe-LEVALLOIS       0.00      0.00      0.00         1
         

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
from google.colab import drive
drive.mount('/content/drive')