# TP2 : Apprentissage multi-label sur des données textuelles

*Jordan Dutel et Ariane Paradan*

---
---

## I. Import des librairies

In [None]:
import importlib
import function_TAA as taa
importlib.reload(taa)

## II. Load data

In [None]:
df = taa.load_data('PubMed-multi-label-dataset.csv', sep=',')

## III. Pre-processing

### A. Data infos

In [None]:
taa.get_info(df)

### B. Data cleaning

La fonction est longue à faire tourner (~4 min)

In [27]:
df = taa.clean_text(df, 'abstractText')

### C. Features and labels split

In [None]:
target_column = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'L', 'M', 'N', 'Z']
X, y = taa.split_feature_label(df, target_column, multilabel=True)

### D. Split train test

In [None]:
X_train, X_test, y_train, y_test = taa.split_train_test(X, y, test_size=0.5, random_state=42)

In [None]:
X_train

In [None]:
X_train_vec, X_test_vec = taa.tfidf_vectorize(X_train, X_test, max_features=2000)

In [None]:
from sklearn.multioutput import MultiOutputClassifier, ClassifierChain
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score, zero_one_loss, classification_report

def runMOC(X_train, X_test, y_train, y_test):
    """
    Fonction pour exécuter un MultiOutputClassifier avec une régression logistique.
    """
    model = MultiOutputClassifier(LogisticRegression())
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return y_pred

def runECC(X_train, X_test, y_train, y_test):
    """
    Fonction pour exécuter un ClassifierChain avec un k-plus proche voisin.
    """
    base_model = KNeighborsClassifier(n_neighbors=5)  # Paramètre ajustable
    chains = [ClassifierChain(base_model, order='random', random_state=i) for i in range(3)]  # Ex. 3 chaînes
    
    # Moyenne des prédictions des chaînes
    preds = []
    for chain in chains:
        chain.fit(X_train, y_train)
        preds.append(chain.predict(X_test))
    
    # Moyenne des résultats pour obtenir un seul tableau de prédictions
    y_pred = sum(preds) / len(preds)
    y_pred = (y_pred > 0.5).astype(int)  # Seuil pour binariser les prédictions
    return y_pred

def evaluate(y_test, y_pred):
    """
    Fonction pour évaluer les performances des prédictions.
    """
    metrics = {
        "micro-F1": f1_score(y_test, y_pred, average='micro'),
        "macro-F1": f1_score(y_test, y_pred, average='macro'),
        "zero-one-loss": zero_one_loss(y_test, y_pred)
    }
    print("Classification Report:")
    print(classification_report(y_test, y_pred))
    return metrics

def run_models(X_train, X_test, y_train, y_test):
    """
    Fonction principale pour exécuter et évaluer les modèles.
    """
    print("Running MultiOutputClassifier...")
    moc_pred = runMOC(X_train, X_test, y_train, y_test)
    print("Evaluating MultiOutputClassifier...")
    moc_metrics = evaluate(y_test, moc_pred)
    
    print("\nRunning ClassifierChain...")
    ecc_pred = runECC(X_train, X_test, y_train, y_test)
    print("Evaluating ClassifierChain...")
    ecc_metrics = evaluate(y_test, ecc_pred)
    
    print("\nResults Summary:")
    print("MultiOutputClassifier Metrics:", moc_metrics)
    print("ClassifierChain Metrics:", ecc_metrics)
    return {"MOC": moc_metrics, "ECC": ecc_metrics}

# Supposons que X_train, X_test, y_train, y_test soient déjà définis
results = run_models(X_train_vec, X_test_vec, y_train, y_test)



In [None]:
# Supposons que X_train, X_test, y_train, y_test soient déjà définis
results = run_models(X_train_vec, X_test_vec, y_train, y_test)


In [None]:
base_lr = LogisticRegression()

# Run EnsembleClassifierChain
def runMOC(X_train, y_train, X_test):
    clf = MultiOutputClassifier(LogisticRegression()).fit(X_train_tfidf, y_train)
    clf.predict(X_test_tfidf)


def runECC(X_train, y_train, X_test, model_name):
    chains = [ClassifierChain(base_lr, order="random", random_state=i) for i in range(10)]
    for chain in chains:
        chain.fit(X_train, Y_train)

    Y_pred_chains = np.array([chain.predict_proba(X_test) for chain in chains])
    chain_jaccard_scores = [
        jaccard_score(Y_test, Y_pred_chain >= 0.5, average="samples")
        for Y_pred_chain in Y_pred_chains
    ]

    Y_pred_ensemble = Y_pred_chains.mean(axis=0)
    ensemble_jaccard_score = jaccard_score(
        Y_test, Y_pred_ensemble >= 0.5, average="samples"
    )
    return ensemble_jaccard_score

def Evaluate(PRED, y_test):

    return 

def run_modele(X_train, y_train, X_test, y_test):
    PRED_MOC_LR = runMOC(X_train, y_train, X_test)
    PRED_ECC_LR = runECC(X_train, y_train, X_test)
    Evaluate(PRED_MOC_LR, y_test)  
    Evaluate(PRED_ECC_LR, y_test) 


In [59]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier
from sklearn.multioutput import ClassifierChain
from sklearn.metrics import jaccard_score, accuracy_score, classification_report
from sklearn.feature_extraction.text import TfidfTransformer

def runMOC(X_train, y_train, X_test):
    """
    MultiOutputClassifier avec régression logistique
    """
    clf = MultiOutputClassifier(LogisticRegression(max_iter=2000, random_state=42))
    clf.fit(X_train, y_train)
    predictions = clf.predict(X_test)
    return predictions


def runECC(X_train, y_train, X_test, Y_test):
    """
    Ensemble Classifier Chain avec 10 chaînes
    """
    base_lr = LogisticRegression(max_iter=2000, random_state=42)
    chains = [ClassifierChain(base_lr, order="random", random_state=i) for i in range(10)]

    # Ajustement des chaînes
    for chain in chains:
        chain.fit(X_train, y_train)

    # Prédictions pour chaque chaîne
    Y_pred_chains = np.array([chain.predict(X_test) for chain in chains])
    Y_pred_ensemble = Y_pred_chains.mean(axis=0) >= 0.5  # Moyenne des prédictions
    
    # Évaluation des performances de l'ensemble
    chain_jaccard_scores = [
        jaccard_score(Y_test, Y_pred_chain, average="samples") for Y_pred_chain in Y_pred_chains
    ]
    ensemble_jaccard_score = jaccard_score(
        Y_test, Y_pred_ensemble, average="samples"
    )

    print(f"Scores Jaccard des chaînes individuelles : {chain_jaccard_scores}")
    print(f"Score Jaccard de l'ensemble : {ensemble_jaccard_score}")

    return Y_pred_ensemble


def evaluate(PRED, y_test):
    """
    Évaluation des performances du modèle
    """
    accuracy = accuracy_score(y_test, PRED)
    report = classification_report(y_test, PRED, zero_division=0)
    
    print(f"Accuracy : {accuracy}")
    print("Rapport de classification :")
    print(report)
    return accuracy


def run_model(X_train, y_train, X_test, y_test):
    """
    Exécution des modèles et évaluation des performances
    """
    print("\n=== MultiOutputClassifier ===")
    PRED_MOC_LR = runMOC(X_train, y_train, X_test)
    evaluate(PRED_MOC_LR, y_test)

    print("\n=== Ensemble Classifier Chains ===")
    PRED_ECC_LR = runECC(X_train, y_train, X_test, y_test)
    evaluate(PRED_ECC_LR, y_test)


In [None]:
run_model(X_train_tfidf, y_train, X_test_tfidf, y_test)

In [None]:
print("X_train_tfidf shape:", X_train_tfidf.shape)
print("y_train shape:", y_train.shape)

