# **FULL DATASET BINARY**

### **Preliminary steps**

In [1]:
import os 
print(os.getcwd())
os.chdir('ml-for-nlp')
print(os.getcwd())

/Users/Corentin/Desktop/ENSAE 2024-2025/S2/NLP/final project 
/Users/Corentin/Desktop/ENSAE 2024-2025/S2/NLP/final project /ml-for-nlp


In [2]:
### PACKAGES
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import shap

In [3]:
### FUNCTIONS

from src.configs import constants, ml_config, names
from src.libs import preprocessing, evaluation, visualization

In [4]:
### LOAD DATA

df_poetry_gpt = preprocessing.load_dataset(source="GPT", type="POETRY")
df_poetry_bard = preprocessing.load_dataset(source="BARD", type="POETRY")
df_poetry_human = preprocessing.load_dataset(source="Human", type="POETRY")

df_essay_gpt = preprocessing.load_dataset(source="GPT", type="ESSAY")
df_essay_bard = preprocessing.load_dataset(source="BARD", type="ESSAY")
df_essay_human = preprocessing.load_dataset(source="Human", type="ESSAY")

df_story_gpt = preprocessing.load_dataset(source="GPT", type="STORY")
df_story_bard = preprocessing.load_dataset(source="BARD", type="STORY")
df_story_human = preprocessing.load_dataset(source="Human", type="STORY")

Time taken to load dataset: 0.02466583251953125 seconds
Time taken to load dataset: 0.007227897644042969 seconds
Time taken to load dataset: 0.28362226486206055 seconds
Time taken to load dataset: 0.007528066635131836 seconds
Time taken to load dataset: 0.0073337554931640625 seconds
Time taken to load dataset: 0.07922887802124023 seconds
Time taken to load dataset: 0.009380102157592773 seconds
Time taken to load dataset: 0.010261058807373047 seconds
Time taken to load dataset: 0.009925127029418945 seconds


In [5]:
### GET TRAIN SPLIT

df_train_poetry_gpt, df_test_poetry_gpt = preprocessing.train_valid_split(df_poetry_gpt)
df_train_poetry_bard, df_test_poetry_bard = preprocessing.train_valid_split(
    df_poetry_bard
)
df_train_poetry_human, df_test_poetry_human = preprocessing.train_valid_split(
    df_poetry_human
)

df_train_essay_gpt, df_test_essay_gpt = preprocessing.train_valid_split(df_essay_gpt)
df_train_essay_bard, df_test_essay_bard = preprocessing.train_valid_split(df_essay_bard)
df_train_essay_human, df_test_essay_human = preprocessing.train_valid_split(
    df_essay_human
)

df_train_story_gpt, df_test_story_gpt = preprocessing.train_valid_split(df_story_gpt)
df_train_story_bard, df_test_story_bard = preprocessing.train_valid_split(df_story_bard)
df_train_story_human, df_test_story_human = preprocessing.train_valid_split(
    df_story_human
)

In [6]:
### GET FULL DATASETS

df_train_poetry = preprocessing.group_llms(
    preprocessing.get_poetry_dataset(
        df_train_poetry_gpt,
        df_train_poetry_bard,
        df_train_poetry_human,
        samples_per_source=min(
            len(df_train_poetry_gpt),
            len(df_train_poetry_bard),
            len(df_train_poetry_human),
        ),
    )
)
df_test_poetry = preprocessing.group_llms(
    preprocessing.get_poetry_dataset(
        df_test_poetry_gpt,
        df_test_poetry_bard,
        df_test_poetry_human,
        samples_per_source=None,
    )
)

df_train_essay = preprocessing.group_llms(
    preprocessing.get_essay_dataset(
        df_train_essay_gpt,
        df_train_essay_bard,
        df_train_essay_human,
        samples_per_source=min(
            len(df_train_essay_gpt), len(df_train_essay_bard), len(df_train_essay_human)
        ),
    )
)
df_test_essay = preprocessing.group_llms(
    preprocessing.get_essay_dataset(
        df_test_essay_gpt,
        df_test_essay_bard,
        df_test_essay_human,
        samples_per_source=None,
    )
)

df_train_story = preprocessing.group_llms(
    preprocessing.get_story_dataset(
        df_train_story_gpt,
        df_train_story_bard,
        df_train_story_human,
        samples_per_source=min(
            len(df_train_story_gpt), len(df_train_story_bard), len(df_train_story_human)
        ),
    )
)
df_test_story = preprocessing.group_llms(
    preprocessing.get_story_dataset(
        df_test_story_gpt,
        df_test_story_bard,
        df_test_story_human,
        samples_per_source=None,
    )
)

In [7]:
### CONCATENATE ALL DATASETS

df_train = pd.concat([df_train_poetry, df_train_essay, df_train_story], axis=0).sample(
    frac=1
)
df_test = pd.concat([df_test_poetry, df_test_essay, df_test_story], axis=0).sample(
    frac=1
)

In [8]:
### SPLIT FEATURES AND LABELS

X_train, y_train = preprocessing.split_features_and_labels(df_train)
X_test, y_test = preprocessing.split_features_and_labels(df_test)
X_test_poetry, y_test_poetry = preprocessing.split_features_and_labels(df_test_poetry)
X_test_essay, y_test_essay = preprocessing.split_features_and_labels(df_test_essay)
X_test_story, y_test_story = preprocessing.split_features_and_labels(df_test_story)

## **CLASSIFICATION USING PUNCTUATION & RANDOM FOREST**

In [9]:
from lightgbm import LGBMClassifier
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix

In [10]:
from src.configs import ml_config

In [35]:
def punctuation_transform(X: np.ndarray) -> np.ndarray:

    X_series = pd.Series(X).astype(str)
    array_nb_characters = X_series.str.len().to_numpy()

    array_ratio_punctuations = (
        X_series.str.count(r"[^\w\s]").to_numpy() / array_nb_characters
    )
    array_ratio_words_per_sentence = (
        X_series.str.count(r"[.!?]").to_numpy()
        / X_series.str.split().str.len().to_numpy()
    )
    array_ratio_odd_characters = (
        X_series.str.count(r"[!;-_]").to_numpy() / array_nb_characters
    )

    embedding = pd.DataFrame(
        {
            "ratio_punctuations": array_ratio_punctuations,
            "ratio_words_per_sentence": array_ratio_words_per_sentence,
            "ratio_odd_characters": array_ratio_odd_characters,
        }
    ).to_numpy()

    return embedding

def punctuation_map_dimensions_to_names() -> np.ndarray:
    return np.array([
        "ratio_punctuations",
        "ratio_words_per_sentence",
        "ratio_odd_characters"
    ])


In [36]:
### EMBEDDING

X_train_211 = punctuation_transform(X_train)
X_test_211 = punctuation_transform(X_test)
X_test_poetry_211 = punctuation_transform(X_test_poetry)
X_test_essay_211 = punctuation_transform(X_test_essay)
X_test_story_211 = punctuation_transform(X_test_story)

In [16]:
def evaluate(model, X, y):
    """
    Evaluate a model and return confusion matrix + precision, recall, f1.

    Args:
        model: Trained sklearn-like model.
        X (np.ndarray): Input features.
        y (np.ndarray): True labels.

    Returns:
        dict: Dictionary containing confusion matrix, precision, recall, f1 score.
    """
    y_pred = model.predict(X)

    return {
        "confusion_matrix": confusion_matrix(y, y_pred),
        "precision": precision_score(y, y_pred, average="binary"),
        "recall": recall_score(y, y_pred, average="binary"),
        "f1": f1_score(y, y_pred, average="binary")
    }


In [42]:
from sklearn.ensemble import RandomForestClassifier

In [41]:
rf_params = {
    "n_estimators": 100,
    "max_depth": 50,
    "min_samples_split": 2,
    "min_samples_leaf": 1,
    "bootstrap": True,
    "n_jobs": -1,
    "random_state": 42
}

In [43]:
classifier_211 = RandomForestClassifier(**rf_params)

# Entraîner
classifier_211.fit(X_train_211, y_train)

In [44]:
# Evaluation
metrics_211 = evaluate(classifier_211, X_test_211, y_test)
metrics_poetry_211 = evaluate(classifier_211, X_test_poetry_211, y_test_poetry)
metrics_essay_211 = evaluate(classifier_211, X_test_essay_211, y_test_essay)
metrics_story_211 = evaluate(classifier_211, X_test_story_211, y_test_story)

print(f"Metrics on test set: {metrics_211}")
print(f"Metrics on poetry: {metrics_poetry_211}")
print(f"Metrics on essay: {metrics_essay_211}")
print(f"Metrics on story: {metrics_story_211}")

Metrics on test set: {'confusion_matrix': array([[857, 550],
       [  4,  13]]), 'precision': 0.023090586145648313, 'recall': 0.7647058823529411, 'f1': 0.04482758620689655}
Metrics on poetry: {'confusion_matrix': array([[842, 543],
       [  3,   7]]), 'precision': 0.012727272727272728, 'recall': 0.7, 'f1': 0.025}
Metrics on essay: {'confusion_matrix': array([[13,  7],
       [ 1,  3]]), 'precision': 0.3, 'recall': 0.75, 'f1': 0.42857142857142855}
Metrics on story: {'confusion_matrix': array([[2, 0],
       [0, 3]]), 'precision': 1.0, 'recall': 1.0, 'f1': 1.0}


In [45]:
### PARAMS
lgbm_params = {
    "boosting_type": "gbdt",
    "n_estimators": 100,
    "max_depth": 10,
    "num_leaves": 31,
    "subsample": 0.8,
    "n_jobs": -1,
    "learning_rate": 0.05,
    "verbose": 10,
    "random_state": 42
}

In [None]:
### CLASSIFIER

classifier_211 = LGBMClassifier(**lgbm_params)

classifier_211.fit(X_train_211, y_train)

: 

: 

: 

: 

In [None]:
### CLASSIFIER

classifier_211 = LGBMClassifier(**lgbm_params)

classifier_211.fit(X_train_211, y_train)

# Evaluation
metrics_211 = evaluate(classifier_211, X_test_211, y_test)
metrics_poetry_211 = evaluate(classifier_211, X_test_poetry_211, y_test_poetry)
metrics_essay_211 = evaluate(classifier_211, X_test_essay_211, y_test_essay)
metrics_story_211 = evaluate(classifier_211, X_test_story_211, y_test_story)

print(f"Metrics on test set: {metrics_211}")
print(f"Metrics on poetry: {metrics_poetry_211}")
print(f"Metrics on essay: {metrics_essay_211}")
print(f"Metrics on story: {metrics_story_211}")

: 

: 

: 

: 

In [None]:
### INTERPRETABILITY - FEATURE IMPORTANCE

dim_names_211 = punctuation_map_dimensions_to_names()
feature_importances_211 = classifier_211.feature_importances_

most_important_features_211 = np.argsort(feature_importances_211)[-10:][::-1]

for i in most_important_features_211:
    print(f"{dim_names_211[i]}: {feature_importances_211[i]}")


In [None]:
### INTERPRETABILITY - SHAP VALUES

explainer_211 = shap.Explainer(classifier_211)

if hasattr(X_test_211, "toarray"):
    X_test_211_dense = X_test_211.toarray()
else:
    X_test_211_dense = X_test_211

shap_values_211 = explainer_211(X_test_211_dense)
shap.summary_plot(shap_values_211, X_test_211_dense, feature_names=dim_names_211)


## **CLASSIFICATION USING TF-IDF & LGBM**

In [None]:
ID_EXPERIMENT = 111

In [None]:
### EMBEDDING

embedding_111 = TfIdfEmbedding(ID_EXPERIMENT)

embedding_111.fit(X=X_train)

X_train_111 = embedding_111.transform(X=X_train)
X_test_111 = embedding_111.transform(X=X_test)
X_test_poetry_111 = embedding_111.transform(X=X_test_poetry)
X_test_essay_111 = embedding_111.transform(X=X_test_essay)
X_test_story_111 = embedding_111.transform(X=X_test_story)

In [None]:
### CLASSIFIER

classifier_111 = LightGBMClassifier(ID_EXPERIMENT)

classifier_111.train(X_train=X_train_111, y_train=y_train)

metrics_111 = classifier_111.evaluate(X_test_111, y_test)
metrics_poetry_111 = classifier_111.evaluate(X_test_poetry_111, y_test_poetry)
metrics_essay_111 = classifier_111.evaluate(X_test_essay_111, y_test_essay)
metrics_story_111 = classifier_111.evaluate(X_test_story_111, y_test_story)

print(f"Metrics on test set: {metrics_111}")
print(f"Metrics on poetry: {metrics_poetry_111}")
print(f"Metrics on essay: {metrics_essay_111}")
print(f"Metrics on story: {metrics_story_111}")

In [None]:
### INTERPRETABILITY - FEATURE IMPORTANCE

dim_names_111 = embedding_111.map_dimensions_to_names()
feature_importances_111 = classifier_111.get_feature_importance()
most_important_features_111 = np.argsort(feature_importances_111)[-10:][::-1]
for i in most_important_features_111:
    print(f"{dim_names_111[i]}: {feature_importances_111[i]}")

In [None]:
### INTERPRETABILITY - SHAP VALUES

explainer_111 = shap.Explainer(classifier_111.classifier)

if hasattr(X_test_111, "toarray"):
    X_test_111 = X_test_111.toarray()

# Calculate SHAP values
shap_values_111 = explainer_111(X_test_111)


# Visualize the SHAP values
shap.summary_plot(shap_values_111, X_test_111, feature_names=dim_names_111)

## **CLASSIFICATION USING TF-IDF & RANDOM FOREST**

In [None]:
ID_EXPERIMENT = 121

In [None]:
### EMBEDDING

embedding_121 = TfIdfEmbedding(ID_EXPERIMENT)

embedding_121.fit(X=X_train)

X_train_121 = embedding_121.transform(X=X_train)
X_test_121 = embedding_121.transform(X=X_test)
X_test_poetry_121 = embedding_121.transform(X=X_test_poetry)
X_test_essay_121 = embedding_121.transform(X=X_test_essay)
X_test_story_121 = embedding_121.transform(X=X_test_story)

In [None]:
### CLASSIFIER

classifier_121 = RFClassifier(ID_EXPERIMENT)

classifier_121.train(X_train=X_train_121, y_train=y_train)

metrics_121 = classifier_121.evaluate(X_test_121, y_test)
metrics_poetry_121 = classifier_121.evaluate(X_test_poetry_121, y_test_poetry)
metrics_essay_121 = classifier_121.evaluate(X_test_essay_121, y_test_essay)
metrics_story_121 = classifier_121.evaluate(X_test_story_121, y_test_story)

print(f"Metrics on test: {metrics_121}")
print(f"Metrics on poetry: {metrics_poetry_121}")
print(f"Metrics on essay: {metrics_essay_121}")
print(f"Metrics on story: {metrics_story_121}")

## **CLASSIFICATION USING TF-IDF & NAIVE BAYES**

In [None]:
ID_EXPERIMENT = 131

In [None]:
### EMBEDDING

embedding_131 = TfIdfEmbedding(ID_EXPERIMENT)

embedding_131.fit(X=X_train)

X_train_131 = embedding_131.transform(X=X_train)
X_test_131 = embedding_131.transform(X=X_test)
X_test_poetry_131 = embedding_131.transform(X=X_test_poetry)
X_test_essay_131 = embedding_131.transform(X=X_test_essay)
X_test_story_131 = embedding_131.transform(X=X_test_story)

In [None]:
### CLASSIFIER

classifier_131 = NBClassifier(ID_EXPERIMENT)

classifier_131.train(X_train=X_train_131, y_train=y_train)

metrics_131 = classifier_131.evaluate(X_test_131, y_test)
metrics_poetry_131 = classifier_131.evaluate(X_test_poetry_131, y_test_poetry)
metrics_essay_131 = classifier_131.evaluate(X_test_essay_131, y_test_essay)
metrics_story_131 = classifier_131.evaluate(X_test_story_131, y_test_story)

print(f"Metrics on test: {metrics_131}")
print(f"Metrics on poetry: {metrics_poetry_131}")
print(f"Metrics on essay: {metrics_essay_131}")
print(f"Metrics on story: {metrics_story_131}")