In [None]:
import os
import numpy as np
import pandas as pd
import s3fs
import zipfile
import matplotlib.pyplot as plt
from nltk import word_tokenize

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from nltk import word_tokenize          
from nltk.stem import WordNetLemmatizer 
import tiktoken
import nltk

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, StratifiedKFold
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report, accuracy_score, ConfusionMatrixDisplay, confusion_matrix
from sklearn.utils.class_weight import compute_sample_weight
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import VotingClassifier

from spacy.lang.fr.stop_words import STOP_WORDS as fr_stop
from spacy.lang.en.stop_words import STOP_WORDS as en_stop

from itertools import chain
from collections import Counter
import json

# Export Dataset

In [None]:
# Create filesystem object
S3_ENDPOINT_URL = "https://" + os.environ["AWS_S3_ENDPOINT"]
fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})

# Lister les challenges
#fs.ls("gvimont/diffusion/hackathon-minarm-2024")

# Lister les fichiers d'un challenge
fs.ls("civel/diffusion/hackathon-minarm-2024/AIVSAI")
PATH_IN = 'civel/diffusion/hackathon-minarm-2024/AIVSAI/hack_train.csv'
fs.download(PATH_IN, 'data/hack_train.csv')

In [None]:
def load_csv():
    file_path = "data/hack_train.csv"
    return pd.read_csv(filepath_or_buffer=file_path)
df = load_csv()
df

# Clean Dataframe

In [None]:
def clean_dataframe_new(df):
    df = df.drop_duplicates()
    df = df.dropna()
    
    return df
df = clean_dataframe_new(df)
df

# Save Dataset in Onyxia

In [None]:
PATH_OUT = 'linafarchado/diffusion/projet-mongroupe-hackathon/hack_train.csv'
with fs.open(PATH_OUT, 'w') as file_out:
    df.to_csv(file_out, index=False)

# Clean Dataframe

In [None]:
def clean_dataframe(df):
    df = df.drop_duplicates()
    df = df.dropna()
    
    return df

# Read From Onyxia

In [None]:
with fs.open(PATH_OUT, mode="r") as file_in:
    df = pd.read_csv(file_in)
    df = clean_dataframe(df)

In [None]:
df.head()

# CNN Model

In [None]:
pip install tensorflow

In [None]:
from sklearn.model_selection import train_test_split
import tensorflow as tf
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Conv1D, GlobalMaxPooling1D, Dense, Concatenate

# Separate text and labels
texts = df['text'].values
labels = df['label'].values

# Tokenization & padding
tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts)
MAX_LEN = 100

X = pad_sequences(tokenizer.texts_to_sequences(texts), maxlen=MAX_LEN)

# Invert label mapping
label_mapping = {0: 1, 1: 0}

if all(label in [0, 1] for label in labels):
    label_mapping = {0: 1, 1: 0}

labels = np.array([label_mapping[label] for label in labels])

# Split data into training, testing, and validation sets
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

# Model
input_text = Input(shape=(MAX_LEN,), name='input_text')
embed = Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=128)(input_text)
conv = Conv1D(filters=64, kernel_size=3, padding='same', activation='relu')(embed)
pool = GlobalMaxPooling1D()(conv)
dense1 = Dense(64, activation='relu')(pool)
output = Dense(1, activation='sigmoid')(dense1)
cnn_model = Model(inputs=input_text, outputs=output)

# Compile
cnn_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Training
cnn_model.fit(X_train, y_train, epochs=15, batch_size=32, validation_data=(X_val, y_val))


In [None]:
# Evaluation
loss, accuracy = cnn_model.evaluate(X_test, y_test)
print(f'Test Accuracy: {accuracy * 100:.2f}%')

# Prédire les étiquettes sur les données de test
y_pred = (cnn_model.predict(X_test) > 0.5).astype("int32")

# Générer et imprimer le rapport de classification
print("Classification Report for CNN Model:")
print(classification_report(y_test, y_pred))

# Calculer la précision, le rappel et le score F1 pour les deux classes
precision_class_0 = precision_score(y_test, y_pred, pos_label=0)
recall_class_0 = recall_score(y_test, y_pred, pos_label=0)
f1_class_0 = f1_score(y_test, y_pred, pos_label=0)

precision_class_1 = precision_score(y_test, y_pred, pos_label=1)
recall_class_1 = recall_score(y_test, y_pred, pos_label=1)
f1_class_1 = f1_score(y_test, y_pred, pos_label=1)

print("Metrics for class 0:")
print("Precision:", precision_class_0)
print("Recall:", recall_class_0)
print("F1-score:", f1_class_0)

print("\nMetrics for class 1:")
print("Precision:", precision_class_1)
print("Recall:", recall_class_1)
print("F1-score:", f1_class_1)

# Transformer

In [None]:
from tensorflow.keras.layers import MultiHeadAttention, LayerNormalization, Dense, Dropout, Flatten

# Tokenization & padding
tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts)
MAX_LEN = 100
X = pad_sequences(tokenizer.texts_to_sequences(texts), maxlen=MAX_LEN)
y = labels

# Split data into training, testing, and validation sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

# Reshape input data
X_train = X_train.reshape(X_train.shape[0], MAX_LEN)
X_val = X_val.reshape(X_val.shape[0], MAX_LEN)
X_test = X_test.reshape(X_test.shape[0], MAX_LEN)

# Transformer Model
vocab_size = len(tokenizer.word_index) + 1
max_len = MAX_LEN

input_layer = Input(shape=(max_len,), name='transformer_input')
embed = Embedding(vocab_size, 128, mask_zero=True)(input_layer)
attention = MultiHeadAttention(num_heads=8, key_dim=64)(embed, embed)
attention = LayerNormalization()(attention + embed)
dense = Dense(64, activation='relu')(attention)
dropout = Dropout(0.1)(dense)
output = Dense(64, activation='relu')(dropout)
output = Flatten()(output)
output = Dense(1, activation='sigmoid')(output)

transformer_model = Model(inputs=input_layer, outputs=output)

# Compile Transformer model
transformer_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train Transformer model
transformer_model.fit(X_train, y_train, epochs=20, batch_size=32, validation_data=(X_val, y_val))

In [None]:
# Evaluate Transformer model
loss, accuracy = transformer_model.evaluate(X_test, y_test)
print(f'Transformer Test Accuracy: {accuracy * 100:.2f}%')

# Prédire les étiquettes sur les données de test
y_pred = (transformer_model.predict(X_test) > 0.5).astype("int32")

# Générer et imprimer le rapport de classification
print("Classification Report for Transformer Model:")
print(classification_report(y_test, y_pred))

# Calculer la précision, le rappel et le score F1 pour les deux classes
precision_class_0 = precision_score(y_test, y_pred, pos_label=0)
recall_class_0 = recall_score(y_test, y_pred, pos_label=0)
f1_class_0 = f1_score(y_test, y_pred, pos_label=0)

precision_class_1 = precision_score(y_test, y_pred, pos_label=1)
recall_class_1 = recall_score(y_test, y_pred, pos_label=1)
f1_class_1 = f1_score(y_test, y_pred, pos_label=1)

print("Metrics for class 0:")
print("Precision:", precision_class_0)
print("Recall:", recall_class_0)
print("F1-score:", f1_class_0)

print("\nMetrics for class 1:")
print("Precision:", precision_class_1)
print("Recall:", recall_class_1)
print("F1-score:", f1_class_1)