# IAVSAI

#### To start the project:

- First of all, you'll need Onyxia in order to create a new service and launch Vscode-python-gpu.
- Open the github project on it.
- Then, please add your Onyxia user's name in the variable below:


In [None]:
user_name = "juliettejin"

#### Now
- Run the following commands to install the dependencies
- After that, restart the kernel
- Then you can comment theses commands to not run them again

In [None]:
! pip install nltk
! pip install datasets
! pip install spacy
! pip uninstall -y scipy
! pip install scipy==1.12
! pip install tensorflow

import nltk
nltk.download('punkt')
nltk.download('wordnet')

In [None]:
import os
import s3fs
import zipfile

import numpy as np
import pandas as pd
import re
from joblib import dump, load

import matplotlib.pyplot as plt
import seaborn as sns

import nltk
from nltk import word_tokenize          

from sklearn.pipeline import make_pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import classification_report, accuracy_score, ConfusionMatrixDisplay, confusion_matrix, precision_score, recall_score, f1_score

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Conv1D, GlobalMaxPooling1D, Dense, Concatenate

### Export Dataset

In [None]:
# Create filesystem object
S3_ENDPOINT_URL = "https://" + os.environ["AWS_S3_ENDPOINT"]
fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})

# List the challenge's files
fs.ls("civel/diffusion/hackathon-minarm-2024/AIVSAI")
PATH_IN = 'civel/diffusion/hackathon-minarm-2024/AIVSAI/hack_train.csv'
fs.download(PATH_IN, 'data/hack_train.csv')

In [None]:
def load_csv():
    file_path = "data/hack_train.csv"
    return pd.read_csv(filepath_or_buffer=file_path)
df = load_csv()
df

### Clean Dataframe

In [None]:
def clean_dataframe_new(df):
    df = df.drop_duplicates()
    df = df.dropna()
    
    return df
df = clean_dataframe_new(df)
df

### Save Dataset in Onyxia

- After running the next cell, you will be able to observe the dataset on Onyxia

In [None]:
PATH_OUT = user_name + '/diffusion/projet-mongroupe-hackathon/hack_train.csv'
with fs.open(PATH_OUT, 'w') as file_out:
    df.to_csv(file_out, index=False)

### Clean Dataframe

In [None]:
def clean_dataframe(df):
    df = df.drop_duplicates()
    df = df.dropna()
    return df

### Read From Onyxia

In [None]:
with fs.open(PATH_OUT, mode="r") as file_in:
    df = pd.read_csv(file_in)
    df = clean_dataframe(df)

In [None]:
df.head()

### Statistiques

In [None]:
total_number_of_source = len(df['src'].drop_duplicates())
print("Total number of source:", total_number_of_source)

In [None]:
# Distribution of Labels
plt.figure(figsize=(8, 6))
sns.countplot(data=df, x='label')
plt.title('Distribution of Labels')
plt.xlabel('Label')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.show()

- Equal data distribution for classification is advantageous as it prevents bias, aids generalization, ensures fair evaluation, and enhances learning stability.

### Compare the length of the answers between human and machine

In [None]:
def compareLengthAnswer(data) :

    #Collect the length of the answer
    lengthHumanAnswer = data[data['label'] ==  1]['text'].apply(len)
    lengthChatGPTAnswer = data[data['label'] ==  0]['text'].apply(len)
    
    # Display the graphic
    plt.figure(figsize=(10, 6))
    plt.hist(lengthHumanAnswer, bins=100, alpha=0.5, label='Human Answers')
    plt.hist(lengthChatGPTAnswer, bins=20, alpha=0.5, label='Machine Answers')
    plt.title(' Comparaison: the length of the answers between human and machine')
    plt.xlabel('Number of caracters per response')
    plt.ylabel('Number of response')
    plt.legend()

    plt.yscale('log')
    plt.xlim(left=0)

    plt.grid(True)
    plt.show()

In [None]:
compareLengthAnswer(df)

- We can see that human answers are likely to be longer than machine answers
- Human answers tend to be longer because they include additional context and elaboration, aiming for comprehensive understanding.
- AI-generated responses prioritize brevity while still conveying essential information efficiently.

### Count number of sentences per source

In [None]:
def count_sentences(text):
    # Using regex to detect end of sentense more precisely
    sentences = re.split(r'(?<=[.!?])\s+', text)
    return len([sentence for sentence in sentences if sentence.strip()])

df_final = df.copy()
df_final['human_sentence_count'] = df_final[df_final['label'] ==  1]['text'].apply(count_sentences)
df_final['chatgpt_sentence_count'] = df_final[df_final['label'] ==  0]['text'].apply(count_sentences)

grouped = df_final.groupby('src').agg({
    'human_sentence_count': 'sum',
    'chatgpt_sentence_count': 'sum'
}).reset_index()

grouped

In [None]:
plt.figure(figsize=(10, 6))
plt.bar(grouped['src'], grouped['human_sentence_count'], color='skyblue', label='Human Sentence Count')
plt.bar(grouped['src'], grouped['chatgpt_sentence_count'], color='salmon', label='Machine Sentence Count')
plt.ylabel('Sentence Count')
plt.xlabel('Source')
plt.title('Comparison of Sentence Counts')
plt.legend()
plt.tight_layout()
plt.xticks([])
plt.show()

## Models

## Naive Bayésien (nb)

Data

In [None]:
dataset_nb = df.copy()
dataset_nb

In [None]:
X = dataset_nb['text']
Y = dataset_nb['label']

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

Model

- In this section we are using a grid search for Naives Bayes.

In [None]:
# Model construction with a tokenizer and ngram_range
model = make_pipeline(CountVectorizer(tokenizer=word_tokenize, ngram_range = (2,2)), MultinomialNB(alpha=5.0, fit_prior=False))

# Grid search for Naives Bayes

# Define the parameter grid
param_grid = {
    'multinomialnb__alpha': [1.0, 2.0, 3.0],        # Smoothing parameter
    'multinomialnb__fit_prior': [True, False]       # Whether to learn class prior probabilities
}
grid_search = GridSearchCV(model, param_grid, cv=3, scoring='accuracy', verbose=1, n_jobs=-1)

# Fit the grid search to the data
grid_search.fit(X_train, Y_train)

In [None]:
# Redirecting output to a file

#gridsearch_nb_file = 'gridsearch_nb.txt'
#with open(gridsearch_nb_file, 'w') as f:
#    print("Best parameters:", grid_search.best_params_, file=f)
#    print("Best cross-validation score:", grid_search.best_score_, file=f)
#    best_model = grid_search.best_estimator_
#    test_score = best_model.score(X_lr_test, y_lr_test)
#    print("Test set score:", test_score, file=f)

In [None]:
model.fit(X_train, Y_train)

In [None]:
y_pred = model.predict(X_test)

In [None]:
classification_report_naive_bayes = classification_report(Y_test, y_pred)
print("Classification Report:\n", classification_report_naive_bayes)

In [None]:
# Write the classification report to the file
#file_path = 'classification_report_naive_bayes.txt'
#with open(file_path, 'w') as file:
#    file.write(classification_report_naive_bayes)

Confusion matrix

In [None]:
cm = confusion_matrix(Y_test, y_pred, labels=model.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=model.classes_)
disp.plot()
plt.show()

Cross Validation

In [None]:
# Perform cross-validation and print the mean accuracy
scoring = 'f1_macro'
scores = cross_val_score(model, X, Y, cv=5, scoring=scoring, n_jobs = -1)
print(f"Mean {scoring}: {scores.mean()}")
print(f"Standard deviation {scoring}: {scores.std()}")

scores_1 = cross_val_score(model, X, Y, cv=10, scoring=scoring, n_jobs = -1)
print(f"Mean {scoring}: {scores_1.mean()}")
print(f"Standard deviation {scoring}: {scores_1.std()}")

## CNN Model

Prepare data and create model

In [None]:
# Separate text and labels
texts = df['text'].values
labels = df['label'].values

# Tokenization & padding
tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts)
MAX_LEN = 100

X = pad_sequences(tokenizer.texts_to_sequences(texts), maxlen=MAX_LEN)

# Invert label mapping
label_mapping = {0: 1, 1: 0}

if all(label in [0, 1] for label in labels):
    label_mapping = {0: 1, 1: 0}

labels = np.array([label_mapping[label] for label in labels])

# Split data into training, testing, and validation sets
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

# Model
input_text = Input(shape=(MAX_LEN,), name='input_text')
embed = Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=128)(input_text)
conv = Conv1D(filters=64, kernel_size=3, padding='same', activation='relu')(embed)
pool = GlobalMaxPooling1D()(conv)
dense1 = Dense(64, activation='relu')(pool)
output = Dense(1, activation='sigmoid')(dense1)
cnn_model = Model(inputs=input_text, outputs=output)

# Compile
cnn_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Training
cnn_model.fit(X_train, y_train, epochs=15, batch_size=32, validation_data=(X_val, y_val))


Evaluation and prediction

In [None]:
# Evaluation
loss, accuracy = cnn_model.evaluate(X_test, y_test)
print(f'Test Accuracy: {accuracy * 100:.2f}%')

# Prédire les étiquettes sur les données de test
y_pred = (cnn_model.predict(X_test) > 0.5).astype("int32")

# Générer et imprimer le rapport de classification
print("Classification Report for CNN Model:")
print(classification_report(y_test, y_pred))

# Calculer la précision, le rappel et le score F1 pour les deux classes
precision_class_0 = precision_score(y_test, y_pred, pos_label=0)
recall_class_0 = recall_score(y_test, y_pred, pos_label=0)
f1_class_0 = f1_score(y_test, y_pred, pos_label=0)

precision_class_1 = precision_score(y_test, y_pred, pos_label=1)
recall_class_1 = recall_score(y_test, y_pred, pos_label=1)
f1_class_1 = f1_score(y_test, y_pred, pos_label=1)

print("Metrics for class 0:")
print("Precision:", precision_class_0)
print("Recall:", recall_class_0)
print("F1-score:", f1_class_0)

print("\nMetrics for class 1:")
print("Precision:", precision_class_1)
print("Recall:", recall_class_1)
print("F1-score:", f1_class_1)

## Transformer from scratch

Prepare data and create model

In [None]:
from tensorflow.keras.layers import MultiHeadAttention, LayerNormalization, Dense, Dropout, Flatten

# Tokenization & padding
tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts)
MAX_LEN = 100
X = pad_sequences(tokenizer.texts_to_sequences(texts), maxlen=MAX_LEN)
y = labels

# Split data into training, testing, and validation sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

# Reshape input data
X_train = X_train.reshape(X_train.shape[0], MAX_LEN)
X_val = X_val.reshape(X_val.shape[0], MAX_LEN)
X_test = X_test.reshape(X_test.shape[0], MAX_LEN)

# Transformer Model
vocab_size = len(tokenizer.word_index) + 1
max_len = MAX_LEN

input_layer = Input(shape=(max_len,), name='transformer_input')
embed = Embedding(vocab_size, 128, mask_zero=True)(input_layer)
attention = MultiHeadAttention(num_heads=8, key_dim=64)(embed, embed)
attention = LayerNormalization()(attention + embed)
dense = Dense(64, activation='relu')(attention)
dropout = Dropout(0.1)(dense)
output = Dense(64, activation='relu')(dropout)
output = Flatten()(output)
output = Dense(1, activation='sigmoid')(output)

transformer_model = Model(inputs=input_layer, outputs=output)

# Compile Transformer model
transformer_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train Transformer model
transformer_model.fit(X_train, y_train, epochs=20, batch_size=32, validation_data=(X_val, y_val))

Evaluation and prediction

In [None]:
# Evaluate Transformer model
loss, accuracy = transformer_model.evaluate(X_test, y_test)
print(f'Transformer Test Accuracy: {accuracy * 100:.2f}%')

# Prédire les étiquettes sur les données de test
y_pred = (transformer_model.predict(X_test) > 0.5).astype("int32")

# Générer et imprimer le rapport de classification
print("Classification Report for Transformer Model:")
print(classification_report(y_test, y_pred))

# Calculer la précision, le rappel et le score F1 pour les deux classes
precision_class_0 = precision_score(y_test, y_pred, pos_label=0)
recall_class_0 = recall_score(y_test, y_pred, pos_label=0)
f1_class_0 = f1_score(y_test, y_pred, pos_label=0)

precision_class_1 = precision_score(y_test, y_pred, pos_label=1)
recall_class_1 = recall_score(y_test, y_pred, pos_label=1)
f1_class_1 = f1_score(y_test, y_pred, pos_label=1)

print("Metrics for class 0:")
print("Precision:", precision_class_0)
print("Recall:", recall_class_0)
print("F1-score:", f1_class_0)

print("\nMetrics for class 1:")
print("Precision:", precision_class_1)
print("Recall:", recall_class_1)
print("F1-score:", f1_class_1)

## Linear regression (lr)

Data

In [None]:
df_lr = df

X_lr = df_lr['text']
y_lr = df_lr['label']

X_lr_train, X_lr_test, y_lr_train, y_lr_test = train_test_split(X_lr, y_lr, test_size=0.2, random_state=42)

text_transformer = CountVectorizer()

X_lr_train_vectorized = text_transformer.fit_transform(X_lr_train)
X_lr_test_vectorized = text_transformer.transform(X_lr_test)

Model

- In this cell we will create out model and train it, which will take some time.
- You can skip this, and directly load the model that we stored.

In [None]:
model_lr = LogisticRegression(random_state=42, solver='liblinear', class_weight='balanced')
model_lr.fit(X_lr_train_vectorized, y_lr_train)

In [None]:
# Saving the model
#dump(model_lr, 'trained_model_lr.joblib')
model_lr = load('trained_model_lr.joblib')

Analysis

In [None]:
y_lr_pred = model_lr.predict(X_lr_test_vectorized)
test_accuracy_lr = accuracy_score(y_lr_test, y_lr_pred)
classification_report_lr = classification_report(y_lr_test, y_lr_pred)
print("Classification Report:\n", classification_report_lr)
print("Testing Accuracy:", test_accuracy_lr)

In [None]:
#Write the classification report to the file
#file_path = 'classification_report_logistic_regression.txt'
#with open(file_path, 'w') as file:
#    file.write(classification_report_lr)

In [None]:
cm = confusion_matrix(y_lr_test, y_lr_pred, labels=model_lr.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=model_lr.classes_)
disp.plot()
plt.show()

This confusion matrix evaluates our model's performance:

- True Negative (TN): 4940
- False Positive (FP): 612
- False Negative (FN): 823
- True Positive (TP): 4989

And there is still room for improvement. We can now use a grid search to get the best parameters for our model.
You can skip this cell, and print the results in the next one

In [None]:
model = make_pipeline(CountVectorizer(), LogisticRegression(random_state=42, class_weight='balanced', solver='liblinear'))

param_grid = {
    'countvectorizer__lowercase': [False, True],
    'countvectorizer__tokenizer': [word_tokenize],
    'countvectorizer__ngram_range': [(1, 1), (2, 2)],
    'countvectorizer__stop_words': [None, 'english'],
    'logisticregression__C': [0.001, 0.01, 0.1],
    'logisticregression__penalty': ['l1', 'l2'],
}
grid_search = GridSearchCV(model, param_grid, cv=3, scoring='f1_macro', verbose=1, n_jobs=-1)
grid_search.fit(X_lr_train, y_lr_train)

In [None]:
gridsearch_lr_file = 'gridsearch_lr.txt'

# Redirecting output to a file

#with open('gridsearch_lr.txt', 'w') as f:
#    print("Best parameters:", grid_search.best_params_, file=f)
#    print("Best cross-validation score:", grid_search.best_score_, file=f)
#    best_model = grid_search.best_estimator_
#    test_score = best_model.score(X_lr_test, y_lr_test)
#    print("Test set score:", test_score, file=f)

with open(gridsearch_lr_file, 'r') as file:
    gridsearch_contents = file.read()
print(gridsearch_contents)