## IAvsAI with old data

In [None]:
#! pip install nltk
#! pip install datasets
#! pip install tiktoken
#! pip install spacy

#import nltk
#nltk.download('punkt')
#nltk.download('wordnet')

In [None]:
import os

import pandas as pd
import s3fs
import zipfile
import matplotlib.pyplot as plt
from nltk import word_tokenize, sent_tokenize
#import eli5

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from nltk import word_tokenize          
from nltk.stem import WordNetLemmatizer 
import tiktoken
import nltk

from gensim.test.utils import common_texts
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
import gensim.downloader

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, StratifiedKFold
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report, accuracy_score, ConfusionMatrixDisplay, confusion_matrix
from sklearn.utils.class_weight import compute_sample_weight
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import VotingClassifier

from spacy.lang.fr.stop_words import STOP_WORDS as fr_stop
from spacy.lang.en.stop_words import STOP_WORDS as en_stop

from itertools import chain
from collections import Counter
import json
from scipy.sparse import hstack
import numpy as np
import re
import matplotlib.pyplot as plt

### Récupérer les données d'un challenge

In [None]:
# Create filesystem object
S3_ENDPOINT_URL = "https://" + os.environ["AWS_S3_ENDPOINT"]
fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})

# Lister les challenges
#fs.ls("gvimont/diffusion/hackathon-minarm-2024")

# Lister les fichiers d'un challenge
fs.ls("civel/diffusion/hackathon-minarm-2024/AIVSAI")

In [None]:
# Télécharger les données dans le service
PATH_IN = 'civel/diffusion/hackathon-minarm-2024/AIVSAI/HC3.zip'
fs.download(PATH_IN, 'data/HC3.zip')

# Décompresser les données
with zipfile.ZipFile("data/HC3.zip","r") as zip_file:
    zip_file.extractall("data/")

NB : les données peuvent être également téléchargées directement si besoin, pour être utilisées hors du SSP CLoud.
Exemple pour le fichier ci-dessus (même format de lien pour les autres challenges) : 

http://minio.lab.sspcloud.fr/gvimont/diffusion/hackathon-minarm-2024/AIVSAI/HC3.zip

### Exporter des données

In [None]:
def load_jsonl_files():
    file_path = "data/HC3/all.jsonl"
    dfs = pd.read_json(file_path, lines=True)
    return dfs

df = load_jsonl_files()
df

### Clean questions

In [None]:
def clean_question(question):
    # Remove if question starts with 'Q.'
    return question[3:].strip() if question[:3] == 'Q. ' else question

def clean_questions(df):
    df_cleaned = df.copy()
    
    df_cleaned['question'] = df_cleaned['question'].apply(lambda x: clean_question(x))
    
    return df_cleaned

### Clean Answers

In [None]:
def clean_answer(answer):
    # Transform list answer into string
    return ' '.join(answer) if isinstance(answer, list) else answer

def clean_answers(df):
    df_cleaned = df.copy()
    
    df_cleaned['human_answers'] = df_cleaned['human_answers'].apply(lambda x: clean_answer(x))
    df_cleaned['chatgpt_answers'] = df_cleaned['chatgpt_answers'].apply(lambda x: clean_answer(x))
    
    return df_cleaned

In [None]:
def clean_type_data(df) :
    
    df_cleaned = df.copy()
    df_cleaned['human_answers'].astype(str)
    df_cleaned['chatgpt_answers'].astype(str)
    
    return df_cleaned

### Clean Dataframe

In [None]:
def clean_dataframe(df):
    df = clean_answers(df)
    df = df.drop_duplicates()
    df = clean_questions(df)
    df = clean_type_data(df)
    if 'index' in df.columns :
        df = df.drop(columns="index")
    df = df.dropna()
    
    return df

In [None]:
df = clean_dataframe(df)
df

In [None]:
# Export vers un bucket personnel
PATH_OUT = 'misterfacile/diffusion/projet-mongroupe-hackathon/all_dataset.csv'
with fs.open(PATH_OUT, 'w') as file_out:
    df.to_csv(file_out, index=False)


In [None]:
# NB : le dossier 'diffusion' permet un accès en lecture à tous les membres du groupe !
# Tous les membres peuvent donc le voir et l'utiliser dans un service
fs.ls("misterfacile/diffusion/projet-mongroupe-hackathon")

In [None]:
with fs.open(PATH_OUT, mode="r") as file_in:
    df_test = pd.read_csv(file_in)
    df_test = clean_dataframe(df_test)

In [None]:
df_test.head()

# Tokenizer

In [None]:
sources = ['finance','wiki_csai','open_qa','medicine','reddit_eli5']
answers_by_source = {source: " ".join(df.loc[df['source'] == source, 'human_answers']) for source in df['source'].unique()}

In [None]:
def nltk_tokenize(doc: str) -> list[str]:
    return word_tokenize(doc)

def gpt_tokenize(doc: str) -> list:
    enc = tiktoken.encoding_for_model("gpt-4")
    tokens = enc.encode(doc)
    return [str(token) for token in tokens]

In [None]:
def tokenize(doc: str, base_tokenizer=word_tokenize, do_lower=False, do_remove_stop_word=False, custom_stop_words=[], do_lemmatize=False) -> tuple[list,list]:
    if do_lower:
        doc = doc.lower()
    list_token = base_tokenizer(doc)

    if do_remove_stop_word:
        stop_words = en_stop | set("-.!?()_;:,'[]$%*/|\\") | {'...'} | {"''"} | {"``"} | set(custom_stop_words) 
        list_token = [token for token in list_token if token not in stop_words]
        
    if do_lemmatize:
        wnl = WordNetLemmatizer()
        list_token = [wnl.lemmatize(t) for t in list_token]

    return list_token

def use_tokenizer(tokenizer, base_tokenizer=word_tokenize):
    return tokenizer

def remove_stop_words(custom_stop_words):
    return custom_stop_words

def add_stop_words(words, custom_stop_words):
    custom_stop_words.extend(words)
    return custom_stop_words

def lower():
    return True

def lemmatize():
    return True

In [None]:
"""
custom_stop_words = []
list_token_per_source = {}
for source in sources:
    list_token_per_source[source] = tokenize(answers_by_source[source], do_lower=True, do_remove_stop_word=True, custom_stop_words=custom_stop_words)
"""

In [None]:
"""for source, list_token in list_token_per_source.items():
    word_counts = Counter(list_token)
    top_10_words = word_counts.most_common(10)
    print(source,":\n", top_10_words)"""

In [None]:
"""file_path = 'list_token_per_source.json'

# Open the file in write mode
with open(file_path, 'w') as json_file:
    # Write the dictionary to the file
    json.dump(list_token_per_source, json_file, indent=4)

print("Dictionary has been written to", file_path)"""

## Statistiques

#### Compare the length of the answer between human and ChatGPT

In [None]:
def compareLengthAnswer(data, category="") :

    #Collect the length of the answer
    if (category == "") :
        lengthHumanAnswer = data['human_answers'].apply(len)
        lengthChatGPTAnswer = data['chatgpt_answers'].apply(len)
    else :
        lengthHumanAnswer = data[data['source'] == category]['human_answers'].apply(len)
        lengthChatGPTAnswer = data[data['source'] == category]['chatgpt_answers'].apply(len)
    
    # Display the graphic
    plt.figure(figsize=(10, 6))
    plt.hist(lengthHumanAnswer, bins=100, alpha=0.5, label='Human Answers')
    plt.hist(lengthChatGPTAnswer, bins=20, alpha=0.5, label='ChatGPT Answers')
    plt.title('Comparaison de la taille des réponses')
    plt.xlabel('Quantité de caractères par réponse')
    plt.ylabel('Quantité de réponse')
    plt.legend()

    plt.yscale('log')
    plt.xlim(left=0)

    plt.grid(True)
    plt.show()

In [None]:
compareLengthAnswer(df_test)

In [None]:
df_wiki = df_test[df_test['source'] == 'wiki_csai']['human_answers']


In [None]:
df_test.head()

#### Count number of sentences 

In [None]:
def count_sentences(text):
    # Utiliser regex pour détecter la fin des phrases avec plus de précision
    sentences = re.split(r'(?<=[.!?])\s+', text)
    # Retourner le nombre de phrases, en s'assurant qu'on ne compte pas les éléments vides
    return len([sentence for sentence in sentences if sentence.strip()])

# Appliquer la fonction de comptage sur les colonnes des réponses et créer de nouvelles colonnes pour les comptes
df['human_sentence_count'] = df['human_answers'].apply(count_sentences)
df['chatgpt_sentence_count'] = df['chatgpt_answers'].apply(count_sentences)

# Grouper par la colonne 'source' et calculer la somme des phrases pour chaque source
grouped = df.groupby('source').agg({
    'human_sentence_count': 'sum',
    'chatgpt_sentence_count': 'sum'
}).reset_index()

grouped

In [None]:
positions = np.arange(len(grouped['source']))

fig, ax = plt.subplots(figsize=(10, 6))
width = 0.35

# Create bars
human_bars = ax.bar(positions - width/2, grouped['human_sentence_count'],width, label='Human Sentence Count')
chatgpt_bars = ax.bar(positions + width/2, grouped['chatgpt_sentence_count'], width, label='ChatGPT Sentence Count')

ax.set_xlabel('Source')
ax.set_ylabel('Sentence Count')
ax.set_title('Sentence Count Comparison by Source')
ax.set_xticks(positions)
ax.set_xticklabels(grouped['source'])
ax.legend()
plt.show()


#### Correlation between answers and questions length

In [None]:
df_plot = df.copy()

df_plot['question_length'] = df['question'].apply(len)
df_plot['human_answer_length'] = df['human_answers'].apply(len)
df_plot['chatgpt_answer_length'] = df['chatgpt_answers'].apply(len)

plt.figure(figsize=(20, 6))

# Scatter plot for Q and A
plt.scatter(df_plot['question_length'], df_plot['human_answer_length'], label='Human Answers', alpha=0.3)
plt.scatter(df_plot['question_length'], df_plot['chatgpt_answer_length'], label='ChatGPT Answers', alpha=0.3)

plt.xlabel('Question Length')
plt.ylabel('Answer Length')
plt.title('Correlation between Question and Answer Length')
plt.legend()
plt.show()

## Models

### Neural network

In [None]:
pip install tensorflow

In [None]:
with fs.open(PATH_OUT, mode="r") as file_in:
    df_test = pd.read_csv(file_in)
    df_test = clean_dataframe(df_test)

In [None]:
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Conv1D, GlobalMaxPooling1D, Dense, Concatenate

# Separate answers
chatgpt_answers = df['chatgpt_answers'].values
human_answers = df['human_answers'].values

# Tokenization & padding
tokenizer = Tokenizer()
tokenizer.fit_on_texts(list(chatgpt_answers) + list(human_answers))
MAX_LEN = 100

X_chatgpt = pad_sequences(tokenizer.texts_to_sequences(chatgpt_answers), maxlen=MAX_LEN)
X_human = pad_sequences(tokenizer.texts_to_sequences(human_answers), maxlen=MAX_LEN)

# Concatenate X_chatgpt and X_human
X = np.concatenate((X_chatgpt, X_human), axis=0)
y = np.array([0] * len(X_chatgpt) + [1] * len(X_human))

# Split data into training, testing, and validation sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)  # 0.25 x 0.8 = 0.2

# Model
input_text = Input(shape=(MAX_LEN,), name='input_text')
embed = Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=128)(input_text)
conv = Conv1D(filters=64, kernel_size=3, padding='same', activation='relu')(embed)
pool = GlobalMaxPooling1D()(conv)
dense1 = Dense(64, activation='relu')(pool)
output = Dense(1, activation='sigmoid')(dense1)
model = Model(inputs=input_text, outputs=output)

# Compile
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Training
model.fit(X_train, y_train, epochs=20, batch_size=32, validation_data=(X_val, y_val))

# Evaluation
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test Accuracy: {accuracy * 100:.2f}%')


### Regression Logistic with different use of the data

Setting training and test sets

0: human answers
1: chatgpt answers

In [None]:
#Data with only answers
dataset_answers = pd.DataFrame({
    'answers': df['human_answers'].tolist() + df['chatgpt_answers'].tolist(),
    'category': [0] * len(df) + [1] * len(df)
})

#Data with questions and answers
dataset_questions_answers = pd.DataFrame({
    'answers': df[['question', 'human_answers']].apply(lambda x: ''.join(x), axis=1).tolist() + df[['question', 'chatgpt_answers']].apply(lambda x: ''.join(x), axis=1).tolist(),
    'category': [0] * len(df) + [1] * len(df)
})

#Separating data for questions and answers then concatenate
X_questions = df['question'].tolist() + df['question'].tolist()
X_answers = df['human_answers'].tolist() + df['chatgpt_answers'].tolist()
y_targets = [0] * len(df) + [1] * len(df)

X_questions = np.array(X_questions)
X_answers = np.array(X_answers)
y = np.array(y_targets)

Data with only answers

In [None]:
X_a = dataset_answers['answers']  # Features (answers)
y_a = dataset_answers['category']  # Target variable (category)

# Split the data into training and testing sets (80% train, 20% test)
X_a_train, X_a_test, y_a_train, y_a_test = train_test_split(X_a, y_a, test_size=0.2, random_state=42)

# Applying CountVectorizer()
text_transformer = CountVectorizer()

X_a_train_vectorized = text_transformer.fit_transform(X_a_train)
X_a_test_vectorized = text_transformer.transform(X_a_test)

In [None]:
model_a = LogisticRegression(penalty='l2', solver='liblinear', class_weight='balanced')
model_a.fit(X_a_train_vectorized, y_a_train)
y_a_pred = model_a.predict(X_a_test_vectorized)

In [None]:
test_accuracy = accuracy_score(y_a_test, y_a_pred)
print("Classification Report:\n", classification_report(y_a_test, y_a_pred))
print("Testing Accuracy:", test_accuracy)

Data with questions and answers

In [None]:
X_qa = dataset_questions_answers['answers']  # Features (questions and answers)
y_qa = dataset_questions_answers['category']  # Target variable (category)

# Split the data into training and testing sets (80% train, 20% test)
X_qa_train, X_qa_test, y_qa_train, y_qa_test = train_test_split(X_qa, y_qa, test_size=0.2, random_state=42)

text_transformer = CountVectorizer()

X_qa_train_vectorized = text_transformer.fit_transform(X_qa_train)
X_qa_test_vectorized = text_transformer.transform(X_qa_test)

In [None]:
model_qa = LogisticRegression(penalty='l2', solver='liblinear', class_weight='balanced')
model_qa.fit(X_qa_train_vectorized, y_qa_train)
y_qa_pred = model_qa.predict(X_qa_test_vectorized)

In [None]:
test_accuracy = accuracy_score(y_qa_test, y_qa_pred)
print("Classification Report:\n", classification_report(y_qa_test, y_qa_pred))
print("Testing Accuracy:", test_accuracy)

Separating data for questions and answers then concatenate

In [None]:
# Split my data into training and testing sets
X_questions_train, X_questions_test, X_answers_train, X_answers_test, y_train, y_test = train_test_split(X_questions, X_answers, y, test_size=0.4, random_state=42)

# Preprocess my text data separately for questions and answers
vectorizer_question = CountVectorizer()
vectorizer_answer = CountVectorizer()

X_questions_train_tfidf = vectorizer_question.fit_transform(X_questions_train)
X_questions_test_tfidf = vectorizer_question.transform(X_questions_test)

X_answers_train_tfidf = vectorizer_answer.fit_transform(X_answers_train)
X_answers_test_tfidf = vectorizer_answer.transform(X_answers_test)

# Concatenate the TF-IDF representations of questions and answers
X_train = hstack([X_questions_train_tfidf, X_answers_train_tfidf])
X_test = hstack([X_questions_test_tfidf, X_answers_test_tfidf])

# Train the logistic regression model
model_3 = LogisticRegression(penalty='l2', solver='liblinear', class_weight='balanced')
model_3.fit(X_train, y_train)

# Evaluate the model
y_pred = model_3.predict(X_test)
test_accuracy = accuracy_score(y_test, y_pred)
print("Testing Accuracy:", test_accuracy)

Analysis

In [None]:
cm = confusion_matrix(y_test, y_pred, labels=model_3.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=model_3.classes_)
disp.plot()
plt.show()

### Bag of Words (CountVectorizer) Naives bayes avec MultinomialNB

In [None]:
#Création d'un dataset composé de toutes les phrases avec ces labels
dataset = pd.DataFrame({
    'answers': df['human_answers'].tolist() + df['chatgpt_answers'].tolist(),
    'category': ['human'] * len(df) + ['chatgpt'] * len(df)
})

In [None]:
X = dataset['answers']
Y = dataset['category']

In [None]:
#Training a split dataset avec un poid bien balancé
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
sample_weights = compute_sample_weight(class_weight='balanced', y=Y_train)

In [None]:
#Construction de modèle avec un tokenizer et ngram_range correspondant aux mots adjacents

model = make_pipeline(CountVectorizer(tokenizer=word_tokenize, ngram_range = (3,3)), MultinomialNB(alpha=4.0, fit_prior=False))

#Construction d'un grid_search pour Naives Bayes
# Define the parameter grid
#param_grid = {
#    'alpha': [1.0, 2.0, 3.0, 4.0],  # Smoothing parameter
#    'fit_prior': [True, False]       # Whether to learn class prior probabilities
#}
#grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy')
# Fit the grid search to the data
#grid_search.fit(X_train_counts, Y_train)

# Get the best parameters
#best_params = grid_search.best_params_
#print("Best parameters:", best_params)

In [None]:
model.fit(X_train, Y_train)
y_pred = model.predict(X_test)
print("Classification Report:\n", classification_report(Y_test, y_pred))

In [None]:
cm = confusion_matrix(Y_test, y_pred, labels=model.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=model.classes_)
disp.plot()
plt.show()

### Cross Validation

In [None]:
# Perform cross-validation and print the mean accuracy
scoring = 'f1_macro'
scores = cross_val_score(model, X, Y, cv=5, scoring=scoring, n_jobs = -1)
print(f"Mean {scoring}: {scores.mean()}")
print(f"Standard deviation {scoring}: {scores.std()}")

scores_1 = cross_val_score(model, X, Y, cv=10, scoring=scoring, n_jobs = -1)
print(f"Mean {scoring}: {scores_1.mean()}")
print(f"Standard deviation {scoring}: {scores_1.std()}")

### Model Regression using a set training by Word2Vec

In [None]:
X_list = dataset['answers']
X_strings = " ".join(X_list)

#Decoupage
X_sents = sent_tokenize(X_strings)
X_sents = [word_tokenize(s) for s in X_sents]

In [None]:
#Creation du model Word2Vec
model = Word2Vec(sentences=X_sents, vector_size=50, window=5, min_count=5, workers=4)
word_vectors = model.wv
word_vectors.save("word2vec.wordvectors") # Store the words and their trained embeddings
del model

In [None]:
#Charger les vecteurs des mots
wv = KeyedVectors.load("word2vec.wordvectors", mmap='r')

In [None]:
def document_vector(doc, wv):
    """Create document vectors by averaging word vectors."""
    words = word_tokenize(doc)
    word_vectors = np.array([wv[word] for word in words if word in wv])
    
    if len(word_vectors) == 0:
        return np.zeros(wv.vector_size)
    return np.mean(word_vectors, axis=0)

In [None]:
X = dataset['answers']
Y = dataset['category']
X = np.array([document_vector(text, wv) for text in X])

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
sample_weights = compute_sample_weight(class_weight='balanced', y=Y_train)

print(f"Features dimension: {X_train.shape}")

In [None]:
model = make_pipeline(LogisticRegression(random_state=42, class_weight='balanced', solver='liblinear'))

param_grid = {
    'logisticregression__C': [0.001, 0.01, 0.1, 1, 10],
    'logisticregression__penalty': ['l1', 'l2'],
    #'logisticregression__max_iter': [10, 50, 100, 200, 300],
    #'logisticregression__tol': [1e-4, 1e-3, 1e-2],
    'logisticregression__dual': [False, True],
    'logisticregression__fit_intercept': [True, False],
    'logisticregression__multi_class': ['auto', 'ovr', 'multinomial'],
    #'logisticregression__warm_start': [True, False]
}


grid_search = GridSearchCV(model, param_grid, cv=5, scoring='f1_macro', verbose=1, n_jobs=-1)
grid_search.fit(X_train, Y_train)

best_model = grid_search.best_estimator_
test_score = best_model.score(X_test, Y_test)

print("Best parameters:", grid_search.best_params_)
print("Best cross-validation score:", grid_search.best_score_)

In [None]:
y_pred = best_model.predict(X_test)
print("Classification Report:\n", classification_report(Y_test, y_pred))