## Tutoriel : interagir avec le système de stockage S3 du SSP Cloud (MinIO)

In [12]:
! pip install nltk
! pip install datasets
! pip install tiktoken
! pip install spacy

import nltk
nltk.download('punkt')
nltk.download('wordnet')



[nltk_data] Downloading package punkt to /home/onyxia/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/onyxia/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [13]:
import os

import pandas as pd
import s3fs
import zipfile
import matplotlib.pyplot as plt
from nltk import word_tokenize

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from nltk import word_tokenize          
from nltk.stem import WordNetLemmatizer 
import tiktoken
import nltk

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, StratifiedKFold
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report, accuracy_score, ConfusionMatrixDisplay, confusion_matrix
from sklearn.utils.class_weight import compute_sample_weight
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import VotingClassifier

from spacy.lang.fr.stop_words import STOP_WORDS as fr_stop
from spacy.lang.en.stop_words import STOP_WORDS as en_stop

from itertools import chain

### Récupérer les données d'un challenge

In [14]:
# Create filesystem object
S3_ENDPOINT_URL = "https://" + os.environ["AWS_S3_ENDPOINT"]
fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})

# Lister les challenges
#fs.ls("gvimont/diffusion/hackathon-minarm-2024")

# Lister les fichiers d'un challenge
fs.ls("gvimont/diffusion/hackathon-minarm-2024/AIVSAI")

['gvimont/diffusion/hackathon-minarm-2024/AIVSAI/.keep',
 'gvimont/diffusion/hackathon-minarm-2024/AIVSAI/HC3.zip']

In [15]:
# Télécharger les données dans le service
PATH_IN = 'gvimont/diffusion/hackathon-minarm-2024/AIVSAI/HC3.zip'
fs.download(PATH_IN, 'data/HC3.zip')

# Décompresser les données
with zipfile.ZipFile("data/HC3.zip","r") as zip_file:
    zip_file.extractall("data/")

NB : les données peuvent être également téléchargées directement si besoin, pour être utilisées hors du SSP CLoud.
Exemple pour le fichier ci-dessus (même format de lien pour les autres challenges) : 

http://minio.lab.sspcloud.fr/gvimont/diffusion/hackathon-minarm-2024/AIVSAI/HC3.zip

### Exporter des données

In [16]:
def load_jsonl_files():
    directory = 'data/HC3'

    # Get a list of all .jsonl files in the directory, excluding all.jsonl and reddit_eli5.jsonl
    jsonl_files = [file for file in os.listdir(directory) if file.endswith('.jsonl') and file not in ['all.jsonl', 'reddit_eli5.jsonl']]

    
    # Add data in dataframe
    dfs = pd.DataFrame()
    for file in jsonl_files:
        file_path = os.path.join(directory, file)
        df = pd.read_json(file_path, lines=True)
        
        # Add a new column with the source file name
        df['source'] = file[:-6]
        
        # Concatenate to DataFrame
        dfs = pd.concat([dfs, df], ignore_index=True)
    
    return dfs

df = load_jsonl_files()
df


Unnamed: 0,question,human_answers,chatgpt_answers,source
0,Historical P/E ratios of small-cap vs. large-c...,[There is most likely an error in the WSJ's da...,[Historical price-to-earnings (P/E) ratios for...,finance
1,Should you co-sign a personal loan for a frien...,[I know this question has a lot of answers alr...,[Co-signing a personal loan for a friend or fa...,finance
2,Should I avoid credit card use to improve our ...,[If you pay it off before the cycle closes it ...,[It can be a good idea to avoid using credit c...,finance
3,Difference between 'split and redemption' of s...,[It is the first time I encounter redemption p...,[Share split and redemption are two different ...,finance
4,Pros & cons of investing in gold vs. platinum?,[Why Investors Buy Platinum is an old (1995) a...,[Gold and platinum are both precious metals th...,finance
...,...,...,...,...
7205,Is rise in pressure from 116/66 to 140/80 norm...,[Hello!Welcome and thank you for asking on HCM...,[It's not uncommon for blood pressure to fluct...,medicine
7206,What could cause a painless lump in the right ...,"[Hi, * As per my surgical experience, the issu...",[There are several possible causes of a painle...,medicine
7207,Can Acutret be given to a child for treatment ...,[Although it is difficult to comment whether A...,[It is not appropriate for me to recommend a s...,medicine
7208,Are BP of 119/65 and pulse of 35 causes for co...,[Welcome and thank you for asking on HCM! I ha...,[It is not uncommon for people with rheumatoid...,medicine


### Clean questions

In [17]:
def clean_question(question):
    # Remove if question starts with 'Q.'
    return question[3:].strip() if question[:3] == 'Q. ' else question

def clean_questions(df):
    df_cleaned = df.copy()
    
    df_cleaned['question'] = df_cleaned['question'].apply(lambda x: clean_question(x))
    
    return df_cleaned

### Clean Answers

In [18]:
def clean_answer(answer):
    # Transform list answer into string
    return ' '.join(answer) if isinstance(answer, list) else answer

def clean_answers(df):
    df_cleaned = df.copy()
    
    df_cleaned['human_answers'] = df_cleaned['human_answers'].apply(lambda x: clean_answer(x))
    df_cleaned['chatgpt_answers'] = df_cleaned['chatgpt_answers'].apply(lambda x: clean_answer(x))
    
    return df_cleaned

### Clean Dataframe

In [19]:
def clean_dataframe(df):
    df = clean_answers(df)
    df = df.drop_duplicates()
    df = clean_questions(df)
    df = df.dropna()
    return df

In [20]:
df = clean_dataframe(df)
df

Unnamed: 0,question,human_answers,chatgpt_answers,source
0,Historical P/E ratios of small-cap vs. large-c...,There is most likely an error in the WSJ's dat...,Historical price-to-earnings (P/E) ratios for ...,finance
1,Should you co-sign a personal loan for a frien...,I know this question has a lot of answers alre...,Co-signing a personal loan for a friend or fam...,finance
2,Should I avoid credit card use to improve our ...,If you pay it off before the cycle closes it w...,It can be a good idea to avoid using credit ca...,finance
3,Difference between 'split and redemption' of s...,It is the first time I encounter redemption pr...,Share split and redemption are two different c...,finance
4,Pros & cons of investing in gold vs. platinum?,Why Investors Buy Platinum is an old (1995) ar...,Gold and platinum are both precious metals tha...,finance
...,...,...,...,...
7205,Is rise in pressure from 116/66 to 140/80 norm...,Hello!Welcome and thank you for asking on HCM!...,It's not uncommon for blood pressure to fluctu...,medicine
7206,What could cause a painless lump in the right ...,"Hi, * As per my surgical experience, the issue...",There are several possible causes of a painles...,medicine
7207,Can Acutret be given to a child for treatment ...,Although it is difficult to comment whether Ac...,It is not appropriate for me to recommend a sp...,medicine
7208,Are BP of 119/65 and pulse of 35 causes for co...,Welcome and thank you for asking on HCM! I hav...,It is not uncommon for people with rheumatoid ...,medicine


In [22]:
# Export vers un bucket personnel
PATH_OUT = 'juliettejin/diffusion/projet-mongroupe-hackathon/all_dataset.csv'
with fs.open(PATH_OUT, 'w') as file_out:
    df.to_csv(file_out, index=False)

In [23]:
# NB : le dossier 'diffusion' permet un accès en lecture à tous les membres du groupe !
# Tous les membres peuvent donc le voir et l'utiliser dans un service
fs.ls("juliettejin/diffusion/projet-mongroupe-hackathon")

['juliettejin/diffusion/projet-mongroupe-hackathon/all_dataset.csv',
 'juliettejin/diffusion/projet-mongroupe-hackathon/medicine.csv']

In [24]:
with fs.open(PATH_OUT, mode="r") as file_in:
    df_test = pd.read_csv(file_in)

In [25]:
df_test.head()

Unnamed: 0,question,human_answers,chatgpt_answers,source
0,Historical P/E ratios of small-cap vs. large-c...,There is most likely an error in the WSJ's dat...,Historical price-to-earnings (P/E) ratios for ...,finance
1,Should you co-sign a personal loan for a frien...,I know this question has a lot of answers alre...,Co-signing a personal loan for a friend or fam...,finance
2,Should I avoid credit card use to improve our ...,If you pay it off before the cycle closes it w...,It can be a good idea to avoid using credit ca...,finance
3,Difference between 'split and redemption' of s...,It is the first time I encounter redemption pr...,Share split and redemption are two different c...,finance
4,Pros & cons of investing in gold vs. platinum?,Why Investors Buy Platinum is an old (1995) ar...,Gold and platinum are both precious metals tha...,finance


# Tokenizer

In [26]:
whole_doc = " ".join(df_test['human_answers'])

In [27]:
def nltk_tokenize(doc: str) -> list[str]:
    return word_tokenize(doc)

def gpt_tokenize(doc: str) -> list:
    enc = tiktoken.encoding_for_model("gpt-4")
    tokens = enc.encode(doc)
    return [str(token) for token in tokens]

In [28]:
def tokenize(doc: str, base_tokenizer=word_tokenize, do_lower=False, do_remove_stop_word=False, custom_stop_words=[], do_lemmatize=False) -> tuple[list,list]:
    if do_lower:
        doc = doc.lower()
    list_token = base_tokenizer(doc)

    if do_remove_stop_word:
        stop_words = en_stop | set("-.!?()_;:,'") | {'...'} | set(custom_stop_words) 
        list_token = [token for token in list_token if token not in stop_words]
        
    if do_lemmatize:
        wnl = WordNetLemmatizer()
        list_token = [wnl.lemmatize(t) for t in list_token]

    return list_token

def use_tokenizer(tokenizer, base_tokenizer=word_tokenize):
    return tokenizer

def remove_stop_words(custom_stop_words):
    return custom_stop_words

def add_stop_words(words, custom_stop_words):
    custom_stop_words.extend(words)
    return custom_stop_words

def lower():
    return True

def lemmatize():
    return True

In [30]:
custom_stop_words = []
list_token = tokenize(whole_doc, do_lower=True, do_remove_stop_word=True, custom_stop_words=custom_stop_words)
vocab = set(list_token)

In [None]:
occ = { token : list_token.count(token) for token in vocab }
sorted(occ.items(), key = lambda x : x[1], reverse=True)[:10]