## Tutoriel : interagir avec le système de stockage S3 du SSP Cloud (MinIO)

In [3]:
import os

import pandas as pd
import s3fs
import zipfile

### Récupérer les données d'un challenge

In [4]:
# Create filesystem object
S3_ENDPOINT_URL = "https://" + os.environ["AWS_S3_ENDPOINT"]
fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})

# Lister les challenges
#fs.ls("gvimont/diffusion/hackathon-minarm-2024")

# Lister les fichiers d'un challenge
fs.ls("gvimont/diffusion/hackathon-minarm-2024/AIVSAI")

['gvimont/diffusion/hackathon-minarm-2024/AIVSAI/.keep',
 'gvimont/diffusion/hackathon-minarm-2024/AIVSAI/HC3.zip']

In [5]:
# Télécharger les données dans le service
PATH_IN = 'gvimont/diffusion/hackathon-minarm-2024/AIVSAI/HC3.zip'
fs.download(PATH_IN, 'data/HC3.zip')

# Décompresser les données
with zipfile.ZipFile("data/HC3.zip","r") as zip_file:
    zip_file.extractall("data/")

NB : les données peuvent être également téléchargées directement si besoin, pour être utilisées hors du SSP CLoud.
Exemple pour le fichier ci-dessus (même format de lien pour les autres challenges) : 

http://minio.lab.sspcloud.fr/gvimont/diffusion/hackathon-minarm-2024/AIVSAI/HC3.zip

### Exporter des données

In [6]:
def load_jsonl_files():
    directory = 'data/HC3'

    # Get a list of all .jsonl files in the directory, excluding all.jsonl and reddit_eli5.jsonl
    jsonl_files = [file for file in os.listdir(directory) if file.endswith('.jsonl') and file not in ['all.jsonl', 'reddit_eli5.jsonl']]

    
    # Add data in dataframe
    dfs = pd.DataFrame()
    for file in jsonl_files:
        file_path = os.path.join(directory, file)
        df = pd.read_json(file_path, lines=True)
        
        # Add a new column with the source file name
        df['source'] = file[:-6]
        
        # Concatenate to DataFrame
        dfs = pd.concat([dfs, df], ignore_index=True)
    
    return dfs

df = load_jsonl_files()
df


Unnamed: 0,question,human_answers,chatgpt_answers,source
0,Does Primolut N taken during pregnancy affect ...,"[Hi, Thanks for the query. I understand yo...",[It is not recommended to use Primolut N durin...,medicine
1,Bloating and pain on right lower abdomen. Shou...,"[Hello,Thanks for the query to H.C.M. Forum.Pa...",[If you are experiencing abdominal pain and bl...,medicine
2,Is chest pain related to intake of clindamycin...,"[Hello, The use of Clindamycin can cause stoma...",[It is possible that chest pain could be relat...,medicine
3,Q. Noticed a yellowish sag in the gums of my 1...,[Hello. Revert back with the photos to a denti...,[It is difficult to accurately diagnose a cond...,medicine
4,"Suggest remedy for low grade fever, hot and co...","[Hi Dear,Welcome to Healthcaremagic Team.Under...",[I'm sorry to hear that you're feeling sick. I...,medicine
...,...,...,...,...
7205,"Please explain what is ""Punched cards""",[A punched card (also punch card or punched-ca...,[Punched cards were a means of storing and pro...,wiki_csai
7206,"Please explain what is ""Paper tape""",[Punched tape or perforated paper tape is a fo...,[Paper tape is a medium for storing and transm...,wiki_csai
7207,"Please explain what is ""BBC Model B""",[The British Broadcasting Corporation Microcom...,[The BBC Model B is a computer that was made b...,wiki_csai
7208,"Please explain what is ""O level""",[The O-Level (Ordinary Level) is a subject-bas...,"[""O level"" refers to the General Certificate o...",wiki_csai


### Clean questions

In [7]:
def clean_question(question):
    # Remove if question starts with 'Q.'
    return question[3:].strip() if question[:3] == 'Q. ' else question

def clean_questions(df):
    df_cleaned = df.copy()
    
    df_cleaned['question'] = df_cleaned['question'].apply(lambda x: clean_question(x))
    
    return df_cleaned

### Clean Answers

In [8]:
def clean_answer(answer):
    # Transform list answer into string
    return ' '.join(answer) if isinstance(answer, list) else answer

def clean_answers(df):
    df_cleaned = df.copy()
    
    df_cleaned['human_answers'] = df_cleaned['human_answers'].apply(lambda x: clean_answer(x))
    df_cleaned['chatgpt_answers'] = df_cleaned['chatgpt_answers'].apply(lambda x: clean_answer(x))
    
    return df_cleaned

### Clean Dataframe

In [9]:
def clean_dataframe(df):
    df = clean_answers(df)
    df = df.drop_duplicates()
    df = clean_questions(df)
    df = df.dropna()
    return df

In [10]:
df = clean_dataframe(df)
df

Unnamed: 0,question,human_answers,chatgpt_answers,source
0,Does Primolut N taken during pregnancy affect ...,"Hi, Thanks for the query. I understand you...",It is not recommended to use Primolut N during...,medicine
1,Bloating and pain on right lower abdomen. Shou...,"Hello,Thanks for the query to H.C.M. Forum.Pai...",If you are experiencing abdominal pain and blo...,medicine
2,Is chest pain related to intake of clindamycin...,"Hello, The use of Clindamycin can cause stomac...",It is possible that chest pain could be relate...,medicine
3,Noticed a yellowish sag in the gums of my 13 m...,Hello. Revert back with the photos to a dentis...,It is difficult to accurately diagnose a condi...,medicine
4,"Suggest remedy for low grade fever, hot and co...","Hi Dear,Welcome to Healthcaremagic Team.Unders...",I'm sorry to hear that you're feeling sick. It...,medicine
...,...,...,...,...
7205,"Please explain what is ""Punched cards""",A punched card (also punch card or punched-car...,Punched cards were a means of storing and proc...,wiki_csai
7206,"Please explain what is ""Paper tape""",Punched tape or perforated paper tape is a for...,Paper tape is a medium for storing and transmi...,wiki_csai
7207,"Please explain what is ""BBC Model B""",The British Broadcasting Corporation Microcomp...,The BBC Model B is a computer that was made by...,wiki_csai
7208,"Please explain what is ""O level""",The O-Level (Ordinary Level) is a subject-base...,"""O level"" refers to the General Certificate of...",wiki_csai


In [11]:
# Export vers un bucket personnel
PATH_OUT = 'linafarchado/diffusion/projet-mongroupe-hackathon/all_dataset.csv'
with fs.open(PATH_OUT, 'w') as file_out:
    df.to_csv(file_out, index=False)

In [12]:
# NB : le dossier 'diffusion' permet un accès en lecture à tous les membres du groupe !
# Tous les membres peuvent donc le voir et l'utiliser dans un service
fs.ls("linafarchado/diffusion/projet-mongroupe-hackathon")

['linafarchado/diffusion/projet-mongroupe-hackathon/all_dataset.csv',
 'linafarchado/diffusion/projet-mongroupe-hackathon/medicine.csv']

In [13]:
with fs.open(PATH_OUT, mode="r") as file_in:
    df_test = pd.read_csv(file_in)

In [14]:
df_test.head()

Unnamed: 0,question,human_answers,chatgpt_answers,source
0,Does Primolut N taken during pregnancy affect ...,"Hi, Thanks for the query. I understand you...",It is not recommended to use Primolut N during...,medicine
1,Bloating and pain on right lower abdomen. Shou...,"Hello,Thanks for the query to H.C.M. Forum.Pai...",If you are experiencing abdominal pain and blo...,medicine
2,Is chest pain related to intake of clindamycin...,"Hello, The use of Clindamycin can cause stomac...",It is possible that chest pain could be relate...,medicine
3,Noticed a yellowish sag in the gums of my 13 m...,Hello. Revert back with the photos to a dentis...,It is difficult to accurately diagnose a condi...,medicine
4,"Suggest remedy for low grade fever, hot and co...","Hi Dear,Welcome to Healthcaremagic Team.Unders...",I'm sorry to hear that you're feeling sick. It...,medicine


# Tokenizer

In [None]:
whole_doc = " ".join(df_test['human_answers'])

In [2]:
def nltk_tokenize(doc: str) -> list[str]:
    return word_tokenize(doc)

def gpt_tokenize(doc: str) -> list:
    enc = tiktoken.encoding_for_model("gpt-4")
    tokens = enc.encode(doc)
    return [str(token) for token in tokens]

In [None]:
def tokenize(doc: str, base_tokenizer=word_tokenize, do_lower=False, do_remove_stop_word=False, custom_stop_words=[], do_lemmatize=False) -> tuple[list,list]:
    if do_lower:
        doc = doc.lower()
    list_token = base_tokenizer(doc)

    if do_remove_stop_word:
        stop_words = en_stop | set("-.!?()_;:,'") | {'...'} | set(custom_stop_words) 
        list_token = [token for token in list_token if token not in stop_words]
        
    if do_lemmatize:
        wnl = WordNetLemmatizer()
        list_token = [wnl.lemmatize(t) for t in list_token]

    return list_token

def use_tokenizer(tokenizer, base_tokenizer=word_tokenize):
    return tokenizer

def remove_stop_words(custom_stop_words):
    return custom_stop_words

def add_stop_words(words, custom_stop_words):
    custom_stop_words.extend(words)
    return custom_stop_words

def lower():
    return True

def lemmatize():
    return True

In [None]:
custom_stop_words = []
list_token = tokenize(whole_doc, do_lower=True, do_remove_stop_word=True, custom_stop_words=custom_stop_words)
vocab = set(list_token)
occ = { token : list_token.count(token) for token in vocab }
sorted(occ.items(), key = lambda x : x[1], reverse=True)[:10]