<a href="https://colab.research.google.com/github/lcontrerasroa/glossaire/blob/main/glossaire_pipeline_colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 🧠 Pipeline complet : Extraction terminologique + Définition enrichie
Ce carnet inclut :
- Récupération HAL
- Extraction de termes
- Détection de langue contextuelle
- Génération de définitions brutes et reformulées
- Intégration possible avec ChatGPT pour enrichir les définitions

In [None]:
try:
    import google.colab
    !pip install scikit-learn langdetect openai
except:
    pass

Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: langdetect
  Building wheel for langdetect (setup.py) ... [?25l[?25hdone
  Created wheel for langdetect: filename=langdetect-1.0.9-py3-none-any.whl size=993223 sha256=3c104a26f33e263b6b28c686548dcc9c4b7a460d56a649c1ff5651b1917655a2
  Stored in directory: /root/.cache/pip/wheels/0a/f2/b2/e5ca405801e05eb7c8ed5b3b4bcf1fcabcd6272c167640072e
Successfully built langdetect
Installing collected packages: langdetect
Successfully installed langdetect-1.0.9


In [None]:
import pandas as pd
import requests
import re
from langdetect import detect
from sklearn.feature_extraction.text import CountVectorizer
import openai

In [None]:
hal_id = "leonardo-contreras-roa"
url = f"https://api.archives-ouvertes.fr/search/?q=authIdHal_s:{hal_id}&fl=title_s,abstract_s,abstractFr_s,abstract_en_s,keyword_s&rows=100&wt=json"
response = requests.get(url)
docs = response.json()['response']['docs']

records = []
for doc in docs:
    title = doc.get('title_s', ['Sans titre'])[0]
    keywords = doc.get('keyword_s', [])
    abstract_raw = doc.get('abstract_s') or doc.get('abstract_en_s') or doc.get('abstractFr_s')
    abstract = " ".join(abstract_raw) if isinstance(abstract_raw, list) else abstract_raw
    records.append({'title': title, 'abstract': abstract, 'keywords': keywords})

hal_df = pd.DataFrame(records)
hal_df.to_csv("hal_data_full.csv", index=False)

In [None]:
keywords = [(kw.lower(), 'keyword') for kws in hal_df['keywords'] for kw in kws]
vec_title = CountVectorizer(ngram_range=(2,3), stop_words='english')
title_ngrams = vec_title.fit_transform(hal_df['title'].fillna(''))
title_terms = vec_title.get_feature_names_out()
df_keywords = pd.DataFrame(keywords, columns=['term', 'source'])
df_title = pd.DataFrame({'term': title_terms, 'source': 'title'})
vec_abs = CountVectorizer(ngram_range=(2,3), stop_words='english')
abs_ngrams = vec_abs.fit_transform(hal_df['abstract'].fillna(''))
abs_terms = vec_abs.get_feature_names_out()
df_abs = pd.DataFrame({'term': abs_terms, 'source': 'abstract'})
df_all = pd.concat([df_keywords, df_title, df_abs], ignore_index=True).drop_duplicates()
df_all['definition'] = ''

In [None]:
def detect_lang_with_context(row):
    context = (str(row.get("term")) + " ") + str(row.get("definition"))
    try:
        lang = detect(context.strip())
        return lang if lang in ['en', 'fr', 'es'] else 'fr'
    except:
        return "unknown"
df_all['langue'] = df_all.apply(detect_lang_with_context, axis=1)

In [None]:
texts = (hal_df['title'].fillna('') + '. ' + hal_df['abstract'].fillna('')).str.lower().tolist()
def find_definition(term, texts):
    for text in texts:
        sentences = re.split(r'[.!?]\s+', text)
        for sentence in sentences:
            if term in sentence:
                return sentence.strip().capitalize()
    return ""
df_all['auto_definition'] = df_all['term'].apply(lambda t: find_definition(t, texts))

In [None]:
def reformuler_definition(term, phrase):
    if pd.isna(phrase) or not isinstance(phrase, str) or len(phrase.strip()) < 10:
        return ""
    phrase = phrase.strip().capitalize()
    return f"{term.capitalize()} refers to {phrase.rstrip('.')}."
def est_valide(term, phrase):
    if pd.isna(phrase): return "non"
    if len(term.split()) < 2 and len(term) <= 5: return "non"
    if len(phrase.strip()) < 20: return "non"
    return "oui"
df_all['definition_finale'] = df_all.apply(lambda row: reformuler_definition(row['term'], row['auto_definition']), axis=1)
df_all['valide_auto'] = df_all.apply(lambda row: est_valide(row['term'], row['auto_definition']), axis=1)

In [None]:
def enrichir_definition_chatgpt(term, langue="en"):
    prompt = f"Give a concise glossary-style definition for the following term in {langue.upper()}: '{term}'"
    try:
        response = openai.ChatCompletion.create(
            model="gpt-3.5-turbo",
            messages=[{"role": "user", "content": prompt}]
        )
        return response['choices'][0]['message']['content'].strip()
    except:
        return ""

In [None]:
df_all.to_csv("glossaire_complet_chatgpt.csv", index=False)
df_all.head(10)

Unnamed: 0,term,source,definition,langue,auto_definition,definition_finale,valide_auto
0,stages of teaching and learning,keyword,,en,The second part identifies the tasks the teach...,Stages of teaching and learning refers to The ...,oui
1,foreign languages,keyword,,fr,,,non
2,pleasure,keyword,,fr,Second language class planning : means of prom...,Pleasure refers to Second language class plann...,oui
3,class planning,keyword,,en,Second language class planning : means of prom...,Class planning refers to Second language class...,oui
4,didactic strategies,keyword,,fr,Each stage is illustrated with didactic strate...,Didactic strategies refers to Each stage is il...,oui
5,external conditions of learning,keyword,,en,,,non
6,voicing periodicity assimilation pronunciation...,keyword,,fr,,,non
7,mots-clés analyse comparative,keyword,,fr,,,non
8,surréalisme,keyword,,fr,,,non
9,traduction littéraire,keyword,,fr,,,non
