In [None]:
# 2.Prétraitement: Tokenisation, nettoyage et normalisation.

In [1]:
import re
import nltk
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from tqdm import tqdm

# --- Téléchargement des ressources NLTK si nécessaire ---
try:
    _ = stopwords.words('english')
except LookupError:
    nltk.download('stopwords')

try:
    nltk.data.find('corpora/wordnet')
except LookupError:
    nltk.download('wordnet')
    nltk.download('omw-1.4')

# --- Initialisation des outils ---
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# --- FONCTION DE PRÉTRAITEMENT ---

def preprocess_text(text: str) -> str:
    """Nettoyage complet du texte : minuscules, suppression des caractères spéciaux, stopwords et lemmatisation."""
    if not isinstance(text, str):
        return ''

    # Minuscule
    text = text.lower()

    # Suppression des caractères spéciaux, chiffres, ponctuation
    text = re.sub(r"[^a-z\s]", " ", text)

    # Suppression des espaces multiples
    text = re.sub(r"\s+", " ", text).strip()

    # Tokenisation simple
    tokens = text.split()

    # Suppression des stopwords et des mots trop courts
    tokens = [t for t in tokens if t not in stop_words and len(t) > 2]

    # Lemmatisation
    tokens = [lemmatizer.lemmatize(t) for t in tokens]

    # Reconstruction du texte propre
    return ' '.join(tokens)

if __name__ == "__main__":
   
    INPUT_PATH = "../data/data_clean.csv"  
    OUTPUT_PATH = "../data/preprocessed_data.csv"

    print(f" Chargement du dataset depuis {INPUT_PATH}...")
    df = pd.read_csv(INPUT_PATH)

    # Choisir la colonne texte : 'TI' ou 'abstract'
    text_column = 'TI' if 'TI' in df.columns else 'abstract'

    print(f" Prétraitement du texte ({text_column})...")
    tqdm.pandas()
    df['clean_text'] = df[text_column].progress_apply(preprocess_text)

    # Supprimer la colonne label si elle existe (au cas où)
    if 'label' in df.columns:
        df = df.drop(columns=['label'])

    print(f" Sauvegarde du fichier nettoyé dans {OUTPUT_PATH}...")
    df.to_csv(OUTPUT_PATH, index=False)

    print(" Prétraitement terminé avec succès !")
    print(f" Fichier sauvegardé : {OUTPUT_PATH}")


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\hasna\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\hasna\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


 Chargement du dataset depuis ../data/data_clean.csv...
 Prétraitement du texte (TI)...


100%|██████████| 194938/194938 [00:13<00:00, 14105.94it/s]


 Sauvegarde du fichier nettoyé dans ../data/preprocessed_data.csv...
 Prétraitement terminé avec succès !
 Fichier sauvegardé : ../data/preprocessed_data.csv


In [2]:
df = pd.read_csv(r"C:\Users\hasna\Desktop\Master\S3\NLP\Projet-NLP\data\preprocessed_data.csv")
df['PY'].values

array([2018, 2018, 2018, ..., 2019, 2019, 2019], dtype=int64)

In [4]:
print(df["PY"].describe())
print(sorted(df["PY"].unique())[:10], " ... ", sorted(df["PY"].unique())[-10:])


count    194938.000000
mean       2013.708056
std           5.940576
min        1961.000000
25%        2011.000000
50%        2015.000000
75%        2018.000000
max        2020.000000
Name: PY, dtype: float64
[1961, 1967, 1969, 1970, 1971, 1972, 1973, 1974, 1975, 1976]  ...  [2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020]


In [None]:
import pandas as pd
import os

# Chemin vers le dataset complet
DATA_PATH = r"C:\Users\hasna\Desktop\Master\S3\NLP\Projet-NLP\data\preprocessed_data.csv"

# Chargement du dataset complet
df = pd.read_csv(DATA_PATH)
print(f" Dataset complet chargé : {df.shape[0]} lignes")

# Sélection aléatoire ou les premières 10 000 lignes
df_small = df.head(30000)  # ou df.sample(n=10000, random_state=42) pour un échantillon aléatoire

print(f" Dataset réduit : {df_small.shape[0]} lignes")

# Sauvegarde du dataset réduit
SAVE_PATH = r"C:\Users\hasna\Desktop\Master\S3\NLP\Projet-NLP\data\preprocessed_data_small.csv"
df_small.to_csv(SAVE_PATH, index=False)
print(f" Dataset réduit sauvegardé dans : {SAVE_PATH}")

In [2]:
df_small.head(10)

Unnamed: 0,PY,id,eid,TI,author,clean_text
0,2018,85058550296,2-s2.0-85058550296,knowledge capture and reuse through expert’s a...,"[{'@_fa': 'true', '@seq': '1', 'author-url': '...",knowledge capture reuse expert activity monito...
1,2018,85058873457,2-s2.0-85058873457,an approach of energy resources control system...,"[{'@_fa': 'true', '@seq': '1', 'author-url': '...",approach energy resource control system design
2,2018,85058993388,2-s2.0-85058993388,som-like neural network and differential evolu...,"[{'@_fa': 'true', '@seq': '1', 'author-url': '...",som like neural network differential evolution...
3,2018,85059318297,2-s2.0-85059318297,drug discovery and drug marketing with the cri...,"[{'@_fa': 'true', '@seq': '1', 'author-url': '...",drug discovery drug marketing critical role mo...
4,2018,85059937495,2-s2.0-85059937495,towards a natural language compiler,"[{'@_fa': 'true', '@seq': '1', 'author-url': '...",towards natural language compiler
5,2018,85044445280,2-s2.0-85044445280,spontaneous emergence of programs from “primor...,"[{'@_fa': 'true', '@seq': '1', 'author-url': '...",spontaneous emergence program primordial soup ...
6,2018,85044714101,2-s2.0-85044714101,towards high-performance python,"[{'@_fa': 'true', '@seq': '1', 'author-url': '...",towards high performance python
7,2018,85045383599,2-s2.0-85045383599,the online set aggregation problem,"[{'@_fa': 'true', '@seq': '1', 'author-url': '...",online set aggregation problem
8,2018,85045426562,2-s2.0-85045426562,"practical, anonymous, and publicly linkable un...","[{'@_fa': 'true', '@seq': '1', 'author-url': '...",practical anonymous publicly linkable universa...
9,2018,85046626398,2-s2.0-85046626398,a distributional semantics model for idiom det...,"[{'@_fa': 'true', '@seq': '1', 'author-url': '...",distributional semantics model idiom detection...


In [1]:
import pandas as pd
import os

# Chemin vers le dataset complet
DATA_PATH = r"C:\Users\hasna\Desktop\Master\S3\NLP\Projet-NLP\data\preprocessed_data.csv"

# Chargement du dataset complet
df = pd.read_csv(DATA_PATH)
print(f" Dataset complet chargé : {df.shape[0]} lignes")

# ---- Sélection des 30 000 premières lignes ----
df_first_30k = df.head(30000)

# ---- Reste du dataset : à partir de la ligne 30000 ----
df_rest = df.iloc[30000:].reset_index(drop=True)
print(f" Reste du dataset : {df_rest.shape[0]} lignes")

df_rest.head(10)

 Dataset complet chargé : 194938 lignes
 Reste du dataset : 164938 lignes


Unnamed: 0,PY,id,eid,TI,author,clean_text
0,2017,85026204687,2-s2.0-85026204687,an approach using the design science research ...,"[{'@_fa': 'true', '@seq': '1', 'author-url': '...",approach using design science research develop...
1,2017,85026192956,2-s2.0-85026192956,group matching for peer mentorship in small gr...,"[{'@_fa': 'true', '@seq': '1', 'author-url': '...",group matching peer mentorship small group
2,2017,85025176830,2-s2.0-85025176830,an innovative hybrid model based on data pre-p...,"[{'@_fa': 'true', '@seq': '1', 'author-url': '...",innovative hybrid model based data pre process...
3,2017,85025169375,2-s2.0-85025169375,ivhd: a robust linear-time and memory efficien...,"[{'@_fa': 'true', '@seq': '1', 'author-url': '...",ivhd robust linear time memory efficient metho...
4,2017,85025166493,2-s2.0-85025166493,self-adaptive uis: integrated model-driven dev...,"[{'@_fa': 'true', '@seq': '1', 'author-url': '...",self adaptive uis integrated model driven deve...
5,2017,85025144345,2-s2.0-85025144345,optimization for large-scale machine learning ...,"[{'@_fa': 'true', '@seq': '1', 'author-url': '...",optimization large scale machine learning dist...
6,2017,85025124532,2-s2.0-85025124532,multi-view approach to parkinson’s disease qua...,"[{'@_fa': 'true', '@seq': '1', 'author-url': '...",multi view approach parkinson disease quality ...
7,2017,85025124016,2-s2.0-85025124016,context-awareness and mobile hci: implications...,"[{'@_fa': 'true', '@seq': '1', 'author-url': '...",context awareness mobile hci implication chall...
8,2017,85025123755,2-s2.0-85025123755,introducing a decision making framework to hel...,"[{'@_fa': 'true', '@seq': '1', 'author-url': '...",introducing decision making framework help use...
9,2017,85025115857,2-s2.0-85025115857,a new approach to telecommunications network d...,"[{'@_fa': 'true', '@seq': '1', 'author-url': '...",new approach telecommunication network design ...
