In [1]:
import json
from pathlib import Path
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm 
import torch
from torch_geometric.data import Data
import torch.nn as nn
from torch_geometric.nn import GCNConv
import torch.nn.functional as F
from torch_geometric.loader import DataLoader
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string

# Téléchargez les ressources nécessaires pour la lemmatisation
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

import sys
sys.path.append('../AJA')
import AJA as aja

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\arnau\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\arnau\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
# récupération des données 
df_train_nodes, df_train_edges, df_test_nodes, df_test_edges = aja.get_data()

In [3]:
# Fonction de prétraitement du texte
def preprocess_text(text):
    # Tokenisation
    tokens = text.split()

    # Suppression de la ponctuation et des caractères spéciaux
    tokens = [token.strip(string.punctuation) for token in tokens]

    # Suppression des mots vides
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token.lower() not in stop_words]

    # Lemmatisation
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Reconstitution du texte à partir des tokens traités
    processed_text = ' '.join(tokens)
    return processed_text

# Appliquer le prétraitement aux colonnes 'text' dans les DataFrames
df_train_nodes['text'] = df_train_nodes['text'].apply(preprocess_text)
df_test_nodes['text'] = df_test_nodes['text'].apply(preprocess_text)


In [4]:
from transformers import BertTokenizer, BertModel
import torch
from tqdm import tqdm  # Importer tqdm pour la barre de progression

# Charger le tokenizer et le modèle BERT
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Fonction pour obtenir l'embedding BERT pour une phrase
def get_bert_embedding(sentence):
    # Tokeniser la phrase et l'encoder
    inputs = tokenizer(sentence, return_tensors="pt", truncation=True, padding=True)
    
    # Obtenir l'embedding à partir du modèle BERT
    with torch.no_grad():
        outputs = model(**inputs)
    
    # Récupérer l'embedding de la couche d'output
    embeddings = outputs.last_hidden_state.mean(dim=1).squeeze()
    return embeddings.numpy()

# Appliquer la fonction d'embedding aux colonnes 'text' dans les DataFrames avec une barre de progression
df_train_nodes['bert_embeddings'] = df_train_nodes['text'].apply(lambda x: tqdm(get_bert_embedding(x), leave=False))
df_test_nodes['bert_embeddings'] = df_test_nodes['text'].apply(lambda x: tqdm(get_bert_embedding(x), leave=False))


  0%|          | 0/768 [00:00<?, ?it/s]
[A

[A[A


[A[A[A



[A[A[A[A




[A[A[A[A[A





[A[A[A[A[A[A






[A[A[A[A[A[A[A







[A[A[A[A[A[A[A[A








[A[A[A[A[A[A[A[A[A









[A[A[A[A[A[A[A[A[A[A










[A[A[A[A[A[A[A[A[A[A[A











[A[A[A[A[A[A[A[A[A[A[A[A












[A[A[A[A[A[A[A[A[A[A[A[A[A













[A[A[A[A[A[A[A[A[A[A[A[A[A[A














[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A















[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A
















[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A

















[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A


















                                       [A[A[A[A[A[A
[A

[A[A


[A[A[A



[A[A[A[A




[A[A[A[A[A





[A[A[A[A[A[A






[A[A[A[A[A[A[A







[A[A[A[A[A[A[A[A








[A[A[A[A[A[A[A[A[A









[A[A[A[