In [None]:
import pandas as pd
import numpy as np
import re
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
from db.models import get_restaurants_with_reviews_and_users
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from transformers import pipeline
from concurrent.futures import ThreadPoolExecutor
from wordcloud import WordCloud

class NLPAnalysis:
    def __init__(self, db_path='sqlite:///restaurant_reviews.db'):
        # Initialisation de la connexion à la base de données
        self.engine = create_engine(db_path)
        self.session = sessionmaker(bind=self.engine)()
        self.data = pd.DataFrame()
        self.model = None
        self.tokenizer = None
        self.summarizer = pipeline("summarization", model="t5-small", tokenizer="t5-small")
#1
    def load_data(self):
        # Extraction et transformation des données
        raw_data = get_restaurants_with_reviews_and_users(self.session)
        data = []
        for restaurant in raw_data:
            for review in restaurant['reviews']:
                data.append({
                    'restaurant': restaurant['restaurant'],
                    'restaurant_address': restaurant['restaurant_address'],
                    'title': review['title'],
                    'user_profile': review['user_profile'],
                    'date_review': review['date_review'],
                    'rating': review['rating'],
                    'type_visit': review['type_visit'],
                    'num_contributions': review['num_contributions'],
                    'review': review['review'],
                    'review_cleaned': review['review_cleaned']
                })
        self.data = pd.DataFrame(data)
        self.data['sentiment'] = self.data['rating'].apply(self._sentiment_class)
#2
    @staticmethod
    def _sentiment_class(rating):
        return 2 if rating == 3 else (1 if rating >= 4 else 0)
#3
    def preprocess_reviews(self):
        # Tokenisation et padding
        self.tokenizer = Tokenizer(num_words=10000)
        self.tokenizer.fit_on_texts(self.data['review_cleaned'])
        sequences = self.tokenizer.texts_to_sequences(self.data['review_cleaned'])
        return pad_sequences(sequences, padding='post', maxlen=200)
#4
    def train_lstm_model(self):
        # Préparation des données
        X_pad = self.preprocess_reviews()
        y = self.data['sentiment']
        X_train, X_test, y_train, y_test = train_test_split(X_pad, y, test_size=0.2, random_state=42)

        # Définition et entraînement du modèle LSTM
        self.model = Sequential([
            Embedding(input_dim=10000, output_dim=128, input_length=200),
            LSTM(128, dropout=0.2, recurrent_dropout=0.2),
            Dense(3, activation='softmax')
        ])
        self.model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
        self.model.fit(X_train, y_train, epochs=5, batch_size=64, validation_data=(X_test, y_test), verbose=1)

        # Évaluation du modèle
        y_pred = np.argmax(self.model.predict(X_test), axis=1)
        accuracy = accuracy_score(y_test, y_pred)
        print(f"Précision du modèle LSTM : {accuracy * 100:.2f}%")
#4
    def summarize_restaurant_reviews(self, restaurant_name, taille_segment=100, max_length=15, min_length=5):
        def segmenter_avis(texte):
            mots = texte.split()
            for i in range(0, len(mots), taille_segment):
                yield ' '.join(mots[i:i + taille_segment])

        # Filtrage des données pour le restaurant spécifié
        restaurant_data = self.data[self.data['restaurant'] == restaurant_name]
        
        if restaurant_data.empty:
            return pd.DataFrame({"restaurant": [restaurant_name], "resume_avis": ["Aucun avis trouvé pour ce restaurant"]})

        # Récupération et concaténation des avis pour le restaurant spécifié
        avis = ' '.join(restaurant_data['review_cleaned'])
        segments = list(segmenter_avis(avis))
        resumes = []

        for segment in segments:
            try:
                resume = self.summarizer(segment, max_length=max_length, min_length=min_length, do_sample=False)[0]['summary_text']
                resumes.append(resume)
            except Exception as e:
                resumes.append(f"Erreur lors du résumé : {str(e)}")

        # Combinaison des résumés partiels en un résumé global limité à 15 mots
        resume_global = ' '.join(resumes)
        resume_limit = ' '.join(resume_global.split()[:15])

        # Retour sous forme de DataFrame
        return pd.DataFrame({"restaurant": [restaurant_name], "resume_avis": [resume_limit]})
#5
    def generate_wordcloud(self):
        tous_avis = ' '.join(self.data['review_cleaned'])
        wordcloud = WordCloud(width=800, height=400, background_color='white').generate(tous_avis)
        return wordcloud

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
#test ma class
nlp = NLPAnalysis()
nlp.load_data()
nlp.preprocess_reviews()


KeyError: 'review_cleaned'

In [None]:
nlp.train_lstm_model()
nlp.summarize_restaurant_reviews()
nlp.generate_wordcloud()