In [101]:
import numpy as np
from collections import defaultdict
import matplotlib.pyplot as plt
from tqdm import tqdm
from sklearn.manifold import TSNE
import plotly.express as px

class Word2Vec:
    def __init__(self, window_size=2, embedding_dim=1000):
        self.window_size = window_size
        self.embedding_dim = embedding_dim
        self.word2idx = {}
        self.idx2word = {}
        self.vocab_size = 0
        self.W = None
        self.C = None
        self.vocab = None
        

    def train(self, word2idx,  sentences, epochs=10, learning_rate=0.01):
         # Initialize weight matrices
        self.vocab_size = len(word2idx)
        self.W = np.random.randn(self.vocab_size, self.embedding_dim) * 0.01
        self.C = np.random.randn(self.vocab_size, self.embedding_dim) * 0.01
        
        for epoch in range(epochs):
            loss = 0
            for sentence in tqdm(sentences):
                for i, word in enumerate(sentence):
                    # Get target word index
                    target_idx = self.word2idx[word]
                    
                    # Get context words within window
                    context_words = []
                    for j in range(max(0, i - self.window_size), min(len(sentence), i + self.window_size + 1)):
                        if i != j:
                            context_words.append(self.word2idx[sentence[j]])
                    
                    # Training step
                    for context_idx in context_words:
                        # Forward pass
                        target_vector = self.W[target_idx]
                        context_vector = self.C[context_idx]
                        score = np.dot(target_vector, context_vector)
                        
                        # Compute gradients
                        grad = context_vector * (1 - sigmoid(score))
                        grad_context = target_vector * (1 - sigmoid(score))
                        
                        # Update weights
                        self.W[target_idx] += learning_rate * grad
                        self.C[context_idx] += learning_rate * grad_context
                        
                        loss += -np.log(sigmoid(score))
            
            print(f"Epoch {epoch+1}, Loss: {loss}")

class GloVe:
    def __init__(self, embedding_dim=100, window_size=10):
        self.embedding_dim = embedding_dim
        self.window_size = window_size
        self.word2idx = {}
        self.cooccurrence = defaultdict(float)
        self.W = None
        
    def build_vocab_and_cooccurrence(self, sentences):
        # Build vocabulary
        vocab = set()
        for sentence in sentences:
            for word in sentence:
                vocab.add(word)
        
        for idx, word in enumerate(vocab):
            self.word2idx[word] = idx
        
        # Build co-occurrence matrix
        for sentence in sentences:
            for i, word in enumerate(sentence):
                for j in range(max(0, i - self.window_size), min(len(sentence), i + self.window_size + 1)):
                    if i != j:
                        self.cooccurrence[(self.word2idx[word], self.word2idx[sentence[j]])] += 1.0

    def train(self, epochs=50, learning_rate=0.05):
        vocab_size = len(self.word2idx)
        self.W = np.random.randn(vocab_size, self.embedding_dim) * 0.01
        
        for epoch in range(epochs):
            loss = 0
            for (i, j), X_ij in self.cooccurrence.items():
                weight = (X_ij/100)**0.75 if X_ij > 100 else 1
                
                # Compute loss
                diff = np.dot(self.W[i], self.W[j]) - np.log(X_ij + 1)
                loss += 0.5 * weight * (diff ** 2)
                
                # Compute gradients
                grad = weight * diff
                
                # Update vectors
                self.W[i] -= learning_rate * grad * self.W[j]
                self.W[j] -= learning_rate * grad * self.W[i]
            
            print(f"Epoch {epoch+1}, Loss: {loss}")

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def visualize_embeddings_tsne(embeddings, words, title, top_n=50 , perplexity=5):
    # Select the top_n embeddings and corresponding words
    embeddings = embeddings[:top_n]
    words = words[:top_n]
    print(f"Visualizing {len(words)} words")
    # Dynamically set perplexity
   
    
    tsne = TSNE(n_components=3, random_state=42, perplexity=perplexity)
    embedding_3d = tsne.fit_transform(embeddings)
    
    fig = px.scatter_3d(
        x=embedding_3d[:, 0], y=embedding_3d[:, 1], z=embedding_3d[:, 2],
        text=words, title=title,
        labels={'x': 'X', 'y': 'Y', 'z': 'Z'}
    )
    fig.show()


In [12]:
# clean my review data
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
# from nltk.stem import SnowballStemmer
# SnowballStemmer("french")
def clean_text(text):
    text = text.lower()
    text = text.strip()
    text = re.sub(r'\d+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    text = re.sub(r'\s+', ' ', text)
    #remove numbers and special characters from text
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # remove single characters
    text = re.sub(r'\b\w\b', '', text)
    # remove multiple spaces
    text = re.sub(r'\s+', ' ', text)
    # remove leading and trailing spaces
    text = text.strip()
    
    return text

def remove_stopwords(text):
    stop_words = set(stopwords.words('french'))
    words = word_tokenize(text)
    return [word for word in words if word not in stop_words]

def lemmatize(text):
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(word) for word in text]

df['cleaned_reviewbis'] = df['review'].apply(clean_text)
df['cleaned_reviewbis'] = df['cleaned_reviewbis'].apply(remove_stopwords)
df['cleaned_reviewbis'] = df['cleaned_reviewbis'].apply(lemmatize)


In [10]:
# make all the reviews in a list
sentences = df['cleaned_reviewbis'].tolist()
# make a list of all the words in the reviews
words = [word for sentence in sentences for word in sentence]

In [11]:
sentences

[['trop',
  'bon',
  'moment',
  'accueil',
  'plat',
  'ambiance',
  'tout',
  'passer',
  'bon',
  'moment',
  'entre',
  'amis',
  'ntant',
  'rgion',
  'hte',
  'dcouvrir',
  'genre',
  'dtablissement',
  'navons',
  'dus'],
 ['toujours',
  'aussi',
  'goteux',
  'lambiance',
  'bonne',
  'humeur',
  'moment',
  'convivialit',
  'amis',
  'collgues',
  'passer',
  'ct'],
 ['serveurs',
  'serveuses',
  'sympas',
  'longlet',
  'nest',
  'tendre',
  'terrine',
  'moyenne',
  'lentilles',
  'dcevantes',
  'fondant',
  'carrment',
  'top',
  'cuisine',
  'amliorer'],
 ['excellente',
  'soire',
  'petit',
  'restaurant',
  'tions',
  'peu',
  'lcart',
  'ct',
  'cuisine',
  'moment',
  'privilgi',
  'patron',
  'cuisiniers',
  'plat',
  'dlicieux',
  'lambiance',
  'trs',
  'sympa',
  'grce',
  'bonne',
  'humeur',
  'restaurateur',
  'manqu'],
 ['super',
  'moment',
  'entre',
  'amis',
  'plat',
  'taient',
  'trs',
  'dlicieux',
  'conseille',
  'fortement',
  'cette',
  'adresse',
 

In [61]:
words[0:5]

['trop', 'bon', 'moment', 'accueil', 'plat']

In [62]:
words[0]

'trop'

In [63]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(words)
tokens = vectorizer.get_feature_names_out()
token_matrix = X.toarray()



In [107]:
len(set(tokens))

4543

In [105]:
len(words)

28335

In [106]:
len()

4547

In [108]:
word2idx = {word: idx for idx, word in enumerate(set(words))}


In [109]:
word2idx

{'touriste': 0,
 'classiques': 1,
 'certains': 2,
 'dentres': 3,
 'prvenue': 4,
 'antiquits': 5,
 'linventivit': 6,
 'servent': 7,
 'mot': 8,
 'faisant': 9,
 'affichait': 10,
 'bruyant': 11,
 'pourtant': 12,
 'violence': 13,
 'sale': 14,
 'doute': 15,
 'navaient': 16,
 'vendanges': 17,
 'alcooliss': 18,
 'biensans': 19,
 'gote': 20,
 'bonne': 21,
 'roulante': 22,
 'recommanderons': 23,
 'fminine': 24,
 'praline': 25,
 'pioche': 26,
 'direambiance': 27,
 'sucre': 28,
 'old': 29,
 'adresses': 30,
 'coll': 31,
 'souriants': 32,
 'servicechaleureux': 33,
 'voleuses': 34,
 'honntes': 35,
 'arrondissement': 36,
 'correcte': 37,
 'surface': 38,
 'oprent': 39,
 'simplement': 40,
 'rustique': 41,
 'stphanoises': 42,
 'faussent': 43,
 'arrivs': 44,
 'inventives': 45,
 'nombreuses': 46,
 'association': 47,
 'depuis': 48,
 'prfres': 49,
 'installes': 50,
 'entrant': 51,
 'eh': 52,
 'vraiment': 53,
 'portable': 54,
 'paye': 55,
 'discrets': 56,
 'dlay': 57,
 'sentiment': 58,
 'reserver': 59,
 'fame

In [110]:
# Train Word2Vec
w2v = Word2Vec(window_size=3, embedding_dim=1000)
w2v.word2idx = word2idx
w2v.train(word2idx, sentences , epochs=10, learning_rate=0.01)

100%|██████████| 795/795 [00:02<00:00, 391.01it/s]


Epoch 1, Loss: 109541.58903970568


100%|██████████| 795/795 [00:02<00:00, 388.78it/s]


Epoch 2, Loss: 91387.07454243721


100%|██████████| 795/795 [00:02<00:00, 391.46it/s]


Epoch 3, Loss: 70494.0806539791


100%|██████████| 795/795 [00:02<00:00, 384.05it/s]


Epoch 4, Loss: 56266.04288874361


100%|██████████| 795/795 [00:02<00:00, 388.73it/s]


Epoch 5, Loss: 46483.6336434366


100%|██████████| 795/795 [00:02<00:00, 385.45it/s]


Epoch 6, Loss: 39421.773160866425


100%|██████████| 795/795 [00:01<00:00, 398.99it/s]


Epoch 7, Loss: 34094.45370802349


100%|██████████| 795/795 [00:02<00:00, 394.44it/s]


Epoch 8, Loss: 29934.801067475997


100%|██████████| 795/795 [00:02<00:00, 383.81it/s]


Epoch 9, Loss: 26598.967300332915


100%|██████████| 795/795 [00:02<00:00, 392.96it/s]

Epoch 10, Loss: 23866.57181366743





In [111]:
w2v.W.shape

(4547, 1000)

In [112]:
w2v.vocab_size


4547

In [113]:
w2v.W

array([[-0.02967446, -0.03098272,  0.00371896, ..., -0.00595652,
        -0.00265873, -0.00749667],
       [-0.03754046, -0.0072625 ,  0.01149858, ...,  0.01044861,
        -0.03756396, -0.03205405],
       [-0.03084021, -0.02758714,  0.00940491, ...,  0.00095203,
        -0.02602498, -0.00913874],
       ...,
       [ 0.00629228, -0.00173301, -0.00360592, ..., -0.00350293,
         0.0045678 , -0.00349289],
       [ 0.00593521, -0.00895686,  0.01003124, ..., -0.00570487,
        -0.01156177, -0.01647145],
       [-0.03483199, -0.01463233,  0.00908088, ...,  0.00704977,
        -0.00822424,  0.00315612]])

In [114]:
w2v.vocab

In [116]:
# Visualize embeddings
visualize_embeddings_tsne(w2v.W, tokens, title="Word2Vec Embeddings", top_n=100, perplexity=5)

Visualizing 100 words


In [None]:
# Train GloVe
glove = GloVe(embedding_dim=100)
glove.build_vocab_and_cooccurrence(sentences)
glove.train(epochs=50)


In [None]:
import numpy as np
from tqdm import tqdm

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

class Word2Vec:
    def __init__(self, window_size=2, embedding_dim=100):
        """
        Initialize the Word2Vec model.

        Parameters:
            window_size (int): Context window size.
            embedding_dim (int): Dimensionality of the word embeddings.
        """
        self.window_size = window_size
        self.embedding_dim = embedding_dim
        self.word2idx = {}
        self.idx2word = {}
        self.vocab_size = 0
        self.W = None  # Target word embedding matrix
        self.C = None  # Context word embedding matrix

    def build_vocab(self, sentences):
        """
        Build the vocabulary and mapping of words to indices.

        Parameters:
            sentences (list of list of str): Tokenized sentences.
        """
        vocab = set(word for sentence in sentences for word in sentence)
        self.vocab_size = len(vocab)
        self.word2idx = {word: idx for idx, word in enumerate(vocab)}
        self.idx2word = {idx: word for word, idx in self.word2idx.items()}

    def initialize_weights(self):
        """
        Initialize weight matrices for the Word2Vec model.
        """
        self.W = np.random.randn(self.vocab_size, self.embedding_dim) * 0.01
        self.C = np.random.randn(self.vocab_size, self.embedding_dim) * 0.01

    def train(self, sentences, epochs=10, learning_rate=0.01):
        """
        Train the Word2Vec model using skip-gram.

        Parameters:
            sentences (list of list of str): Tokenized sentences.
            epochs (int): Number of training epochs.
            learning_rate (float): Learning rate for gradient descent.
        """
        # Build vocabulary and initialize weights
        self.build_vocab(sentences)
        self.initialize_weights()

        for epoch in range(epochs):
            total_loss = 0
            for sentence in tqdm(sentences, desc=f"Epoch {epoch+1}"):
                for i, target_word in enumerate(sentence):
                    target_idx = self.word2idx[target_word]

                    # Get context word indices within the window
                    context_indices = list(range(max(0, i - self.window_size), i)) + \
                                      list(range(i + 1, min(len(sentence), i + self.window_size + 1)))

                    context_words = [self.word2idx[sentence[j]] for j in context_indices]

                    # Perform training for each context word
                    for context_idx in context_words:
                        # Forward pass
                        target_vector = self.W[target_idx]
                        context_vector = self.C[context_idx]
                        score = np.dot(target_vector, context_vector)
                        prediction = sigmoid(score)

                        # Compute the loss and gradients
                        loss = -np.log(prediction)
                        total_loss += loss

                        grad = (1 - prediction) * context_vector
                        grad_context = (1 - prediction) * target_vector

                        # Update weights
                        self.W[target_idx] += learning_rate * grad
                        self.C[context_idx] += learning_rate * grad_context

            print(f"Epoch {epoch + 1}, Loss: {total_loss:.4f}")

    def get_word_vector(self, word):
        """
        Get the embedding vector for a specific word.

        Parameters:
            word (str): The word to retrieve the embedding for.

        Returns:
            numpy.ndarray: Embedding vector for the word.
        """
        if word not in self.word2idx:
            raise ValueError(f"Word '{word}' not in vocabulary")
        return self.W[self.word2idx[word]]

    def get_context_vector(self, word):
        """
        Get the context vector for a specific word.

        Parameters:
            word (str): The word to retrieve the context vector for.

        Returns:
            numpy.ndarray: Context vector for the word.
        """
        if word not in self.word2idx:
            raise ValueError(f"Word '{word}' not in vocabulary")
        return self.C[self.word2idx[word]]



In [1]:
from gensim.models import Word2Vec

In [None]:
# make a bag of words from the reviews and train a word2vec model


In [16]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
import plotly.express as px

# Tokenize and create Word2Vec model
reviews = df['cleaned_reviewbis']
tokenized_reviews = list(reviews)
model = Word2Vec(sentences=tokenized_reviews, vector_size=100, window=5, min_count=1, workers=4)

# Get top 50 words and their vectors
most_important_words = model.wv.index_to_key[:50]
word_vectors = [model.wv[word] for word in most_important_words]

# Reduce to 3D and get explained variance
pca = PCA(n_components=3)
word_vectors_3d = pca.fit_transform(word_vectors)
explained_var = pca.explained_variance_ratio_
var_percentages = [f"{var:.1%}" for var in explained_var]
total_var = f"{sum(explained_var):.1%}"

# Perform clustering
kmeans = KMeans(n_clusters=5)
clusters = kmeans.fit_predict(word_vectors)

# Create plot DataFrame
plot_df = pd.DataFrame(word_vectors_3d, columns=['x', 'y', 'z'])
plot_df['word'] = most_important_words
plot_df['cluster'] = clusters

# Create enhanced 3D plot
fig = px.scatter_3d(
    plot_df,
    x='x', y='y', z='z',
    color='cluster',
    text='word',
    title=f'Word Clusters (Total variance explained: {total_var})',
    labels={
        'x': f'PC1 ({var_percentages[0]})',
        'y': f'PC2 ({var_percentages[1]})',
        'z': f'PC3 ({var_percentages[2]})'
    }
)

fig.update_layout(
    width=900,
    height=900,
    showlegend=True
)

fig.show()

In [None]:
# Ca start la

In [3]:
from src.pipeline import Pipeline

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\gabry\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\gabry\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\gabry\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\gabry\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [4]:
pipe = Pipeline()

In [155]:
df = pipe.get_every_reviews()

In [6]:
df.head()

Unnamed: 0,restaurant_id,user_id,review_id,title,user_profile,date_review,rating,type_visit,num_contributions,review,review_cleaned
0,1,1,1,Très belle soirée,SetC77,2024-12-16,5.0,friends,67,"Trop bon moment!! \nAccueil, plats, ambiance t...",trop bon moment accueil plat ambiance tout pas...
1,1,2,2,Vive la bonne cuisine dans une ambiance conviv...,H3293ZGsylviel,2024-11-26,5.0,friends,2,"Toujours aussi goûteux !\nL'ambiance, la bonne...",toujours aussi goûteux lambiance bonne humeur ...
2,1,3,3,Sans plus,marieno_lleb739,2024-11-23,3.0,friends,96,Les serveurs et serveuses sont sympas. Mais l’...,serveurs serveuses sympas longlet nest tendre ...
3,1,4,4,Bon et joyeux,Vymsbmm,2024-11-14,5.0,friends,225,Une excellente soirée dans ce petit restaurant...,excellente soirée petit restaurant peu lécart ...
4,1,5,5,Bon restaurant et endroit pour se retrouver en...,Youliic974,2024-11-01,4.0,friends,38,Un super moment entre amis.\nLes plats étaient...,super moment entre amis plat très délicieux co...


In [204]:
import plotly.express as px

# 2. Créer le graphique de densité
fig = px.density_contour(df, x='rating', color='type_visit', title='Graphique de Densité de [Votre Colonne]')

# 4. Ajouter des titres et labels
fig.update_layout(
    xaxis_title='[Nom de la Colonne]',
    yaxis_title='Densité'
)

# 5. Afficher le graphique
fig.show()

In [207]:
df['Length'] = df['review'].apply(len)

In [13]:
import seaborn as sns
import plotly.express as px


In [205]:
df.head()

Unnamed: 0,restaurant_id,user_id,review_id,title,user_profile,date_review,rating,type_visit,num_contributions,review,review_cleaned,id_restaurant
0,1,1,1,Très belle soirée,SetC77,2024-12-16,5.0,friends,67,"Trop bon moment!! \nAccueil, plats, ambiance t...",trop bon moment accueil plat ambiance tout pas...,1
1,1,2,2,Vive la bonne cuisine dans une ambiance conviv...,H3293ZGsylviel,2024-11-26,5.0,friends,2,"Toujours aussi goûteux !\nL'ambiance, la bonne...",toujours aussi goûteux lambiance bonne humeur ...,1
2,1,3,3,Sans plus,marieno_lleb739,2024-11-23,3.0,friends,96,Les serveurs et serveuses sont sympas. Mais l’...,serveurs serveuses sympas longlet nest tendre ...,1
3,1,4,4,Bon et joyeux,Vymsbmm,2024-11-14,5.0,friends,225,Une excellente soirée dans ce petit restaurant...,excellente soirée petit restaurant peu lécart ...,1
4,1,5,5,Bon restaurant et endroit pour se retrouver en...,Youliic974,2024-11-01,4.0,friends,38,Un super moment entre amis.\nLes plats étaient...,super moment entre amis plat très délicieux co...,1


In [208]:
fig = px.density_contour(
    data_frame=df,
    x='Length',
    color='rating',
    marginal_x='histogram',
    marginal_y='histogram'
)
fig.update_layout(width=900, height=300)
fig.show()

In [209]:
# 2. Créer le graphique de densité
fig = px.density_contour(df, x='Length', color='rating', title='Graphique de Densité de [Votre Colonne]')

# 4. Ajouter des titres et labels
fig.update_layout(
    xaxis_title='[Nom de la Colonne]',
    yaxis_title='Densité'
)

# 5. Afficher le graphique
fig.show()

In [77]:
tokens = df["review"].tolist()

In [52]:
stop_words_fr = "https://raw.githubusercontent.com/stopwords-iso/stopwords-fr/master/stopwords-fr.txt"
stop_words = pd.read_csv(stop_words_fr, header=None)
stop_words = stop_words[0].tolist()

from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(stop_words=stop_words)
X = vectorizer.fit_transform(df['review'])
tokens = vectorizer.get_feature_names_out()
token_matrix = X.toarray()



Your stop_words may be inconsistent with your preprocessing. Tokenizing the stop words generated tokens ['quelqu'] not in stop_words.



In [53]:
import nltk
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

# Preprocess the reviews
def preprocess(review):
    review = review.lower()
    tokens = nltk.word_tokenize(review)
    tokens = [token for token in tokens if token not in stop_words]
    tokens = [token for token in tokens if len(token) > 2]
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return tokens

# Apply preprocessing to the reviews
reviews = [preprocess(review) for review in df['review']] 

In [97]:
reviews

[['moment',
  'accueil',
  'plat',
  'ambiance',
  'passer',
  'moment',
  'amis',
  'région',
  'hâte',
  'découvrir',
  'genre',
  'établissement',
  'déçus'],
 ['goûteux',
  "l'ambiance",
  'bonne',
  'humeur',
  'moment',
  'convivialité',
  'amis',
  'collègues',
  'passer',
  'côté'],
 ['serveurs',
  'serveuses',
  'sympas',
  'onglet',
  'tendre',
  'terrine',
  'moyenne',
  'lentilles',
  'décevantes',
  'fondant',
  'carrément',
  'top',
  'cuisine',
  'améliorer'],
 ['excellente',
  'soirée',
  'petit',
  'restaurant',
  'écart',
  'côté',
  'cuisine',
  'moment',
  'privilégié',
  'patron',
  'cuisiniers',
  'plat',
  'délicieux',
  'ambiance',
  'sympa',
  'grâce',
  'bonne',
  'humeur',
  'restaurateur',
  'manqué'],
 ['super',
  'moment',
  'amis',
  'plat',
  'délicieux',
  'conseille',
  'fortement',
  'adresse',
  'bouchon',
  'lyonnais',
  'grave',
  'régalé'],
 ['petite',
  'salade',
  'sympas',
  'gustativement',
  'proposer',
  'betteraves',
  'harengs',
  'terrine

In [19]:
import gensim
from gensim import corpora

In [20]:
dictionary = corpora.Dictionary(reviews)

In [21]:
corpus = [dictionary.doc2bow(review) for review in reviews]

In [22]:
corpus

[[(0, 1),
  (1, 1),
  (2, 1),
  (3, 1),
  (4, 1),
  (5, 1),
  (6, 1),
  (7, 2),
  (8, 1),
  (9, 1),
  (10, 1),
  (11, 1)],
 [(2, 1),
  (7, 1),
  (8, 1),
  (12, 1),
  (13, 1),
  (14, 1),
  (15, 1),
  (16, 1),
  (17, 1),
  (18, 1)],
 [(19, 1),
  (20, 1),
  (21, 1),
  (22, 1),
  (23, 1),
  (24, 1),
  (25, 1),
  (26, 1),
  (27, 1),
  (28, 1),
  (29, 1),
  (30, 1),
  (31, 1),
  (32, 1)],
 [(1, 1),
  (7, 1),
  (9, 1),
  (12, 1),
  (15, 1),
  (17, 1),
  (21, 1),
  (33, 1),
  (34, 1),
  (35, 1),
  (36, 1),
  (37, 1),
  (38, 1),
  (39, 1),
  (40, 1),
  (41, 1),
  (42, 1),
  (43, 1),
  (44, 1),
  (45, 1)],
 [(2, 1),
  (7, 1),
  (9, 1),
  (34, 1),
  (46, 1),
  (47, 1),
  (48, 1),
  (49, 1),
  (50, 1),
  (51, 1),
  (52, 1),
  (53, 1)],
 [(29, 1),
  (31, 1),
  (51, 1),
  (54, 1),
  (55, 1),
  (56, 1),
  (57, 1),
  (58, 1),
  (59, 1),
  (60, 1),
  (61, 1),
  (62, 1),
  (63, 1),
  (64, 1),
  (65, 1),
  (66, 1),
  (67, 1),
  (68, 1),
  (69, 2),
  (70, 1),
  (71, 1),
  (72, 1),
  (73, 1),
  (74, 1),
  

In [32]:
lda_model = gensim.models.LdaModel(corpus=corpus, id2word=dictionary, num_topics=3)

In [33]:
for topic in lda_model.show_topics(num_topics=3):
    print('Topic', topic[0])
    print('Top words:', topic[1], '\n')

Topic 0
Top words: 0.018*"bouchon" + 0.017*"service" + 0.014*"menu" + 0.014*"cuisine" + 0.014*"..." + 0.013*"plat" + 0.009*"lyonnais" + 0.009*"fille" + 0.008*"restaurant" + 0.007*"table" 

Topic 1
Top words: 0.018*"plat" + 0.014*"bouchon" + 0.011*"menu" + 0.009*"lyonnais" + 0.009*"dessert" + 0.008*"..." + 0.007*"service" + 0.007*"c'est" + 0.007*"fromage" + 0.007*"bonne" 

Topic 2
Top words: 0.026*"plat" + 0.016*"dessert" + 0.016*"bouchon" + 0.013*"service" + 0.010*"menu" + 0.010*"c'est" + 0.009*"choix" + 0.009*"bonne" + 0.008*"lyonnais" + 0.008*"cuisine" 



In [25]:
import pyLDAvis
import pyLDAvis.gensim as gensimvis

In [34]:
vis = gensimvis.prepare(lda_model, corpus, dictionary)

In [35]:
pyLDAvis.display(vis)

In [36]:
import pandas as pd
import numpy as np
from datetime import datetime
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
from textblob import TextBlob
import plotly.express as px
import plotly.graph_objects as go

def analyze_restaurant(df):
    # Temporal Analysis
    df['review_date'] = pd.to_datetime(df["date_review"])
    df['year_month'] = df['review_date'].dt.strftime('%Y-%m')
    
    # Correctly calculate time analysis
    time_analysis = df.groupby('year_month').agg({
        'rating': ['mean', 'count']
    }).reset_index()
    
    # Flatten column names
    time_analysis.columns = ['year_month', 'rating_mean', 'rating_count']
    
    # Create time trend plot
    time_trend = px.line(
        time_analysis,
        x='year_month',
        y='rating_mean',
        title='Rating Trends Over Time'
    )
    
    # Visit type analysis
    visit_dist = px.box(
        df,
        x='type_visit',
        y='rating',
        title='Rating Distribution by Visit Type'
    )
    
    return {
        'time_trend': time_trend,
        'visit_distribution': visit_dist,
        'statistics': time_analysis.to_dict()
    }

# Usage
# results = analyze_restaurant(df)
# results['time_trend'].show()
# results['visit_distribution'].show()

In [37]:
results = analyze_restaurant(df[df['restaurant_id'] == 1])



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [38]:
results['time_trend'].show()

In [39]:
results['visit_distribution'].show()

In [42]:
results['statistics']

{'year_month': {0: '2010-01',
  1: '2011-05',
  2: '2011-07',
  3: '2011-11',
  4: '2012-06',
  5: '2012-07',
  6: '2012-08',
  7: '2012-09',
  8: '2012-10',
  9: '2013-01',
  10: '2013-03',
  11: '2013-04',
  12: '2013-05',
  13: '2013-06',
  14: '2013-07',
  15: '2013-08',
  16: '2013-09',
  17: '2013-10',
  18: '2013-11',
  19: '2013-12',
  20: '2014-01',
  21: '2014-02',
  22: '2014-03',
  23: '2014-04',
  24: '2014-05',
  25: '2014-07',
  26: '2014-08',
  27: '2014-09',
  28: '2014-10',
  29: '2014-11',
  30: '2014-12',
  31: '2015-01',
  32: '2015-02',
  33: '2015-03',
  34: '2015-04',
  35: '2015-05',
  36: '2015-06',
  37: '2015-07',
  38: '2015-08',
  39: '2015-09',
  40: '2015-10',
  41: '2015-11',
  42: '2015-12',
  43: '2016-01',
  44: '2016-02',
  45: '2016-03',
  46: '2016-04',
  47: '2016-05',
  48: '2016-06',
  49: '2016-07',
  50: '2016-08',
  51: '2016-09',
  52: '2016-10',
  53: '2016-11',
  54: '2016-12',
  55: '2017-01',
  56: '2017-02',
  57: '2017-03',
  58: '201

In [55]:
import pandas as pd
import numpy as np
from datetime import datetime
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
from textblob import TextBlob
import plotly.express as px
import plotly.graph_objects as go
from wordcloud import WordCloud
from nltk import ngrams
from nltk.corpus import stopwords
import nltk
from transformers import pipeline

# Download NLTK stopwords for French
nltk.download('stopwords')
stop_words = set(stopwords.words('french'))

def analyze_restaurant(df):
    # 1. Data Preparation
    df['review_date'] = pd.to_datetime(df["date_review"])
    df['year_month'] = df['review_date'].dt.strftime('%Y-%m')
    
    # Handle missing reviews
    df = df.dropna(subset=['review'])
    
    # 2. Sentiment Analysis
    df['sentiment'] = df['review'].apply(lambda x: TextBlob(str(x)).sentiment.polarity)
    
    # 3. Temporal Analysis
    time_analysis = df.groupby('year_month').agg({
        'rating': ['mean', 'count'],
        'sentiment': 'mean'
    }).reset_index()
    time_analysis.columns = ['year_month', 'rating_mean', 'rating_count', 'sentiment_mean']
    
    # 4. Visit Type Analysis
    visit_analysis = df.groupby("type_visit").agg({
        'rating': ['mean', 'count'],
        'sentiment': 'mean'
    }).reset_index()
    visit_analysis.columns = ['type_visit', 'rating_mean', 'rating_count', 'sentiment_mean']
    
    # 5. Topic Modeling
    vectorizer = TfidfVectorizer(max_features=100)
    tfidf_matrix = vectorizer.fit_transform(df['review'].astype(str))
    
    nmf = NMF(n_components=5, random_state=42)
    topic_matrix = nmf.fit_transform(tfidf_matrix)
    
    # Get top words per topic
    feature_names = vectorizer.get_feature_names_out()
    topics = {}
    for topic_idx, topic in enumerate(nmf.components_):
        top_words = [feature_names[i] for i in topic.argsort()[:-10 - 1:-1]]
        topics[f'Topic_{topic_idx}'] = top_words
    
    # 6. Aspect-Based Sentiment Analysis
    aspects = ['nourriture', 'service', 'ambiance', 'prix', 'emplacement']
    for aspect in aspects:
        df[f'{aspect}_sentiment'] = df['review'].apply(
            lambda x: TextBlob(str(x)).sentiment.polarity if aspect in x.lower() else np.nan
        )
    aspect_sentiments = df[[
        'nourriture_sentiment', 'service_sentiment', 'ambiance_sentiment',
        'prix_sentiment', 'emplacement_sentiment'
    ]].mean().reset_index()
    aspect_sentiments.columns = ['Aspect', 'Average_Sentiment']
    
    # 7. Emotion Detection
    emotion_analyzer = pipeline(
        "text-classification",
        model="nlptown/bert-base-multilingual-uncased-sentiment",
        return_all_scores=True,
        truncation=True,
        max_length=512
    )
    
    def detect_emotions(text):
        result = emotion_analyzer(str(text))
        emotions = {res['label']: res['score'] for res in result[0]}
        return emotions
    
    df['emotions'] = df['review'].apply(detect_emotions)
    emotions_df = pd.json_normalize(df['emotions']).mean().reset_index()
    emotions_df.columns = ['Emotion', 'Average_Score']
    
    # 8. Word Frequency Analysis
    all_words = ' '.join(df['review'].tolist()).lower().split()
    all_words = [word for word in all_words if word.isalpha() and word not in stop_words]
    word_counts = Counter(all_words).most_common(20)
    words_df = pd.DataFrame(word_counts, columns=['Word', 'Count'])
    
    # 9. Bigram Analysis
    bigram_list = list(ngrams(all_words, 2))
    bigram_counts = Counter([' '.join(bigram) for bigram in bigram_list]).most_common(20)
    bigrams_df = pd.DataFrame(bigram_counts, columns=['Bigram', 'Count'])
    
    
    # 11. Dependency Parsing (Example: Average sentence length)
    df['sentence_length'] = df['review'].apply(lambda x: len(str(x).split('. ')))
    average_sentence_length = df['sentence_length'].mean()
    
    # 12. Visualizations
    # Time Trend Plot
    fig_time = px.line(
        time_analysis,
        x='year_month',
        y='rating_mean',
        title='Average Rating Over Time',
        labels={'year_month': 'Date', 'rating_mean': 'Average Rating'}
    )
    
    # Visit Type Distribution
    fig_visit = px.box(
        df,
        x='type_visit',
        y='rating',
        title='Rating Distribution by Visit Type',
        points='all',
        labels={'type_visit': 'Type of Visit', 'rating': 'Rating'}
    )
    
    # Sentiment vs Rating
    fig_sentiment = px.scatter(
        df,
        x='rating',
        y='sentiment',
        title='Sentiment vs Rating',
        trendline='ols',
        labels={'rating': 'Rating', 'sentiment': 'Sentiment Polarity'}
    )
    
    # Aspect Sentiment Bar Chart
    fig_aspect = px.bar(
        aspect_sentiments,
        x='Aspect',
        y='Average_Sentiment',
        title='Average Sentiment by Aspect',
        labels={'Aspect': 'Aspect', 'Average_Sentiment': 'Average Sentiment Polarity'}
    )
    
    # Emotion Scores Bar Chart
    fig_emotions = px.bar(
        emotions_df,
        x='Emotion',
        y='Average_Score',
        title='Average Emotion Scores in Reviews',
        labels={'Emotion': 'Emotion', 'Average_Score': 'Average Score'}
    )
    
    # Word Frequency Bar Chart
    fig_words = px.bar(
        words_df,
        x='Word',
        y='Count',
        title='Top 20 Most Common Words',
        labels={'Word': 'Word', 'Count': 'Frequency'}
    )
    
    # Bigram Frequency Bar Chart
    fig_bigrams = px.bar(
        bigrams_df,
        x='Bigram',
        y='Count',
        title='Top 20 Most Common Bigrams',
        labels={'Bigram': 'Bigram', 'Count': 'Frequency'}
    )
    
    # Dependency Parsing Visualization (Average Sentence Length)
    fig_sentence = px.bar(
        x=['Average Sentence Length'],
        y=[average_sentence_length],
        title='Average Sentence Length in Reviews',
        labels={'x': 'Metric', 'y': 'Number of Sentences'}
    )
    
    # Compile all visualizations
    visualizations = {
        'time_trend': fig_time,
        'visit_distribution': fig_visit,
        'sentiment_analysis': fig_sentiment,
        'aspect_sentiment': fig_aspect,
        'emotion_scores': fig_emotions,
        'word_frequency': fig_words,
        'bigram_frequency': fig_bigrams,

        'average_sentence_length': fig_sentence
    }
    
    # Return all results
    return {
        'visualizations': visualizations,
        'topics': topics,
        'statistics': {
            'time_analysis': time_analysis.to_dict(),
            'visit_analysis': visit_analysis.to_dict(),
            'aspect_sentiments': aspect_sentiments.to_dict(),
            'emotions': emotions_df.to_dict(),
            'word_counts': words_df.to_dict(),
            'bigram_counts': bigrams_df.to_dict(),
            'average_sentence_length': average_sentence_length
        }
    }

# Usage example:
# results = analyze_restaurant(restaurant_df)
# results['visualizations']['time_trend'].show()
# results['visualizations']['visit_distribution'].show()
# results['visualizations']['sentiment_analysis'].show()
# results['visualizations']['aspect_sentiment'].show()
# results['visualizations']['emotion_scores'].show()
# results['visualizations']['word_frequency'].show()
# results['visualizations']['bigram_frequency'].show()
# results['visualizations']['wordcloud_positive'].show()
# results['visualizations']['wordcloud_negative'].show()
# results['visualizations']['average_sentence_length'].show()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\gabry\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [56]:
results = analyze_restaurant(df[df['restaurant_id'] == 1])




Device set to use cpu

`return_all_scores` is now deprecated,  if want a similar functionality use `top_k=None` instead of `return_all_scores=True` or `top_k=1` instead of `return_all_scores=False`.



In [57]:
results['visualizations']['time_trend'].show()

In [58]:
results['visualizations']['visit_distribution'].show()

In [None]:
# results = analyze_restaurant(restaurant_df)
# results['visualizations']['time_trend'].show()
# results['visualizations']['visit_distribution'].show()
# results['visualizations']['sentiment_analysis'].show()
# results['visualizations']['aspect_sentiment'].show()
# results['visualizations']['emotion_scores'].show()
# results['visualizations']['word_frequency'].show()
# results['visualizations']['bigram_frequency'].show()
# results['visualizations']['wordcloud_positive'].show()
# results['visualizations']['wordcloud_negative'].show()
# results['visualizations']['average_sentence_length'].show()

In [59]:
results['visualizations']['sentiment_analysis'].show()

In [60]:
results['visualizations']['aspect_sentiment'].show()

In [61]:
results['visualizations']['emotion_scores'].show()

In [62]:
results['visualizations']['word_frequency'].show()

In [63]:
results['visualizations']['bigram_frequency'].show()


In [64]:
results['visualizations']['average_sentence_length'].show()

In [83]:
rest = pipe.get_all_restaurants()

In [129]:
rest_scrapped = [r for r in rest if r.scrapped == 1]

In [None]:
#make a df of all the restaurants
df_rest = pd.DataFrame([r.to_dict() for r in rest_scrapped])

AttributeError: 'Restaurant' object has no attribute 'to_dict'

In [131]:
scrapped_restaurants = [{
                    "id_restaurant": restaurant.id_restaurant,    
                    "nom": restaurant.nom,
                    "latitude": restaurant.latitude,
                    "longitude": restaurant.longitude,
                    "rank": restaurant.rank,
                    "prix_min": restaurant.prix_min,
                    "prix_max": restaurant.prix_max,
                    "etoiles_michelin": restaurant.etoiles_michelin,
                    "note_globale": restaurant.note_globale,
                    "qualite_prix_note": restaurant.qualite_prix_note,
                    "cuisine_note": restaurant.cuisine_note,
                    "service_note": restaurant.service_note,
                    "ambiance_note": restaurant.ambiance_note,
                    "cuisines": restaurant.cuisines,
                    "repas": restaurant.repas,
                    "fonctionnalite": restaurant.fonctionnalite,
                }
                for restaurant in [restaurant for restaurant in rest_scrapped] ]

In [132]:
scrapped_restaurants

[{'id_restaurant': 1,
  'nom': 'Le Bouchon des Filles',
  'latitude': 45.768677,
  'longitude': 4.829356,
  'rank': 149,
  'prix_min': None,
  'prix_max': None,
  'etoiles_michelin': 0,
  'note_globale': 4.0,
  'qualite_prix_note': 4.2,
  'cuisine_note': 4.2,
  'service_note': 4.2,
  'ambiance_note': 4.2,
  'cuisines': 'Française, Européenne',
  'repas': 'Déjeuner, Dîner, Ouvert tard',
  'fonctionnalite': "Cartes bancaires acceptées; Places assises; Réservations; Sert de l'alcool; Service de table; "},
 {'id_restaurant': 2,
  'nom': 'Les Terrasses de Lyon',
  'latitude': 45.76339,
  'longitude': 4.826154,
  'rank': 195,
  'prix_min': None,
  'prix_max': None,
  'etoiles_michelin': 1,
  'note_globale': 4.5,
  'qualite_prix_note': 3.8,
  'cuisine_note': 4.5,
  'service_note': 4.3,
  'ambiance_note': 4.3,
  'cuisines': 'Française',
  'repas': 'Déjeuner, Dîner',
  'fonctionnalite': "Accessible en fauteuil roulant; Bar complet; Cartes bancaires acceptées; Chaises hautes disponibles; Chèques

In [98]:
df

Unnamed: 0,restaurant_id,user_id,review_id,title,user_profile,date_review,rating,type_visit,num_contributions,review,review_cleaned,Length
0,1,1,1,Très belle soirée,SetC77,2024-12-16,5.0,friends,67,"Trop bon moment!! \nAccueil, plats, ambiance t...",trop bon moment accueil plat ambiance tout pas...,203
1,1,2,2,Vive la bonne cuisine dans une ambiance conviv...,H3293ZGsylviel,2024-11-26,5.0,friends,2,"Toujours aussi goûteux !\nL'ambiance, la bonne...",toujours aussi goûteux lambiance bonne humeur ...,123
2,1,3,3,Sans plus,marieno_lleb739,2024-11-23,3.0,friends,96,Les serveurs et serveuses sont sympas. Mais l’...,serveurs serveuses sympas longlet nest tendre ...,170
3,1,4,4,Bon et joyeux,Vymsbmm,2024-11-14,5.0,friends,225,Une excellente soirée dans ce petit restaurant...,excellente soirée petit restaurant peu lécart ...,256
4,1,5,5,Bon restaurant et endroit pour se retrouver en...,Youliic974,2024-11-01,4.0,friends,38,Un super moment entre amis.\nLes plats étaient...,super moment entre amis plat très délicieux co...,158
...,...,...,...,...,...,...,...,...,...,...,...,...
790,1,785,791,Un super petit restaurant Bouchon propose de d...,206Nancy,2016-04-28,5.0,none,4,Nous avons marché jusqu'à droite après avoir o...,avon marché jusquà droite après avoir ouvert d...,430
791,1,786,792,Tout simplement fantastique,SajaChu,2016-03-24,5.0,friends,65,Il n'y a pas de meilleur endroit pour manger q...,meilleur endroit manger cela lyon cest tout si...,174
792,1,787,793,Délicieux,GFMVHAC,2016-01-30,4.0,couples,58,Le Bouchon charmante et chaleureuse et traditi...,bouchon charmante chaleureuse traditionnelle b...,174
793,1,788,794,Moderne Bouchon,SoonerSteve,2015-09-24,5.0,couples,1826,C'est un bouchon mais plus moderne et haut de ...,cest bouchon plus moderne haut gamme femme avo...,302


In [133]:
df_rest = pd.DataFrame(scrapped_restaurants)

In [94]:
# make a clsuter of the restaurants
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

df_rest = df_rest.dropna()
# Select relevant features
features = ['prix_min', 'prix_max', 'note_globale', 'qualite_prix_note', 'cuisine_note', 'service_note', 'ambiance_note']
X = df_rest[features]

# Standardize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Perform KMeans clustering
kmeans = KMeans(n_clusters=5)
df_rest['cluster'] = kmeans.fit_predict(X_scaled)

df_rest

Unnamed: 0,nom,latitude,longitude,rank,prix_min,prix_max,etoiles_michelin,note_globale,qualite_prix_note,cuisine_note,service_note,ambiance_note,cuisines,repas,fonctionnalite,cluster
2,Frazarin Bistrot Franco Italien,45.751328,4.829706,3,16.0,38.0,0,5.0,4.9,4.9,4.9,4.9,"Italienne, Française, Saine","Déjeuner, Dîner, Boissons",Accessible en fauteuil roulant; Bières & Vins;...,1
6,Agastache Restaurant,45.766384,4.847662,4,25.0,42.0,0,5.0,4.9,4.9,5.0,4.7,"Française, Moderne, Saine","Déjeuner, Dîner",Cartes bancaires acceptées; Places assises; Ré...,1
7,Le Casse Museau,45.76547,4.832431,7,10.0,38.0,0,5.0,4.6,4.8,4.8,4.9,Française,"Déjeuner, Dîner",American Express; Bières & Vins; Cartes bancai...,1
8,Le Vieux Lyon,45.76234,4.827478,84,15.0,25.0,0,4.5,4.4,4.4,4.3,4.4,"Française, Européenne","Déjeuner, Dîner",Buffet; Cartes bancaires acceptées; Places ass...,3
9,L'affreux Jojo,45.75963,4.847064,1,9.0,15.0,0,5.0,4.7,4.8,4.9,4.8,"Italienne, Méditerranéenne, Saine","Déjeuner, Dîner, Boissons",Accessible en fauteuil roulant; Cartes bancair...,1
10,La Table d’Ambre,45.746998,4.845089,9,28.0,79.0,0,4.0,3.4,3.8,4.1,4.2,Française,"Déjeuner, Dîner",Cartes bancaires acceptées; Places assises; Ré...,0
12,Carmelo,45.765316,4.834973,249,9.0,25.0,0,4.0,3.6,3.9,4.0,3.9,"Italienne, Toscane, Romana, Latium, Sicilienne...","Déjeuner, Dîner",Accessible en fauteuil roulant; Bar complet; B...,0
13,Wasabi,45.74897,4.841493,82,39.0,79.0,0,4.5,4.0,4.7,4.4,4.3,"Japonaise, Sushi",Dîner,Accessible en fauteuil roulant; Réservations; ...,4
16,La Source,45.74624,4.82741,8,7.0,29.0,0,5.0,4.8,4.9,4.9,4.8,"Française, Saine, Bars-restaurants","Petit déjeuner, Déjeuner, Dîner, Boissons",Accessible en fauteuil roulant; Bar complet; C...,1
21,Empanadas Club,45.76673,4.83463,5,15.0,25.0,0,5.0,4.8,4.9,4.9,4.8,"Latino, Saine, Argentine, Sud-américaine","Déjeuner, Dîner, Ouvert tard, Boissons",Bar complet; Bières & Vins; Mastercard; Places...,1


In [96]:
# plot in 3 d the clusters
fig = px.scatter_3d(
    df_rest,
    x='note_globale',
    y='qualite_prix_note',
    z='cuisine_note',
    color='cluster',
    title='Restaurant Clusters',
    labels={'note_globale': 'Note Globale', 'qualite_prix_note': 'Qualité/Prix', 'cuisine_note': 'Cuisine'}
)
fig.show()

In [None]:
# Step 1: Preprocess reviews
vectorizer = TfidfVectorizer(stop_words='english')
X_reviews = vectorizer.fit_transform(df['review'])

# Step 2: Combine review features with restaurant characteristics
restaurant_features = df[['characteristic1', 'characteristic2', 'restaurant_info1', 'restaurant_info2']]
X_combined = pd.concat([pd.DataFrame(X_reviews.toarray()), restaurant_features.reset_index(drop=True)], axis=1)

# Step 3: Perform clustering
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_combined)
kmeans = KMeans(n_clusters=5, random_state=42)
df['cluster'] = kmeans.fit_predict(X_scaled)

# Step 4: Visualize clusters in a 3D chart
fig = px.scatter_3d(
    df,
    x='characteristic1',
    y='characteristic2',
    z='restaurant_info1',
    color='cluster',
    hover_data=['restaurant_info2']
)
fig.update_layout(title='3D Clustering of Restaurants')
fig.show()

In [101]:
from textblob import TextBlob

# Ajouter une colonne de sentiment
df["sentiment"] = df["review"].apply(lambda x: TextBlob(x).sentiment.polarity)


In [102]:
df

Unnamed: 0,restaurant_id,user_id,review_id,title,user_profile,date_review,rating,type_visit,num_contributions,review,review_cleaned,Length,sentiment
0,1,1,1,Très belle soirée,SetC77,2024-12-16,5.0,friends,67,"Trop bon moment!! \nAccueil, plats, ambiance t...",trop bon moment accueil plat ambiance tout pas...,203,0.000000
1,1,2,2,Vive la bonne cuisine dans une ambiance conviv...,H3293ZGsylviel,2024-11-26,5.0,friends,2,"Toujours aussi goûteux !\nL'ambiance, la bonne...",toujours aussi goûteux lambiance bonne humeur ...,123,0.000000
2,1,3,3,Sans plus,marieno_lleb739,2024-11-23,3.0,friends,96,Les serveurs et serveuses sont sympas. Mais l’...,serveurs serveuses sympas longlet nest tendre ...,170,0.500000
3,1,4,4,Bon et joyeux,Vymsbmm,2024-11-14,5.0,friends,225,Une excellente soirée dans ce petit restaurant...,excellente soirée petit restaurant peu lécart ...,256,0.000000
4,1,5,5,Bon restaurant et endroit pour se retrouver en...,Youliic974,2024-11-01,4.0,friends,38,Un super moment entre amis.\nLes plats étaient...,super moment entre amis plat très délicieux co...,158,0.333333
...,...,...,...,...,...,...,...,...,...,...,...,...,...
790,1,785,791,Un super petit restaurant Bouchon propose de d...,206Nancy,2016-04-28,5.0,none,4,Nous avons marché jusqu'à droite après avoir o...,avon marché jusquà droite après avoir ouvert d...,430,0.000000
791,1,786,792,Tout simplement fantastique,SajaChu,2016-03-24,5.0,friends,65,Il n'y a pas de meilleur endroit pour manger q...,meilleur endroit manger cela lyon cest tout si...,174,0.000000
792,1,787,793,Délicieux,GFMVHAC,2016-01-30,4.0,couples,58,Le Bouchon charmante et chaleureuse et traditi...,bouchon charmante chaleureuse traditionnelle b...,174,0.000000
793,1,788,794,Moderne Bouchon,SoonerSteve,2015-09-24,5.0,couples,1826,C'est un bouchon mais plus moderne et haut de ...,cest bouchon plus moderne haut gamme femme avo...,302,0.000000


In [108]:
df

Unnamed: 0,restaurant_id,user_id,review_id,title,user_profile,date_review,rating,type_visit,num_contributions,review,review_cleaned,Length,sentiment
0,1,1,1,Très belle soirée,SetC77,2024-12-16,5.0,friends,67,"Trop bon moment!! \nAccueil, plats, ambiance t...",trop bon moment accueil plat ambiance tout pas...,203,0.000000
1,1,2,2,Vive la bonne cuisine dans une ambiance conviv...,H3293ZGsylviel,2024-11-26,5.0,friends,2,"Toujours aussi goûteux !\nL'ambiance, la bonne...",toujours aussi goûteux lambiance bonne humeur ...,123,0.000000
2,1,3,3,Sans plus,marieno_lleb739,2024-11-23,3.0,friends,96,Les serveurs et serveuses sont sympas. Mais l’...,serveurs serveuses sympas longlet nest tendre ...,170,0.500000
3,1,4,4,Bon et joyeux,Vymsbmm,2024-11-14,5.0,friends,225,Une excellente soirée dans ce petit restaurant...,excellente soirée petit restaurant peu lécart ...,256,0.000000
4,1,5,5,Bon restaurant et endroit pour se retrouver en...,Youliic974,2024-11-01,4.0,friends,38,Un super moment entre amis.\nLes plats étaient...,super moment entre amis plat très délicieux co...,158,0.333333
...,...,...,...,...,...,...,...,...,...,...,...,...,...
790,1,785,791,Un super petit restaurant Bouchon propose de d...,206Nancy,2016-04-28,5.0,none,4,Nous avons marché jusqu'à droite après avoir o...,avon marché jusquà droite après avoir ouvert d...,430,0.000000
791,1,786,792,Tout simplement fantastique,SajaChu,2016-03-24,5.0,friends,65,Il n'y a pas de meilleur endroit pour manger q...,meilleur endroit manger cela lyon cest tout si...,174,0.000000
792,1,787,793,Délicieux,GFMVHAC,2016-01-30,4.0,couples,58,Le Bouchon charmante et chaleureuse et traditi...,bouchon charmante chaleureuse traditionnelle b...,174,0.000000
793,1,788,794,Moderne Bouchon,SoonerSteve,2015-09-24,5.0,couples,1826,C'est un bouchon mais plus moderne et haut de ...,cest bouchon plus moderne haut gamme femme avo...,302,0.000000


In [135]:
df_rest

Unnamed: 0,id_restaurant,nom,latitude,longitude,rank,prix_min,prix_max,etoiles_michelin,note_globale,qualite_prix_note,cuisine_note,service_note,ambiance_note,cuisines,repas,fonctionnalite
2,3,Frazarin Bistrot Franco Italien,45.751328,4.829706,3,16.0,38.0,0,5.0,4.9,4.9,4.9,4.9,"Italienne, Française, Saine","Déjeuner, Dîner, Boissons",Accessible en fauteuil roulant; Bières & Vins;...
6,7,Agastache Restaurant,45.766384,4.847662,4,25.0,42.0,0,5.0,4.9,4.9,5.0,4.7,"Française, Moderne, Saine","Déjeuner, Dîner",Cartes bancaires acceptées; Places assises; Ré...
7,8,Le Casse Museau,45.76547,4.832431,7,10.0,38.0,0,5.0,4.6,4.8,4.8,4.9,Française,"Déjeuner, Dîner",American Express; Bières & Vins; Cartes bancai...
8,9,Le Vieux Lyon,45.76234,4.827478,84,15.0,25.0,0,4.5,4.4,4.4,4.3,4.4,"Française, Européenne","Déjeuner, Dîner",Buffet; Cartes bancaires acceptées; Places ass...
9,10,L'affreux Jojo,45.75963,4.847064,1,9.0,15.0,0,5.0,4.7,4.8,4.9,4.8,"Italienne, Méditerranéenne, Saine","Déjeuner, Dîner, Boissons",Accessible en fauteuil roulant; Cartes bancair...
10,11,La Table d’Ambre,45.746998,4.845089,9,28.0,79.0,0,4.0,3.4,3.8,4.1,4.2,Française,"Déjeuner, Dîner",Cartes bancaires acceptées; Places assises; Ré...
12,13,Carmelo,45.765316,4.834973,249,9.0,25.0,0,4.0,3.6,3.9,4.0,3.9,"Italienne, Toscane, Romana, Latium, Sicilienne...","Déjeuner, Dîner",Accessible en fauteuil roulant; Bar complet; B...
13,14,Wasabi,45.74897,4.841493,82,39.0,79.0,0,4.5,4.0,4.7,4.4,4.3,"Japonaise, Sushi",Dîner,Accessible en fauteuil roulant; Réservations; ...
16,17,La Source,45.74624,4.82741,8,7.0,29.0,0,5.0,4.8,4.9,4.9,4.8,"Française, Saine, Bars-restaurants","Petit déjeuner, Déjeuner, Dîner, Boissons",Accessible en fauteuil roulant; Bar complet; C...
21,22,Empanadas Club,45.76673,4.83463,5,15.0,25.0,0,5.0,4.8,4.9,4.9,4.8,"Latino, Saine, Argentine, Sud-américaine","Déjeuner, Dîner, Ouvert tard, Boissons",Bar complet; Bières & Vins; Mastercard; Places...


In [151]:
df_rest = pd.DataFrame(scrapped_restaurants)
df_rest = df_rest.dropna()
df = df.dropna()

In [156]:
df["id_restaurant"] = df["restaurant_id"]

In [157]:
# jointure des deux dataframes
df_rest = pd.merge(df, df_rest, on='id_restaurant')

In [159]:
df_rest

Unnamed: 0,restaurant_id,user_id,review_id,title,user_profile,date_review,rating,type_visit,num_contributions,review,...,prix_max,etoiles_michelin,note_globale,qualite_prix_note,cuisine_note,service_note,ambiance_note,cuisines,repas,fonctionnalite
0,3,1492,1531,Très belle expérience,fredsX5996LQ,2024-12-15,5.0,family,8,"Ambiance soignée, patrons amicaux et donnant t...",...,38.0,0,5.0,4.9,4.9,4.9,4.9,"Italienne, Française, Saine","Déjeuner, Dîner, Boissons",Accessible en fauteuil roulant; Bières & Vins;...
1,3,1493,1532,"Déjeuner au Frazarin, que du bonheur",Guide38409630404,2024-11-27,5.0,friends,1,"Vous voulez bien manger, alors allez dans ce r...",...,38.0,0,5.0,4.9,4.9,4.9,4.9,"Italienne, Française, Saine","Déjeuner, Dîner, Boissons",Accessible en fauteuil roulant; Bières & Vins;...
2,3,1494,1533,La perfection tout simplement,chartinm,2024-11-20,5.0,couples,4,Tout simplement parfait de l’entrée au dessert...,...,38.0,0,5.0,4.9,4.9,4.9,4.9,"Italienne, Française, Saine","Déjeuner, Dîner, Boissons",Accessible en fauteuil roulant; Bières & Vins;...
3,3,1495,1534,Plat frais et de qualité,Axelfoley009,2024-11-18,5.0,couples,38,Belle soirée passée au Frazarin. Certainement ...,...,38.0,0,5.0,4.9,4.9,4.9,4.9,"Italienne, Française, Saine","Déjeuner, Dîner, Boissons",Accessible en fauteuil roulant; Bières & Vins;...
4,3,1496,1535,très bien,karinepU103AS,2024-11-13,5.0,business,1,Très bon restaurant ! j'ai adoré le service et...,...,38.0,0,5.0,4.9,4.9,4.9,4.9,"Italienne, Française, Saine","Déjeuner, Dîner, Boissons",Accessible en fauteuil roulant; Bières & Vins;...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5860,38,15183,19613,A vite découvrir !,philsjm,2020-08-17,5.0,none,13,"Lyonnais depuis 4 ans, nous avons testé par ha...",...,27.0,0,5.0,4.8,4.9,5.0,4.9,"Française, Européenne, Saine","Déjeuner, Dîner, Boissons",Accessible en fauteuil roulant; Bar complet; C...
5861,38,15415,19614,Très bien,laulau3069,2020-08-17,5.0,couples,0,Terrasse tranquille à l'ombre Le service très ...,...,27.0,0,5.0,4.8,4.9,5.0,4.9,"Française, Européenne, Saine","Déjeuner, Dîner, Boissons",Accessible en fauteuil roulant; Bar complet; C...
5862,38,15416,19615,Un super moment,Francois_petavy,2020-08-17,5.0,family,0,Merci au restaurant la Gache pour ce midi. Un ...,...,27.0,0,5.0,4.8,4.9,5.0,4.9,"Française, Européenne, Saine","Déjeuner, Dîner, Boissons",Accessible en fauteuil roulant; Bar complet; C...
5863,38,15417,19616,Nous nous y sommes arrêté,solenneg9,2020-08-17,5.0,none,0,Nous nous y sommes arrêté par hasard ce midi E...,...,27.0,0,5.0,4.8,4.9,5.0,4.9,"Française, Européenne, Saine","Déjeuner, Dîner, Boissons",Accessible en fauteuil roulant; Bar complet; C...


In [166]:
from sklearn.feature_extraction.text import TfidfVectorizer

# TF-IDF pour représenter les avis en vecteurs
vectorizer = TfidfVectorizer(max_features=100)  # Limiter à 100 mots-clés
X_tfidf = vectorizer.fit_transform(df_rest["review"])


In [169]:
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

# Préparer les données pour le clustering
features = pd.concat([df_rest[["note_globale", "rank", "prix_min", "prix_max", "etoiles_michelin"]],
                      pd.DataFrame(X_tfidf.toarray())], axis=1)
scaler = StandardScaler()
features.columns = features.columns.astype(str)
features_scaled = scaler.fit_transform(features)

# Réduction de dimensions pour visualisation
pca = PCA(n_components=3)
features_3d = pca.fit_transform(features_scaled)

# Appliquer KMeans
kmeans = KMeans(n_clusters=5, random_state=42)
df_rest["cluster"] = kmeans.fit_predict(features_scaled)


In [176]:
features

Unnamed: 0,note_globale,rank,prix_min,prix_max,etoiles_michelin,0,1,2,3,4,...,90,91,92,93,94,95,96,97,98,99
0,5.0,3,16.0,38.0,0,0.000000,0.000000,0.000000,0.000000,0.242203,...,0.244363,0.000000,0.000000,0.257452,0.216599,0.0,0.0,0.000000,0.000000,0.000000
1,5.0,3,16.0,38.0,0,0.142135,0.000000,0.000000,0.000000,0.000000,...,0.092329,0.000000,0.000000,0.000000,0.491034,0.0,0.0,0.000000,0.000000,0.000000
2,5.0,3,16.0,38.0,0,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.143704,0.000000,0.326029,0.000000,0.0,0.0,0.000000,0.000000,0.000000
3,5.0,3,16.0,38.0,0,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.214015,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000
4,5.0,3,16.0,38.0,0,0.000000,0.000000,0.000000,0.431606,0.000000,...,0.206895,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5500,4.5,135,60.0,179.0,1,0.000000,0.216527,0.000000,0.000000,0.000000,...,0.098978,0.275781,0.111647,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000
5501,4.5,135,60.0,179.0,1,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.097632,0.181353,0.110128,0.000000,0.000000,0.0,0.0,0.000000,0.397109,0.220077
5502,4.5,135,60.0,179.0,1,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.185724,0.344986,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000
5503,4.5,135,60.0,179.0,1,0.000000,0.000000,0.181291,0.000000,0.000000,...,0.297559,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.223581


In [178]:
features["id_restaurant"] = df_rest["id_restaurant"]

In [186]:
# plot in 3d pca the clusters
fig = px.scatter_3d(
    df_rest,
    x=features_3d[:, 0],
    y=features_3d[:, 1],
    z=features_3d[:, 2],
    color='id_restaurant',
    title='Restaurant Clusters',
    labels={'x': 'PC1', 'y': 'PC2', 'z': 'PC3'}
)

fig.update_layout(
    width=1200,
    height=800
)

fig.show()

In [188]:
# df_rest id 98 rest
df_rest[df_rest['id_restaurant'] == 98]

Unnamed: 0,restaurant_id,user_id,review_id,title,user_profile,date_review,rating,type_visit,num_contributions,review,...,etoiles_michelin,note_globale,qualite_prix_note,cuisine_note,service_note,ambiance_note,cuisines,repas,fonctionnalite,cluster
4455,98,14392,18076,Époustouflant,80sebastienr,2025-01-11,5.0,couples,17,"Il ne s'agit pas simplement de qualité, mais ...",...,2,5.0,4.4,4.8,4.7,4.3,"Française, Saine","Déjeuner, Dîner",Accessible en fauteuil roulant; Cartes bancair...,1
4456,98,14393,18077,"Acceuil ,calme, service et quiétude.",Pierre-Loui,2024-12-15,4.0,business,322,"Dès l'entrée au restaurant , nous sommes surpr...",...,2,5.0,4.4,4.8,4.7,4.3,"Française, Saine","Déjeuner, Dîner",Accessible en fauteuil roulant; Cartes bancair...,1
4457,98,13684,18078,Bonne table,Nono0825,2024-11-19,4.0,family,1062,Nous avons fêté notre anniversaire de mariage!...,...,2,5.0,4.4,4.8,4.7,4.3,"Française, Saine","Déjeuner, Dîner",Accessible en fauteuil roulant; Cartes bancair...,1
4458,98,14394,18079,Une belle expérience,317pierremarcp,2024-10-17,5.0,couples,6,"Très bonne table, beaucoup de saveurs dans les...",...,2,5.0,4.4,4.8,4.7,4.3,"Française, Saine","Déjeuner, Dîner",Accessible en fauteuil roulant; Cartes bancair...,1
4459,98,14395,18080,Divin,maliceste,2024-07-23,5.0,couples,150,"Un voyage culinaire ! Bravo au chef, sa brigad...",...,2,5.0,4.4,4.8,4.7,4.3,"Française, Saine","Déjeuner, Dîner",Accessible en fauteuil roulant; Cartes bancair...,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5230,98,8218,18851,Du très très haut niveau !!!,Olivierly,2014-07-21,5.0,friends,70,Une nouvelle très grande adresse à LYON. Le ch...,...,2,5.0,4.4,4.8,4.7,4.3,"Française, Saine","Déjeuner, Dîner",Accessible en fauteuil roulant; Cartes bancair...,1
5231,98,14955,18852,Comment dire... Extraordinaire,Bert-2820,2014-07-17,5.0,couples,11,Nous sommes allés diner le 16 juillet. Le rest...,...,2,5.0,4.4,4.8,4.7,4.3,"Française, Saine","Déjeuner, Dîner",Accessible en fauteuil roulant; Cartes bancair...,1
5232,98,14956,18853,merci,Eric691166686,2014-06-26,5.0,couples,73,"alors la,je dois vous dire que c'est l'excepti...",...,2,5.0,4.4,4.8,4.7,4.3,"Française, Saine","Déjeuner, Dîner",Accessible en fauteuil roulant; Cartes bancair...,1
5233,98,14358,18854,Expérience de haute qualité,Arlington7175,2024-12-31,4.0,couples,243,"Dans l'ensemble, ce fut une bonne expérience, ...",...,2,5.0,4.4,4.8,4.7,4.3,"Française, Saine","Déjeuner, Dîner",Accessible en fauteuil roulant; Cartes bancair...,1


In [182]:
# Créer un DataFrame pour la visualisation
visual_df = pd.DataFrame(features_3d, columns=["PCA1", "PCA2", "PCA3"])
visual_df["cluster"] = df_rest["cluster"]

# Visualisation en 3D
fig = px.scatter_3d(visual_df, x="PCA1", y="PCA2", z="PCA3", color="cluster",
                    title="Clusters des restaurants")
fig.show()

In [185]:
# Groupement par cluster pour inspection
df_rest.groupby("cluster")[["note_globale", "prix_min", "prix_max"]].mean()


Unnamed: 0_level_0,note_globale,prix_min,prix_max
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,4.770248,13.717355,29.842975
1,4.875488,85.533203,163.980469
2,4.697389,16.061444,36.504352
3,4.615578,19.050251,40.600503
4,4.398689,18.167213,40.561967


In [190]:
reviews

[['moment',
  'accueil',
  'plat',
  'ambiance',
  'passer',
  'moment',
  'amis',
  'région',
  'hâte',
  'découvrir',
  'genre',
  'établissement',
  'déçus'],
 ['goûteux',
  "l'ambiance",
  'bonne',
  'humeur',
  'moment',
  'convivialité',
  'amis',
  'collègues',
  'passer',
  'côté'],
 ['serveurs',
  'serveuses',
  'sympas',
  'onglet',
  'tendre',
  'terrine',
  'moyenne',
  'lentilles',
  'décevantes',
  'fondant',
  'carrément',
  'top',
  'cuisine',
  'améliorer'],
 ['excellente',
  'soirée',
  'petit',
  'restaurant',
  'écart',
  'côté',
  'cuisine',
  'moment',
  'privilégié',
  'patron',
  'cuisiniers',
  'plat',
  'délicieux',
  'ambiance',
  'sympa',
  'grâce',
  'bonne',
  'humeur',
  'restaurateur',
  'manqué'],
 ['super',
  'moment',
  'amis',
  'plat',
  'délicieux',
  'conseille',
  'fortement',
  'adresse',
  'bouchon',
  'lyonnais',
  'grave',
  'régalé'],
 ['petite',
  'salade',
  'sympas',
  'gustativement',
  'proposer',
  'betteraves',
  'harengs',
  'terrine

In [193]:
# Agréger les avis par restaurant
aggregated_reviews = df_rest.groupby("restaurant_id").agg({
    "rating": ["mean", "std"],  # Note moyenne et écart type
    "review": lambda x: " ".join(x),  # Combiner tous les avi
}).reset_index()

# Renommer les colonnes pour simplifier
aggregated_reviews.columns = ["restaurant_id", "rating_mean", "rating_std", "reviews_combined"]


In [196]:
# Joindre les caractéristiques des restaurants avec les agrégations des avis
restaurant_features = df_rest.merge(aggregated_reviews, on="restaurant_id", how="left")

# Sélectionner les colonnes pertinentes pour le clustering
features = restaurant_features[[
    "rank", "prix_min", "prix_max", "etoiles_michelin", "note_globale", 
    "rating_mean", "rating_std"
]].fillna(0)  # Remplir les valeurs manquantes si nécessaire


In [197]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

# Standardiser les données
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)

# Réduction de dimensions avec PCA (pour visualisation)
pca = PCA(n_components=3)
features_3d = pca.fit_transform(features_scaled)

# Clustering avec KMeans
kmeans = KMeans(n_clusters=5, random_state=42)
restaurant_features["cluster"] = kmeans.fit_predict(features_scaled)


In [198]:
import plotly.express as px

# Créer un DataFrame pour la visualisation
visual_df = pd.DataFrame(features_3d, columns=["PCA1", "PCA2", "PCA3"])
visual_df["cluster"] = restaurant_features["cluster"]
visual_df["restaurant_name"] = restaurant_features["nom"]

# Graphique 3D
fig = px.scatter_3d(
    visual_df, x="PCA1", y="PCA2", z="PCA3", color="cluster",
    hover_data=["restaurant_name"], title="Clustering des restaurants"
)
fig.show()


In [203]:
# Groupement par cluster pour inspection des caractéristiques
cluster_analysis = restaurant_features.groupby("cluster").agg({
    "rank": "mean",
    "prix_min": "mean",
    "prix_max": "mean",
    "note_globale": "mean",
    "rating_mean": "mean",
    "etoiles_michelin": "mean"
})

cluster_analysis


Unnamed: 0_level_0,rank,prix_min,prix_max,note_globale,rating_mean,etoiles_michelin
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,3.895522,12.910448,29.067164,5.0,4.848259,0.0
1,135.0,60.0,179.0,4.5,4.592593,1.0
2,249.0,9.0,25.0,4.0,4.081633,0.0
3,83.096491,23.657895,50.368421,4.381579,4.547953,0.0
4,39.0,94.0,159.0,5.0,4.766667,2.0
