In [51]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

In [52]:
# Load the dataset
df = pd.read_csv("yelp.csv")  # Adjust the path if needed
print(df.head())

              business_id        date               review_id  stars  \
0  9yKzy9PApeiPPOUJEtnvkg  2011-01-26  fWKvX83p0-ka4JS3dc6E5A      5   
1  ZRJwVLyzEJq1VAihDhYiow  2011-07-27  IjZ33sJrzXqU-0X6U8NwyA      5   
2  6oRAC4uyJCsJl1X0WZpVSA  2012-06-14  IESLBzqUCLdSzSqm0eCSxQ      4   
3  _1QQZuf4zZOyFCvXc0o6Vg  2010-05-27  G-WvGaISbqqaMHlNnByodA      5   
4  6ozycU1RpktNG2-1BroVtw  2012-01-05  1uJFq2r5QfJG_6ExMRCaGw      5   

                                                text    type  \
0  My wife took me here on my birthday for breakf...  review   
1  I have no idea why some people give bad review...  review   
2  love the gyro plate. Rice is so good and I als...  review   
3  Rosie, Dakota, and I LOVE Chaparral Dog Park!!...  review   
4  General Manager Scott Petello is a good egg!!!...  review   

                  user_id  cool  useful  funny  
0  rLtl8ZkDX5vH5nAx9C3q5Q     2       5      0  
1  0a2KyEL0d3Yb1V6aivbIuQ     0       0      0  
2  0hT2KtfLiobPvh6cDC8JQg     0    

In [53]:
# Select relevant columns and preprocess data
df = df[['text', 'stars']]

In [54]:
# Label reviews: Positive (1) for 4-5 stars, Negative (0) for 1-2 stars
# Optionally, exclude neutral reviews (3 stars)
df = df[df['stars'] != 3]
df['sentiment'] = df['stars'].apply(lambda x: 1 if x >= 4 else 0)

In [55]:
print(df['sentiment'].value_counts())


sentiment
1    6863
0    1676
Name: count, dtype: int64


In [56]:
# Extract text and sentiment
texts = df['text'].values
labels = df['sentiment'].values

In [57]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.2, random_state=42)

In [58]:
# Tokenize and pad the text data
tokenizer = Tokenizer(oov_token="<UNK>")
tokenizer.fit_on_texts(X_train)
vocab_size = len(tokenizer.word_index) + 1

In [59]:
import joblib

# Save the tokenizer
joblib.dump(tokenizer, 'tokenizer.joblib')


['tokenizer.joblib']

In [60]:
# Convert texts to sequences and pad them
max_sequence_length = 200
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)
X_train_padded = pad_sequences(X_train_seq, maxlen=max_sequence_length, padding='post')
X_test_padded = pad_sequences(X_test_seq, maxlen=max_sequence_length, padding='post')

In [61]:
# Define the LSTM model
embedding_dim = 128
model = Sequential([
    Embedding(vocab_size, embedding_dim, input_length=max_sequence_length),
    LSTM(64, return_sequences=False),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')  # Output layer for binary classification
])



In [62]:
# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])


In [63]:
# Train the model
history = model.fit(X_train_padded, y_train, validation_split=0.2, epochs=30, batch_size=64, verbose=1)

Epoch 1/30
[1m86/86[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 254ms/step - accuracy: 0.7718 - loss: 0.5394 - val_accuracy: 0.7944 - val_loss: 0.5067
Epoch 2/30
[1m86/86[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 247ms/step - accuracy: 0.8071 - loss: 0.4805 - val_accuracy: 0.7966 - val_loss: 0.4952
Epoch 3/30
[1m86/86[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 234ms/step - accuracy: 0.8419 - loss: 0.4274 - val_accuracy: 0.7893 - val_loss: 0.5124
Epoch 4/30
[1m86/86[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 236ms/step - accuracy: 0.8563 - loss: 0.3881 - val_accuracy: 0.7959 - val_loss: 0.5795
Epoch 5/30
[1m86/86[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 235ms/step - accuracy: 0.8614 - loss: 0.3688 - val_accuracy: 0.7937 - val_loss: 0.5777
Epoch 6/30
[1m86/86[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 237ms/step - accuracy: 0.8629 - loss: 0.3645 - val_accuracy: 0.7930 - val_loss: 0.6195
Epoch 7/30
[1m86/86[

In [65]:
# Evaluate the model on the test set
y_pred = (model.predict(X_test_padded) > 0.5).astype(int).flatten()

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.2f}")

# Generate a classification report
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred, target_names=['Negative', 'Positive']))


[1m54/54[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 47ms/step
Model Accuracy: 0.88
              precision    recall  f1-score   support

    Negative       0.70      0.68      0.69       348
    Positive       0.92      0.93      0.92      1360

    accuracy                           0.88      1708
   macro avg       0.81      0.80      0.81      1708
weighted avg       0.87      0.88      0.88      1708



In [66]:
# Save the model
model.save('yelp_sentiment_lstm_model.h5')

