Importing Libraries

In [2]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import SimpleRNN, Dense, Embedding

Loading dataset

In [3]:
data = pd.read_csv('swiggy.csv')
print("Columns in the dataset:")
print(data.columns.tolist())

Columns in the dataset:
['ID', 'Area', 'City', 'Restaurant Price', 'Avg Rating', 'Total Rating', 'Food Item', 'Food Type', 'Delivery Time', 'Review']


In [4]:
data.head()

Unnamed: 0,ID,Area,City,Restaurant Price,Avg Rating,Total Rating,Food Item,Food Type,Delivery Time,Review
0,1,Suburb,Ahmedabad,600,4.2,6198,Sushi,Fast Food,30-40 min,"Good, but nothing extraordinary."
1,2,Business District,Pune,200,4.7,4865,Pepperoni Pizza,Non-Vegetarian,50-60 min,"Good, but nothing extraordinary."
2,3,Suburb,Bangalore,600,4.7,2095,Waffles,Fast Food,50-60 min,Late delivery ruined it.
3,4,Business District,Mumbai,900,4.0,6639,Sushi,Vegetarian,50-60 min,Best meal I've had in a while!
4,5,Tech Park,Mumbai,200,4.7,6926,Spring Rolls,Gluten-Free,20-30 min,Mediocre experience.


In [5]:
data.isnull().sum()

Unnamed: 0,0
ID,0
Area,0
City,0
Restaurant Price,0
Avg Rating,0
Total Rating,0
Food Item,0
Food Type,0
Delivery Time,0
Review,0


#text cleaning and setiment labling

In [6]:
data["Review"] = data["Review"].str.lower()    #convert all text to lowercase in review columbn
data["Review"] = data["Review"].replace(r'[^a-z0-9\s]', '', regex=True) # Removes all characters except letters, numbers and spaces from the "Review" column

data['sentiment'] = data['Avg Rating'].apply(lambda x: 1 if x > 3.5 else 0) #Creates a new "sentiment" column with 1 for ratings above 3.5 and 0 otherwise


In [7]:
data = data.dropna() #Removes rows that contain any missing values

Tokenization and Padding

In [8]:
max_features = 5000 #Sets the maximum number of words to keep in the tokenizer
max_length = 200  #fixed length for each input sequence after padding

tokenizer = Tokenizer(num_words=max_features)  #initializes the tokenizer to keep the top 5000 words only
tokenizer.fit_on_texts(data["Review"])   #Builds the word index based on the reviews in the dataset
X = pad_sequences(tokenizer.texts_to_sequences(    # Converts each review into a sequence of word indexes
    data["Review"]), maxlen=max_length)
y = data['sentiment'].values  #Extracts the sentiment labels as a NumPy array for model training

Splitting data

In [9]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, test_size=0.1, random_state=42, stratify=y_train
)

building RNN model

In [10]:
model = Sequential([   #Creates a sequential neural network model
    Embedding(input_dim=max_features, output_dim=16, input_length=max_length),   #Maps input words to 16-dimensional vectors
    SimpleRNN(64, activation='tanh', return_sequences=False),   #Adds a recurrent layer with 64 units using tanh activation
    Dense(1, activation='sigmoid')    #Adds an output layer with one neuron using sigmoid activation for binary output
])

model.compile(
    loss='binary_crossentropy',     #Configures the model with binary crossentropy loss, Adam optimizer and accuracy metric
    optimizer='adam',
    metrics=['accuracy']
)



Training and Evaluating Model

In [11]:
history = model.fit(
    X_train, y_train,
    epochs=5,
    batch_size=32,
    validation_data=(X_val, y_val),
    verbose=1
)

score = model.evaluate(X_test, y_test, verbose=0)
print(f"Test accuracy: {score[1]:.2f}")

Epoch 1/5
[1m180/180[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 53ms/step - accuracy: 0.7063 - loss: 0.6103 - val_accuracy: 0.7156 - val_loss: 0.6010
Epoch 2/5
[1m180/180[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 51ms/step - accuracy: 0.7190 - loss: 0.5958 - val_accuracy: 0.7156 - val_loss: 0.5982
Epoch 3/5
[1m180/180[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 41ms/step - accuracy: 0.7098 - loss: 0.6030 - val_accuracy: 0.7156 - val_loss: 0.5964
Epoch 4/5
[1m180/180[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 46ms/step - accuracy: 0.7103 - loss: 0.6024 - val_accuracy: 0.7156 - val_loss: 0.5986
Epoch 5/5
[1m180/180[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 50ms/step - accuracy: 0.7107 - loss: 0.6054 - val_accuracy: 0.7156 - val_loss: 0.6034
Test accuracy: 0.72


**Training and Evaluating Model**
Our model achieved a accuracy of 72% which is great for a RNN model. We can further fine tune it to achieve more accuracy.

In [None]:
Predicting Sentiment

In [12]:
def predict_sentiment(review_text):
    text = review_text.lower()
    text = re.sub(r'[^a-z0-9\s]', '', text)

    seq = tokenizer.texts_to_sequences([text])
    padded = pad_sequences(seq, maxlen=max_length)

    prediction = model.predict(padded)[0][0]
    return f"{'Positive' if prediction >= 0.5 else 'Negative'} (Probability: {prediction:.2f})"


sample_review = "The food was great."
print(f"Review: {sample_review}")
print(f"Sentiment: {predict_sentiment(sample_review)}")

Review: The food was great.
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 182ms/step
Sentiment: Positive (Probability: 0.76)


In summary the model processes textual reviews through RNN to predict sentiment from raw data. This helps in actionable insights by understanding customer sentiment