In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import re
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize
import tensorflow
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import SimpleRNN, Dense, Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
import warnings
warnings.filterwarnings('ignore')



In [None]:
df=pd.read_csv("swiggy.csv")

In [None]:
df.head()

Unnamed: 0,ID,Area,City,Restaurant Price,Avg Rating,Total Rating,Food Item,Food Type,Delivery Time,Review
0,1,Suburb,Ahmedabad,600,4.2,6198,Sushi,Fast Food,30-40 min,"Good, but nothing extraordinary."
1,2,Business District,Pune,200,4.7,4865,Pepperoni Pizza,Non-Vegetarian,50-60 min,"Good, but nothing extraordinary."
2,3,Suburb,Bangalore,600,4.7,2095,Waffles,Fast Food,50-60 min,Late delivery ruined it.
3,4,Business District,Mumbai,900,4.0,6639,Sushi,Vegetarian,50-60 min,Best meal I've had in a while!
4,5,Tech Park,Mumbai,200,4.7,6926,Spring Rolls,Gluten-Free,20-30 min,Mediocre experience.


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8000 entries, 0 to 7999
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   ID                8000 non-null   int64  
 1   Area              8000 non-null   object 
 2   City              8000 non-null   object 
 3   Restaurant Price  8000 non-null   int64  
 4   Avg Rating        8000 non-null   float64
 5   Total Rating      8000 non-null   int64  
 6   Food Item         8000 non-null   object 
 7   Food Type         8000 non-null   object 
 8   Delivery Time     8000 non-null   object 
 9   Review            8000 non-null   object 
dtypes: float64(1), int64(3), object(6)
memory usage: 625.1+ KB


In [None]:
df["Review"].values

array(['Good, but nothing extraordinary.',
       'Good, but nothing extraordinary.', 'Late delivery ruined it.',
       ..., 'Nothing special but edible.', 'It was okay.',
       'Delicious and fresh.'], dtype=object)

In [None]:
df.isnull().sum()

ID                  0
Area                0
City                0
Restaurant Price    0
Avg Rating          0
Total Rating        0
Food Item           0
Food Type           0
Delivery Time       0
Review              0
dtype: int64

In [None]:
df.drop_duplicates().sum()

ID                                                           32004000
Area                SuburbBusiness DistrictSuburbBusiness District...
City                AhmedabadPuneBangaloreMumbaiMumbaiHyderabadBan...
Restaurant Price                                              4356700
Avg Rating                                                    33039.2
Total Rating                                                 39839820
Food Item           SushiPepperoni PizzaWafflesSushiSpring RollsSm...
Food Type           Fast FoodNon-VegetarianFast FoodVegetarianGlut...
Delivery Time       30-40 min50-60 min50-60 min50-60 min20-30 min3...
Review              Good, but nothing extraordinary.Good, but noth...
dtype: object

In [None]:


# Review text Cleaning
def clean_reviews(text):

	# removing html brackets and other square brackets from the string using regex
	regex = re.compile('<.*?>') # r'<.*?>'
	text = re.sub(regex, '', text)

	# removing special characters like @, #, $, etc
	pattern = re.compile('[^a-zA-z0-9\s]')
	text = re.sub(pattern,'',text)

	# removing numbers
	pattern = re.compile('\d+')
	text = re.sub(pattern,'',text)

	# converting text to lower case
	text = text.lower()

	# Tokenization of words
	text = word_tokenize(text)

	# Stop words removal
	text = [word for word in text if not word in stop_words]

	return text


In [None]:
df['sentiment'] = df['Avg Rating'].apply(lambda x: 1 if x > 3.5 else 0)

In [None]:
print(df["sentiment"])

0       1
1       1
2       1
3       1
4       1
       ..
7995    1
7996    1
7997    1
7998    1
7999    1
Name: sentiment, Length: 8000, dtype: int64


In [None]:
# using the clean_reviews function on the dataset
print(df["Review"])

0        Good, but nothing extraordinary.
1        Good, but nothing extraordinary.
2                Late delivery ruined it.
3          Best meal I've had in a while!
4                    Mediocre experience.
                      ...                
7995                My new favorite dish!
7996    Amazing taste and quick delivery.
7997          Nothing special but edible.
7998                         It was okay.
7999                 Delicious and fresh.
Name: Review, Length: 8000, dtype: object


In [None]:
tokenizer = Tokenizer(num_words=max_word)
tokenizer.fit_on_texts(df["Review"])

In [None]:
print(reveiw_to_list)

['Good, but nothing extraordinary.', 'Good, but nothing extraordinary.', 'Late delivery ruined it.', "Best meal I've had in a while!", 'Mediocre experience.', "Best meal I've had in a while!", 'Amazing taste and quick delivery.', 'My new favorite dish!', 'Absolutely loved it!', 'Superb packaging and presentation.', 'Absolutely loved it!', 'Disappointed.', 'Average taste.', 'Average taste.', 'Worst experience ever.', 'Mediocre experience.', 'Disappointed.', 'Perfectly cooked and well-seasoned.', 'Disappointed.', 'Not as described.', 'Would order again if needed.', 'Good, but nothing extraordinary.', 'Tasty and worth the price.', 'Nothing special but edible.', 'Absolutely loved it!', 'Perfectly cooked and well-seasoned.', 'Absolutely loved it!', 'Terrible taste!', 'Highly recommended!', 'Highly recommended!', 'Standard quality.', 'My new favorite dish!', 'Tasty and worth the price.', 'Nothing special but edible.', 'Nothing special but edible.', 'Mediocre experience.', 'Tasty and worth th

In [None]:
max_word=500
max_length=200

In [None]:
X = pad_sequences(tokenizer.texts_to_sequences(df["Review"]), maxlen=max_length)
y = df['sentiment'].values  

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

#Using RNN Model 

In [None]:
rnn=Sequential(name="SimpleRNN")
rnn.add(Embedding(input_dim=max_word,output_dim=16,input_length=max_length))
rnn.add(SimpleRNN(64,activation="tanh"))
rnn.add(Dense(1,activation="sigmoid"))

In [None]:
rnn.summary()

In [None]:
rnn.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

In [None]:

history = rnn.fit(
    X_train, y_train,
    epochs=5,
    batch_size=32,
    verbose=1
)

Epoch 1/5
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 17ms/step - accuracy: 0.6858 - loss: 0.6141
Epoch 2/5
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 16ms/step - accuracy: 0.7208 - loss: 0.5936
Epoch 3/5
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 15ms/step - accuracy: 0.7051 - loss: 0.6061
Epoch 4/5
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 16ms/step - accuracy: 0.7098 - loss: 0.6022
Epoch 5/5
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 16ms/step - accuracy: 0.7216 - loss: 0.5923


In [None]:
score = rnn.evaluate(X_test, y_test, verbose=0)
print(f"Test accuracy: {score[1]:.2f}")

Test accuracy: 0.72


Prediction

In [None]:
def predict_sentiment(review_text):
    text = review_text.lower()
    text = re.sub(r'[^a-z0-9\s]', '', text)
    
    seq = tokenizer.texts_to_sequences([text])
    padded = pad_sequences(seq, maxlen=max_length)

    prediction = rnn.predict(padded)[0][0]
    return f"{'Positive' if prediction >= 0.5 else 'Negative'} (Probabilitys: {prediction:.````2f})"
sample_review = "The food was great."
print(f"Review: {sample_review}")
print(f"Sentiment: {predict_sentiment(sample_review)}")


Review: The food was great.
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 124ms/step
Sentiment: Positive (Probabilitys: 0.71)
