Taken help from GenAI's

### Importing Libraries

In [2]:
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec
from sklearn.cluster import KMeans
import re
from nltk.sentiment import SentimentIntensityAnalyzer
import nltk

In [3]:
dataset = pd.read_csv('Restaurant_Reviews.tsv', sep = '\t')

In [4]:
data = dataset.copy()

### Preprocessing

In [5]:
stop_words = set(stopwords.words('english'))

In [6]:
def remove_stopwords(text) :
    tokens = word_tokenize(text)
    return [token for token in tokens if token.isalpha() and token not in stop_words]

In [7]:
remove_stopwords('My name is Harsh Maniya')

['My', 'name', 'Harsh', 'Maniya']

In [8]:
data['Tokens'] = data['Review'].apply(remove_stopwords)

In [9]:
data.head(5)

Unnamed: 0,Review,Liked,Tokens
0,Wow... Loved this place.,1,"[Wow, Loved, place]"
1,Crust is not good.,0,"[Crust, good]"
2,Not tasty and the texture was just nasty.,0,"[Not, tasty, texture, nasty]"
3,Stopped by during the late May bank holiday of...,1,"[Stopped, late, May, bank, holiday, Rick, Stev..."
4,The selection on the menu was great and so wer...,1,"[The, selection, menu, great, prices]"


### Training Word2Vec Model

In [10]:
model = Word2Vec(sentences = data['Tokens'], vector_size = 200, min_count = 5)

In [11]:
def sentence_vector(tokens, model):
    valid_tokens = [word for word in tokens if word in model.wv]
    if valid_tokens:
        word_vectors = np.array([model.wv[word] for word in valid_tokens])
        return word_vectors.mean(axis=0)
    else:
        return np.zeros(model.vector_size)

In [12]:
data['Vector'] = data['Tokens'].apply(lambda x: sentence_vector(x, model))

### K-Means clustering for Text classification (Unlabeled data)

In [13]:
X = np.vstack(data['Vector'].values) 
kmeans = KMeans(n_clusters=3, random_state=27)
data['Cluster'] = kmeans.fit_predict(X)



In [14]:
cluster_labels = {0: 'Service', 1: 'Ambiance', 2: 'Food'}
data['Category'] = data['Cluster'].map(cluster_labels)

In [15]:
data.head()

Unnamed: 0,Review,Liked,Tokens,Vector,Cluster,Category
0,Wow... Loved this place.,1,"[Wow, Loved, place]","[0.004115201, -0.0025320498, -0.00015886906, 0...",1,Ambiance
1,Crust is not good.,0,"[Crust, good]","[-0.0046123504, 0.0041337516, 0.0023621756, 0....",2,Food
2,Not tasty and the texture was just nasty.,0,"[Not, tasty, texture, nasty]","[-0.0016557387, -0.0009831111, 0.002366534, 0....",0,Service
3,Stopped by during the late May bank holiday of...,1,"[Stopped, late, May, bank, holiday, Rick, Stev...","[-0.00035783992, 0.0016495173, 0.00014460708, ...",0,Service
4,The selection on the menu was great and so wer...,1,"[The, selection, menu, great, prices]","[-0.0009349378, 0.002154593, 0.0006511372, 0.0...",2,Food


### Context based Classification and Sentiment analysis

In [16]:
data2 = dataset.copy()

#### Preprocessing

In [17]:
def preprocess_text(text):
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\s+', ' ', text) 
    text = text.lower()
    return text

data2["Cleaned_Review"] = data2["Review"].apply(preprocess_text)

#### Analysing sentiment

In [18]:
sia = SentimentIntensityAnalyzer()

def analyze_sentiment(text):
    score = sia.polarity_scores(text)
    if score['compound'] > 0.05:
        return "Positive"
    elif score['compound'] < -0.05:
        return "Negative"
    else:
        return "Neutral"

data2["Sentiment"] = data2["Cleaned_Review"].apply(analyze_sentiment)

#### Defining context words and then classification

In [19]:
categories = {
    "Food": ["taste", "delicious", "pizza", "food", "meal", "dish", "tasty", "crust", "breakfast"],  # we have to manually
    "Service": ["service", "staff", "wait", "helpful", "slow", "menu"],                              # add context words
    "Ambiance": ["ambiance", "clean", "noisy", "decor", "environment", "atmosphere", "place"]        # for each category
}

def categorize_review(text):
    for category, keywords in categories.items():
        if any(keyword in text for keyword in keywords):
            return category
    return "Other"

data2["Category"] = data2["Cleaned_Review"].apply(categorize_review)

data2["Category_Sentiment"] = data2["Category"] + " - " + data2["Sentiment"]

data2.head()

Unnamed: 0,Review,Liked,Cleaned_Review,Sentiment,Category,Category_Sentiment
0,Wow... Loved this place.,1,wow loved this place,Positive,Ambiance,Ambiance - Positive
1,Crust is not good.,0,crust is not good,Negative,Food,Food - Negative
2,Not tasty and the texture was just nasty.,0,not tasty and the texture was just nasty,Negative,Food,Food - Negative
3,Stopped by during the late May bank holiday of...,1,stopped by during the late may bank holiday of...,Positive,Other,Other - Positive
4,The selection on the menu was great and so wer...,1,the selection on the menu was great and so wer...,Positive,Service,Service - Positive
