In [1]:
import pandas as pd
import streamlit as st
import os



In [2]:
df=pd.read_csv("C:\\cafe-sentiment-analytics\\data\\cafe_reviews_10000.csv")
df.head()

Unnamed: 0,review_id,rating,review_text,review_date,reviewer_name,branch_name,category,sentiment_manual
0,1,4,Very average experience. Overpriced for the po...,6/21/2024,Rahul,"ChaiCofi, Kochi",,
1,2,5,Not worth the hype. Staff seemed uninterested.,6/3/2024,Saniya,"French Toast, Panampilly",,
2,3,3,Service was slow and the food was cold when se...,9/4/2024,Gokul,"Kashi Art Café, Fort Kochi",,
3,4,3,Loved the desserts here! Coffee was perfect.,12/28/2024,Joel,"Kashi Art Café, Fort Kochi",,
4,5,3,One of the best cafés in Kochi. Highly recomme...,9/7/2024,Merin,"Kashi Art Café, Fort Kochi",,


In [3]:
df.shape

(10000, 8)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   review_id         10000 non-null  int64  
 1   rating            10000 non-null  int64  
 2   review_text       10000 non-null  object 
 3   review_date       10000 non-null  object 
 4   reviewer_name     10000 non-null  object 
 5   branch_name       10000 non-null  object 
 6   category          0 non-null      float64
 7   sentiment_manual  0 non-null      float64
dtypes: float64(2), int64(2), object(4)
memory usage: 625.1+ KB


In [5]:
df.isnull().sum()

review_id               0
rating                  0
review_text             0
review_date             0
reviewer_name           0
branch_name             0
category            10000
sentiment_manual    10000
dtype: int64

In [6]:
df['review_date']=pd.to_datetime(df['review_date'])
df['review_date'].dtype

dtype('<M8[ns]')

In [7]:
import re

In [8]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+", "", text)      # remove links
    text = re.sub(r"[^a-z\s]", "", text)     # keep letters only
    text = re.sub(r"\s+", " ", text).strip()
    return text

df['clean_review'] = df['review_text'].apply(clean_text)


In [9]:
# creating useful features
df['review_length'] = df['clean_review'].apply(lambda x: len(x.split()))
df['is_weekend']=df['review_date'].dt.weekday >=5
df['month']=df['review_date'].dt.month

In [10]:
df[['review_text', 'clean_review', 'review_length', 'is_weekend', 'month']].head()

Unnamed: 0,review_text,clean_review,review_length,is_weekend,month
0,Very average experience. Overpriced for the po...,very average experience overpriced for the por...,8,False,6
1,Not worth the hype. Staff seemed uninterested.,not worth the hype staff seemed uninterested,7,False,6
2,Service was slow and the food was cold when se...,service was slow and the food was cold when se...,10,False,9
3,Loved the desserts here! Coffee was perfect.,loved the desserts here coffee was perfect,7,True,12
4,One of the best cafés in Kochi. Highly recomme...,one of the best cafs in kochi highly recommended,9,True,9


In [11]:
!pip install vadersentiment




[notice] A new release of pip is available: 24.0 -> 25.3
[notice] To update, run: C:\Users\ASUS\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [12]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [13]:
analyzer = SentimentIntensityAnalyzer()

In [14]:
def get_sentiment(text):
    score=analyzer.polarity_scores(text)['compound']
    if score >=0.05:
        return "positive"
    elif score <= -0.05:
        return "negative"
    else:
        return "neutral"

In [15]:
df['sentiment_auto']=df['clean_review'].apply(get_sentiment)

In [16]:
df[['review_text','sentiment_auto']].head(10)

Unnamed: 0,review_text,sentiment_auto
0,Very average experience. Overpriced for the po...,neutral
1,Not worth the hype. Staff seemed uninterested.,negative
2,Service was slow and the food was cold when se...,neutral
3,Loved the desserts here! Coffee was perfect.,positive
4,One of the best cafés in Kochi. Highly recomme...,positive
5,Not worth the hype. Staff seemed uninterested.,negative
6,Not worth the hype. Staff seemed uninterested.,negative
7,Not worth the hype. Staff seemed uninterested.,negative
8,Service was slow and the food was cold when se...,neutral
9,Very average experience. Overpriced for the po...,neutral


In [17]:
df['sentiment_auto'].value_counts()

sentiment_auto
positive    5977
neutral     2978
negative    1045
Name: count, dtype: int64

In [18]:
df_sample=df.sample(30,random_state=42)
df_sample[['review_text','sentiment_auto']]

Unnamed: 0,review_text,sentiment_auto
6252,Service was slow and the food was cold when se...,neutral
4684,Perfect spot for brunch. Peaceful and aesthetic.,positive
1731,Perfect spot for brunch. Peaceful and aesthetic.,positive
4742,Small portions. Price doesn’t match quality.,neutral
4521,Not worth the hype. Staff seemed uninterested.,negative
6340,Very average experience. Overpriced for the po...,neutral
576,Crowded but food quality is really good.,positive
5202,Perfect spot for brunch. Peaceful and aesthetic.,positive
6363,Small portions. Price doesn’t match quality.,neutral
439,Service was slow and the food was cold when se...,neutral


In [19]:
df_sample.shape

(30, 13)

In [20]:
df_sample['sentiment_manual'] = [
    'neutral',
    "positive",
    "neutral",
    "positive",
    "positive",
    "neutral",
    "negative",
    "neutral",
    "positive",
    "positive",
    "neutral",
    "positive",
    "positive",
    "neutral",
    "positive",
    "positive",
    "neutral",
    "positive",
    "positive",
    "neutral",
    "positive",
    "positive",
    "neutral",
    "positive",
    "positive",
    "neutral",
    "positive",
    "positive",
    "neutral",
    "positive",
    ]

In [21]:
df_sample[['sentiment_auto','sentiment_manual']].head(10)

Unnamed: 0,sentiment_auto,sentiment_manual
6252,neutral,neutral
4684,positive,positive
1731,positive,neutral
4742,neutral,positive
4521,negative,positive
6340,neutral,neutral
576,positive,negative
5202,positive,neutral
6363,neutral,positive
439,neutral,positive


In [22]:
# calculate the accuracy
accuracy = (df_sample['sentiment_auto'] == df_sample['sentiment_manual']).mean()
accuracy

np.float64(0.3333333333333333)

In [23]:
# keep only positive and negative for validation
df_binary = df_sample[df_sample['sentiment_manual'] != 'neutral'].copy()

In [24]:
len(df_binary)

19

In [25]:
# recalculate the accuracy
binary_accuracy = (df_binary['sentiment_auto'] == df_binary['sentiment_manual']).mean()
binary_accuracy

np.float64(0.3684210526315789)

In [26]:
# inspect mismatches
df_binary[df_binary['sentiment_auto'] != df_binary['sentiment_manual']][
    ['review_text','sentiment_auto','sentiment_manual']
]

Unnamed: 0,review_text,sentiment_auto,sentiment_manual
4742,Small portions. Price doesn’t match quality.,neutral,positive
4521,Not worth the hype. Staff seemed uninterested.,negative,positive
576,Crowded but food quality is really good.,positive,negative
6363,Small portions. Price doesn’t match quality.,neutral,positive
439,Service was slow and the food was cold when se...,neutral,positive
7487,Very average experience. Overpriced for the po...,neutral,positive
3999,Service was slow and the food was cold when se...,neutral,positive
9930,Not worth the hype. Staff seemed uninterested.,negative,positive
2249,Very average experience. Overpriced for the po...,neutral,positive
9485,Small portions. Price doesn’t match quality.,neutral,positive


In [27]:
# creating sentiment flag for business uses
df['sentiment_business']=df['sentiment_auto'].apply(lambda x: 1 if x=='positive' else 'negative_or_neutral')

In [28]:
df['sentiment_business'].value_counts(normalize=True)*100

sentiment_business
1                      59.77
negative_or_neutral    40.23
Name: proportion, dtype: float64

In [29]:
df.groupby('branch_name')['sentiment_business'].value_counts(normalize=True)*100

branch_name                 sentiment_business 
Bloomsbury's, Kakkanad      1                      61.090909
                            negative_or_neutral    38.909091
ChaiCofi, Kochi             1                      59.751702
                            negative_or_neutral    40.248298
French Toast, Panampilly    1                      59.434343
                            negative_or_neutral    40.565657
Kashi Art Café, Fort Kochi  1                      58.832746
                            negative_or_neutral    41.167254
Name: proportion, dtype: float64

In [30]:
# weekend vs weekday
df.groupby('is_weekend')['sentiment_business'].value_counts(normalize=True)*100

is_weekend  sentiment_business 
False       1                      60.097971
            negative_or_neutral    39.902029
True        1                      58.949212
            negative_or_neutral    41.050788
Name: proportion, dtype: float64