In [43]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords

class SentimentAnalysis:
    def __init__(self, reviews_df):
        self.reviews_df = reviews_df_updated

    def preprocess(self, text):
        '''
        Preprocesses reviews before sentiment analysis.
    
        Input:
        - text: text to be preprocessed (in this case, reviews)
    
        Output:
        - processed_revs: word lists for processed reviews
        '''
    
        # replace NaN and blanks
        if not isinstance(text, str) or not text.strip():
            return []
    
        # remove emojis and non-alphanumeric characters
        text = re.sub(r'[^\w\s]', '', text)

        # remove non-English characters
        text = re.sub(r'[^\x00-\x7F]+', '', text)

        # tokenize and lowercase
        tokens = word_tokenize(text.lower())
    
        # remove stopwords
        stop_words = set(stopwords.words('english'))
        processed_revs = [word for word in tokens if word not in stop_words]
    
        return processed_revs

    def review_sentiment_score(self, review, factors, sia):
        """
        Calculate sentiment score for each review in each factor using SentimentIntensityAnalyzer and the VADER lexicon.

        Inputs:
        - review: preprocessed review word list
        - factors: list of factors for which sentiment will be calculated
        - sia: SentimentIntensityAnalyzer class containing the function polarity_scores, which will be used to calculate sentiment scores

        Outputs:
        - average_scores: sentiment scores in each factor for a review on a scale of -1 to 1
        """
        # initialize list of scores
        scores = {factor: [] for factor in factors}

        # tokenize reviews into sentences so that VADER can calculate scores based on context
        sentences = sent_tokenize(review)

        for sentence in sentences:
            for factor, keywords in factors.items():
                if any(keyword in sentence.lower() for keyword in keywords):
                    sentiment_score = sia.polarity_scores(sentence)['compound']
                    scores[factor].append(sentiment_score)

        average_scores = {factor: (np.mean(scores[factor]) if scores[factor] else np.nan)
                          for factor in factors}
        return average_scores

    def normalize(self, score, old_min=-1, old_max=1, new_min=1, new_max=5):
        """
        Normalize sentiment scores from a scale of [-1, 1] to [1, 5].

        Inputs:
        - score: sentiment score calculated using the review_sentiment_analysis function
        - old_min, old_max: limits of sentiment score scale
        - new_min, new_max: desired scale (1-5)

        Output:
        - normalized sentiment score using min_max scaling
        """
        if pd.isna(score):  # skip NaN values
            return np.nan

        # min_max scaling to normalize sentiment score
        return new_min + (score - old_min) * (new_max - new_min) / (old_max - old_min)

    def analyze(self):
        loc_revs = self.reviews_df
        
        # rename desired columns and drop unnecessary columns
        loc_revs.rename(columns={'title': 'Place_Name', 'rating_x': 'Rating', 'address': 'Address', 'gps_coordinates.latitude': 'Latitude',
                                 'gps_coordinates.longitude': 'Longitude', 'operating_hours.monday': 'Monday_Hours',
                                 'operating_hours.tuesday': 'Tuesday_Hours', 'operating_hours.wednesday': 'Wednesday_Hours',
                                 'operating_hours.thursday': 'Thursday_Hours', 'operating_hours.friday': 'Friday_Hours',
                                 'operating_hours.saturday': 'Saturday_Hours', 'operating_hours.sunday': 'Sunday_Hours',
                                 'extracted_snippet.original': 'Review', 'details.service': 'Service',
                                 'details.food': 'Food', 'details.atmosphere': 'Atmosphere'}, inplace=True)
        loc_revs = loc_revs.drop(['reviews', 'iso_date', 'likes', 'rating_y'], axis=1)
        processed_revs = loc_revs.copy()

        # ensure all scores are numeric
        cols = ['Service', 'Food', 'Atmosphere']
        loc_revs[cols] = loc_revs[cols].apply(pd.to_numeric, errors='coerce')  # ignore strings like "Dine-in"
        loc_revs = loc_revs.groupby(['Place_Name', 'place_id'])[cols].mean().reset_index() # move place name and id to front of df

        # replace NaN and blanks, and ensure all reviews are strings
        processed_revs['Review'] = processed_revs['Review'].fillna("").astype(str)

        # apply preprocessing
        processed_revs['Processed_Review'] = processed_revs['Review'].apply(self.preprocess)

        # download NLTK dependencies
        nltk.download('vader_lexicon')
        nltk.download('punkt')
        nltk.download('stopwords')

        # initialize SentimentIntensityAnalyzer class
        sia = SentimentIntensityAnalyzer()

        # define factors and keywords
        factors = {
            'Internet': ['internet', 'good connection', 'bad connection', 'wifi'],
            'Price': ['cheap', 'expensive', 'very expensive', 'too expensive', 'affordable', 'value', 'cost', 'pricey', 'pricy', 'pricier', 'price', 'pricing', 'paid', 'pay'],
            'Lighting': ['bright', 'dark', 'dim', 'well-lit', 'lighting', 'sunny'],
            'Noise': ['quiet', 'noisy', 'loud', 'peaceful', 'silent', 'noise', 'calm'],
            'Comfort': ['comfortable', 'uncomfortable', 'spacious', 'cozy', 'cramped', 'seating', 'space', 'comfort', 'sunny', 'sun', 'shade', 'stuffy'],
            'Food': ['delicious', 'bland', 'tasty', 'flavor', 'spicy', 'sweet', 'savory', 'flavorful', 'taste', 'perfect',
                     'raw', 'best', 'high quality', 'low quality', 'avoid', 'good', 'bad', 'solid', 'fresh'],
            'Service': ['friendly', 'rude', 'attentive', 'slow', 'efficient', 'courteous', 'quick', 'fast', 'service', 'poor', 'nice', 'hospitality', 'kind', 'welcoming'],
            'Atmosphere': ['ambiance', 'vibe', 'vibes', 'decor', 'welcoming', 'modern', 'cozy', 'atmosphere', 'cute', 'chill', 'hectic', 'chaotic', 'warm', 'cold']
        }

        # initialize missing columns with NaN
        for factor in factors.keys():
            if factor not in processed_revs.columns:
                processed_revs[factor] = np.nan

        # calculate normalized sentiment score for each review in each factor
        # only performed on scores of NaN; all other scores were part of original review data and are on a 1-5 scale
        for factor in factors.keys():
            processed_revs[factor] = processed_revs.apply(
                lambda row: (
                    self.normalize(self.review_sentiment_score(row['Review'], factors, sia)[factor])
                    if pd.isna(row[factor]) else row[factor]
                ),
                axis=1)

        # ensure factor columns are numeric
        factor_columns = list(factors.keys())
        processed_revs[factor_columns] = processed_revs[factor_columns].apply(pd.to_numeric, errors='coerce')

        # group and average scores by location
        location_scores = processed_revs.groupby(['Place_Name', 'place_id'])[factor_columns].mean().reset_index()

        # replace NaN values with 0
        processed_revs[factor_columns] = processed_revs[factor_columns].fillna(0)

        return location_scores

In [45]:
# read the CSV files into DataFrames
reviews_df_updated = pd.read_csv('reviews_df_updated.csv')

# initialize the SentimentAnalysis class
sentiment_analyzer = SentimentAnalysis(reviews_df_updated)

# perform the sentiment analysis and retrieve the final dataframe
location_scores = sentiment_analyzer.analyze()
location_scores

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\cpfly\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\cpfly\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\cpfly\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,Place_Name,place_id,Internet,Price,Lighting,Noise,Comfort,Food,Service,Atmosphere
0,Alfred Coffee,ChIJXWBzZqe8woARmHCkaNO295Q,,4.764,3.843,,3.34,4.5431,5.0,4.833333
1,Alfred Coffee,ChIJx755_Qu9woARKGmYHprkK0c,,3.9878,,2.9226,4.4702,4.666667,4.555556,4.333333
2,Anderson Cafe (Starbucks),ChIJ99Yi5Nu9woARG62g6qTSbx8,,2.4308,,,,2.506914,2.4,2.8
3,Blue Bottle Coffee,ChIJQ_kheoy7woARo3jjflGLgxM,,,,,3.6644,4.04755,3.928833,3.66576
4,Bluestone Lane Westwood Coffee Shop,ChIJJXHT4LS9woARyL7sbitjv8o,,2.7271,,,3.784433,3.625,4.0,3.74735
5,Board House Coffee,ChIJaX6LUmi7woARQG0iLDsAmsY,4.804,,,,4.4368,4.710981,4.669543,4.5
6,Bruin Buzz,ChIJ1xoN2oi8woARXJ6Ahcp7GS8,,,,,3.0,4.5012,,
7,Bruin Cafe,ChIJezN24o28woARcqiE5XHiRhc,,,,,2.8968,4.06124,3.33175,4.5
8,Cafe 451,ChIJm-IBiom8woAR-CxVeBRQVIM,,,,,,4.5,3.67714,4.204
9,Caffe Luxxe,ChIJ83Ob_V-7woARyB2vkPZjBEc,2.0122,4.1045,,4.7038,4.5819,4.64456,4.0,4.445383


In [47]:
location_scores.to_csv('location_scores2.csv', index=False)