In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import json
import re
import time
from datetime import datetime

# Text Preprocessing
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Download necessary NLTK data (run once)
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')
try:
    nltk.data.find('corpora/stopwords')
except LookupError:
    nltk.download('stopwords')
try:
    nltk.data.find('corpora/wordnet')
except LookupError:
    nltk.download('wordnet')

# Scikit-learn for baseline models and metrics
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import (
    accuracy_score,
    precision_recall_fscore_support,
    roc_auc_score,
    classification_report,
    confusion_matrix,
    ConfusionMatrixDisplay,
    roc_curve,
    auc
)
from sklearn.preprocessing import LabelEncoder, MultiLabelBinarizer

# PyTorch and Hugging Face Transformers
import torch
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW  # Correct import for AdamW
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    get_linear_schedule_with_warmup,
    DistilBertTokenizer,  # Example using DistilBERT
    DistilBertForSequenceClassification
)

# Plotting
import matplotlib.pyplot as plt
import seaborn as sns

# Set random seed for reproducibility
np.random.seed(42)
torch.manual_seed(42)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(42)

print("Libraries imported successfully.")



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


Libraries imported successfully.


In [2]:
# Import necessary libraries
import pandas as pd
import numpy as np
import json
import re
import time
from datetime import datetime

# Text Preprocessing
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Download necessary NLTK data (run once)
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')
try:
    nltk.data.find('corpora/stopwords')
except LookupError:
    nltk.download('stopwords')
try:
    nltk.data.find('corpora/wordnet')
except LookupError:
    nltk.download('wordnet')

# Scikit-learn for baseline models and metrics
# ... (rest of the imports) ...

# Set random seed for reproducibility
# ... (seed setting) ...

print("Libraries imported successfully.")

# --- File Paths ---
# Mount Google Drive
try:
    from google.colab import drive
    drive.mount('/content/drive')
    # Adjust the file paths accordingly
    review_file_path = '/content/drive/My Drive/CMPS 6730 - NLP/FinalProject/yelp_academic_dataset_review.json'
    business_file_path = '/content/drive/My Drive/CMPS 6730 - NLP/FinalProject/yelp_academic_dataset_business.json'
    print("Google Drive mounted.")
    print(f"Review data path: {review_file_path}")
    print(f"Business data path: {business_file_path}")
except ModuleNotFoundError:
    # If not in Colab, use the local path directly (adjust if needed)
    review_file_path = r"G:\My Drive\CMPS 6730 - NLP\FinalProject\yelp_academic_dataset_review.json" # Use raw string
    business_file_path = r"G:\My Drive\CMPS 6730 - NLP\FinalProject\yelp_academic_dataset_business.json" # Use raw string
    print("Running locally. Ensure the file paths are correct.")
    print(f"Review data path: {review_file_path}")
    print(f"Business data path: {business_file_path}")

# --- Load Review Data in Chunks ---
print(f"\nLoading review data from: {review_file_path}")
start_time = time.time()

all_data = []
chunksize = 100000  # Adjust as needed

for chunk in pd.read_json(review_file_path, lines=True, chunksize=chunksize):
    all_data.append(chunk)

df_reviews = pd.concat(all_data, ignore_index=True)

loading_time = time.time() - start_time
print(f"Review data loaded in {loading_time:.2f} seconds.")
print(f"Review dataset shape: {df_reviews.shape}")
print("\nReview Data Info:")
df_reviews.info()
print("\nReview Data Head:")
print(df_reviews.head())

# Convert 'date' column to datetime objects
df_reviews['date'] = pd.to_datetime(df_reviews['date'])
print("\n'date' column converted to datetime.")

# --- Load Business Data ---
print(f"\nLoading business data from: {business_file_path}")
start_time = time.time()
try:
    df_business = pd.read_json(business_file_path, lines=True)
except ValueError as e:
    print(f"Error reading business JSON with pandas: {e}. Trying line-by-line.")
    business_data = []
    with open(business_file_path, 'r', encoding='utf-8') as f:
        for line in f:
            try:
                business_data.append(json.loads(line))
            except json.JSONDecodeError:
                print(f"Skipping malformed line in businesses")
                continue
    df_business = pd.DataFrame(business_data)

loading_time = time.time() - start_time
print(f"Business data loaded in {loading_time:.2f} seconds.")
print(f"Business dataset shape: {df_business.shape}")
print("\nBusiness Data Info:")
df_business.info()
print("\nBusiness Data Head:")
print(df_business.head())
print("\nExample Business Categories:", df_business['categories'].iloc[0])

Libraries imported successfully.


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Mounted at /content/drive
Google Drive mounted.
Review data path: /content/drive/My Drive/CMPS 6730 - NLP/FinalProject/yelp_academic_dataset_review.json
Business data path: /content/drive/My Drive/CMPS 6730 - NLP/FinalProject/yelp_academic_dataset_business.json

Loading review data from: /content/drive/My Drive/CMPS 6730 - NLP/FinalProject/yelp_academic_dataset_review.json
Review data loaded in 101.03 seconds.
Review dataset shape: (6990280, 9)

Review Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6990280 entries, 0 to 6990279
Data columns (total 9 columns):
 #   Column       Dtype         
---  ------       -----         
 0   review_id    object        
 1   user_id      object        
 2   business_id  object        
 3   stars        int64         
 4   useful       int64         
 5   funny        int64         
 6   cool         int64         
 7   text         object        
 8   date         datetime64[ns]
dtypes: datetime64[ns](1), int64(4), object(4)
memory usa

In [3]:
# Filter data based on your criteria (date range AND category)

# 1. Filter Businesses by Category


# Filter by category (must contain 'Restaurants' or 'Food') - ALL locations
df_business['categories'] = df_business['categories'].fillna('')
all_restaurants = df_business[
    df_business['categories'].str.contains('Restaurant|Food', case=False, regex=True)
].copy()

target_business_ids = set(all_restaurants['business_id'])
print(f"Number of unique target restaurant business IDs: {len(target_business_ids)}")

if not target_business_ids:
    print("\nWarning: No restaurant business IDs found for the specified location. Check city names and categories.")
    # Handle this case - perhaps stop execution or proceed with only date filtering?
    # For now, we'll let it proceed, resulting in an empty review dataframe later.

# 2. Filter Reviews by Date
start_date = datetime(2019, 1, 1)
end_date = datetime(2024, 12, 31) # Use end of 2024 for completeness

df_reviews_filtered_date = df_reviews[(df_reviews['date'] >= start_date) & (df_reviews['date'] <= end_date)].copy()
print(f"\nReviews filtered by date ({start_date.date()} to {end_date.date()}). Shape: {df_reviews_filtered_date.shape}")

# 3. Filter Date-Filtered Reviews by Target Business IDs
if target_business_ids:
    df_filtered = df_reviews_filtered_date[
        df_reviews_filtered_date['business_id'].isin(target_business_ids)
    ].copy()
    print(f"Reviews filtered by target business IDs. Shape: {df_filtered.shape}")
else:
    print("\nSkipping business ID filtering as no target IDs were found.")
    df_filtered = pd.DataFrame(columns=df_reviews_filtered_date.columns) # Create empty DataFrame matching columns


# Select relevant columns for final processing
# Keep 'business_id' if you might need it later, otherwise drop it
df_processed = df_filtered[['review_id', 'text', 'stars', 'date', 'business_id']].copy()

print(f"\nFinal shape for processing (filtered by date and location/category): {df_processed.shape}")

if df_processed.empty:
     print("\nWARNING: No reviews match the filtering criteria (Date + Location/Category). Subsequent steps will fail.")
     # Consider stopping execution here if the dataframe is empty.
else:
    print("\nSelected relevant columns ('review_id', 'text', 'stars', 'date', 'business_id').")
    print(df_processed.head())

# --- Cleanup (Optional: remove intermediate dataframes to save memory) ---
# del df_reviews, df_business, df_reviews_filtered_date, businesses_in_location, restaurants_in_location
# import gc
# gc.collect()
# print("\nIntermediate dataframes cleaned up.")

Number of unique target restaurant business IDs: 64629

Reviews filtered by date (2019-01-01 to 2024-12-31). Shape: (2111695, 9)
Reviews filtered by target business IDs. Shape: (1525330, 9)

Final shape for processing (filtered by date and location/category): (1525330, 5)

Selected relevant columns ('review_id', 'text', 'stars', 'date', 'business_id').
                     review_id  \
194087  F6VdYuJiefNBfn3HNELv0A   
194089  nAMDCKElSKxOhzm9Lpt6Eg   
194093  3CmdoGKBZUX3Nb5IfbztMg   
203214  -4Nv_JAolCM0gzKM4DZpmQ   
204271  uObbDRxP_cwJVADckZJDZw   

                                                     text  stars  \
194087  The food is INCREDIBLE! We didn't have time to...      5   
194089  We had a great time, and excellent service. Al...      5   
194093  My favorite coffee shop in New Orleans for sur...      5   
203214  Old school circa 1979, the cozy, intimate banq...      4   
204271  Delicious! Tried this place on Saturday and it...      5   

                      date     

In [4]:
import pandas as pd
import numpy as np
import nltk
import re # For basic text cleaning
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from collections import defaultdict

# --- Download NLTK data (only needs to be done once) ---
!pip install nltk
import nltk

try:
    nltk.data.find('tokenizers/punkt')
except LookupError: # Changed to catch LookupError
    print("Downloading NLTK 'punkt' tokenizer...")
    nltk.download('punkt', quiet=True)
try:
    nltk.data.find('corpora/stopwords')
except LookupError: # Changed to catch LookupError
    print("Downloading NLTK 'stopwords'...")
    nltk.download('stopwords', quiet=True)
try:
    nltk.data.find('sentiment/vader_lexicon.zip')
except LookupError: # Changed to catch LookupError
    print("Downloading NLTK 'vader_lexicon'...")
    nltk.download('vader_lexicon', quiet=True)

try:
    nltk.data.find('tokenizers/punkt_tab') # Check if the resource is already present
except LookupError:
    print("Downloading NLTK 'punkt_tab'...")
    nltk.download('punkt_tab') # Download specifically 'punkt_tab'


# --- Configuration ---
# Define the path to your processed data (assuming df_processed is loaded elsewhere)
# If df_processed is not loaded, you'll need to load it first, e.g.:
# input_data_file = 'path/to/your/processed_reviews.csv'
# df_processed = pd.read_csv(input_data_file)

# Define where to save the file with sentiment scores
try:
    # Assume Google Drive is mounted if in Colab
    output_sentiment_file = '/content/drive/My Drive/CMPS 6730 - NLP/FinalProject/reviews_with_sentiment_scores.csv'
    print(f"Output file with sentiment scores will be saved to: {output_sentiment_file}")
except NameError: # If 'drive' object doesn't exist (running locally)
    # Adjust local path if necessary
    output_sentiment_file = r"G:\My Drive\CMPS 6730 - NLP\FinalProject\reviews_with_sentiment_scores.csv"
    print(f"Output file with sentiment scores will be saved to: {output_sentiment_file}")

# --- Aspect Keyword Lists (Illustrative Examples - EXPAND THESE SIGNIFICANTLY) ---
# Based on the paper, these lists should be comprehensive.
# Consider synonyms, related terms, common phrases.

# --- Aspect Keyword Lists (EXPANDED - CONTINUE ADDING MORE!) ---
aspect_keywords = {
    'food': [
        # General Food
        'food', 'dish', 'meal', 'plate', 'menu', 'cuisine', 'recipe', 'ingredients', 'eat', 'ate', 'dining',
        # Taste & Quality
        'taste', 'flavor', 'delicious', 'tasty', 'yummy', 'savory', 'sweet', 'sour', 'bitter', 'spicy', 'hot',
        'fresh', 'quality', 'authentic', 'homemade', 'cooked', 'preparation', 'appetizer', 'entree', 'dessert',
        'portion', 'serving', 'burnt', 'undercooked', 'overcooked', 'bland', 'seasoning',
        # Specific Items (Examples - Add many more common ones)
        'chicken', 'beef', 'pork', 'fish', 'seafood', 'shrimp', 'crab', 'lobster', 'steak', 'burger', 'sandwich',
        'pizza', 'pasta', 'sushi', 'taco', 'salad', 'soup', 'bread', 'fries', 'rice', 'noodles', 'vegetables',
        'cake', 'pie', 'ice cream', 'coffee', 'tea', 'drink', 'beverage', 'wine', 'beer', 'cocktail'
    ],
    'service': [
        # Staff General
        'service', 'staff', 'server', 'waiter', 'waitress', 'waitstaff', 'host', 'hostess', 'bartender', 'manager', 'employee',
        # Staff Behavior
        'friendly', 'attentive', 'helpful', 'professional', 'polite', 'courteous', 'welcoming', 'accommodating',
        'rude', 'unfriendly', 'inattentive', 'slow', 'ignored', 'forgetful', 'unprofessional', 'arrogant', 'bad', 'terrible'
        # Process
        'wait', 'waiting', 'order', 'refill', 'check', 'bill', 'reservation', 'seated', 'prompt', 'quick', 'efficient',
        'mistake', 'error', 'issue', 'problem', 'complaint', 'request'
    ],
    'ambiance': [
        # General Feel
        'ambiance', 'atmosphere', 'vibe', 'decor', 'setting', 'environment', 'interior', 'design', 'layout',
        # Sensory
        'music', 'lighting', 'loud', 'noisy', 'quiet', 'sound', 'smell',
        # Comfort & Cleanliness
        'comfortable', 'cozy', 'relaxing', 'upscale', 'casual', 'romantic', 'view',
        'clean', 'dirty', 'tidy', 'messy', 'hygiene', 'restroom', 'bathroom', 'tables',
        # Space
        'crowded', 'spacious', 'seating', 'booth', 'patio', 'outdoor'
    ],
    'price': [
        # General Cost
        'price', 'cost', 'value', 'money', 'budget', 'bill', 'charge', 'worth', 'pay', 'paid', 'tab',
        # Affordability
        'cheap', 'expensive', 'affordable', 'overpriced', 'reasonable', 'deal', 'bargain', 'pricey', 'costly',
        'value for money', 'rip-off', 'discount', 'coupon', 'special'
    ],
    'context': [
        # Occasion
        'occasion', 'birthday', 'anniversary', 'celebration', 'date', 'romantic', 'special', 'holiday',
        # Company
        'friends', 'family', 'kids', 'children', 'group', 'party', 'business', 'work', 'solo', 'couple',
        # Time/Meal Type
        'lunch', 'dinner', 'brunch', 'breakfast', 'late night', 'happy hour',
        # Location Related (can overlap with ambiance)
        'location', 'neighborhood', 'parking', 'visit', 'trip', 'tourist'
     ]
}
# --- Initialize Sentiment Analyzer ---
analyzer = SentimentIntensityAnalyzer()
stop_words = set(stopwords.words('english'))

# --- Helper Functions ---

def clean_text(text):
    """Basic text cleaning."""
    if not isinstance(text, str):
        return ""
    text = text.lower() # Lowercase
    text = re.sub(r'\d+', '', text) # Remove numbers
    text = re.sub(r'[^\w\s]', '', text) # Remove punctuation
    text = text.strip() # Remove leading/trailing whitespace
    return text

def classify_sentence_aspect(sentence, aspect_kw):
    """
    Classifies a sentence into one aspect based on keyword counts.
    Follows the paper's method: assign to aspect with most keywords.
    """
    # Clean and tokenize the sentence
    cleaned_sentence = clean_text(sentence)
    words = word_tokenize(cleaned_sentence)
    # Remove stop words for better keyword matching
    words = [word for word in words if word not in stop_words]

    scores = {aspect: 0 for aspect in aspect_kw}
    max_score = 0
    best_aspect = 'other' # Default if no keywords match

    # Count keywords for each aspect
    for aspect, keywords in aspect_kw.items():
        # Use a set for faster checking
        keyword_set = set(keywords)
        # Count occurrences of keywords in the sentence words
        score = sum(1 for word in words if word in keyword_set)
        scores[aspect] = score
        if score > max_score:
            max_score = score
            best_aspect = aspect
        # Handle ties (optional, could assign to 'multiple' or prioritize)
        # elif score == max_score and max_score > 0:
        #     best_aspect = 'multiple' # Or handle differently

    # Ensure we only classify if at least one keyword was found
    if max_score == 0:
        best_aspect = 'other'

    return best_aspect

def get_sentence_sentiment(sentence):
    """
    Calculates the VADER compound sentiment score for a sentence.
    Ranges from -1 (most negative) to +1 (most positive).
    The paper used AFINN (-5 to +5), VADER is a common alternative.
    """
    # VADER's polarity_scores returns dict: {'neg': %, 'neu': %, 'pos': %, 'compound': score}
    vs = analyzer.polarity_scores(sentence)
    return vs['compound'] # Use the compound score as the overall sentiment

def analyze_review_sentiment(review_text, aspect_kw):
    """
    Analyzes a full review text to get weighted sentiment scores per aspect.
    Implements the methodology from the paper.
    """
    if not isinstance(review_text, str) or not review_text.strip():
        # Return default scores for empty or invalid reviews
        return {f'{aspect}_sentiment': 0.0 for aspect in aspect_kw} | {'other_sentiment': 0.0}

    # 1. Tokenize into sentences
    sentences = sent_tokenize(review_text)
    total_sentences = len(sentences)

    if total_sentences == 0:
        # Handle reviews that couldn't be sentence-tokenized
         return {f'{aspect}_sentiment': 0.0 for aspect in aspect_kw} | {'other_sentiment': 0.0}

    # 2. Classify aspect and get sentiment for each sentence
    sentence_results = []
    for sentence in sentences:
        aspect = classify_sentence_aspect(sentence, aspect_kw)
        sentiment = get_sentence_sentiment(sentence)
        sentence_results.append({'aspect': aspect, 'sentiment': sentiment})

    # 3. Aggregate sentiment scores and count sentences per aspect
    aspect_sentiments_sum = defaultdict(float)
    aspect_sentence_counts = defaultdict(int)

    for result in sentence_results:
        aspect = result['aspect']
        sentiment = result['sentiment']
        aspect_sentiments_sum[aspect] += sentiment
        aspect_sentence_counts[aspect] += 1

    # 4. Calculate weighted sentiment scores (as per paper's formula)
    weighted_scores = {}
    all_aspects = list(aspect_kw.keys()) + ['other'] # Include 'other' category

    for aspect in all_aspects:
        sum_score = aspect_sentiments_sum[aspect]
        count = aspect_sentence_counts[aspect]

        # Calculate weight (proportion of sentences for this aspect)
        weight = count / total_sentences if total_sentences > 0 else 0

        # Weighted score = Sum * Weight (or proportion)
        # Note: The paper's formula description is slightly ambiguous.
        # Sentiment_ij = Sentiment Score_ij * (# sentences Attribute j / # sentences Attribute_ij)
        # This implies weighting the *average* score per attribute.
        # Let's calculate both average and weighted sum for clarity.
        # average_score = sum_score / count if count > 0 else 0.0
        # weighted_score_avg_based = average_score * weight # Avg score weighted by proportion

        # Alternative interpretation: Weight the *total* sentiment sum for the aspect
        # This seems more aligned with capturing overall impact.
        weighted_score_sum_based = sum_score * weight

        # Store the weighted score based on the sum interpretation
        weighted_scores[f'{aspect}_sentiment'] = weighted_score_sum_based

    return weighted_scores

def get_overall_sentiment(review_text):  # <-- Add this function here
    """Calculates the overall sentiment score for the entire review."""
    if not isinstance(review_text, str) or not review_text.strip():
        return 0.0  # Return 0 for empty or invalid reviews

    # Use VADER to get the compound sentiment score
    scores = analyzer.polarity_scores(review_text)
    return scores['compound']

# --- Main Processing Logic ---

# Check if df_processed exists and is populated
# IMPORTANT: Ensure df_processed is loaded before this point if not already in memory.
# Example:
# try:
#     df_processed = pd.read_csv('path/to/your/filtered_reviews.csv')
#     print(f"Loaded {len(df_processed)} reviews.")
# except FileNotFoundError:
#     print("Error: Processed review file not found.")
#     df_processed = pd.DataFrame() # Create empty df to avoid error

if 'df_processed' not in locals() or not isinstance(df_processed, pd.DataFrame) or df_processed.empty:
    print("\nError: The 'df_processed' DataFrame is empty or does not exist.")
    print("Please ensure your data loading step ran successfully.")
    # Optionally, raise an error:
    # raise ValueError("Cannot proceed without data in df_processed.")
else:
    print(f"\nStarting sentiment analysis process on {len(df_processed)} filtered reviews.")

    # Make a copy to avoid modifying the original DataFrame directly
    df_analysis = df_processed.copy()

    # --- Apply Sentiment Analysis ---
    print("Analyzing review sentiments (this may take a while)...")

    # Use .apply() to process the 'text' column of each review
    # The result of apply will be a Series of dictionaries
    sentiment_results = df_analysis['text'].apply(lambda text: analyze_review_sentiment(text, aspect_keywords))

    # Convert the Series of dictionaries into separate columns in the DataFrame
    # pd.json_normalize is efficient for this
    sentiment_df = pd.json_normalize(sentiment_results)

    # Add these new sentiment score columns to the analysis DataFrame
    df_analysis = pd.concat([df_analysis.reset_index(drop=True), sentiment_df.reset_index(drop=True)], axis=1)

    print("Sentiment analysis complete.")

    # --- Select and Reorder Columns for Output ---
    # Keep original identifiers and add the new sentiment scores
    output_columns = ['review_id', 'business_id', 'date', 'stars', 'text'] + \
                     [f'{aspect}_sentiment' for aspect in aspect_keywords] + \
                     ['other_sentiment'] # Include 'other' if calculated

    # Ensure all expected columns exist before selecting
    final_columns = [col for col in output_columns if col in df_analysis.columns]
    df_final_output = df_analysis[final_columns]

    # --- Export to CSV ---
    try:
        # Use index=False to avoid writing the DataFrame index as a column
        df_final_output.to_csv(output_sentiment_file, index=False, encoding='utf-8')
        print(f"\nSuccessfully exported {len(df_final_output)} reviews with sentiment scores to:")
        print(output_sentiment_file)
        print("\n--- NEXT STEPS ---")
        print("1. Explore the calculated sentiment scores (e.g., correlations with 'stars').")
        print("2. Refine the aspect keyword lists for better classification accuracy.")
        print("3. Consider alternative sentiment lexicons or models if needed.")
        print("4. Use these sentiment scores as features for further analysis or modeling.")

    except Exception as e:
        print(f"\nError exporting file: {e}")
        print("Please check the output path and ensure you have write permissions.")

# Optional: Display the first few rows with the new scores
if 'df_final_output' in locals():
    print("\nPreview of DataFrame with Sentiment Scores:")
    print(df_final_output.head())

Downloading NLTK 'vader_lexicon'...
Downloading NLTK 'punkt_tab'...


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


Output file with sentiment scores will be saved to: /content/drive/My Drive/CMPS 6730 - NLP/FinalProject/reviews_with_sentiment_scores.csv

Starting sentiment analysis process on 1525330 filtered reviews.
Analyzing review sentiments (this may take a while)...
Sentiment analysis complete.

Successfully exported 1525330 reviews with sentiment scores to:
/content/drive/My Drive/CMPS 6730 - NLP/FinalProject/reviews_with_sentiment_scores.csv

--- NEXT STEPS ---
1. Explore the calculated sentiment scores (e.g., correlations with 'stars').
2. Refine the aspect keyword lists for better classification accuracy.
3. Consider alternative sentiment lexicons or models if needed.
4. Use these sentiment scores as features for further analysis or modeling.

Preview of DataFrame with Sentiment Scores:
                review_id             business_id                date  stars  \
0  F6VdYuJiefNBfn3HNELv0A  z7em5co2qckbAXoDGXynsA 2019-01-04 02:18:09      5   
1  nAMDCKElSKxOhzm9Lpt6Eg  M0r9lUn2gLFYgIwIfG

In [5]:
# Initialize lemmatizer and stopwords
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re
import time

# Download the necessary NLTK data if it's not already present
try:
    nltk.data.find('tokenizers/punkt_tab')
except LookupError:
    nltk.download('punkt_tab')

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    """Cleans and preprocesses text data."""
    if not isinstance(text, str):
        return ""
    # Lowercase
    text = text.lower()
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    # Remove user @ references and '#' from tweet
    text = re.sub(r'\@\w+|\#','', text)
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Tokenize
    tokens = word_tokenize(text)
    # Remove stopwords and lemmatize
    processed_tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words and word.isalpha()]
    return ' '.join(processed_tokens)

# Apply preprocessing (can take time on large datasets)
print("\nStarting text preprocessing...")
start_time = time.time()
df_processed['processed_text'] = df_processed['text'].apply(preprocess_text)
processing_time = time.time() - start_time
print(f"Text preprocessing completed in {processing_time:.2f} seconds.")

# Display some processed text examples
print("\nOriginal vs Processed Text Examples:")
for i in range(3):
    print(f"--- Example {i+1} ---")
    print("Original:", df_processed['text'].iloc[i][:200] + "...") # Show first 200 chars
    print("Processed:", df_processed['processed_text'].iloc[i])
    print("-" * 20)

# Drop rows where processed text is empty
df_processed = df_processed[df_processed['processed_text'] != '']
print(f"\nShape after removing empty processed texts: {df_processed.shape}")


Starting text preprocessing...
Text preprocessing completed in 710.21 seconds.

Original vs Processed Text Examples:
--- Example 1 ---
Original: The food is INCREDIBLE! We didn't have time to get any of the cocktails, but they had some pretty creative and classy ones on the menu. Will be going back for that alone. But anyways they have this wa...
Processed: food incredible didnt time get cocktail pretty creative classy one menu going back alone anyways way taking unique twist thing absolutely killing oh god im going talk cole slaw minute dont know whole world suck cant cole slaw perfect like somehow straight red cabbage think also creamy tangy light checked box didnt even know possible also need mention peppermint cheesecake desert chocolately browniey crust wonderful world caught cheese cake world thing make sense course also mention fried chicken supper came cole slaw deliberately mentioning absolutely phenomenal breading flavorful there nice spicy honey drizzle really finish also c

In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from nltk.sentiment.vader import SentimentIntensityAnalyzer  # Import SentimentIntensityAnalyzer


# Initialize SentimentIntensityAnalyzer (if not already initialized)
analyzer = SentimentIntensityAnalyzer()

def get_overall_sentiment(review_text):  # <-- Add this function here
    """Calculates the overall sentiment score for the entire review."""
    if not isinstance(review_text, str) or not review_text.strip():
        return 0.0  # Return 0 for empty or invalid reviews

    # Use VADER to get the compound sentiment score
    scores = analyzer.polarity_scores(review_text)
    return scores['compound']
# Assuming df_processed is already loaded and contains the preprocessed text data in the 'processed_text' column

# 1. Load or calculate sentiment scores if not in df_processed
try:
    # If sentiment scores are already in df_processed, use them
    sentiment_columns = ['food_sentiment', 'service_sentiment', 'ambiance_sentiment', 'price_sentiment', 'context_sentiment', 'overall_sentiment']
    Y = df_processed[sentiment_columns].copy()  # Use .copy() to avoid SettingWithCopyWarning
except KeyError:
    print("Sentiment score columns not found in df_processed. Calculating or loading them...")
    # Calculate overall sentiment for each review
    df_processed['overall_sentiment'] = df_processed['text'].apply(get_overall_sentiment)
    # Option 2: Load from a separate file (if you saved them earlier)
    from google.colab import drive
    drive.mount('/content/drive')  # Mount Google Drive
    sentiment_file_path = '/content/drive/My Drive/CMPS 6730 - NLP/FinalProject/reviews_with_sentiment_scores.csv'

    sentiment_df = pd.read_csv(sentiment_file_path)
    # Merge sentiment scores with the original dataframe based on a common column (e.g., 'review_id')
    df_processed = pd.merge(
        df_processed,
        sentiment_df[['review_id', 'food_sentiment', 'service_sentiment', 'ambiance_sentiment', 'price_sentiment', 'context_sentiment']],
        on='review_id',
        how='left'
    )
    Y = df_processed[['food_sentiment', 'service_sentiment', 'ambiance_sentiment', 'price_sentiment', 'context_sentiment', 'overall_sentiment']].copy()

# --- 2. Correct sentiment binning using fixed thresholds ---

# Define fixed VADER thresholds and corresponding integer labels
# Bins: (-inf, -0.05], (-0.05, 0.05), [0.05, +inf)
# We add +/- infinity to ensure all values are covered.
bin_edges = [-float('inf'), -0.05, 0.05, float('inf')]
# Assign integer labels: 0 for Negative, 1 for Neutral, 2 for Positive
bin_labels = [0, 1, 2]
label_names = ['negative', 'neutral', 'positive'] # For potential use in reports if needed

print(f"Using fixed bin edges: {bin_edges}")
print(f"Assigning integer labels: {bin_labels} ({', '.join(label_names)})")

# Ensure Y is a DataFrame (it should be from previous steps)
if not isinstance(Y, pd.DataFrame):
     raise TypeError("Y should be a pandas DataFrame at this stage.")

# Create a new DataFrame for binned labels to avoid modifying Y inplace initially
Y_binned = pd.DataFrame(index=Y.index)

for column in Y.columns:
    # Cast column values to float64 explicitly to avoid dtype issues
    # Use .loc to avoid SettingWithCopyWarning if Y is a slice
    Y_column_float = Y.loc[:, column].astype(float)

    # Use pd.cut for binning with fixed edges
    # include_lowest=True: includes the lowest value (-inf) in the first bin
    # right=True (default): bins are (edge1, edge2], except first which is [edge1, edge2] due to include_lowest
    # If you want bins like [edge1, edge2), use right=False
    binned_data = pd.cut(
        Y_column_float,
        bins=bin_edges,
        labels=bin_labels,
        include_lowest=True,
        right=True # Standard VADER thresholds often use >= 0.05 for positive, <= -0.05 for negative
    )

    # Check for NaNs introduced by binning (shouldn't happen with inf edges, but good practice)
    if binned_data.isnull().any():
        print(f"Warning: NaNs found in '{column}' after binning. Check original data.")
        # Handle NaNs if necessary, e.g., fill with neutral=1 or drop rows
        # binned_data = binned_data.fillna(1) # Example: Fill NaN with Neutral

    # Assign the binned data (as integers) to the new DataFrame
    Y_binned[column] = binned_data.astype(int)

# Replace the original Y with the binned version
Y = Y_binned
print("\nSentiment scores binned using fixed thresholds:")
print(Y.head())
print("\nValue counts for 'food_sentiment' (example):")
print(Y['food_sentiment'].value_counts())

# --- End of binning modification ---

# 3. Prepare data for training and testing
# Assuming 'processed_text' column contains preprocessed text data
X = df_processed['processed_text'].copy()  # Use .copy() to ensure a deep copy

# Get the first sentiment column for stratification
stratify_column = Y.iloc[:, 0]  # Assuming Y is a DataFrame

# Split data into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y, test_size=0.2, random_state=42, stratify=stratify_column
)

# Print dataset sizes
print(f"Training set size: {len(X_train)}")
print(f"Testing set size: {len(X_test)}")

# ... (rest of the code, including TF-IDF, tokenization, and model training) ...

Sentiment score columns not found in df_processed. Calculating or loading them...
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Using fixed bin edges: [-inf, -0.05, 0.05, inf]
Assigning integer labels: [0, 1, 2] (negative, neutral, positive)

Sentiment scores binned using fixed thresholds:
   food_sentiment  service_sentiment  ambiance_sentiment  price_sentiment  \
0               2                  1                   1                1   
1               2                  2                   1                1   
2               2                  1                   1                1   
3               1                  1                   1                1   
4               2                  1                   1                1   

   context_sentiment  overall_sentiment  
0                  1                  2  
1                  1                  2  
2                  1                 

In [7]:
# --- Import necessary libraries ---
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer  # Import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import classification_report, precision_recall_fscore_support, accuracy_score

# Assuming df_processed is already loaded and contains the preprocessed text data in the 'processed_text' column

# 1. Load or calculate sentiment scores if not in df_processed
try:
    # If sentiment scores are already in df_processed, use them
    sentiment_columns = ['food_sentiment', 'service_sentiment', 'ambiance_sentiment', 'price_sentiment', 'context_sentiment', 'overall_sentiment']
    Y = df_processed[sentiment_columns].copy()  # Use .copy() to avoid SettingWithCopyWarning
except KeyError:
    print("Sentiment score columns not found in df_processed. Calculating or loading them...")
    # Option 2: Load from a separate file (if you saved them earlier)
    from google.colab import drive
    drive.mount('/content/drive')  # Mount Google Drive
    sentiment_file_path = '/content/drive/My Drive/CMPS 6730 - NLP/FinalProject/reviews_with_sentiment_scores.csv'

    sentiment_df = pd.read_csv(sentiment_file_path)
    # Merge sentiment scores with the original dataframe based on a common column (e.g., 'review_id')
    df_processed = pd.merge(
        df_processed,
        sentiment_df[['review_id', 'food_sentiment', 'service_sentiment', 'ambiance_sentiment', 'price_sentiment', 'context_sentiment', 'overall_sentiment']],
        on='review_id',
        how='left'
    )
    Y = df_processed[['food_sentiment', 'service_sentiment', 'ambiance_sentiment', 'price_sentiment', 'context_sentiment', 'overall_sentiment']].copy()

# --- 2. Correct sentiment binning using fixed thresholds ---

# Define fixed VADER thresholds and corresponding integer labels
# Bins: (-inf, -0.05], (-0.05, 0.05), [0.05, +inf)
# We add +/- infinity to ensure all values are covered.
bin_edges = [-float('inf'), -0.05, 0.05, float('inf')]
# Assign integer labels: 0 for Negative, 1 for Neutral, 2 for Positive
bin_labels = [0, 1, 2]
label_names = ['negative', 'neutral', 'positive'] # For potential use in reports if needed

print(f"Using fixed bin edges: {bin_edges}")
print(f"Assigning integer labels: {bin_labels} ({', '.join(label_names)})")

# Ensure Y is a DataFrame (it should be from previous steps)
if not isinstance(Y, pd.DataFrame):
     raise TypeError("Y should be a pandas DataFrame at this stage.")

# Create a new DataFrame for binned labels to avoid modifying Y inplace initially
Y_binned = pd.DataFrame(index=Y.index)

for column in Y.columns:
    # Cast column values to float64 explicitly to avoid dtype issues
    # Use .loc to avoid SettingWithCopyWarning if Y is a slice
    Y_column_float = Y.loc[:, column].astype(float)

    # Use pd.cut for binning with fixed edges
    # include_lowest=True: includes the lowest value (-inf) in the first bin
    # right=True (default): bins are (edge1, edge2], except first which is [edge1, edge2] due to include_lowest
    # If you want bins like [edge1, edge2), use right=False
    binned_data = pd.cut(
        Y_column_float,
        bins=bin_edges,
        labels=bin_labels,
        include_lowest=True,
        right=True # Standard VADER thresholds often use >= 0.05 for positive, <= -0.05 for negative
    )

    # Check for NaNs introduced by binning (shouldn't happen with inf edges, but good practice)
    if binned_data.isnull().any():
        print(f"Warning: NaNs found in '{column}' after binning. Check original data.")
        # Handle NaNs if necessary, e.g., fill with neutral=1 or drop rows
        # binned_data = binned_data.fillna(1) # Example: Fill NaN with Neutral

    # Assign the binned data (as integers) to the new DataFrame
    Y_binned[column] = binned_data.astype(int)

# Replace the original Y with the binned version
Y = Y_binned
print("\nSentiment scores binned using fixed thresholds:")
print(Y.head())
print("\nValue counts for 'food_sentiment' (example):")
print(Y['food_sentiment'].value_counts())

# --- End of binning modification ---

# 3. Prepare data for training and testing
# Assuming 'processed_text' column contains preprocessed text data
X = df_processed['processed_text'].copy()  # Use .copy() to ensure a deep copy

# Get the first sentiment column for stratification
stratify_column = Y.iloc[:, 0]  # Assuming Y is a DataFrame

# Split data into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y, test_size=0.2, random_state=42, stratify=stratify_column
)

# Print dataset sizes
print(f"Training set size: {len(X_train)}")
print(f"Testing set size: {len(X_test)}")


# --- TF-IDF Vectorization ---
# Create a TF-IDF vectorizer
vectorizer = TfidfVectorizer(max_features=5000)  # Adjust max_features as needed

# Fit the vectorizer to the training data and transform it
X_train_tfidf = vectorizer.fit_transform(X_train)

# Transform the testing data using the fitted vectorizer
X_test_tfidf = vectorizer.transform(X_test)

# --- Baseline Model Training and Evaluation ---
# Using OneVsRestClassifier: Trains one classifier per class (or per output)
# Here, we wrap Logistic Regression. One LR model will be trained for each aspect.
print("\nTraining Baseline Model (Logistic Regression with OneVsRest)...")

# Create a base estimator
log_reg = LogisticRegression(solver='liblinear', random_state=42) # Liblinear often good for high-dim sparse data

# Wrap it with OneVsRestClassifier for multi-label/multi-output scenario
# It trains one classifier per column in Y
baseline_model = MultiOutputClassifier(log_reg) # Simpler API for multi-output integer targets

# Train the model
baseline_model.fit(X_train_tfidf, Y_train) # Use the numeric labels DataFrame/Array

# Make predictions on the test set
Y_pred_baseline = baseline_model.predict(X_test_tfidf)

# --- Baseline Evaluation ---
# Assuming 'aspects' and 'sentiment_classes' are defined (from your Transformer code)

baseline_report = {} # To store results in a dictionary
print("\n--- Baseline (Logistic Regression) Classification Report (Per Aspect) ---")

baseline_f1_scores = {} # Store F1-scores per aspect
all_true_flat_baseline = []  # To store flattened true labels for overall metrics
all_pred_flat_baseline = []  # To store flattened predicted labels for overall metrics

# Add these lines before the loop in your code:
aspects = ['food', 'service', 'ambiance', 'price', 'context', 'overall_sentiment']
sentiment_classes = ['negative', 'neutral', 'positive']

for i, aspect_name in enumerate(aspects):
    print(f"\n--- Aspect: {aspect_name} ---")
    true_labels = Y_test.iloc[:, i].values # Get true labels for this aspect
    pred_labels = Y_pred_baseline[:, i] # Get predicted labels for this aspect

    # Get unique labels in pred_labels and true_labels
    unique_labels = np.unique(np.concatenate((pred_labels, true_labels)))

    # Filter target_names to include only the present labels
    present_target_names = [name for idx, name in enumerate(sentiment_classes) if idx in unique_labels]


    report = classification_report(true_labels, pred_labels, target_names=present_target_names, zero_division=0)
    print(report)
    precision, recall, f1, _ = precision_recall_fscore_support(true_labels, pred_labels, average='weighted', zero_division=0)
    accuracy = accuracy_score(true_labels, pred_labels)
    baseline_report[aspect_name] = {'precision': precision, 'recall': recall, 'f1-score': f1, 'accuracy': accuracy}
    baseline_f1_scores[aspect_name] = f1  # Store F1-score for this aspect

    all_true_flat_baseline.extend(true_labels)  # Extend for overall metrics
    all_pred_flat_baseline.extend(pred_labels)  # Extend for overall metrics

# Overall Micro and Macro Averages for Baseline
print("\n--- Baseline Overall Micro/Macro Averages ---")
precision_micro_b, recall_micro_b, f1_micro_b, _ = precision_recall_fscore_support(all_true_flat_baseline, all_pred_flat_baseline, average='micro', zero_division=0)
precision_macro_b, recall_macro_b, f1_macro_b, _ = precision_recall_fscore_support(all_true_flat_baseline, all_pred_flat_baseline, average='macro', zero_division=0)
print(f"Micro Average: Precision={precision_micro_b:.4f}, Recall={recall_micro_b:.4f}, F1-Score={f1_micro_b:.4f}")
print(f"Macro Average: Precision={precision_macro_b:.4f}, Recall={recall_macro_b:.4f}, F1-Score={f1_macro_b:.4f}")

baseline_report['overall_micro'] = {'precision': precision_micro_b, 'recall': recall_micro_b, 'f1-score': f1_micro_b}
baseline_report['overall_macro'] = {'precision': precision_macro_b, 'recall': recall_macro_b, 'f1-score': f1_macro_b}

Using fixed bin edges: [-inf, -0.05, 0.05, inf]
Assigning integer labels: [0, 1, 2] (negative, neutral, positive)

Sentiment scores binned using fixed thresholds:
   food_sentiment  service_sentiment  ambiance_sentiment  price_sentiment  \
0               2                  1                   1                1   
1               2                  2                   1                1   
2               2                  1                   1                1   
3               1                  1                   1                1   
4               2                  1                   1                1   

   context_sentiment  overall_sentiment  
0                  1                  2  
1                  1                  2  
2                  1                  2  
3                  2                  2  
4                  1                  2  

Value counts for 'food_sentiment' (example):
food_sentiment
2    1019023
1     385113
0     121177
Name: count, dtype: in

In [8]:
# --- Import necessary libraries ---
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer  # Import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import classification_report, precision_recall_fscore_support, accuracy_score
import torch
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW  # Correct import for AdamW
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    get_linear_schedule_with_warmup,
    DistilBertTokenizer,  # Example using DistilBERT
    DistilBertForSequenceClassification
)
import time
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# Initialize SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()

# Assuming df_processed is already loaded and contains the preprocessed text data in the 'processed_text' column

# 1. Load or calculate sentiment scores if not in df_processed
try:
    # If sentiment scores are already in df_processed, use them
    sentiment_columns = ['food_sentiment', 'service_sentiment', 'ambiance_sentiment', 'price_sentiment', 'context_sentiment', 'overall_sentiment']
    Y = df_processed[sentiment_columns].copy()  # Use .copy() to avoid SettingWithCopyWarning
except KeyError:
    print("Sentiment score columns not found in df_processed. Calculating or loading them...")
    # Option 2: Load from a separate file (if you saved them earlier)
    from google.colab import drive
    drive.mount('/content/drive')  # Mount Google Drive
    sentiment_file_path = '/content/drive/My Drive/CMPS 6730 - NLP/FinalProject/reviews_with_sentiment_scores.csv'

    sentiment_df = pd.read_csv(sentiment_file_path)
    # Merge sentiment scores with the original dataframe based on a common column (e.g., 'review_id')
    df_processed = pd.merge(
        df_processed,
        sentiment_df[['review_id', 'food_sentiment', 'service_sentiment', 'ambiance_sentiment', 'price_sentiment', 'context_sentiment', 'overall_sentiment']],
        on='review_id',
        how='left'
    )
    Y = df_processed[['food_sentiment', 'service_sentiment', 'ambiance_sentiment', 'price_sentiment', 'context_sentiment', 'overall_sentiment']].copy()

# Calculate overall sentiment for each review
df_processed['overall_sentiment'] = df_processed['text'].apply(get_overall_sentiment)

# Now include 'overall_sentiment' in your target columns
Y = df_processed[['food_sentiment', 'service_sentiment', 'ambiance_sentiment', 'price_sentiment', 'context_sentiment', 'overall_sentiment']].copy()

# --- 2. Correct sentiment binning using fixed thresholds ---

# Define fixed VADER thresholds and corresponding integer labels
# Bins: (-inf, -0.05], (-0.05, 0.05), [0.05, +inf)
# We add +/- infinity to ensure all values are covered.
bin_edges = [-float('inf'), -0.05, 0.05, float('inf')]
# Assign integer labels: 0 for Negative, 1 for Neutral, 2 for Positive
bin_labels = [0, 1, 2]
label_names = ['negative', 'neutral', 'positive'] # For potential use in reports if needed

print(f"Using fixed bin edges: {bin_edges}")
print(f"Assigning integer labels: {bin_labels} ({', '.join(label_names)})")

# Ensure Y is a DataFrame (it should be from previous steps)
if not isinstance(Y, pd.DataFrame):
     raise TypeError("Y should be a pandas DataFrame at this stage.")

# Create a new DataFrame for binned labels to avoid modifying Y inplace initially
Y_binned = pd.DataFrame(index=Y.index)

for column in Y.columns:
    # Cast column values to float64 explicitly to avoid dtype issues
    # Use .loc to avoid SettingWithCopyWarning if Y is a slice
    Y_column_float = Y.loc[:, column].astype(float)

    # Use pd.cut for binning with fixed edges
    # include_lowest=True: includes the lowest value (-inf) in the first bin
    # right=True (default): bins are (edge1, edge2], except first which is [edge1, edge2] due to include_lowest
    # If you want bins like [edge1, edge2), use right=False
    binned_data = pd.cut(
        Y_column_float,
        bins=bin_edges,
        labels=bin_labels,
        include_lowest=True,
        right=True # Standard VADER thresholds often use >= 0.05 for positive, <= -0.05 for negative
    )

    # Check for NaNs introduced by binning (shouldn't happen with inf edges, but good practice)
    if binned_data.isnull().any():
        print(f"Warning: NaNs found in '{column}' after binning. Check original data.")
        # Handle NaNs if necessary, e.g., fill with neutral=1 or drop rows
        # binned_data = binned_data.fillna(1) # Example: Fill NaN with Neutral

    # Assign the binned data (as integers) to the new DataFrame
    Y_binned[column] = binned_data.astype(int)

# Replace the original Y with the binned version
Y = Y_binned
print("\nSentiment scores binned using fixed thresholds:")
print(Y.head())
print("\nValue counts for 'food_sentiment' (example):")
print(Y['food_sentiment'].value_counts())

# --- End of binning modification ---

# 3. Prepare data for training and testing
# Assuming 'processed_text' column contains preprocessed text data
X = df_processed['processed_text'].copy()  # Use .copy() to ensure a deep copy

# Get the first sentiment column for stratification
stratify_column = Y.iloc[:, 0]  # Assuming Y is a DataFrame

# Split data into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y, test_size=0.2, random_state=42, stratify=stratify_column
)

# Print dataset sizes
print(f"Training set size: {len(X_train)}")
print(f"Testing set size: {len(X_test)}")


# --- Tokenization ---
# Initialize tokenizer (if not done earlier)
model_name = 'distilbert-base-uncased'  # Or your preferred model name
tokenizer = DistilBertTokenizer.from_pretrained(model_name)

# Tokenize and encode the text data
train_encodings = tokenizer(X_train.tolist(), truncation=True, padding=True)
test_encodings = tokenizer(X_test.tolist(), truncation=True, padding=True)

# --- Convert Y_train and Y_test to NumPy arrays ---
Y_train_np = Y_train.values  # Assuming Y_train is a pandas DataFrame
Y_test_np = Y_test.values   # Assuming Y_test is a pandas DataFrame


# --- Define PyTorch Dataset ---
class YelpAspectDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        # Ensure labels are shaped correctly (N_samples, N_aspects)
        self.labels = labels

    def __getitem__(self, idx):
        # item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        # Handle cases where encodings might be lists of tensors or similar structures
        item = {}
        for key, val in self.encodings.items():
            # Ensure the value associated with the key is indexable and convert to tensor
            if isinstance(val, list):
                item[key] = torch.tensor(val[idx])
            else:  # Assuming it might already be a tensor or numpy array
                item[key] = torch.tensor(val[idx]).clone().detach()  # Make sure it's a tensor

        # Labels should be FloatTensor for BCEWithLogitsLoss or LongTensor for CrossEntropyLoss
        # For multi-label (predicting probability for each class per aspect) or multi-output (predicting one class per aspect)
        # Let's assume multi-output: predicting one class (0, 1, 2) per aspect. Labels are LongTensor.
        # Shape should be (num_aspects,) for a single item
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

    def __len__(self):
        # Use the number of samples in one of the encoding keys (e.g., 'input_ids')
        return len(self.encodings['input_ids'])


# Create datasets
train_dataset = YelpAspectDataset(train_encodings, Y_train_np)
test_dataset = YelpAspectDataset(test_encodings, Y_test_np)

# --- Define Model ---
# We need a multi-output classification head.
# Load pre-trained DistilBERT and modify the classifier layer.
aspects = ['food', 'service', 'ambiance', 'price', 'context', 'overall']  # Define your aspects
sentiment_classes = ['negative', 'neutral', 'positive']  # Define sentiment classes

num_aspects = len(aspects)
num_classes_per_aspect = len(sentiment_classes)  # 3 classes: positive, negative, neutral

# Load the base model
model = DistilBertForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_aspects * num_classes_per_aspect  # TEMPORARY - see below
)

# *** IMPORTANT: Modifying the Classifier Head for Multi-Output ***
# The standard `DistilBertForSequenceClassification` has ONE output layer for num_labels.
# For multi-output (one prediction per aspect), we need to customize.
# Option 1: Treat as N independent classification tasks (simpler to implement with standard model if labels handled outside)
# Option 2: Create N classification heads (more complex custom model)
# Option 3: Reshape the output of a single large head (requires careful loss calculation)

# Let's try Option 3 conceptually: A single head predicting scores for all aspect-class combinations.
# Then reshape and calculate loss per aspect.
# The output dimension should be num_aspects * num_classes_per_aspect
# The final layer needs to be replaced or adapted.

# Get the original classifier's input dimension
original_classifier_in_features = model.classifier.in_features

# Replace the classifier head. Output size is num_aspects * num_classes (e.g., 5 * 3 = 15)
model.classifier = torch.nn.Linear(original_classifier_in_features, num_aspects * num_classes_per_aspect)
# The pre_classifier layer might also need adjustment if present (DistilBERT has one)
if hasattr(model, 'pre_classifier') and model.pre_classifier is not None:
    original_pre_classifier_in_features = model.pre_classifier.in_features
    model.pre_classifier = torch.nn.Linear(original_pre_classifier_in_features, original_classifier_in_features)  # Keep standard pre-classifier -> classifier connection
else:
    print("Model does not have a separate pre_classifier layer.")

print(f"\nCustomized DistilBERT model loaded. Output layer size: {model.classifier.out_features}")

# --- Training Setup ---
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(f"Using device: {device}")
model.to(device)

# Dataloaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)  # Adjust batch_size based on GPU memory
test_loader = DataLoader(test_dataset, batch_size=32)  # Larger batch size for evaluation is fine

# Optimizer and Scheduler
optimizer = AdamW(model.parameters(), lr=5e-5)  # Typical learning rate for fine-tuning
num_epochs = 3  # Adjust as needed (start small)
num_training_steps = num_epochs * len(train_loader)
lr_scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

# Loss Function: CrossEntropyLoss is suitable for multi-class classification (per aspect)
loss_fn = torch.nn.CrossEntropyLoss()

# --- Training Loop ---
print("\nStarting Transformer Model Training...")
model.train()
for epoch in range(num_epochs):
    print(f"--- Epoch {epoch + 1}/{num_epochs} ---")
    epoch_start_time = time.time()
    total_loss = 0
    for i, batch in enumerate(train_loader):
        optimizer.zero_grad()
        # Move batch to device
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)  # Shape: (batch_size, num_aspects)

        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits  # Shape: (batch_size, num_aspects * num_classes_per_aspect)

        # Calculate loss: Reshape logits and labels for CrossEntropyLoss
        # Logits need to be (batch_size * num_aspects, num_classes_per_aspect)
        # Labels need to be (batch_size * num_aspects,)
        reshaped_logits = logits.view(-1, num_classes_per_aspect)  # (batch * aspects, classes)
        reshaped_labels = labels.view(-1)  # (batch * aspects,)

        loss = loss_fn(reshaped_logits, reshaped_labels)
        total_loss += loss.item()

        loss.backward()
        optimizer.step()
        lr_scheduler.step()

        if (i + 1) % 100 == 0:  # Print progress every 100 batches
            print(f"  Batch {i + 1}/{len(train_loader)}, Loss: {loss.item():.4f}")

    avg_train_loss = total_loss / len(train_loader)
    epoch_time = time.time() - epoch_start_time
    print(f"Epoch {epoch + 1} completed in {epoch_time:.2f}s. Average Training Loss: {avg_train_loss:.4f}")

print("Transformer training finished.")
# Consider saving the trained model
# model.save_pretrained('./my_aspect_sentiment_model')
# tokenizer.save_pretrained('./my_aspect_sentiment_model')

Using fixed bin edges: [-inf, -0.05, 0.05, inf]
Assigning integer labels: [0, 1, 2] (negative, neutral, positive)

Sentiment scores binned using fixed thresholds:
   food_sentiment  service_sentiment  ambiance_sentiment  price_sentiment  \
0               2                  1                   1                1   
1               2                  2                   1                1   
2               2                  1                   1                1   
3               1                  1                   1                1   
4               2                  1                   1                1   

   context_sentiment  overall_sentiment  
0                  1                  2  
1                  1                  2  
2                  1                  2  
3                  2                  2  
4                  1                  2  

Value counts for 'food_sentiment' (example):
food_sentiment
2    1019023
1     385113
0     121177
Name: count, dtype: in

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Customized DistilBERT model loaded. Output layer size: 18
Using device: cuda

Starting Transformer Model Training...
--- Epoch 1/3 ---
  Batch 100/76266, Loss: 0.4605
  Batch 200/76266, Loss: 0.5363
  Batch 300/76266, Loss: 0.3642
  Batch 400/76266, Loss: 0.3450
  Batch 500/76266, Loss: 0.4664
  Batch 600/76266, Loss: 0.4256
  Batch 700/76266, Loss: 0.3760
  Batch 800/76266, Loss: 0.3525
  Batch 900/76266, Loss: 0.3631
  Batch 1000/76266, Loss: 0.3212
  Batch 1100/76266, Loss: 0.3153
  Batch 1200/76266, Loss: 0.4185
  Batch 1300/76266, Loss: 0.3455
  Batch 1400/76266, Loss: 0.3565
  Batch 1500/76266, Loss: 0.2568
  Batch 1600/76266, Loss: 0.3890
  Batch 1700/76266, Loss: 0.2809
  Batch 1800/76266, Loss: 0.2546
  Batch 1900/76266, Loss: 0.2207
  Batch 2000/76266, Loss: 0.2464
  Batch 2100/76266, Loss: 0.2484
  Batch 2200/76266, Loss: 0.3675
  Batch 2300/76266, Loss: 0.2434
  Batch 2400/76266, Loss: 0.2172
  Batch 2500/76266, Loss: 0.3450
  Batch 2600/76266, Loss: 0.2391
  Batch 2700/76

In [9]:
# --- Save the Model and Tokenizer ---
print("\nSaving the fine-tuned model and tokenizer...")

# Define the directory where you want to save them
drive.mount('/content/drive')  # Mount Google Drive
save_directory = '/content/drive/My Drive/Colab Notebooks' # You can change this path
# Save the model's weights and configuration file
model.save_pretrained(save_directory)

# Save the tokenizer's vocabulary and configuration file
tokenizer.save_pretrained(save_directory)

print(f"Model and tokenizer saved to {save_directory}")

# --- (Optional) How to load the model and tokenizer later ---
# from transformers import AutoModelForSequenceClassification, AutoTokenizer
# loaded_model = AutoModelForSequenceClassification.from_pretrained(save_directory)
# loaded_tokenizer = AutoTokenizer.from_pretrained(save_directory)
# print("\nModel and tokenizer loaded successfully (example).")
# loaded_model.to(device) # Remember to move the loaded model to the correct device


Saving the fine-tuned model and tokenizer...
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Model and tokenizer saved to /content/drive/My Drive/Colab Notebooks


In [10]:
print("\nEvaluating Transformer Model...")
model.eval()  # Set model to evaluation mode

all_preds_transformer = []
all_labels_transformer = []

with torch.no_grad():  # Disable gradient calculations for inference
    for batch in test_loader:
        # Move batch to device
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)  # Shape: (batch_size, num_aspects)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits  # Shape: (batch_size, num_aspects * num_classes_per_aspect)

        # Get predictions: Reshape logits and find the class with max probability for each aspect
        # Reshape logits to (batch_size, num_aspects, num_classes_per_aspect)
        reshaped_logits = logits.view(input_ids.size(0), num_aspects, num_classes_per_aspect)
        predictions = torch.argmax(reshaped_logits, dim=2)  # Get predicted class index along the class dimension

        all_preds_transformer.extend(predictions.cpu().numpy())
        all_labels_transformer.extend(labels.cpu().numpy())

# Convert collected predictions and labels into numpy arrays
Y_pred_transformer_np = np.array(all_preds_transformer)  # Shape: (num_test_samples, num_aspects)
Y_true_transformer_np = np.array(all_labels_transformer)  # Shape: (num_test_samples, num_aspects)

# Calculate metrics per aspect
transformer_report = {}
print("\n--- Transformer Classification Report (Per Aspect) ---")

all_true_flat_transformer = []
all_pred_flat_transformer = []

# Assuming 'aspects' is defined earlier and includes 'overall'
# aspects = ['food', 'service', 'ambiance', 'price', 'context', 'overall']

for i, aspect_name in enumerate(aspects):  # Iterate through all aspects, including 'overall'
    print(f"\n--- Aspect: {aspect_name} ---")
    true_labels = Y_true_transformer_np[:, i]
    pred_labels = Y_pred_transformer_np[:, i]

    all_true_flat_transformer.extend(true_labels)
    all_pred_flat_transformer.extend(pred_labels)

    # Get unique labels in pred_labels and true_labels
    unique_labels = np.unique(np.concatenate((pred_labels, true_labels)))

    # Filter target_names to include only the present labels
    present_target_names = [name for idx, name in enumerate(sentiment_classes) if idx in unique_labels]

    report = classification_report(true_labels, pred_labels, target_names=present_target_names, zero_division=0)
    print(report)
    precision, recall, f1, _ = precision_recall_fscore_support(true_labels, pred_labels, average='weighted', zero_division=0)
    accuracy = accuracy_score(true_labels, pred_labels)
    transformer_report[aspect_name] = {'precision': precision, 'recall': recall, 'f1-score': f1, 'accuracy': accuracy}

# Overall Micro and Macro Averages
print("\n--- Transformer Overall Micro/Macro Averages ---")
precision_micro_t, recall_micro_t, f1_micro_t, _ = precision_recall_fscore_support(all_true_flat_transformer, all_pred_flat_transformer, average='micro', zero_division=0)
precision_macro_t, recall_macro_t, f1_macro_t, _ = precision_recall_fscore_support(all_true_flat_transformer, all_pred_flat_transformer, average='macro', zero_division=0)
print(f"Micro Average: Precision={precision_micro_t:.4f}, Recall={recall_micro_t:.4f}, F1-Score={f1_micro_t:.4f}")
print(f"Macro Average: Precision={precision_macro_t:.4f}, Recall={recall_macro_t:.4f}, F1-Score={f1_macro_t:.4f}")

transformer_report['overall_micro'] = {'precision': precision_micro_t, 'recall': recall_micro_t, 'f1-score': f1_micro_t}
transformer_report['overall_macro'] = {'precision': precision_macro_t, 'recall': recall_macro_t, 'f1-score': f1_macro_t}

# Store transformer results for comparison plots
transformer_f1_scores = {aspect: metrics['f1-score'] for aspect, metrics in transformer_report.items() if 'overall' not in aspect}  # Exclude overall averages here if needed


Evaluating Transformer Model...

--- Transformer Classification Report (Per Aspect) ---

--- Aspect: food ---
              precision    recall  f1-score   support

    negative       0.66      0.63      0.65     24235
     neutral       0.85      0.74      0.79     77023
    positive       0.90      0.95      0.92    203805

    accuracy                           0.87    305063
   macro avg       0.80      0.77      0.79    305063
weighted avg       0.87      0.87      0.87    305063


--- Aspect: service ---
              precision    recall  f1-score   support

    negative       0.73      0.74      0.74     25208
     neutral       0.91      0.89      0.90    187376
    positive       0.81      0.85      0.83     92479

    accuracy                           0.86    305063
   macro avg       0.82      0.83      0.82    305063
weighted avg       0.87      0.86      0.86    305063


--- Aspect: ambiance ---
              precision    recall  f1-score   support

    negative       0.

In [11]:
# --- Define the sentiment classes corresponding to your integer labels ---
# This should match the 'bin_labels' used during binning (e.g., 0, 1, 2)
# and the 'label_names' (e.g., 'negative', 'neutral', 'positive')
sentiment_classes = ['negative', 'neutral', 'positive'] # MUST match the order used for bin_labels=[0, 1, 2]
integer_to_string_label_map = { i: label for i, label in enumerate(sentiment_classes) }
print(f"Integer-to-String Label Map for Prediction: {integer_to_string_label_map}")

def predict_sentiment(review_text, model, tokenizer, aspects, int_to_str_map):
    """Predicts sentiment for all aspects for a given review text."""

    # Preprocess the input text
    processed_text = preprocess_text(review_text) # Assumes preprocess_text is defined elsewhere
    if not processed_text:
        return {"error": "Review text is empty after preprocessing."}

    # Tokenize
    inputs = tokenizer(processed_text, return_tensors='pt', truncation=True, padding=True, max_length=128) # Adjust max_length if needed

    # Move inputs to the same device as the model
    inputs = {k: v.to(model.device) for k, v in inputs.items()}

    # Predict
    model.eval() # Ensure model is in eval mode
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits # Shape: (1, num_aspects * num_classes_per_aspect)

    # Process logits to get predictions per aspect
    num_classes_per_aspect = len(int_to_str_map) # Get number of classes (e.g., 3)
    # Reshape to (1, num_aspects, num_classes_per_aspect)
    reshaped_logits = logits.view(1, len(aspects), num_classes_per_aspect)

    # Get the index of the max logit for each aspect (these are your 0, 1, 2 labels)
    predictions_indices = torch.argmax(reshaped_logits, dim=2).squeeze().cpu().numpy() # Shape: (num_aspects,)

    # Handle case where there's only one aspect (numpy might return a scalar)
    if predictions_indices.ndim == 0:
        predictions_indices = [predictions_indices.item()] # Make it a list

    # Map predicted integer indices back to sentiment string labels
    predicted_sentiments = {}
    for i, pred_idx in enumerate(predictions_indices):
         # Ensure the predicted index exists in the map, handle potential errors
         sentiment_label = int_to_str_map.get(pred_idx, "Unknown Label")
         predicted_sentiments[aspects[i]] = sentiment_label


    # # --- OLD MAPPING LOGIC (REMOVE/REPLACE) ---
    # # Map indices back to sentiment labels
    # # Create inverse map (0: 'neutral', 1: 'positive', -1: 'negative'} <--- This was likely incorrect
    # # inverse_label_map = {v: k for k, v in label_map.items()} <--- Remove this line
    # # predicted_sentiments = {aspects[i]: inverse_label_map[pred_idx] for i, pred_idx in enumerate(predictions_indices)} <--- Remove this line
    # # --- END OLD MAPPING LOGIC ---

    return predicted_sentiments

# --- Example Usage (Location: Near the end of the notebook, around page 20) ---

# Ensure 'aspects' list is defined correctly (should match training)
aspects = ['food', 'service', 'ambiance', 'price', 'context', 'overall']

# Define the mapping from integer labels (0, 1, 2) to strings ('negative', 'neutral', 'positive')
# This map MUST align with the 'bin_labels' used during binning.
# If bin_labels = [0, 1, 2] corresponds to ['negative', 'neutral', 'positive']
integer_to_string_label_map = {
    0: 'negative',
    1: 'neutral',
    2: 'positive'
}
print(f"\nUsing label map for prediction: {integer_to_string_label_map}")


new_review = "The pizza was amazing, truly authentic Italian style! However, the waiter was quite rude and ignored us for a long time."
# Pass the correct integer-to-string map to the function
predicted_results = predict_sentiment(new_review, model, tokenizer, aspects, integer_to_string_label_map)

print("\n--- Example Prediction ---")
print(f"Review: \"{new_review}\"")
print("\nPredicted Sentiments:")
if "error" in predicted_results:
    print(predicted_results["error"])
else:
    for aspect, sentiment in predicted_results.items():
        print(f"- {aspect.capitalize()}: {sentiment.capitalize()}") # Keep .capitalize() for display


new_review_2 = "Service was terrible, and the food was bad and pricey. Terrible atmosphere."
predicted_results_2 = predict_sentiment(new_review_2, model, tokenizer, aspects, integer_to_string_label_map)
print("\n--- Example Prediction 2 ---")
print(f"Review: \"{new_review_2}\"")
print("\nPredicted Sentiments:")
if "error" in predicted_results_2:
    print(predicted_results_2["error"])
else:
    for aspect, sentiment in predicted_results_2.items():
        print(f"- {aspect.capitalize()}: {sentiment.capitalize()}")

new_review_3 = "This restaurant is bad."
predicted_results_3 = predict_sentiment(new_review_3, model, tokenizer, aspects, integer_to_string_label_map)
print("\n--- Example Prediction 3 ---") # Corrected print statement index
print(f"Review: \"{new_review_3}\"")
print("\nPredicted Sentiments:")
if "error" in predicted_results_3:
    print(predicted_results_3["error"])
else:
    for aspect, sentiment in predicted_results_3.items():
        print(f"- {aspect.capitalize()}: {sentiment.capitalize()}")

new_review_4 = "This restaurant is great. The food was yummy. The service was excellent. The price was inexpensive and the ambiance was outstanding. We went for my birthday."
predicted_results_4 = predict_sentiment(new_review_4, model, tokenizer, aspects, integer_to_string_label_map)
print("\n--- Example Prediction 4 ---") # Corrected print statement index
print(f"Review: \"{new_review_4}\"")
print("\nPredicted Sentiments:")
if "error" in predicted_results_4:
    print(predicted_results_4["error"])
else:
    for aspect, sentiment in predicted_results_4.items():
        print(f"- {aspect.capitalize()}: {sentiment.capitalize()}")



Integer-to-String Label Map for Prediction: {0: 'negative', 1: 'neutral', 2: 'positive'}

Using label map for prediction: {0: 'negative', 1: 'neutral', 2: 'positive'}

--- Example Prediction ---
Review: "The pizza was amazing, truly authentic Italian style! However, the waiter was quite rude and ignored us for a long time."

Predicted Sentiments:
- Food: Positive
- Service: Negative
- Ambiance: Neutral
- Price: Neutral
- Context: Neutral
- Overall: Positive

--- Example Prediction 2 ---
Review: "Service was terrible, and the food was bad and pricey. Terrible atmosphere."

Predicted Sentiments:
- Food: Neutral
- Service: Negative
- Ambiance: Negative
- Price: Neutral
- Context: Neutral
- Overall: Negative

--- Example Prediction 3 ---
Review: "This restaurant is bad."

Predicted Sentiments:
- Food: Neutral
- Service: Negative
- Ambiance: Neutral
- Price: Neutral
- Context: Neutral
- Overall: Negative

--- Example Prediction 4 ---
Review: "This restaurant is great. The food was yummy. Th

In [None]:
# --- Get User Input and Predict ---
while True:
    user_review = input("Enter a restaurant review (or type 'exit' to quit): ")
    if user_review.lower() == 'exit':
        break

    predicted_results = predict_sentiment(user_review, model, tokenizer, aspects, integer_to_string_label_map)

    print("\n--- User Review Prediction ---")
    print(f"Review: \"{user_review}\"")
    print("\nPredicted Sentiments:")
    if "error" in predicted_results:
        print(predicted_results["error"])
    else:
        for aspect, sentiment in predicted_results.items():
            print(f"- {aspect.capitalize()}: {sentiment.capitalize()}")

    print("-" * 30) # Separator for multiple reviews

print("Exiting sentiment analysis.")