In [4]:

import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import torch
from transformers import pipeline

import os
import gc
import json

torch.set_num_threads(4)
np.set_printoptions(linewidth=200) # default 75


In [5]:

from CFG import Config
cfg_ref = Config()
config = cfg_ref.get_config()
# config = {'data_dir': '/kaggle/input/yelp-compressed-dataset'}


In [6]:

business_df = pq.read_table(os.path.join(config['data_dir'], 'business.parquet')).to_pandas()
restaurants_df = business_df[business_df['categories'].apply(\
                    lambda x: 'Restaurants' in x if x is not None else False)]
print("restaurants_df.shape", restaurants_df.shape)
del business_df

# users_df = pq.read_table(os.path.join(config['data_dir'], 'user.parquet')).to_pandas()
# checkin_df = pq.read_table(os.path.join(config['data_dir'], 'checkin.parquet')).to_pandas()

review_df_sample = pd.read_csv(os.path.join(config['data_dir'], 'review_df_sample.csv'))



In [17]:
# restanrant_name = 'Willie Mae\'s Scotch House'
# city = 'New Orleans'

# curr_rest_business_id = restaurants_df[(restaurants_df['name'] == restanrant_name) & 
#                             (restaurants_df['city'] == city)].business_id
# curr_rest_business_id = curr_rest_business_id.values[0]

curr_rest_business_id = '-Tskf8WK17rb3ZfeFuRSWA'


In [18]:
curr_restau_review_df = review_df_sample[review_df_sample['business_id'] == curr_rest_business_id]

In [19]:

def get_sentiment_classifier_pipeline():
    sentiment_classifier_pipeline = pipeline(
            "sentiment-analysis", 
            model="distilbert-base-uncased-finetuned-sst-2-english",
            device=-1, # Force CPU usage (0 = index of first CPU)
            truncation=True,
            max_length=512, # Increase the max length to 512
        )
    
    return sentiment_classifier_pipeline


def classify_sentiment(df, classifier_pipeline):
    results = classifier_pipeline(df['text'].tolist())  
    df['sentiment_label'] = [1 if result['label'] == 'POSITIVE' else 0 for result in results]
    return df


In [23]:
dummy_input_df = pd.DataFrame({
    'text': ['This product is amazing!', 'Terrible customer service.', 'It works okay.']
})

sent_classifier_pipeline = get_sentiment_classifier_pipeline()
curr_restau_review_df = classify_sentiment(curr_restau_review_df, sent_classifier_pipeline)




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['sentiment_label'] = [1 if result['label'] == 'POSITIVE' else 0 for result in results]


In [33]:
curr_restau_pos_reviews = curr_restau_review_df[curr_restau_review_df['sentiment_label'] == 1]['text'].to_frame()
curr_restau_neg_reviews = curr_restau_review_df[curr_restau_review_df['sentiment_label'] == 0]['text'].to_frame()

print("Number of positive reviews:", len(curr_restau_pos_reviews))
print("Number of negative reviews:", len(curr_restau_neg_reviews))


Number of positive reviews: 2
Number of negative reviews: 1


In [34]:

def get_text_summarizer_pipeline():
    # Create the summarization pipeline
    summarizer = pipeline(
            "summarization",
            model="t5-base",
            device=-1,  # Force CPU usage (0 = index of first CPU)
            truncation=True,
            max_length=512,  # Set the maximum output length
        )
    
    return summarizer

# Function to summarize the reviews
def summarize_reviews(df, summarizer):
    # Get the list of reviews
    reviews = df['text'].tolist()
    
    # Summarize the reviews
    summaries = summarizer(reviews, truncation=True, max_length=512)
    
    # Add the summaries to the dataframe
    df['summary'] = [summary['summary_text'] for summary in summaries]
    
    return df



In [35]:
def get_review_chunks(reviews, num_chunks=5, each_chunk_reviews=10):
    # Create n chunks of k reviews each - we want n revuew summaries 
    review_chunks = []
    for i in range(num_chunks):
        # Generate 10 random indices
        random_indices = np.random.choice(reviews.shape[0], size=min(reviews.shape[0], 
                        each_chunk_reviews), replace=False)

        # Extract 'text' field from each row using the random indices
        selected_texts = reviews.iloc[random_indices]['text'].tolist()

        # Merge all the text rows
        merged_text = ''.join(selected_texts)

        review_chunks.append(merged_text)

    return pd.DataFrame({'text': review_chunks})


In [36]:

num_chunks = 5  # How many review chunks, each of size 'each_chunk_reviews', to summaize
each_chunk_reviews = 10  # Number of reviews in each chunk

positive_reviews_chunk = get_review_chunks(curr_restau_pos_reviews, num_chunks, each_chunk_reviews)
negative_reviews_chunk = get_review_chunks(curr_restau_pos_reviews, num_chunks, each_chunk_reviews)


In [37]:
summarizer_pipeline = get_text_summarizer_pipeline()

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [38]:
print("Summarizing positive reviews...")
positive_review_summaries = summarize_reviews(positive_reviews_chunk, summarizer_pipeline)
print("Summarizing negative reviews...")
negative_review_summaries = summarize_reviews(negative_reviews_chunk, summarizer_pipeline)


Summarizing positive reviews...


Your max_length is set to 512, but your input_length is only 266. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=133)
Your max_length is set to 512, but your input_length is only 266. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=133)
Your max_length is set to 512, but your input_length is only 266. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=133)
Your max_length is set to 512, but your input_length is only 268. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=1

Summarizing negative reviews...


Your max_length is set to 512, but your input_length is only 268. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=134)
Your max_length is set to 512, but your input_length is only 268. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=134)
Your max_length is set to 512, but your input_length is only 266. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=133)
Your max_length is set to 512, but your input_length is only 268. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=1

In [42]:
''.join(positive_review_summaries.summary.values)

'the service was amazing, from the bus boy, to the server, and the person doing our dessert . the gumbo was delicious, and it was a beautiful place to duck in and escape the snow after visiting the statue down the street .the service was amazing, from the bus boy, to the server, and the person doing our dessert . the gumbo was delicious, and it was a beautiful place to duck in and escape the snow after visiting the statue down the street .the service was amazing, from the bus boy, to the server, and the person doing our dessert . the gumbo was delicious, and it was a beautiful place to duck in and escape the snow after visiting the statue down the street .ignatius j Reilly statue was down the street . we arrived around 8pm on a friday night . the wait was fairly short: 10-15 minutes .ignatius j Reilly statue was down the street . we arrived around 8pm on a friday night . the wait was fairly short: 10-15 minutes .'

In [43]:
''.join(negative_review_summaries.summary.values)

'ignatius j Reilly statue was down the street . we arrived around 8pm on a friday night . the wait was fairly short: 10-15 minutes .ignatius j Reilly statue was down the street . we arrived around 8pm on a friday night . the wait was fairly short: 10-15 minutes .ignatius j Reilly statue was down the street . we arrived around 8pm on a friday night . the wait was fairly short: 10-15 minutes .the service was amazing, from the bus boy, to the server, and the person doing our dessert . the gumbo was delicious, and it was a beautiful place to duck in and escape the snow after visiting the statue down the street .ignatius j Reilly statue was down the street . we arrived around 8pm on a friday night . the wait was fairly short: 10-15 minutes .'