In [2]:

import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import torch
from transformers import pipeline

import os
import gc
import json

torch.set_num_threads(4)
np.set_printoptions(linewidth=200) # default 75


In [3]:

# from CFG import Config
# cfg_ref = Config()
# config = cfg_ref.get_config()
config = {'data_dir': '/kaggle/input/yelp-compressed-dataset'}


In [4]:

business_df = pq.read_table(os.path.join(config['data_dir'], 'business.parquet')).to_pandas()
restaurants_df = business_df[business_df['categories'].apply(\
                    lambda x: 'Restaurants' in x if x is not None else False)]
print("restaurants_df.shape", restaurants_df.shape)
del business_df
gc.collect()

# users_df = pq.read_table(os.path.join(config['data_dir'], 'user.parquet')).to_pandas()
# checkin_df = pq.read_table(os.path.join(config['data_dir'], 'checkin.parquet')).to_pandas()

# review_df_sample = pd.read_csv(os.path.join(config['data_dir'], 'review_df_sample.csv'))
review_df_sample = pq.read_table(os.path.join(config['data_dir'], 'review.parquet')).to_pandas()



restaurants_df.shape (52268, 14)


In [20]:
restanrant_name = 'Willie Mae\'s Scotch House'
city = 'New Orleans'

'''
'Willie Mae\'s Scotch House'
'New Orleans'
id: VVH6k9-ycttH3TV_lk5WfQ
'''
curr_rest_business_id = restaurants_df[(restaurants_df['name'] == restanrant_name) & 
                            (restaurants_df['city'] == city)].business_id
curr_rest_business_id = curr_rest_business_id.values[0]

# curr_rest_business_id = '-Tskf8WK17rb3ZfeFuRSWA'

print('curr_rest_business_id: ', curr_rest_business_id)

curr_rest_business_id:  VVH6k9-ycttH3TV_lk5WfQ


In [8]:
curr_restau_review_df = review_df_sample[review_df_sample['business_id'] == curr_rest_business_id]

In [9]:
curr_restau_review_df.shape

(3633, 9)

In [10]:

def get_sentiment_classifier_pipeline():
    sentiment_classifier_pipeline = pipeline(
            "sentiment-analysis", 
            model="distilbert-base-uncased-finetuned-sst-2-english",
            device=-1, # Force CPU usage (0 = index of first CPU)
            truncation=True,
            max_length=512, # Increase the max length to 512
        )
    
    return sentiment_classifier_pipeline


def classify_sentiment(df, classifier_pipeline):
    results = classifier_pipeline(df['text'].tolist())  
    df['sentiment_label'] = [1 if result['label'] == 'POSITIVE' else 0 for result in results]
    return df


In [11]:
# dummy_input_df = pd.DataFrame({
#     'text': ['This product is amazing!', 'Terrible customer service.', 'It works okay.']
# })

sent_classifier_pipeline = get_sentiment_classifier_pipeline()
curr_restau_review_df = classify_sentiment(curr_restau_review_df, sent_classifier_pipeline)




config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['sentiment_label'] = [1 if result['label'] == 'POSITIVE' else 0 for result in results]


In [12]:
curr_restau_pos_reviews = curr_restau_review_df[curr_restau_review_df['sentiment_label'] == 1]['text'].to_frame()
curr_restau_neg_reviews = curr_restau_review_df[curr_restau_review_df['sentiment_label'] == 0]['text'].to_frame()

print("Number of positive reviews:", len(curr_restau_pos_reviews))
print("Number of negative reviews:", len(curr_restau_neg_reviews))


Number of positive reviews: 2878
Number of negative reviews: 755


In [13]:

def get_text_summarizer_pipeline():
    # Create the summarization pipeline
    summarizer = pipeline(
            "summarization",
            model="t5-base",
            device=-1,  # Force CPU usage (0 = index of first CPU)
            truncation=True,
            max_length=512,  # Set the maximum output length
        )
    
    return summarizer

# Function to summarize the reviews
def summarize_reviews(df, summarizer):
    # Get the list of reviews
    reviews = df['text'].tolist()
    
    # Summarize the reviews
    summaries = summarizer(reviews, truncation=True, max_length=512)
    
    # Add the summaries to the dataframe
    df['summary'] = [summary['summary_text'] for summary in summaries]
    
    return df



In [14]:
def get_review_chunks(reviews, num_chunks=5, each_chunk_reviews=10):
    # Create n chunks of k reviews each - we want n revuew summaries 
    review_chunks = []
    for i in range(num_chunks):
        # Generate 10 random indices
        random_indices = np.random.choice(reviews.shape[0], size=min(reviews.shape[0], 
                        each_chunk_reviews), replace=False)

        # Extract 'text' field from each row using the random indices
        selected_texts = reviews.iloc[random_indices]['text'].tolist()

        # Merge all the text rows
        merged_text = ''.join(selected_texts)

        review_chunks.append(merged_text)

    return pd.DataFrame({'text': review_chunks})


In [15]:

num_chunks = 5  # How many review chunks, each of size 'each_chunk_reviews', to summaize
each_chunk_reviews = 10  # Number of reviews in each chunk

positive_reviews_chunk = get_review_chunks(curr_restau_pos_reviews, num_chunks, each_chunk_reviews)
negative_reviews_chunk = get_review_chunks(curr_restau_pos_reviews, num_chunks, each_chunk_reviews)


In [16]:
summarizer_pipeline = get_text_summarizer_pipeline()

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [17]:
print("Summarizing positive reviews...")
positive_review_summaries = summarize_reviews(positive_reviews_chunk, summarizer_pipeline)
print("Summarizing negative reviews...")
negative_review_summaries = summarize_reviews(negative_reviews_chunk, summarizer_pipeline)


Summarizing positive reviews...
Summarizing negative reviews...


In [18]:
''.join(positive_review_summaries.summary.values)

"willie mae's is off the beaten track, out of the French Quarter . fried chicken is amazing, mashed potatoes and gravy were amazing . the mac n cheese was drier than most peoples, but this is the way i love it .the fried chicken was certainly memorable. It was delicious. the wait outside gave me a good feeling for the neighborhood and the culture . the chicken is so flaky and good. i would definitely recommend the mac n cheese .willie mae's is one of the best fried chicken joints i've ever tasted . the mashed potatoes are amazing and the chicken is tender and flavorful . it's a must visit if you're in new orleans .willie mae's fried chicken was absolutely spectacular . it was the juiciest, most well-seasoned bite of chicken that wasn't made by my grandmother . the service was pretty slow and the dining room was only a third full .the only fried chicken in the world comes wrapped in a Publix bag . the smell of the place screams soul food . if you're in new orleans, come here ."

In [19]:
''.join(negative_review_summaries.summary.values)

"fried chicken is crispy, golden, juicy, and seemed to be made to order due to the high volume . mac n cheese was okay didn't blow me away as I thought it would . the strawberry cheese cake was crazy good .the wait was long but they made up for it by throwing a few extra wings on all of our plates . the chicken was as advertised, very moist and crispy . mac and cheese is above average but not really special .the chicken was moist, juicy, and spicy, but not too spicy . the wait was justified when the fried chicken and butter beans arrived at our table . despite being on a cleanse, i had to cheat a little bit and let loose .fried chicken, fish & okra were fantastic, mac n cheese, butter beans, peas, candied yams . the service was good as well. took a bit to refill drinks & had to ask twice . willimae's has a shaded waiting area .fried catfish, butter beans and cornbread muffins were delectable . the star of the show was the chicken of course . fried chicken is perhaps one of the best i h