In [None]:
#FinalSummerization.py
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration
import pandas as pd

# Load data and prompt user for listing ID
csv_file_path = 'processed_reviews_SentimentLabels.csv'
data = pd.read_csv(csv_file_path)

listing_id = int(input("Please enter the listing_id to summarize reviews for: "))

# Initialize model and tokenizer
model_name = 't5-large'
model = T5ForConditionalGeneration.from_pretrained(model_name)
tokenizer = T5Tokenizer.from_pretrained(model_name)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
model.eval()


def chunk_text(text, tokenizer, chunk_size=512):
    """
    Split a long text into smaller chunks based on tokenized length.

    Args:
        text (str): Input text to be chunked.
        tokenizer (T5Tokenizer): Tokenizer for encoding the text.
        chunk_size (int): Maximum number of tokens per chunk.

    Returns:
        list[torch.Tensor]: A list of tokenized chunks.
    """
    tokenized_text = tokenizer.encode(text, return_tensors="pt")[0]
    return [tokenized_text[i:i + chunk_size] for i in range(0, len(tokenized_text), chunk_size)]


def summarize_chunk(chunk, tokenizer, model, device, chunk_max_length=100, verbose=True):
    """
    Generate a summary for a single chunk of text.

    Args:
        chunk (torch.Tensor): Tokenized chunk of text.
        tokenizer (T5Tokenizer): Tokenizer for decoding and preparing input.
        model (T5ForConditionalGeneration): Pre-trained T5 model for summarization.
        device (torch.device): Device to execute the model on (CPU/GPU).
        chunk_max_length (int): Maximum length of the summary.
        verbose (bool): Whether to print detailed logs.

    Returns:
        str: Generated summary for the chunk.
    """
    chunk_text_decoded = tokenizer.decode(chunk, skip_special_tokens=True)
    input_text = "summarize: " + chunk_text_decoded
    tokenized_text = tokenizer.encode(input_text, return_tensors="pt").to(device)

    summary_ids = model.generate(
        tokenized_text,
        num_beams=4,
        no_repeat_ngram_size=3,
        repetition_penalty=1.2,
        min_length=30,
        max_length=chunk_max_length,
        early_stopping=True
    )
    chunk_summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

    if verbose:
        print(f"[CHUNK INPUT]: {input_text[:300]}...")
        print(f"[CHUNK SUMMARY]: {chunk_summary}")

    return chunk_summary


def summarize_large_text(text, tokenizer, model, device, chunk_size=512, chunk_max_length=100,
                         final_min_length=100, final_max_length=200, verbose=True):
    """
    Summarize a large text by breaking it into chunks, summarizing each, and combining the results.

    Args:
        text (str): Input text to be summarized.
        tokenizer (T5Tokenizer): Tokenizer for encoding and decoding.
        model (T5ForConditionalGeneration): Pre-trained T5 model for summarization.
        device (torch.device): Device to execute the model on (CPU/GPU).
        chunk_size (int): Maximum token size for each chunk.
        chunk_max_length (int): Maximum length for each chunk's summary.
        final_min_length (int): Minimum length for the final summary.
        final_max_length (int): Maximum length for the final summary.
        verbose (bool): Whether to print detailed logs.

    Returns:
        str: Final comprehensive summary.
    """
    if verbose:
        print("\nSplitting text into chunks...")
    chunks = chunk_text(text, tokenizer, chunk_size)
    if verbose:
        print(f"Total chunks created: {len(chunks)}")

    if verbose:
        print("\nSummarizing individual chunks...")
    chunk_summaries = []
    for i, chunk in enumerate(chunks, start=1):
        if verbose:
            print(f"\nSummarizing chunk {i}/{len(chunks)}")
        chunk_summary = summarize_chunk(chunk, tokenizer, model, device, chunk_max_length, verbose=verbose)
        chunk_summaries.append(chunk_summary)

    combined_summary_text = " ".join(chunk_summaries)

    final_input_text = (
            "summarize: Provide a detailed and coherent summary of guest reviews, "
            "highlighting themes about the host's personality, accommodation quality, neighborhood vibe, "
            "safety, and accessibility to transportation. " + combined_summary_text
    )

    tokenized_summary = tokenizer.encode(final_input_text, return_tensors="pt").to(device)
    if verbose:
        print("\nGenerating final summary...")

    summary_ids = model.generate(
        tokenized_summary,
        num_beams=4,
        no_repeat_ngram_size=3,
        repetition_penalty=1.2,
        min_length=final_min_length,
        max_length=final_max_length,
        early_stopping=True
    )
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)


def summarize_reviews_for_listing(listing_id, verbose=True):
    """
    Summarize all reviews associated with a given listing ID.

    Args:
        listing_id (int): ID of the listing to summarize reviews for.
        verbose (bool): Whether to print detailed progress and logs.

    Returns:
        str: Final summary of reviews for the listing ID.
    """
    if verbose:
        print(f"\nRetrieving reviews for listing_id {listing_id}...")
    reviews = data[data['listing_id'] == listing_id]['review']
    if reviews.empty:
        return f"No reviews found for listing_id {listing_id}"

    if verbose:
        print("\nSample reviews:")
        for idx, review in enumerate(reviews.head(3), start=1):
            print(f"[Sample Review {idx}]: {review}")

    combined_reviews = " ".join(reviews)

    if verbose:
        print("\nSummarizing all reviews...")
    return summarize_large_text(
        combined_reviews,
        tokenizer,
        model,
        device,
        chunk_size=512,
        chunk_max_length=100,
        final_min_length=100,
        final_max_length=200,
        verbose=verbose
    )


# Execute the summarization
final_summary = summarize_reviews_for_listing(listing_id, verbose=True)
print(f"\nFINAL SUMMARY for listing_id {listing_id}:\n{final_summary}")


## Review Summary for Listing ID: 10452

### Retrieving Reviews
**Listing ID:** 10452

**Status:** Retrieving reviews...

### Sample Reviews
1. **Sample Review 1:**  
   Angela was a great host. Great location. Spacious bedroom with comfortable bed. Very good value. Enjoyed my stay!

2. **Sample Review 2:**  
   I’ve stayed in a bunch of Airbnbs and Angela was by far one of the best hosts. She kept the place super clean and even offered to take care of the laundry throughout my stay. She responded within 5 mins of me needing anything. She was happy and always willing to go above and beyond for anything I needed. She’s a sweetheart and she made it feel like it was my own home. Definitely recommend this place to anyone.

3. **Sample Review 3:**  
   Great location! We loved staying in Crown Heights. There were wonderful coffee shops and restaurants nearby. The room we were given was not exactly the one pictured but it was fine. Angela was amazing!  

### Summarizing All Reviews
- **Process:**
  - Splitting text into chunks...
  - **Total Chunks Created:** 12

- **Summarizing Individual Chunks:**
  - **Chunk 1/12**
    - **Input:**  
      Angela was a great host. Great location. Spacious bedroom with comfortable bed. Very good value. Enjoyed my stay! I’ve stayed in a bunch of Airbnbs and Angela was by far one of the best hosts. She kept the place super clean and even offered to take care of the laundry throughout my stay....
    - **Summary:**  
      Angela is a lovely host. She keeps the place clean, responds to messages quickly, and goes out of her way to make sure her guests have everything they need. The neighborhood was wonderful, I felt very safe. The subway was a 4-minute walk from the apartment!

    <!-- Repeat similar formatting for chunks 2-12 -->

### **Final Summary for Listing ID 10452**

**Angela is a lovely host. She keeps the place clean, responds to messages quickly, and goes out of her way to make sure her guests have everything they need. For $50/night, it's not a terrible value. The location is good, with plenty of stores, bars, and a train station really close that can get you to Manhattan in 15 minutes. I spent one month at Angela's and to be honest, it was great. She is always available and ready to help you in any matter.**


In [None]:
#ABSA_plots.py

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Load the dataset
file_path = 'AspectbasedSentimentAnalysis.csv'
data = pd.read_csv(file_path)

# Display the first few rows of the dataset to understand its structure
data.head()


# Filter the dataset for the given listing_id and aspect "Value and Pricing"
filtered_data_value_pricing = data[(data['listing_id'] == 1217318) & 
                                   (data['aspects'] == 'Value and Pricing')]

# Convert the 'review_posted_date' column to datetime for proper sorting and plotting
filtered_data_value_pricing['review_posted_date'] = pd.to_datetime(filtered_data_value_pricing['review_posted_date'])

# Group data by date and sentiment to see the evolution over time
evolution_data_value_pricing = filtered_data_value_pricing.groupby(['review_posted_date', 'sentiment_label']).size().unstack(fill_value=0)

# Prepare data for stacked area plot
evolution_data_value_pricing_cumsum = evolution_data_value_pricing.cumsum()

# Plotting a stacked area chart
plt.figure(figsize=(10, 6))
plt.stackplot(evolution_data_value_pricing_cumsum.index, 
              evolution_data_value_pricing_cumsum[0], 
              evolution_data_value_pricing_cumsum[1], 
              labels=['Negative', 'Positive'], alpha=0.7)
plt.title("Cumulative Evolution of 'Value and Pricing' Aspect for Listing ID 1217318")
plt.xlabel("Date")
plt.ylabel("Cumulative Count of Reviews")
plt.legend(title="Sentiment Label")
plt.grid()
plt.show()
