In [18]:
!pip install nbstripout
!nbstripout review.ipynb


Could not strip 'review.ipynb': file not found


In [17]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import requests
import os
import json
from datetime import datetime
import pandas as pd
from collections import Counter, defaultdict
from transformers import pipeline

# 1. system setting
API_KEY = 'AIzaSyCwG6NWnXFHdMTIsmVFaduVu-MqbBUq6fQ'  # ⚠️ Google API KEY
BASE_PATH = "/content/drive/MyDrive/NLP/GoogleReview"
os.makedirs(BASE_PATH, exist_ok=True)

# 2. place_id searching
def get_place_id(place_name, language='en'):
    url = f"https://maps.googleapis.com/maps/api/place/textsearch/json?query={place_name}&language={language}&key={API_KEY}"
    response = requests.get(url)
    result = response.json()
    if result['status'] == 'OK':
        return result['results'][0]['place_id']
    else:
        print("❌ Can't find a place.")
        return None

# 3. Collect review
def get_reviews(place_id, language='en'):
    url = f"https://maps.googleapis.com/maps/api/place/details/json?place_id={place_id}&fields=reviews&language={language}&key={API_KEY}"
    response = requests.get(url)
    reviews = response.json().get('result', {}).get('reviews', [])
    return [{
        "author": r.get("author_name", ""),
        "text": r.get("text", ""),
        "time": r.get("time", 0),
        "datetime": datetime.fromtimestamp(r.get("time", 0)).strftime('%Y-%m-%d %H:%M:%S'),
        "lang": language
    } for r in reviews]

# 4. translate function
def translate_to_english(text, source_lang):
    if source_lang == 'en':
        return text
    url = "https://translation.googleapis.com/language/translate/v2"
    params = {
        'q': text,
        'source': source_lang,
        'target': 'en',
        'format': 'text',
        'key': API_KEY
    }
    try:
        response = requests.post(url, data=params)
        result = response.json()
        if "error" in result:
            print("❌ translate API errror:")
            print(f"🔹 original text: {text[:80]}...")
            print(f"🔹 error message: {result['error'].get('message')}")
            return None
        return result['data']['translations'][0]['translatedText']
    except Exception as e:
        print("⚠️ exception occurs during translation:")
        print(f"🔹 original text: {text[:80]}...")
        print(f"🔹 exceptional message: {str(e)}")
        return None

# 5. save eng version after translated
def save_clean_reviews(place_name, reviews):
    filename = place_name.lower().replace(" ", "_") + "_cleaned.txt"
    file_path = os.path.join(BASE_PATH, filename)

    existing = set()
    if os.path.exists(file_path):
        with open(file_path, "r", encoding="utf-8") as f:
            for line in f:
                try:
                    r = json.loads(line.strip())
                    existing.add((r['author'], r['time']))
                except:
                    continue

    new_count = 0
    with open(file_path, "a", encoding="utf-8") as f:
        for r in reviews:
            key = (r["author"], r["time"])
            if key not in existing:
                if r["lang"] != "en":
                    translated = translate_to_english(r["text"], r["lang"])
                    if translated:
                        r["text"] = translated
                    else:
                        continue  # skip when fail translation
                # ✅ remove non-english review
                r.pop("lang", None)
                r.pop("text_translated", None)
                f.write(json.dumps(r, ensure_ascii=False) + "\n")
                new_count += 1

    if new_count > 0:
        print(f"✅ {new_count} reviews are saved in '{filename}'.")
    else:
        print(f"ℹ️ no more new review.")

# 6. review function
def run_review_translation_pipeline():
    place_name = input("🔍 Enter the place name: ").strip()
    if not place_name:
        print("❗ plcae name doesn't exist .")
        return

    langs = ['en', 'ko', 'es', 'zh-CN', 'ja']
    place_id = get_place_id(place_name)
    if not place_id:
        return

    total_reviews = []
    for lang in langs:
        print(f"🌐 {lang} reviews are collecting...")
        reviews = get_reviews(place_id, language=lang)
        total_reviews.extend(reviews)

    save_clean_reviews(place_name, total_reviews)

# 7. implement
run_review_translation_pipeline()


🔍 Enter the place name: Bar Totti's
🌐 en reviews are collecting...
🌐 ko reviews are collecting...
🌐 es reviews are collecting...
🌐 zh-CN reviews are collecting...
🌐 ja reviews are collecting...
✅ 25 reviews are saved in 'bar_totti's_cleaned.txt'.


In [3]:
!pip install --upgrade transformers



In [4]:
import glob
import os

# Function to automatically find the latest cleaned file
def find_latest_cleaned_file(base_path):
    list_of_files = glob.glob(os.path.join(base_path, '*_cleaned.txt'))  # Find all *_cleaned.txt files
    if not list_of_files:
        print("❗ Cannot find any cleaned file.")
        return None
    latest_file = max(list_of_files, key=os.path.getmtime)  # Select the most recently modified file
    return latest_file


In [5]:
#Automatically find the latest cleaned file
BASE_PATH = "/content/drive/MyDrive/NLP/GoogleReview"
file_path = find_latest_cleaned_file(BASE_PATH)

if file_path:
    print(f"✅ Latest cleaned file path: {file_path}")

    # Load review data
    reviews = []
    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            try:
                review = json.loads(line)
                text = review.get("text_translated") or review.get("text")
                if text:
                    reviews.append({"text": text})
            except:
                continue

    df = pd.DataFrame(reviews)
    df.head()
else:
    print("❌ Cannot proceed: no cleaned file found.")


✅ Latest cleaned file path: /content/drive/MyDrive/NLP/GoogleReview/bar_totti's_cleaned.txt


In [6]:
from transformers import pipeline

# Load sentiment analysis model (predicts 1 to 5 stars)
star_model = pipeline("sentiment-analysis", model="nlptown/bert-base-multilingual-uncased-sentiment")

# Function to convert star rating to sentiment label
def convert_star_to_label(label):
    stars = int(label.split()[0])
    if stars <= 2:
        return 0  # Negative
    elif stars == 3:
        return 1  # Neutral
    else:
        return 2  # Positive

# Final sentiment analysis function
def analyze_sentiment_star(text):
    try:
        result = star_model(text[:512])[0]
        return convert_star_to_label(result["label"])
    except:
        return 1  # Treat as neutral if an exception occurs


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/953 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/669M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/39.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/872k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Device set to use cpu


In [7]:
# Apply sentiment labeling
df["label"] = df["text"].apply(analyze_sentiment_star)

# Summarize sentiment distribution
label_names = {0: "Negative 😡", 1: "Neutral 😐", 2: "Positive 😊"}
label_counts = df["label"].value_counts().sort_index().rename(index=label_names)

print("📊 Sentiment distribution:")
print(label_counts)

# Check sample entries
df[["text", "label"]].sample(5, random_state=42)


📊 Sentiment distribution:
label
Negative 😡     4
Neutral 😐      2
Positive 😊    19
Name: count, dtype: int64


Unnamed: 0,text,label
8,The best restaurant in Ssinni 🥇\nIt was worth ...,2
16,The Asian girl at the front desk might be a li...,2
0,"You can not find pizza here, but their wood fi...",2
23,This is a popular restaurant in Sydney.\nI thi...,2
11,This restaurant had a great atmosphere and the...,2


In [11]:
!pip install datasets


Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.6.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m27.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl (

In [12]:
from transformers import TrainingArguments, Trainer, pipeline
import numpy as np
import os

# Disable W&B (Weights & Biases)
os.environ["WANDB_DISABLED"] = "true"

# 1. Check number of reviews by sentiment
label_counts = df["label"].value_counts()
has_positive = label_counts.get(2, 0) >= 2
has_negative = label_counts.get(0, 0) >= 2
has_single_negative = label_counts.get(0, 0) == 1

# 2. If there are enough positive reviews → Train a model
if has_positive:
    from transformers import BertTokenizer, BertForSequenceClassification
    from datasets import Dataset
    from sklearn.model_selection import train_test_split

    # Split positive reviews into train/validation sets
    positive_df = df[df["label"] == 2]
    train_df, val_df = train_test_split(positive_df, test_size=0.2, random_state=42)

    # Convert to HuggingFace Dataset
    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

    def tokenize_fn(batch):
        return tokenizer(batch["text"], padding="max_length", truncation=True, max_length=128)

    train_dataset = Dataset.from_pandas(train_df).map(tokenize_fn, batched=True)
    val_dataset = Dataset.from_pandas(val_df).map(tokenize_fn, batched=True)
    train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
    val_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

    model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=3)

    # Set up Trainer and train the model
    training_args = TrainingArguments(
        output_dir="./results",
        learning_rate=2e-5,
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        num_train_epochs=3,
        weight_decay=0.01,
        logging_dir="./logs"
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset
    )

    trainer.train()

    # Add predictions to validation set
    predictions = trainer.predict(val_dataset)
    pred_labels = predictions.predictions.argmax(axis=1)
    val_df = val_df.reset_index(drop=True)
    val_df["predicted_label"] = pred_labels

    # Print a sample of predictions without errors
    n = min(5, len(val_df))
    print("📊 Sample predictions on validation set:")
    print(val_df[["text", "label", "predicted_label"]].sample(n, random_state=42))


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Map:   0%|          | 0/15 [00:00<?, ? examples/s]

Map:   0%|          | 0/4 [00:00<?, ? examples/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Step,Training Loss


📊 Sample predictions on validation set:
                                                text  label  predicted_label
1  I came here a year ago while traveling.\nThis ...      2                2
3  📍Bar Totti’s, 330A/330B George St, Sydney NSW ...      2                2
0  You can not find pizza here, but their wood fi...      2                2
2  Without a doubt one of my favorite places in S...      2                2


In [13]:
from transformers import pipeline

# 1. Load summarization model (using BART)
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

# 2. Function for individual summarization + final summarization
def summarize_reviews_by_compression(text_list, chunk_size=512, final_maxlen=100):
    individual_summaries = []

    for text in text_list:
        try:
            # Summarize each individual review
            summary = summarizer(text[:chunk_size], max_length=60, min_length=20, do_sample=False)[0]["summary_text"]
            individual_summaries.append(summary)
        except:
            continue

    if not individual_summaries:
        return "⚠️ No text available for summarization."

    # Combine individual summaries and summarize again
    combined = " ".join(individual_summaries)[:3000]  # Limit the total length
    try:
        final_summary = summarizer(combined, max_length=final_maxlen, min_length=40, do_sample=False)[0]["summary_text"]
    except:
        final_summary = "⚠️ Final summarization failed"

    return final_summary


config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Device set to use cpu


In [14]:
# List of positive review texts
positive_reviews = df[df["label"] == 2]["text"].tolist()

# Handle negative reviews
negative_count = df["label"].value_counts().get(0, 0)

# Summarize positive reviews
positive_summary = summarize_reviews_by_compression(positive_reviews)

# Summarize or directly output negative review(s)
if negative_count == 1:
    negative_summary = df[df["label"] == 0]["text"].iloc[0]
elif negative_count >= 2:
    negative_reviews = df[df["label"] == 0]["text"].tolist()
    negative_summary = summarize_reviews_by_compression(negative_reviews)
else:
    negative_summary = "❌ No negative reviews"


Your max_length is set to 60, but your input_length is only 46. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=23)
Your max_length is set to 60, but your input_length is only 47. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=23)
Your max_length is set to 60, but your input_length is only 39. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=19)
Your max_length is set to 60, but your input_length is only 35. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=17)


In [15]:
print("✅ Summary of Positive Reviews:\n", positive_summary)
print("\n❌ Summary of Negative Reviews:\n", negative_summary)


✅ Summary of Positive Reviews:
 Bar Totti's in Sydney is an excellent choice for good food and a lively atmosphere. Although there is often a wait, it's usually under 30 minutes, and it's definitely worth it. The fresh taste, rich flavor, and chewy bread are really appealing.

❌ Summary of Negative Reviews:
 "The wait time was really too long, and the waiter didn't look happy" "The fettuccine was amazing! We also had beef cutlet!" "Perhaps they didn't understand English, but there was a misunderstanding"
