In [6]:
import json
import spacy
import time
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification

nlp = spacy.load("en_core_web_sm")

In [1]:
import torch
print(torch.cuda.is_available())
print(torch.cuda.get_device_name(0))


True
NVIDIA GeForce RTX 3090


In [5]:
# Load Spacy model for English
nlp = spacy.load("en_core_web_sm")

# Load sentiment analysis pipeline from transformers
sentiment_pipeline = pipeline("sentiment-analysis")

def process_review(review):
    # Tokenize the review into sentences
    doc = nlp(review)
    sentences = [sent.text.strip() for sent in doc.sents]

    # Perform sentiment analysis on each sentence
    sentiments = []
    for sentence in sentences:
        result = sentiment_pipeline(sentence)[0]
        sentiments.append((sentence, result['label'], result['score']))

    return sentiments

# Replace 'your_file.json' with the path to your JSON file
with open('data/ys-reviews-restaurants.json', 'r') as file:
    data = json.load(file)
    data = data[0:10]
# Process each review
for item in data:
    review = item['text']
    sentiments = process_review(review)
    print(f"Review Sentiments: {sentiments}\n")


No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


Review Sentiments: [('Decent Chinese Food.', 'POSITIVE', 0.9998672008514404), ('The Hunan items are my favorite.', 'POSITIVE', 0.9989293217658997)]

Review Sentiments: [("We've been coming here for well over 15 years.", 'POSITIVE', 0.8908674120903015), ('The management has changed once or twice, but it remains our favorite Chinese food spot in Folsom.', 'POSITIVE', 0.9993718266487122), ('My favorites are the wonton soup and General chicken.', 'POSITIVE', 0.9910495281219482), ('A+ customer service and great food at great prices.', 'POSITIVE', 0.9997970461845398)]

Review Sentiments: [('Great food, decent prices.', 'POSITIVE', 0.9998748302459717), ('You get A lot of food w/e you order.', 'POSITIVE', 0.997488260269165), ('The dinner or lunch combos are a good choice.', 'POSITIVE', 0.999488115310669)]

Review Sentiments: [('Another great meal.', 'POSITIVE', 0.9998217225074768), ('Great service dine in or delivery.', 'POSITIVE', 0.9840313196182251), ('China House is our go to.', 'POSITIVE',

First model

In [6]:
# Load Spacy model for English
nlp = spacy.load("en_core_web_sm")

# Load the tokenizer and model from Hugging Face
model_name = "siebert/sentiment-roberta-large-english"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

# Check if CUDA (GPU support) is available and use it
device = 0 if torch.cuda.is_available() else -1
sentiment_pipeline = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer, device=device)

def process_review(review_text, review_id):
    doc = nlp(review_text)
    sentences = []
    for sent in doc.sents:
        # Check if the sentence is too long
        tokens = tokenizer.encode(sent.text, add_special_tokens=True)
        if len(tokens) > 512:
            print(f"Sentence too long in review {review_id}: {sent.text}")
            continue  # Skip this sentence

        sentiment_result = sentiment_pipeline(sent.text)[0]
        sentences.append({
            "text": sent.text.strip(),
            "topics": [],  # Placeholder for topics, to be filled later
            "sentiment": sentiment_result['label']
        })

    if review_id % 10000 == 0:
        print(f"Processed {review_id} reviews.")

    return sentences

# Function to load a hits file and return a structured dictionary
def load_hits_file(file_name):
    with open(file_name, 'r') as file:
        hits_data = json.load(file)
    hits_dict = {}
    for hit in hits_data:
        doc_index = hit['doc_index']
        sentence_index = hit['sentence_index']
        if doc_index not in hits_dict:
            hits_dict[doc_index] = {}
        hits_dict[doc_index][sentence_index] = hit['lemma']
    return hits_dict

# Load main reviews file
with open('data/ys-reviews-restaurants.json', 'r') as file:
    reviews_data = json.load(file)

# Display total number of reviews
print(f"Total reviews to process: {len(reviews_data)}")

# Load topic hits files
topics = ["clean", "food", "location", "price", "service"]
hits_files = [f"data/topics/{topic}-hits-restaurant-reviews.json" for topic in topics]
hits_dicts = {topic: load_hits_file(file) for topic, file in zip(topics, hits_files)}

# Process and structure the reviews data
restaurants = {}
for i, review in enumerate(reviews_data):
    gmap_id = review["gmap_id"]
    if gmap_id not in restaurants:
        restaurants[gmap_id] = {"reviews": []}
    processed_sentences = process_review(review["text"], i)
    for j, sentence in enumerate(processed_sentences):
        for topic, hits_dict in hits_dicts.items():
            if i in hits_dict and j in hits_dict[i]:
                sentence["topics"].append(topic)
    restaurants[gmap_id]["reviews"].append({"sentences": processed_sentences})

# Convert the structured data to the desired format
final_structure = {"restaurants": [{"gmap_id": gmap_id, "reviews": data["reviews"]} for gmap_id, data in restaurants.items()]}

# Write to a new JSON file
with open('processed_restaurant_reviews.json', 'w') as outfile:
    json.dump(final_structure, outfile, indent=4)

print("Processing complete. Data saved in 'processed_restaurant_reviews.json'")


Total reviews to process: 678759
Processed 0 reviews.
Processed 10000 reviews.
Processed 20000 reviews.
Processed 30000 reviews.
Processed 40000 reviews.
Processed 50000 reviews.
Processed 60000 reviews.
Processed 70000 reviews.
Processed 80000 reviews.
Processed 90000 reviews.
Processed 100000 reviews.
Processed 110000 reviews.


Token indices sequence length is longer than the specified maximum sequence length for this model (880 > 512). Running this sequence through the model will result in indexing errors


Sentence too long in review 119007: (Original)
อาหารอร่อยบริการดีพนักงานยิ้มแย้มแจ่มใส แต่กว่าจะได้เสิร์ฟได้กินนานมากครับ แล้วนี่เป็นวันพุธไม่ใช่ศุกร์เสาร์อาทิตย์เหมือนกับพนักงานไม่พอครับ หรือไม่มีมือครัวเพราะว่ากว่าจะได้อาหารเหมือนกับไปหั่นผักหั่นเนื้ออยู่ๆธรรมดาร้านอาหารต้องคั่วไปเลยมือครัวเขาจะตัดไว้ให้แล้ว เดี๋ยวไปฝากลูกหลานเป็นมือครัวไหมครับ คิดว่าอาหารอร่อยจนขายดีครับแขกมาเต็มเลยแต่เขาก็เซ็งรอกันนั่นแหละ ผมอ่านรีวิวในบอร์ดนี้เขาก็พูดประมาณเหมือนกันนั่นแหละอาหารอร่อยแต่ว่ารอนาน
Processed 120000 reviews.
Processed 130000 reviews.
Processed 140000 reviews.
Processed 150000 reviews.
Processed 160000 reviews.
Processed 170000 reviews.
Processed 180000 reviews.
Processed 190000 reviews.
Processed 200000 reviews.
Processed 210000 reviews.
Processed 220000 reviews.
Processed 230000 reviews.
Processed 240000 reviews.
Processed 250000 reviews.
Processed 260000 reviews.
Processed 270000 reviews.
Processed 280000 reviews.
Processed 290000 reviews.
Processed 300000 reviews.
Processed 310000 r

Testing different model and speed

In [13]:
# Load the tokenizer and model from Hugging Face
model_name = "lxyuan/distilbert-base-multilingual-cased-sentiments-student"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

# Check if CUDA (GPU support) is available and use it
device = 0 if torch.cuda.is_available() else -1
sentiment_pipeline = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer, device=device)

def process_reviews(reviews_data, max_reviews=1000000):
    all_sentences = []
    sentence_mappings = []
    review_structures = []
    batch_size = 10000  # Adjust based on your GPU memory usage

    # Collect sentences from the first 'max_reviews' reviews
    for review_id, review in enumerate(reviews_data[:max_reviews]):
        doc = nlp(review["text"])
        sentences = []
        for sent in doc.sents:
            tokens = tokenizer.encode(sent.text, add_special_tokens=True)
            if len(tokens) <= 512:
                all_sentences.append(sent.text.strip())
                sentence_mappings.append((review_id, len(sentences)))
                sentences.append({
                    "text": sent.text.strip(),
                    "topics": [],
                    "sentiment": ""
                })
        review_structures.append(sentences)

    print(f"Collected sentences for {max_reviews} reviews.")

    # Perform sentiment analysis on the collected sentences
    start_time = time.time()
    for i in range(0, len(all_sentences), batch_size):
        batch = all_sentences[i:i + batch_size]
        batch_sentiments = sentiment_pipeline(batch)
        for j, sentiment_result in enumerate(batch_sentiments):
            review_id, sentence_id = sentence_mappings[i + j]
            if review_id < len(review_structures) and sentence_id < len(review_structures[review_id]):
                review_structures[review_id][sentence_id]["sentiment"] = sentiment_result['label']

    elapsed_time = time.time() - start_time
    print(f"Processed sentiment analysis for {len(all_sentences)} sentences in {elapsed_time:.2f} seconds.")

    return review_structures

def load_hits_file(file_name):
    with open(file_name, 'r') as file:
        hits_data = json.load(file)
    hits_dict = {}
    for hit in hits_data:
        doc_index = hit['doc_index']
        sentence_index = hit['sentence_index']
        if doc_index not in hits_dict:
            hits_dict[doc_index] = {}
        hits_dict[doc_index][sentence_index] = hit['lemma']
    return hits_dict

# Load main reviews file
with open('data/ys-reviews-restaurants.json', 'r') as file:
    reviews_data = json.load(file)

# Display total number of reviews
print(f"Total reviews in the file: {len(reviews_data)}")

# Load topic hits files
topics = ["clean", "food", "location", "price", "service"]
hits_files = [f"data/topics/{topic}-hits-restaurant-reviews.json" for topic in topics]
hits_dicts = {topic: load_hits_file(file) for topic, file in zip(topics, hits_files)}

# Process reviews and obtain structured data
review_structures = process_reviews(reviews_data)

# Assign topics to sentences in review structures
for topic, hits_dict in hits_dicts.items():
    for doc_index, sentence_indices in hits_dict.items():  # Change here
        for sentence_index in sentence_indices:  # Iterate over sentence indices
            if doc_index < len(review_structures) and sentence_index < len(review_structures[doc_index]):
                review_structures[doc_index][sentence_index]["topics"].append(topic)

# Build the final data structure
restaurants = {}
for i, review in enumerate(review_structures):
    gmap_id = reviews_data[i]["gmap_id"]
    if gmap_id not in restaurants:
        restaurants[gmap_id] = {"reviews": []}
    restaurants[gmap_id]["reviews"].append({"sentences": review})

final_structure = {"restaurants": [{"gmap_id": gmap_id, "reviews": data["reviews"]} for gmap_id, data in restaurants.items()]}

# Write to a new JSON file
with open('processed_restaurant_reviews_2.json', 'w') as outfile:
    json.dump(final_structure, outfile, indent=4)

print("Processing complete. Data saved in 'processed_restaurant_reviews_2.json'")



Total reviews in the file: 678759


Token indices sequence length is longer than the specified maximum sequence length for this model (3057 > 512). Running this sequence through the model will result in indexing errors


Collected sentences for 1000000 reviews.
Processed sentiment analysis for 1629759 sentences in 10797.93 seconds.
Processing complete. Data saved in 'processed_restaurant_reviews_2.json'
