In [1]:
import json
import torch
from transformers import pipeline
from datetime import datetime
from colorama import init, Fore

# Initialize colorama
init(autoreset=True)

def load_classification_model():
    model_name = "./deception_detector_bert_cased"
    print(Fore.YELLOW + f"Loading model from: {model_name}")
    classifier = pipeline(
        "text-classification", 
        model=model_name, 
        tokenizer=model_name,
        device=0 if torch.cuda.is_available() else -1
    )
    print(Fore.GREEN + f"Model loaded on {'GPU' if torch.cuda.is_available() else 'CPU'}")
    return classifier

def combine_hotel_data(details_file, reviews_file, output_file, classifier):
    print(Fore.YELLOW + f"\nLoading hotel details from: {details_file}")
    with open(details_file, 'r', encoding='utf-8') as f:
        hotel_details = json.load(f)
    
    print(Fore.YELLOW + f"Loading hotel reviews from: {reviews_file}")
    with open(reviews_file, 'r', encoding='utf-8') as f:
        hotel_reviews = json.load(f)
    
    reviews_dict = {hotel['hotel_number']: hotel['reviews'] for hotel in hotel_reviews}
    print(Fore.CYAN + f"\nFound {len(hotel_details)} hotels in details and {len(hotel_reviews)} hotels in reviews")
    
    combined_hotels = []
    total_reviews = 0
    kept_reviews = 0
    
    for hotel in hotel_details:
        hotel_number = hotel['hotel_number']
        hotel_name = hotel['name']
        print(Fore.MAGENTA + f"\nProcessing hotel #{hotel_number}: {hotel_name}")
        
        if hotel_number in reviews_dict:
            combined_hotel = hotel.copy()
            hotel_reviews_list = reviews_dict[hotel_number]
            print(Fore.CYAN + f"  Found {len(hotel_reviews_list)} reviews")
            
            truthful_reviews = []
            
            for review in hotel_reviews_list:
                total_reviews += 1
                review_text = review['review_text']
                
                # Print progress without spamming too much
                if total_reviews % 5 == 0:
                    print(Fore.WHITE + f"  Processing review {total_reviews}...", end='\r')
                
                try:
                    result = classifier(review_text)[0]
                    label = result['label']
                    score = result['score']
                    
                    if label.lower() == "truthful":
                        truthful_reviews.append(review)
                        kept_reviews += 1
                        print(Fore.GREEN + f"  ✓ Kept review (classified as {label} with  {score:.2%} confidence)")
                    else:
                        print(Fore.RED + f"  ✗ Filtered out (classified as {label} with {score:.2%} confidence)")
                        
                except Exception as e:
                    print(Fore.YELLOW + f"  ! Error processing review: {str(e)} - remove it to be safe")

            
            if truthful_reviews:
                combined_hotel['reviews'] = truthful_reviews
                combined_hotels.append(combined_hotel)
                print(Fore.GREEN + f"  Kept {len(truthful_reviews)}/{len(hotel_reviews_list)} reviews")
            else:
                print(Fore.YELLOW + "  No reviews kept for this hotel")
        else:
            print(Fore.YELLOW + "  No reviews found for this hotel")
    
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(combined_hotels, f, ensure_ascii=False, indent=2)
    
    print(Fore.CYAN + "\nSUMMARY:")
    print(Fore.WHITE + f"Total reviews processed: {total_reviews}")
    print(Fore.GREEN + f"Reviews kept (truthful): {kept_reviews} ({kept_reviews/total_reviews:.1%})")
    print(Fore.RED + f"Reviews filtered out: {total_reviews - kept_reviews} ({(total_reviews - kept_reviews)/total_reviews:.1%})")
    print(Fore.MAGENTA + f"Results saved to: {output_file}")

if __name__ == "__main__":
    try:
        classifier = load_classification_model()
        combine_hotel_data(
            'hotels_details_no_reviews.json',
            'hotels_reviews_only.json',
            'hotels_details_reviews_filtered.json',
            classifier
        )
    except Exception as e:
        print(Fore.RED + f"\nERROR: {str(e)}")
        raise

Loading model from: ./deception_detector_bert_cased



Device set to use cpu


Model loaded on CPU

Loading hotel details from: hotels_details_no_reviews.json
Loading hotel reviews from: hotels_reviews_only.json

Found 169 hotels in details and 169 hotels in reviews

Processing hotel #1: The Residence Tunis
  Found 20 reviews
  ✓ Kept review (classified as truthful with  64.79% confidence)
  ✓ Kept review (classified as truthful with  90.71% confidence)
  ✓ Kept review (classified as truthful with  92.27% confidence)
  ✗ Filtered out (classified as deceptive with 62.51% confidence)
  ✓ Kept review (classified as truthful with  91.09% confidence)
  ✓ Kept review (classified as truthful with  64.75% confidence)
  ✓ Kept review (classified as truthful with  91.74% confidence)
  ✓ Kept review (classified as truthful with  79.84% confidence)
  ✗ Filtered out (classified as deceptive with 86.60% confidence)
  ✗ Filtered out (classified as deceptive with 89.67% confidence)
  ✓ Kept review (classified as truthful with  89.00% confidence)
  ✓ Kept review (classified as tr

Token indices sequence length is longer than the specified maximum sequence length for this model (1140 > 512). Running this sequence through the model will result in indexing errors


  ! Error processing review: The size of tensor a (1140) must match the size of tensor b (512) at non-singleton dimension 1 - remove it to be safe
  ✓ Kept review (classified as truthful with  92.83% confidence)
  ✓ Kept review (classified as truthful with  87.66% confidence)
  ! Error processing review: The size of tensor a (581) must match the size of tensor b (512) at non-singleton dimension 1 - remove it to be safe
  ✓ Kept review (classified as truthful with  93.35% confidence)
  ✓ Kept review (classified as truthful with  64.66% confidence)
  ✓ Kept review (classified as truthful with  93.77% confidence)
  ✓ Kept review (classified as truthful with  91.63% confidence)
  ✓ Kept review (classified as truthful with  89.73% confidence)
  ✓ Kept review (classified as truthful with  86.01% confidence)
  ✓ Kept review (classified as truthful with  91.12% confidence)
  ✓ Kept review (classified as truthful with  84.07% confidence)
  ! Error processing review: The size of tensor a (523) m