In [1]:
import os
import re
import string
import unicodedata
import pickle
import numpy as np
import nltk
import pandas as pd
from googletrans import Translator  # Requires: pip install googletrans==4.0.0-rc1

# ====== SETUP NLTK PATHS ====== #
nltk.data.path.append('../static/model/nltk_data')
from nltk.stem import RSLPStemmer

# ====== INITIALIZE COMPONENTS ====== #
translator = Translator()
stemmer = RSLPStemmer()

# ====== LOAD RESOURCES ====== #
with open('../static/model/model.pkl', 'rb') as f:
    model = pickle.load(f)

with open('../static/model/portuguese_vocabulary.txt', 'r', encoding='utf-8') as f:
    tokens = f.read().splitlines()

with open('../static/model/nltk_data/corpora/stopwords/portuguese', 'r', encoding='utf-8') as f:
    pt_stopwords = set(f.read().splitlines())

# ====== TRANSLATION FUNCTION ====== #
def translate_to_portuguese(text):
    """Translate English text to Portuguese"""
    try:
        translated = translator.translate(text, src='en', dest='pt').text
        return translated
    except Exception as e:
        print(f"Translation error: {e}")
        return text  # Fallback to original text

# ====== TEXT PREPROCESSING FUNCTION ====== #
def preprocess_text_pt(text):
    """Clean and prepare Portuguese text for analysis"""
    if pd.isna(text):
        return ''
        
    # Text normalization
    text = text.lower()
    # Remove URLs/mentions
    text = re.sub(r'http\S+|@\w+|#\w+', '', text)
    # Keep Portuguese chars
    text = re.sub(r'[^\w\sáàâãéêíóôõúç]', '', text)
    # Remove emojis
    text = ''.join(c for c in text if unicodedata.category(c) != 'So')
    # Remove accents
    text = unicodedata.normalize('NFD', text)
    .encode('ascii', 'ignore').decode("utf-8") 
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation)) 
    text = re.sub(r'\d+', '', text)  # Remove numbers
    
    # Language processing  # Remove stopwords
    text = ' '.join([word for word in text.split() if word not in pt_stopwords]) 
    text = ' '.join([stemmer.stem(word) for word in text.split()])  # Stemming
    
    return text

# ====== VECTORIZATION FUNCTION ====== #
def vectorizer(text, vocabulary):
    """Convert text to binary feature vector"""
    vectorized = np.zeros(len(vocabulary), dtype=np.float32)
    words = set(text.split())
    for i, word in enumerate(vocabulary):
        vectorized[i] = 1 if word in words else 0
    return vectorized.reshape(1, -1)

# ====== FULL PREDICTION PIPELINE ====== #
def analyze_english_text(text):
    """
    Complete processing pipeline for English text:
    1. Translate to Portuguese
    2. Preprocess text
    3. Vectorize features
    4. Predict sentiment
    """
    # Translation phase
    pt_text = translate_to_portuguese(text)
    print(f"Translated Portuguese text: {pt_text}")  # Print translation result
    
    # Preprocessing phase
    cleaned_text = preprocess_text_pt(pt_text)
    print(f"Preprocessed text: {cleaned_text}")  # Print cleaned text
    
    # Vectorization phase
    vectorized = vectorizer(cleaned_text, tokens)
    
    # Prediction phase
    prediction = model.predict(vectorized)[0]
    return "positive" if prediction == 1 else "negative"

# ====== USAGE EXAMPLE ====== #
if __name__ == "__main__":
    # Test multiple English tweets
    test_tweets = [
        {"text": "This new phone is amazing! The camera quality blows me away 😍", "expected": "positive"},
        {"text": "So disappointed Bad customer service ever! Waited 2 hours just to get disconnected 👎", "expected": "negative"},
        {"text": "Loving the new park renovation! Perfect place for family weekends 🌳", "expected": "positive"},
        {"text": "My order arrived damaged and they refuse to refund me! Scammers! 😡", "expected": "negative"},
        {"text": "This Portuguese course is fantastic! Finally learning proper grammar 📚", "expected": "positive"}
    ]

    print("=== Sentiment Analysis Test Batch ===")
    for idx, tweet in enumerate(test_tweets, 1):
        print(f"\n📩 Tweet #{idx}:")
        print(f"ENGLISH: {tweet['text']}")
        
        print("\n🔍 Analysis:")
        result = analyze_english_text(tweet['text'])
        
        print(f"\n✅ Expected: {tweet['expected'].upper()}")
        print(f"🤖 Predicted: {result.upper()}")
        print("-------------------------------------------------------------------------------------------------------")

=== Sentiment Analysis Test Batch ===

📩 Tweet #1:
ENGLISH: This new phone is amazing! The camera quality blows me away 😍

🔍 Analysis:
Translated Portuguese text: Este novo telefone é incrível! A qualidade da câmera me surpreende 😍
Preprocessed text: nov telefon incri qual cam surpreend

✅ Expected: POSITIVE
🤖 Predicted: POSITIVE
-------------------------------------------------------------------------------------------------------

📩 Tweet #2:
ENGLISH: So disappointed Bad customer service ever! Waited 2 hours just to get disconnected 👎

🔍 Analysis:
Translated Portuguese text: Tão decepcionado o mau atendimento ao cliente de todos os tempos! Esperou 2 horas apenas para se desconectar 👎
Preprocessed text: tao decepcion mau atend client tod temp esper hor apen desconect

✅ Expected: NEGATIVE
🤖 Predicted: NEGATIVE
-------------------------------------------------------------------------------------------------------

📩 Tweet #3:
ENGLISH: Loving the new park renovation! Perfect place for f