## convert data v·ªÅ ƒë√∫ng c·ªôt 

In [None]:
import pandas as pd

# Load CSV
df = pd.read_csv("Datasets.csv")

# T√°ch product v√† review
def split_sentence(s):
    if " - " in s:
        product, review = s.split(" - ", 1)
        return product.strip(), review.strip()
    return s.strip(), ""   # fallback n·∫øu kh√¥ng c√≥ d·∫•u '-'

df["product"], df["review"] = zip(*df["sentence"].apply(split_sentence))

# N·∫øu mu·ªën b·ªè prefix nh∆∞ "ƒêi·ªán tho·∫°i "
def normalize_product(p):
    prefixes = ["ƒêi·ªán tho·∫°i ", "Smartphone ", "M√°y t√≠nh b·∫£ng "]
    for pre in prefixes:
        if p.startswith(pre):
            return p[len(pre):].strip()
    return p

df["product"] = df["product"].apply(normalize_product)

df.to_csv("clean_reviews.csv", index=False)


In [3]:
"""
Simple Statistics-Based RAG System
- LLM parses query to extract product + aspects
- Filter data using fuzzy matching
- Compute statistics on filtered data
- LLM generates final answer
"""

import pandas as pd
import json
import ast
from typing import List, Dict, Any, Optional, Tuple
from collections import Counter
from openai import OpenAI
from dotenv import load_dotenv
import os

load_dotenv()

# ============================================================================
# MODULE 1: DATA LOADER
# ============================================================================

class DataLoader:
    """Load and prepare data"""
    
    def __init__(self, csv_path: str):
        self.csv_path = csv_path
        self.df = None
    
    def load_data(self) -> pd.DataFrame:
        """Load CSV"""
        print("üìÇ Loading data...")
        self.df = pd.read_csv(self.csv_path)
        
        # Parse aspects and sentiments
        self.df['aspects'] = self.df['aspects'].apply(self._safe_parse)
        self.df['sentiments'] = self.df['sentiments'].apply(self._safe_parse)
        
        print(f"‚úÖ Loaded {len(self.df)} reviews")
        print(f"üì± Products: {self.df['product'].nunique()}")
        
        return self.df
    
    def _safe_parse(self, x):
        """Parse string to list/dict"""
        if pd.isna(x):
            return [] if '[' in str(x) else {}
        if isinstance(x, (list, dict)):
            return x
        try:
            return ast.literal_eval(x)
        except:
            return [] if '[' in str(x) else {}


# ============================================================================
# MODULE 2: QUERY PARSER (LLM)
# ============================================================================

class QueryParser:
    """Use LLM to parse user query"""
    
    def __init__(self, client: OpenAI, available_products: List[str]):
        self.client = client
        self.available_products = available_products
    
    def parse_query(self, query: str) -> Dict:
        """Parse query to extract intent, products, aspects"""
        
        system_prompt = f"""B·∫°n l√† tr·ª£ l√Ω ph√¢n t√≠ch c√¢u h·ªèi v·ªÅ ƒë√°nh gi√° ƒëi·ªán tho·∫°i.

Nhi·ªám v·ª•: Ph√¢n t√≠ch c√¢u h·ªèi ƒë·ªÉ tr√≠ch xu·∫•t:
1. products: Danh s√°ch t√™n s·∫£n ph·∫©m (c√≥ th·ªÉ vi·∫øt t·∫Øt/kh√¥ng ch√≠nh x√°c)
2. aspects: Danh s√°ch kh√≠a c·∫°nh ƒë∆∞·ª£c h·ªèi (general, battery, camera, performance, screen, design, price, storage, features, ser&acc)
3. sentiment_focus: Ng∆∞·ªùi d√πng mu·ªën bi·∫øt ∆∞u ƒëi·ªÉm (positive), nh∆∞·ª£c ƒëi·ªÉm (negative), hay t·ªïng quan (null)
4. is_comparison: C√≥ ph·∫£i c√¢u h·ªèi so s√°nh kh√¥ng? (true/false)

ASPECTS c√≥ th·ªÉ c√≥:
- general: ƒë√°nh gi√° chung
- battery/pin: pin
- camera: camera
- performance: hi·ªáu nƒÉng, t·ªëc ƒë·ªô, chip
- screen: m√†n h√¨nh
- design: thi·∫øt k·∫ø, ngo·∫°i h√¨nh
- price: gi√° c·∫£
- storage: b·ªô nh·ªõ, l∆∞u tr·ªØ
- features: t√≠nh nƒÉng
- ser&acc: d·ªãch v·ª•, ph·ª• ki·ªán

Danh s√°ch s·∫£n ph·∫©m c√≥ s·∫µn (ƒë·ªÉ tham kh·∫£o):
{', '.join(self.available_products[:10])}...

Tr·∫£ v·ªÅ JSON v·ªõi format:
{{
  "products": ["product_name1", "product_name2"],
  "aspects": ["aspect1", "aspect2"],
  "sentiment_focus": "positive|negative|null",
  "is_comparison": true|false
}}"""

        try:
            response = self.client.chat.completions.create(
                model="gpt-4o-mini",
                messages=[
                    {"role": "system", "content": system_prompt},
                    {"role": "user", "content": f"C√¢u h·ªèi: {query}\n\nPh√¢n t√≠ch:"}
                ],
                temperature=0.1,
                response_format={"type": "json_object"}
            )
            
            parsed = json.loads(response.choices[0].message.content)
            print(f"üîç Parsed: {parsed}")
            return parsed
            
        except Exception as e:
            print(f"‚ö†Ô∏è Parse error: {e}")
            return {
                "products": [],
                "aspects": ["general"],
                "sentiment_focus": None,
                "is_comparison": False
            }


# ============================================================================
# MODULE 3: DATA FILTER
# ============================================================================

class DataFilter:
    """Filter data based on parsed query"""
    
    def __init__(self, df: pd.DataFrame):
        self.df = df
        self.all_products = df['product'].unique().tolist()
    
    def keyword_match_product(self, query_product: str) -> List[str]:
        """Match products using keyword extraction with exact priority"""
        query_lower = query_product.lower().strip()
        
        # Extract keywords from query (split and clean)
        keywords = []
        for word in query_lower.split():
            # Remove common words and keep meaningful parts
            if len(word) >= 2 and word not in ['ƒëi·ªán', 'tho·∫°i', 'm√°y', 'chi·∫øc', 'c√°i', 'gb', 'tb']:
                keywords.append(word)
        
        if not keywords:
            return []
        
        exact_matches = []
        partial_matches = []
        
        for product in self.all_products:
            product_lower = product.lower()
            
            # Strategy 1: Exact phrase match (highest priority)
            # Check if query is a substring of product name
            if query_lower in product_lower:
                # But make sure it's a word boundary match
                # e.g., "iphone 17" should not match "iphone 170"
                import re
                # Add word boundaries
                pattern = r'\b' + re.escape(query_lower) + r'\b'
                if re.search(pattern, product_lower):
                    exact_matches.append(product)
                    continue
            
            # Strategy 2: All keywords must exist (partial match)
            if all(kw in product_lower for kw in keywords):
                # Additional check: avoid over-matching
                # If query is "iphone 17", don't match "iphone 17 pro"
                # unless query also has "pro"
                
                # Extract product keywords (significant words)
                product_words = set(re.findall(r'\b\w+\b', product_lower))
                query_words = set(keywords)
                
                # If product has extra significant model keywords not in query, skip
                significant_extras = ['pro', 'max', 'plus', 'ultra', 'mini', 'lite', 'note']
                extra_in_product = product_words.intersection(significant_extras)
                extra_in_query = query_words.intersection(significant_extras)
                
                # If product has model variant keywords that query doesn't have, skip it
                if extra_in_product - extra_in_query:
                    continue
                
                partial_matches.append(product)
        
        # Return exact matches first, then partial if no exact
        if exact_matches:
            return exact_matches
        return partial_matches
    
    def filter_data(self, products: List[str], aspects: List[str]) -> tuple:
        """Filter dataframe based on products and aspects"""
        
        # Step 1: Match products using keywords
        matched_products = []
        
        if not products:
            # No product specified, use all
            matched_products = self.all_products
        else:
            for query_prod in products:
                matched = self.keyword_match_product(query_prod)
                matched_products.extend(matched)
            
            matched_products = list(set(matched_products))
        
        if not matched_products:
            print("‚ö†Ô∏è No products matched query")
            return pd.DataFrame(), [], {}
            # matched_products = self.all_products
        else:
            print(f"‚úÖ Matched {len(matched_products)} products: {matched_products[:3]}{'...' if len(matched_products) > 3 else ''}")
        
        # Filter by products FIRST (don't filter by aspect yet for counting)
        product_filtered = self.df[self.df['product'].isin(matched_products)].copy()
        
        # Count total reviews per product BEFORE aspect filtering
        product_review_counts = {}
        for product in matched_products:
            product_df = product_filtered[product_filtered['product'] == product]
            # Use review_id if available, otherwise count unique reviews
            if 'review_id' in product_df.columns:
                product_review_counts[product] = product_df['review_id'].nunique()
            else:
                product_review_counts[product] = product_df['review'].nunique()
        
        # Step 2: Filter by aspects for retrieval (but keep counts from above)
        if aspects and aspects != ['general']:
            def has_aspect_match(review_aspects):
                if not isinstance(review_aspects, list):
                    return False
                return any(asp in aspects for asp in review_aspects)
            
            aspect_filtered = product_filtered[product_filtered['aspects'].apply(has_aspect_match)].copy()
        else:
            aspect_filtered = product_filtered.copy()
        
        print(f"üìä Filtered to {len(aspect_filtered)} review entries")
        
        return aspect_filtered, matched_products, product_review_counts


# ============================================================================
# MODULE 4: STATISTICS COMPUTER
# ============================================================================

class StatisticsComputer:
    """Compute statistics on filtered data"""
    
    @staticmethod
    def compute_stats(df: pd.DataFrame, aspects: List[str], product_review_counts: Dict[str, int]) -> Dict:
        """Compute comprehensive statistics
        
        Args:
            df: Filtered dataframe (may be filtered by aspect)
            aspects: List of aspects to analyze
            product_review_counts: Dict of {product: total_review_count} BEFORE aspect filtering
        """
        
        if len(df) == 0:
            return {
                'total_reviews': 0,
                'message': 'Kh√¥ng t√¨m th·∫•y d·ªØ li·ªáu ph√π h·ª£p'
            }
        
        # Total reviews across all matched products (from pre-computed counts)
        total_reviews = sum(product_review_counts.values())
        
        stats = {
            'total_reviews': total_reviews,
            'products': {},
            'aspects': {},
            'overall_sentiment': {}
        }
        
        # Overall sentiment distribution (count from filtered df, but calculate % based on total)
        all_sentiments = []
        for sentiments_dict in df['sentiments']:
            if isinstance(sentiments_dict, dict):
                all_sentiments.extend(sentiments_dict.values())
        
        sentiment_counts = Counter(all_sentiments)
        
        stats['overall_sentiment'] = {
            'positive': sentiment_counts.get('Positive', 0),
            'neutral': sentiment_counts.get('Neutral', 0),
            'negative': sentiment_counts.get('Negative', 0),
            'positive_pct': sentiment_counts.get('Positive', 0) / total_reviews * 100 if total_reviews > 0 else 0,
            'negative_pct': sentiment_counts.get('Negative', 0) / total_reviews * 100 if total_reviews > 0 else 0,
            'neutral_pct': sentiment_counts.get('Neutral', 0) / total_reviews * 100 if total_reviews > 0 else 0
        }
        
        # Per-product stats
        for product, product_total_reviews in product_review_counts.items():
            product_df = df[df['product'] == product]
            stats['products'][product] = StatisticsComputer._compute_product_stats(
                product_df, 
                product_total_reviews
            )
        
        # Per-aspect stats
        for aspect in aspects:
            aspect_reviews = []
            aspect_sentiments = []
            
            for idx, row in df.iterrows():
                if isinstance(row['aspects'], list) and aspect in row['aspects']:
                    aspect_reviews.append(row['sentence'])
                    if isinstance(row['sentiments'], dict) and aspect in row['sentiments']:
                        aspect_sentiments.append(row['sentiments'][aspect])
            
            sentiment_counts = Counter(aspect_sentiments)
            positive_count = sentiment_counts.get('Positive', 0)
            negative_count = sentiment_counts.get('Negative', 0)
            neutral_count = sentiment_counts.get('Neutral', 0)
            
            # Calculate based on TOTAL reviews
            implicit_neutral = total_reviews - len(aspect_reviews)
            total_neutral = neutral_count + implicit_neutral
            
            stats['aspects'][aspect] = {
                'count': len(aspect_reviews),
                'positive': positive_count,
                'neutral': total_neutral,
                'negative': negative_count,
                'positive_pct': positive_count / total_reviews * 100 if total_reviews > 0 else 0,
                'negative_pct': negative_count / total_reviews * 100 if total_reviews > 0 else 0,
                'neutral_pct': total_neutral / total_reviews * 100 if total_reviews > 0 else 0,
                'mentioned_pct': len(aspect_reviews) / total_reviews * 100 if total_reviews > 0 else 0,
                'sample_reviews': aspect_reviews[:3]
            }
        
        return stats
    
    @staticmethod
    def _compute_product_stats(product_df: pd.DataFrame, total_reviews: int) -> Dict:
        """Compute stats for a single product
        
        Args:
            product_df: Filtered dataframe for this product (may be aspect-filtered)
            total_reviews: ACTUAL total reviews for this product (before aspect filtering)
        """
        all_sentiments = []
        for sentiments_dict in product_df['sentiments']:
            if isinstance(sentiments_dict, dict):
                all_sentiments.extend(sentiments_dict.values())
        
        sentiment_counts = Counter(all_sentiments)
        
        return {
            'review_count': total_reviews,
            'positive': sentiment_counts.get('Positive', 0),
            'neutral': sentiment_counts.get('Neutral', 0),
            'negative': sentiment_counts.get('Negative', 0),
            'positive_pct': sentiment_counts.get('Positive', 0) / total_reviews * 100 if total_reviews > 0 else 0,
            'negative_pct': sentiment_counts.get('Negative', 0) / total_reviews * 100 if total_reviews > 0 else 0,
            'neutral_pct': sentiment_counts.get('Neutral', 0) / total_reviews * 100 if total_reviews > 0 else 0
        }


# ============================================================================
# MODULE 5: CONTEXT BUILDER
# ============================================================================

class ContextBuilder:
    """Build context for LLM from statistics"""
    
    @staticmethod
    def build_context(stats: Dict, df: pd.DataFrame, parsed_query: Dict) -> str:
        """Build comprehensive context"""
        
        if stats.get('total_reviews', 0) == 0:
            return "Kh√¥ng t√¨m th·∫•y d·ªØ li·ªáu ƒë√°nh gi√° ph√π h·ª£p v·ªõi c√¢u h·ªèi."
        
        parts = []
        
        # Header
        parts.append(f"=== T·ªîNG QUAN ===")
        parts.append(f"T·ªïng s·ªë ƒë√°nh gi√°: {stats['total_reviews']}")
        
        # Overall sentiment
        if 'overall_sentiment' in stats:
            s = stats['overall_sentiment']
            parts.append(f"\nPh√¢n b·ªë c·∫£m x√∫c t·ªïng th·ªÉ:")
            parts.append(f"  ‚Ä¢ T√≠ch c·ª±c: {s['positive_pct']:.1f}% ({s['positive']} ƒë√°nh gi√°)")
            parts.append(f"  ‚Ä¢ Trung l·∫≠p: {s['neutral_pct']:.1f}% ({s['neutral']} ƒë√°nh gi√°)")
            parts.append(f"  ‚Ä¢ Ti√™u c·ª±c: {s['negative_pct']:.1f}% ({s['negative']} ƒë√°nh gi√°)")
        
        # Per-product breakdown
        if len(stats['products']) > 1 or parsed_query.get('is_comparison'):
            parts.append(f"\n=== SO S√ÅNH S·∫¢N PH·∫®M ===")
            for product, pstats in stats['products'].items():
                parts.append(f"\n{product}:")
                parts.append(f"  ‚Ä¢ S·ªë ƒë√°nh gi√°: {pstats['review_count']}")
                parts.append(f"  ‚Ä¢ T√≠ch c·ª±c: {pstats['positive_pct']:.1f}% ({pstats['positive']})")
                parts.append(f"  ‚Ä¢ Ti√™u c·ª±c: {pstats['negative_pct']:.1f}% ({pstats['negative']})")
        
        # Per-aspect breakdown
        if stats['aspects']:
            parts.append(f"\n=== PH√ÇN T√çCH THEO KH√çA C·∫†NH ===")
            for aspect, astats in stats['aspects'].items():
                parts.append(f"\n{aspect.upper()}:")
                parts.append(f"  ‚Ä¢ S·ªë ƒë√°nh gi√° ƒë·ªÅ c·∫≠p: {astats['count']}/{stats['total_reviews']} ({astats['mentioned_pct']:.1f}%)")
                parts.append(f"  ‚Ä¢ T√≠ch c·ª±c: {astats['positive_pct']:.1f}% ({astats['positive']} ƒë√°nh gi√°)")
                parts.append(f"  ‚Ä¢ Ti√™u c·ª±c: {astats['negative_pct']:.1f}% ({astats['negative']} ƒë√°nh gi√°)")
                parts.append(f"  ‚Ä¢ Trung l·∫≠p/Kh√¥ng ƒë·ªÅ c·∫≠p: {astats['neutral_pct']:.1f}% ({astats['neutral']} ƒë√°nh gi√°)")
                
                if astats['sample_reviews']:
                    parts.append(f"  ‚Ä¢ V√≠ d·ª•:")
                    for i, review in enumerate(astats['sample_reviews'][:2], 1):
                        parts.append(f"    {i}. {review[:150]}")
        
        # Sample reviews
        parts.append(f"\n=== M·∫™U ƒê√ÅNH GI√Å ===")
        sample_df = df.head(5)
        for i, row in enumerate(sample_df.iterrows(), 1):
            idx, r = row
            parts.append(f"\n{i}. [{r['product']}]")
            parts.append(f"   Kh√≠a c·∫°nh: {', '.join(r['aspects']) if isinstance(r['aspects'], list) else 'N/A'}")
            parts.append(f"   N·ªôi dung: {r['sentence'][:200]}")
        
        return "\n".join(parts)


# ============================================================================
# MODULE 6: RAG SYSTEM
# ============================================================================

class SimpleRAG:
    """Main RAG system"""
    
    def __init__(self, csv_path: str, openai_api_key: Optional[str] = None):
        # Load API key
        api_key = openai_api_key or os.getenv('OPENAI_API_KEY')
        if not api_key:
            raise ValueError("‚ùå OpenAI API key not found! Add to .env file")
        
        self.client = OpenAI(api_key=api_key)
        
        # Load data
        loader = DataLoader(csv_path)
        self.df = loader.load_data()
        
        # Initialize modules
        self.parser = QueryParser(self.client, self.df['product'].unique().tolist())
        self.filter = DataFilter(self.df)
        
        print("‚úÖ System ready!")
    
    def answer(self, query: str) -> str:
        """Main method: answer user query"""
        
        print(f"\n{'='*80}")
        print(f"‚ùì Query: {query}")
        print(f"{'='*80}")
        
        # Step 1: Parse query with LLM
        print("\nüîç Step 1: Parsing query...")
        parsed = self.parser.parse_query(query)
        
        # Step 2: Filter data
        print("\nüìä Step 2: Filtering data...")
        filtered_df, matched_products, product_review_counts = self.filter.filter_data(
            parsed.get('products', []),
            parsed.get('aspects', ['general'])
        )
        
        if len(filtered_df) == 0:
            return "‚ö†Ô∏è Kh√¥ng t√¨m th·∫•y d·ªØ li·ªáu ph√π h·ª£p v·ªõi c√¢u h·ªèi c·ªßa b·∫°n."
        
        # Step 3: Compute statistics
        print("\nüìà Step 3: Computing statistics...")
        stats = StatisticsComputer.compute_stats(
            filtered_df,
            parsed.get('aspects', ['general']),
            product_review_counts
        )
        
        # Step 4: Build context
        print("\nüìù Step 4: Building context...")
        context = ContextBuilder.build_context(stats, filtered_df, parsed)
        
        # Step 5: Generate answer with LLM
        print("\nü§ñ Step 5: Generating answer...")
        answer = self._generate_answer(query, context, parsed)
        
        return answer
    
    def _generate_answer(self, query: str, context: str, parsed_query: Dict) -> str:
        """Generate final answer using LLM"""
        
        system_prompt = """B·∫°n l√† tr·ª£ l√Ω ph√¢n t√≠ch ƒë√°nh gi√° ƒëi·ªán tho·∫°i chuy√™n nghi·ªáp.

NHI·ªÜM V·ª§:
D·ª±a tr√™n d·ªØ li·ªáu th·ªëng k√™ ƒë∆∞·ª£c cung c·∫•p, h√£y tr·∫£ l·ªùi c√¢u h·ªèi c·ªßa ng∆∞·ªùi d√πng m·ªôt c√°ch:
- Ch√≠nh x√°c, d·ª±a tr√™n s·ªë li·ªáu c·ª• th·ªÉ
- C√¢n b·∫±ng, kh√¥ng thi√™n v·ªã
- D·ªÖ hi·ªÉu, s√∫c t√≠ch
- Tr√≠ch d·∫´n % v√† s·ªë l∆∞·ª£ng review

C·∫§U TR√öC TR·∫¢ L·ªúI:
1. T√≥m t·∫Øt ng·∫Øn g·ªçn (1-2 c√¢u)
2. Ph√¢n t√≠ch s·ªë li·ªáu chi ti·∫øt
3. ƒê∆∞a ra v√≠ d·ª• t·ª´ review (n·∫øu c√≥)
4. K·∫øt lu·∫≠n

L∆ØU √ù:
- KH√îNG b·ªãa ƒë·∫∑t th√¥ng tin ngo√†i context
- N·∫øu d·ªØ li·ªáu kh√¥ng ƒë·ªß, n√≥i r√µ h·∫°n ch·∫ø
- ∆Øu ti√™n s·ªë li·ªáu th·ªëng k√™ h∆°n review ƒë∆°n l·∫ª"""

        user_prompt = f"""D·ªØ li·ªáu th·ªëng k√™:
{context}

C√¢u h·ªèi: {query}

H√£y tr·∫£ l·ªùi d·ª±a tr√™n d·ªØ li·ªáu tr√™n:"""
        
        try:
            response = self.client.chat.completions.create(
                model="gpt-4o-mini",
                messages=[
                    {"role": "system", "content": system_prompt},
                    {"role": "user", "content": user_prompt}
                ],
                temperature=0.3,
                max_tokens=1000
            )
            
            answer = response.choices[0].message.content
            
            # Log token usage
            usage = response.usage
            print(f"üí∞ Tokens: {usage.total_tokens} (prompt: {usage.prompt_tokens}, completion: {usage.completion_tokens})")
            
            return answer
            
        except Exception as e:
            return f"‚ö†Ô∏è L·ªói khi generate answer: {e}\n\nContext:\n{context}"


# ============================================================================
# MAIN
# ============================================================================

def main():
    """Run the system"""
    
    CSV_PATH = "clean_reviews.csv"
    
    print("="*80)
    print("üöÄ SIMPLE STATISTICS RAG SYSTEM")
    print("="*80)
    
    # Initialize RAG
    rag = SimpleRAG(CSV_PATH)
    
    print("\n" + "="*80)
    print("‚úÖ READY TO USE!")
    print("="*80)
    print("\nüìù Example usage:")
    print("answer = rag.answer('Pin Xiaomi 15T c√≥ t·ªët kh√¥ng?')")
    print("answer = rag.answer('So s√°nh camera Xiaomi 15T v√† 15T Pro')")
    print("answer = rag.answer('ƒê√°nh gi√° chung v·ªÅ Xiaomi 15T Pro')")
    
    return rag


def demo():
    """Interactive demo"""
    rag = main()
        
    print("\n" + "="*80)
    print("üéÆ INTERACTIVE DEMO")
    print("="*80)
    print("Type 'exit' to quit")
    print("-"*80)
    
    while True:
        try:
            user_input = input("\nüí¨ Your question: ").strip()
            
            if user_input.lower() in ['exit', 'quit', 'q']:
                print("üëã Goodbye!")
                break
            
            if not user_input:
                continue
            
            answer = rag.answer(user_input)
            print(f"\nü§ñ Answer:\n{answer}\n")
            
        except KeyboardInterrupt:
            print("\n\nüëã Goodbye!")
            break
        except Exception as e:
            print(f"\n‚ö†Ô∏è Error: {e}")


if __name__ == "__main__":
    # Run setup
    rag = main()
    
    # Uncomment to run interactive demo
    demo()

üöÄ SIMPLE STATISTICS RAG SYSTEM
üìÇ Loading data...
‚úÖ Loaded 2429 reviews
üì± Products: 2322
‚úÖ System ready!

‚úÖ READY TO USE!

üìù Example usage:
answer = rag.answer('Pin Xiaomi 15T c√≥ t·ªët kh√¥ng?')
answer = rag.answer('So s√°nh camera Xiaomi 15T v√† 15T Pro')
answer = rag.answer('ƒê√°nh gi√° chung v·ªÅ Xiaomi 15T Pro')
üöÄ SIMPLE STATISTICS RAG SYSTEM
üìÇ Loading data...
‚úÖ Loaded 2429 reviews
üì± Products: 2322
‚úÖ System ready!

‚úÖ READY TO USE!

üìù Example usage:
answer = rag.answer('Pin Xiaomi 15T c√≥ t·ªët kh√¥ng?')
answer = rag.answer('So s√°nh camera Xiaomi 15T v√† 15T Pro')
answer = rag.answer('ƒê√°nh gi√° chung v·ªÅ Xiaomi 15T Pro')

üéÆ INTERACTIVE DEMO
Type 'exit' to quit
--------------------------------------------------------------------------------

‚ùì Query: iphone 17 pro pin th·∫ø n√†o

üîç Step 1: Parsing query...
üîç Parsed: {'products': ['iphone 17 pro'], 'aspects': ['battery'], 'sentiment_focus': 'null', 'is_comparison': False}

üìä Step 2

In [13]:
# ============================================================================
# MODULE 6: CONTEXT BUILDER
# ============================================================================
"""
Simple Statistics-Based RAG System
- LLM parses query to extract product + aspects
- Filter data using fuzzy matching
- Compute statistics on filtered data
- LLM generates final answer
"""

import pandas as pd
import json
import ast
from typing import List, Dict, Any, Optional, Tuple
from collections import Counter
from openai import OpenAI
from dotenv import load_dotenv
import os
import matplotlib.pyplot as plt
import seaborn as sns
from io import BytesIO
import base64

load_dotenv()

# Set style for better looking plots
sns.set_style("whitegrid")
plt.rcParams['font.family'] = 'DejaVu Sans'
plt.rcParams['figure.facecolor'] = 'white'
plt.rcParams['axes.facecolor'] = 'white'

# ============================================================================
# MODULE 1: DATA LOADER
# ============================================================================

class DataLoader:
    """Load and prepare data"""
    
    def __init__(self, csv_path: str):
        self.csv_path = csv_path
        self.df = None
    
    def load_data(self) -> pd.DataFrame:
        """Load CSV"""
        print("üìÇ Loading data...")
        self.df = pd.read_csv(self.csv_path)
        
        # Parse aspects and sentiments
        self.df['aspects'] = self.df['aspects'].apply(self._safe_parse)
        self.df['sentiments'] = self.df['sentiments'].apply(self._safe_parse)
        
        print(f"‚úÖ Loaded {len(self.df)} reviews")
        print(f"üì± Products: {self.df['product'].nunique()}")
        
        return self.df
    
    def _safe_parse(self, x):
        """Parse string to list/dict"""
        if pd.isna(x):
            return [] if '[' in str(x) else {}
        if isinstance(x, (list, dict)):
            return x
        try:
            return ast.literal_eval(x)
        except:
            return [] if '[' in str(x) else {}


# ============================================================================
# MODULE 2: QUERY PARSER (LLM)
# ============================================================================

class QueryParser:
    """Use LLM to parse user query"""
    
    def __init__(self, client: OpenAI, available_products: List[str]):
        self.client = client
        self.available_products = available_products
    
    def parse_query(self, query: str) -> Dict:
        """Parse query to extract intent, products, aspects"""
        
        system_prompt = f"""B·∫°n l√† tr·ª£ l√Ω ph√¢n t√≠ch c√¢u h·ªèi v·ªÅ ƒë√°nh gi√° ƒëi·ªán tho·∫°i.

Nhi·ªám v·ª•: Ph√¢n t√≠ch c√¢u h·ªèi ƒë·ªÉ tr√≠ch xu·∫•t:
1. products: Danh s√°ch t√™n s·∫£n ph·∫©m (c√≥ th·ªÉ vi·∫øt t·∫Øt/kh√¥ng ch√≠nh x√°c)
2. aspects: Danh s√°ch kh√≠a c·∫°nh ƒë∆∞·ª£c h·ªèi (general, battery, camera, performance, screen, design, price, storage, features, ser&acc)
3. sentiment_focus: Ng∆∞·ªùi d√πng mu·ªën bi·∫øt ∆∞u ƒëi·ªÉm (positive), nh∆∞·ª£c ƒëi·ªÉm (negative), hay t·ªïng quan (null)
4. is_comparison: C√≥ ph·∫£i c√¢u h·ªèi so s√°nh kh√¥ng? (true/false)

ASPECTS c√≥ th·ªÉ c√≥:
- general: ƒë√°nh gi√° chung
- battery/pin: pin
- camera: camera
- performance: hi·ªáu nƒÉng, t·ªëc ƒë·ªô, chip
- screen: m√†n h√¨nh
- design: thi·∫øt k·∫ø, ngo·∫°i h√¨nh
- price: gi√° c·∫£
- storage: b·ªô nh·ªõ, l∆∞u tr·ªØ
- features: t√≠nh nƒÉng
- ser&acc: d·ªãch v·ª•, ph·ª• ki·ªán

Danh s√°ch s·∫£n ph·∫©m c√≥ s·∫µn (ƒë·ªÉ tham kh·∫£o):
{', '.join(self.available_products[:10])}...

Tr·∫£ v·ªÅ JSON v·ªõi format:
{{
  "products": ["product_name1", "product_name2"],
  "aspects": ["aspect1", "aspect2"],
  "sentiment_focus": "positive|negative|null",
  "is_comparison": true|false
}}"""

        try:
            response = self.client.chat.completions.create(
                model="gpt-4o-mini",
                messages=[
                    {"role": "system", "content": system_prompt},
                    {"role": "user", "content": f"C√¢u h·ªèi: {query}\n\nPh√¢n t√≠ch:"}
                ],
                temperature=0.1,
                response_format={"type": "json_object"}
            )
            
            parsed = json.loads(response.choices[0].message.content)
            print(f"üîç Parsed: {parsed}")
            return parsed
            
        except Exception as e:
            print(f"‚ö†Ô∏è Parse error: {e}")
            return {
                "products": [],
                "aspects": ["general"],
                "sentiment_focus": None,
                "is_comparison": False
            }


# ============================================================================
# MODULE 3: DATA FILTER
# ============================================================================

class DataFilter:
    """Filter data based on parsed query"""
    
    def __init__(self, df: pd.DataFrame):
        self.df = df
        self.all_products = df['product'].unique().tolist()
    
    def keyword_match_product(self, query_product: str) -> List[str]:
        """Match products using keyword extraction with exact priority"""
        query_lower = query_product.lower().strip()
        
        # Extract keywords from query (split and clean)
        keywords = []
        for word in query_lower.split():
            # Remove common words and keep meaningful parts
            if len(word) >= 2 and word not in ['ƒëi·ªán', 'tho·∫°i', 'm√°y', 'chi·∫øc', 'c√°i', 'gb', 'tb']:
                keywords.append(word)
        
        if not keywords:
            return []
        
        exact_matches = []
        partial_matches = []
        
        for product in self.all_products:
            product_lower = product.lower()
            
            # Strategy 1: Exact phrase match (highest priority)
            # Check if query is a substring of product name
            if query_lower in product_lower:
                # But make sure it's a word boundary match
                # e.g., "iphone 17" should not match "iphone 170"
                import re
                # Add word boundaries
                pattern = r'\b' + re.escape(query_lower) + r'\b'
                if re.search(pattern, product_lower):
                    exact_matches.append(product)
                    continue
            
            # Strategy 2: All keywords must exist (partial match)
            if all(kw in product_lower for kw in keywords):
                # Additional check: avoid over-matching
                # If query is "iphone 17", don't match "iphone 17 pro"
                # unless query also has "pro"
                
                # Extract product keywords (significant words)
                product_words = set(re.findall(r'\b\w+\b', product_lower))
                query_words = set(keywords)
                
                # If product has extra significant model keywords not in query, skip
                significant_extras = ['pro', 'max', 'plus', 'ultra', 'mini', 'lite', 'note']
                extra_in_product = product_words.intersection(significant_extras)
                extra_in_query = query_words.intersection(significant_extras)
                
                # If product has model variant keywords that query doesn't have, skip it
                if extra_in_product - extra_in_query:
                    continue
                
                partial_matches.append(product)
        
        # Return exact matches first, then partial if no exact
        if exact_matches:
            return exact_matches
        return partial_matches
    
    def filter_data(self, products: List[str], aspects: List[str]) -> tuple:
        """Filter dataframe based on products and aspects"""
        
        # Step 1: Match products using keywords
        matched_products = []
        
        if not products:
            # No product specified, use all
            matched_products = self.all_products
        else:
            for query_prod in products:
                matched = self.keyword_match_product(query_prod)
                matched_products.extend(matched)
            
            matched_products = list(set(matched_products))
        
        if not matched_products:
            print("‚ö†Ô∏è No products matched, using all products")
            matched_products = self.all_products
        else:
            print(f"‚úÖ Matched {len(matched_products)} products: {matched_products[:3]}{'...' if len(matched_products) > 3 else ''}")
        
        # Filter by products FIRST (don't filter by aspect yet for counting)
        product_filtered = self.df[self.df['product'].isin(matched_products)].copy()
        
        # Count total reviews per product BEFORE aspect filtering
        product_review_counts = {}
        for product in matched_products:
            product_df = product_filtered[product_filtered['product'] == product]
            # Use review_id if available, otherwise count unique reviews
            if 'review_id' in product_df.columns:
                product_review_counts[product] = product_df['review_id'].nunique()
            else:
                product_review_counts[product] = product_df['review'].nunique()
        
        # Step 2: Filter by aspects for retrieval (but keep counts from above)
        if aspects and aspects != ['general']:
            def has_aspect_match(review_aspects):
                if not isinstance(review_aspects, list):
                    return False
                return any(asp in aspects for asp in review_aspects)
            
            aspect_filtered = product_filtered[product_filtered['aspects'].apply(has_aspect_match)].copy()
        else:
            aspect_filtered = product_filtered.copy()
        
        print(f"üìä Filtered to {len(aspect_filtered)} review entries")
        
        return aspect_filtered, matched_products, product_review_counts


# ============================================================================
# MODULE 4: STATISTICS COMPUTER
# ============================================================================

class StatisticsComputer:
    """Compute statistics on filtered data"""
    
    @staticmethod
    def compute_stats(df: pd.DataFrame, aspects: List[str], product_review_counts: Dict[str, int]) -> Dict:
        """Compute comprehensive statistics
        
        Args:
            df: Filtered dataframe (may be filtered by aspect)
            aspects: List of aspects to analyze
            product_review_counts: Dict of {product: total_review_count} BEFORE aspect filtering
        """
        
        if len(df) == 0:
            return {
                'total_reviews': 0,
                'message': 'Kh√¥ng t√¨m th·∫•y d·ªØ li·ªáu ph√π h·ª£p'
            }
        
        # Total reviews across all matched products (from pre-computed counts)
        total_reviews = sum(product_review_counts.values())
        
        stats = {
            'total_reviews': total_reviews,
            'products': {},
            'aspects': {},
            'overall_sentiment': {}
        }
        
        # Overall sentiment distribution (count from filtered df, but calculate % based on total)
        all_sentiments = []
        for sentiments_dict in df['sentiments']:
            if isinstance(sentiments_dict, dict):
                all_sentiments.extend(sentiments_dict.values())
        
        sentiment_counts = Counter(all_sentiments)
        
        stats['overall_sentiment'] = {
            'positive': sentiment_counts.get('Positive', 0),
            'neutral': sentiment_counts.get('Neutral', 0),
            'negative': sentiment_counts.get('Negative', 0),
            'positive_pct': sentiment_counts.get('Positive', 0) / total_reviews * 100 if total_reviews > 0 else 0,
            'negative_pct': sentiment_counts.get('Negative', 0) / total_reviews * 100 if total_reviews > 0 else 0,
            'neutral_pct': sentiment_counts.get('Neutral', 0) / total_reviews * 100 if total_reviews > 0 else 0
        }
        
        # Per-product stats
        for product, product_total_reviews in product_review_counts.items():
            product_df = df[df['product'] == product]
            stats['products'][product] = StatisticsComputer._compute_product_stats(
                product_df, 
                product_total_reviews
            )
        
        # Per-aspect stats
        for aspect in aspects:
            aspect_reviews = []
            aspect_sentiments = []
            
            for idx, row in df.iterrows():
                if isinstance(row['aspects'], list) and aspect in row['aspects']:
                    aspect_reviews.append(row['sentence'])
                    if isinstance(row['sentiments'], dict) and aspect in row['sentiments']:
                        aspect_sentiments.append(row['sentiments'][aspect])
            
            sentiment_counts = Counter(aspect_sentiments)
            positive_count = sentiment_counts.get('Positive', 0)
            negative_count = sentiment_counts.get('Negative', 0)
            neutral_count = sentiment_counts.get('Neutral', 0)
            
            # Calculate based on TOTAL reviews
            implicit_neutral = total_reviews - len(aspect_reviews)
            total_neutral = neutral_count + implicit_neutral
            
            stats['aspects'][aspect] = {
                'count': len(aspect_reviews),
                'positive': positive_count,
                'neutral': total_neutral,
                'negative': negative_count,
                'positive_pct': positive_count / total_reviews * 100 if total_reviews > 0 else 0,
                'negative_pct': negative_count / total_reviews * 100 if total_reviews > 0 else 0,
                'neutral_pct': total_neutral / total_reviews * 100 if total_reviews > 0 else 0,
                'mentioned_pct': len(aspect_reviews) / total_reviews * 100 if total_reviews > 0 else 0,
                'sample_reviews': aspect_reviews[:3]
            }
        
        return stats
    
    @staticmethod
    def _compute_product_stats(product_df: pd.DataFrame, total_reviews: int) -> Dict:
        """Compute stats for a single product
        
        Args:
            product_df: Filtered dataframe for this product (may be aspect-filtered)
            total_reviews: ACTUAL total reviews for this product (before aspect filtering)
        """
        all_sentiments = []
        for sentiments_dict in product_df['sentiments']:
            if isinstance(sentiments_dict, dict):
                all_sentiments.extend(sentiments_dict.values())
        
        sentiment_counts = Counter(all_sentiments)
        
        return {
            'review_count': total_reviews,
            'positive': sentiment_counts.get('Positive', 0),
            'neutral': sentiment_counts.get('Neutral', 0),
            'negative': sentiment_counts.get('Negative', 0),
            'positive_pct': sentiment_counts.get('Positive', 0) / total_reviews * 100 if total_reviews > 0 else 0,
            'negative_pct': sentiment_counts.get('Negative', 0) / total_reviews * 100 if total_reviews > 0 else 0,
            'neutral_pct': sentiment_counts.get('Neutral', 0) / total_reviews * 100 if total_reviews > 0 else 0
        }


# ============================================================================
# MODULE 5: VISUALIZATION
# ============================================================================

class ChartGenerator:
    """Generate visualization charts from statistics"""
    
    @staticmethod
    def create_sentiment_pie(stats: Dict, title: str = "Ph√¢n b·ªë c·∫£m x√∫c") -> str:
        """Create pie chart for sentiment distribution"""
        if 'overall_sentiment' not in stats:
            return None
        
        s = stats['overall_sentiment']
        labels = ['T√≠ch c·ª±c', 'Trung l·∫≠p', 'Ti√™u c·ª±c']
        sizes = [s['positive'], s['neutral'], s['negative']]
        colors = ['#4CAF50', '#FFC107', '#F44336']
        explode = (0.05, 0, 0.05)  # Explode positive and negative
        
        fig, ax = plt.subplots(figsize=(8, 6))
        wedges, texts, autotexts = ax.pie(
            sizes, 
            explode=explode,
            labels=labels, 
            colors=colors,
            autopct='%1.1f%%',
            startangle=90,
            textprops={'fontsize': 12, 'weight': 'bold'}
        )
        
        # Make percentage text white
        for autotext in autotexts:
            autotext.set_color('white')
            autotext.set_fontsize(14)
        
        ax.set_title(title, fontsize=16, weight='bold', pad=20)
        plt.tight_layout()
        
        # Convert to base64
        buffer = BytesIO()
        plt.savefig(buffer, format='png', dpi=100, bbox_inches='tight')
        buffer.seek(0)
        img_base64 = base64.b64encode(buffer.read()).decode()
        plt.close()
        
        return img_base64
    
    @staticmethod
    def create_product_comparison(stats: Dict) -> str:
        """Create bar chart comparing products"""
        if 'products' not in stats or len(stats['products']) < 2:
            return None
        
        products = []
        positive = []
        negative = []
        
        for product, pstats in stats['products'].items():
            products.append(product.split()[0:3])  # Shorten name
            positive.append(pstats['positive_pct'])
            negative.append(pstats['negative_pct'])
        
        # Shorten product names
        products = [' '.join(p) for p in products]
        
        x = range(len(products))
        width = 0.35
        
        fig, ax = plt.subplots(figsize=(10, 6))
        bars1 = ax.bar([i - width/2 for i in x], positive, width, 
                       label='T√≠ch c·ª±c', color='#4CAF50', alpha=0.8)
        bars2 = ax.bar([i + width/2 for i in x], negative, width,
                       label='Ti√™u c·ª±c', color='#F44336', alpha=0.8)
        
        # Add value labels on bars
        for bar in bars1:
            height = bar.get_height()
            ax.text(bar.get_x() + bar.get_width()/2., height,
                   f'{height:.1f}%', ha='center', va='bottom', fontsize=10)
        
        for bar in bars2:
            height = bar.get_height()
            ax.text(bar.get_x() + bar.get_width()/2., height,
                   f'{height:.1f}%', ha='center', va='bottom', fontsize=10)
        
        ax.set_xlabel('S·∫£n ph·∫©m', fontsize=12, weight='bold')
        ax.set_ylabel('Ph·∫ßn trƒÉm (%)', fontsize=12, weight='bold')
        ax.set_title('So s√°nh ƒë√°nh gi√° gi·ªØa c√°c s·∫£n ph·∫©m', fontsize=14, weight='bold', pad=20)
        ax.set_xticks(x)
        ax.set_xticklabels(products, rotation=15, ha='right')
        ax.legend(fontsize=11)
        ax.grid(axis='y', alpha=0.3)
        
        plt.tight_layout()
        
        buffer = BytesIO()
        plt.savefig(buffer, format='png', dpi=100, bbox_inches='tight')
        buffer.seek(0)
        img_base64 = base64.b64encode(buffer.read()).decode()
        plt.close()
        
        return img_base64
    
    @staticmethod
    def create_aspect_breakdown(stats: Dict) -> str:
        """Create horizontal bar chart for aspect breakdown"""
        if 'aspects' not in stats or not stats['aspects']:
            return None
        
        aspects = []
        positive = []
        negative = []
        mentioned = []
        
        for aspect, astats in stats['aspects'].items():
            aspects.append(aspect.upper())
            positive.append(astats['positive_pct'])
            negative.append(astats['negative_pct'])
            mentioned.append(astats['mentioned_pct'])
        
        # Sort by mentioned percentage
        sorted_data = sorted(zip(aspects, positive, negative, mentioned), 
                           key=lambda x: x[3], reverse=True)
        aspects, positive, negative, mentioned = zip(*sorted_data)
        
        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))
        
        # Chart 1: Sentiment breakdown
        y_pos = range(len(aspects))
        
        ax1.barh(y_pos, positive, color='#4CAF50', alpha=0.8, label='T√≠ch c·ª±c')
        ax1.barh(y_pos, [-n for n in negative], color='#F44336', alpha=0.8, label='Ti√™u c·ª±c')
        
        ax1.set_yticks(y_pos)
        ax1.set_yticklabels(aspects)
        ax1.set_xlabel('Ph·∫ßn trƒÉm (%)', fontsize=11, weight='bold')
        ax1.set_title('C·∫£m x√∫c theo kh√≠a c·∫°nh', fontsize=13, weight='bold')
        ax1.legend(loc='lower right')
        ax1.axvline(x=0, color='black', linewidth=0.8)
        ax1.grid(axis='x', alpha=0.3)
        
        # Chart 2: Mention percentage
        bars = ax2.barh(y_pos, mentioned, color='#2196F3', alpha=0.8)
        
        # Add value labels
        for i, (bar, val) in enumerate(zip(bars, mentioned)):
            ax2.text(val + 1, i, f'{val:.1f}%', va='center', fontsize=9)
        
        ax2.set_yticks(y_pos)
        ax2.set_yticklabels(aspects)
        ax2.set_xlabel('Ph·∫ßn trƒÉm ƒë∆∞·ª£c ƒë·ªÅ c·∫≠p (%)', fontsize=11, weight='bold')
        ax2.set_title('T·ª∑ l·ªá ƒë·ªÅ c·∫≠p kh√≠a c·∫°nh', fontsize=13, weight='bold')
        ax2.grid(axis='x', alpha=0.3)
        
        plt.tight_layout()
        
        buffer = BytesIO()
        plt.savefig(buffer, format='png', dpi=100, bbox_inches='tight')
        buffer.seek(0)
        img_base64 = base64.b64encode(buffer.read()).decode()
        plt.close()
        
        return img_base64
    
    @staticmethod
    def generate_all_charts(stats: Dict, parsed_query: Dict) -> Dict[str, str]:
        """Generate all relevant charts based on query type"""
        charts = {}
        
        # Always create sentiment pie chart
        pie_chart = ChartGenerator.create_sentiment_pie(stats)
        if pie_chart:
            charts['sentiment_pie'] = pie_chart
        
        # Create product comparison if multiple products
        if len(stats.get('products', {})) > 1 or parsed_query.get('is_comparison'):
            comparison_chart = ChartGenerator.create_product_comparison(stats)
            if comparison_chart:
                charts['product_comparison'] = comparison_chart
        
        # Create aspect breakdown if aspects analyzed
        if stats.get('aspects') and len(stats['aspects']) > 0:
            aspect_chart = ChartGenerator.create_aspect_breakdown(stats)
            if aspect_chart:
                charts['aspect_breakdown'] = aspect_chart
        
        return charts

class ContextBuilder:
    """Build context for LLM from statistics"""
    
    @staticmethod
    def build_context(stats: Dict, df: pd.DataFrame, parsed_query: Dict) -> str:
        """Build comprehensive context"""
        
        if stats.get('total_reviews', 0) == 0:
            return "Kh√¥ng t√¨m th·∫•y d·ªØ li·ªáu ƒë√°nh gi√° ph√π h·ª£p v·ªõi c√¢u h·ªèi."
        
        parts = []
        
        # Header
        parts.append(f"=== T·ªîNG QUAN ===")
        parts.append(f"T·ªïng s·ªë ƒë√°nh gi√°: {stats['total_reviews']}")
        
        # Overall sentiment
        if 'overall_sentiment' in stats:
            s = stats['overall_sentiment']
            parts.append(f"\nPh√¢n b·ªë c·∫£m x√∫c t·ªïng th·ªÉ:")
            parts.append(f"  ‚Ä¢ T√≠ch c·ª±c: {s['positive_pct']:.1f}% ({s['positive']} ƒë√°nh gi√°)")
            parts.append(f"  ‚Ä¢ Trung l·∫≠p: {s['neutral_pct']:.1f}% ({s['neutral']} ƒë√°nh gi√°)")
            parts.append(f"  ‚Ä¢ Ti√™u c·ª±c: {s['negative_pct']:.1f}% ({s['negative']} ƒë√°nh gi√°)")
        
        # Per-product breakdown
        if len(stats['products']) > 1 or parsed_query.get('is_comparison'):
            parts.append(f"\n=== SO S√ÅNH S·∫¢N PH·∫®M ===")
            for product, pstats in stats['products'].items():
                parts.append(f"\n{product}:")
                parts.append(f"  ‚Ä¢ S·ªë ƒë√°nh gi√°: {pstats['review_count']}")
                parts.append(f"  ‚Ä¢ T√≠ch c·ª±c: {pstats['positive_pct']:.1f}% ({pstats['positive']})")
                parts.append(f"  ‚Ä¢ Ti√™u c·ª±c: {pstats['negative_pct']:.1f}% ({pstats['negative']})")
        
        # Per-aspect breakdown
        if stats['aspects']:
            parts.append(f"\n=== PH√ÇN T√çCH THEO KH√çA C·∫†NH ===")
            for aspect, astats in stats['aspects'].items():
                parts.append(f"\n{aspect.upper()}:")
                parts.append(f"  ‚Ä¢ S·ªë ƒë√°nh gi√° ƒë·ªÅ c·∫≠p: {astats['count']}/{stats['total_reviews']} ({astats['mentioned_pct']:.1f}%)")
                parts.append(f"  ‚Ä¢ T√≠ch c·ª±c: {astats['positive_pct']:.1f}% ({astats['positive']} ƒë√°nh gi√°)")
                parts.append(f"  ‚Ä¢ Ti√™u c·ª±c: {astats['negative_pct']:.1f}% ({astats['negative']} ƒë√°nh gi√°)")
                parts.append(f"  ‚Ä¢ Trung l·∫≠p/Kh√¥ng ƒë·ªÅ c·∫≠p: {astats['neutral_pct']:.1f}% ({astats['neutral']} ƒë√°nh gi√°)")
                
                if astats['sample_reviews']:
                    parts.append(f"  ‚Ä¢ V√≠ d·ª•:")
                    for i, review in enumerate(astats['sample_reviews'][:2], 1):
                        parts.append(f"    {i}. {review[:150]}")
        
        # Sample reviews
        parts.append(f"\n=== M·∫™U ƒê√ÅNH GI√Å ===")
        sample_df = df.head(5)
        for i, row in enumerate(sample_df.iterrows(), 1):
            idx, r = row
            parts.append(f"\n{i}. [{r['product']}]")
            parts.append(f"   Kh√≠a c·∫°nh: {', '.join(r['aspects']) if isinstance(r['aspects'], list) else 'N/A'}")
            parts.append(f"   N·ªôi dung: {r['sentence'][:200]}")
        
        return "\n".join(parts)


# ============================================================================
# MODULE 7: RAG SYSTEM
# ============================================================================

class SimpleRAG:
    """Main RAG system"""
    
    def __init__(self, csv_path: str, openai_api_key: Optional[str] = None):
        # Load API key
        api_key = openai_api_key or os.getenv('OPENAI_API_KEY')
        if not api_key:
            raise ValueError("‚ùå OpenAI API key not found! Add to .env file")
        
        self.client = OpenAI(api_key=api_key)
        
        # Load data
        loader = DataLoader(csv_path)
        self.df = loader.load_data()
        
        # Initialize modules
        self.parser = QueryParser(self.client, self.df['product'].unique().tolist())
        self.filter = DataFilter(self.df)
        
        print("‚úÖ System ready!")
    
    def answer(self, query: str, show_charts: bool = True) -> Dict[str, Any]:
        """Main method: answer user query
        
        Args:
            query: User question
            show_charts: Whether to generate visualization charts
            
        Returns:
            Dict with 'answer' and 'charts' (base64 encoded images)
        """
        
        print(f"\n{'='*80}")
        print(f"‚ùì Query: {query}")
        print(f"{'='*80}")
        
        # Step 1: Parse query with LLM
        print("\nüîç Step 1: Parsing query...")
        parsed = self.parser.parse_query(query)
        
        # Step 2: Filter data
        print("\nüìä Step 2: Filtering data...")
        filtered_df, matched_products, product_review_counts = self.filter.filter_data(
            parsed.get('products', []),
            parsed.get('aspects', ['general'])
        )
        
        if len(filtered_df) == 0:
            return {
                'answer': "‚ö†Ô∏è Kh√¥ng t√¨m th·∫•y d·ªØ li·ªáu ph√π h·ª£p v·ªõi c√¢u h·ªèi c·ªßa b·∫°n.",
                'charts': {}
            }
        
        # Step 3: Compute statistics
        print("\nüìà Step 3: Computing statistics...")
        stats = StatisticsComputer.compute_stats(
            filtered_df,
            parsed.get('aspects', ['general']),
            product_review_counts
        )
        
        # Step 4: Build context
        print("\nüìù Step 4: Building context...")
        context = ContextBuilder.build_context(stats, filtered_df, parsed)
        
        # Step 5: Generate charts
        charts = {}
        if show_charts:
            print("\nüìä Step 5: Generating charts...")
            charts = ChartGenerator.generate_all_charts(stats, parsed)
            print(f"‚úÖ Generated {len(charts)} chart(s)")
        
        # Step 6: Generate answer with LLM
        print("\nü§ñ Step 6: Generating answer...")
        answer = self._generate_answer(query, context, parsed)
        
        return {
            'answer': answer,
            'charts': charts,
            'stats': stats
        }
    
    def _generate_answer(self, query: str, context: str, parsed_query: Dict) -> str:
        """Generate final answer using LLM"""
        
        system_prompt = """B·∫°n l√† tr·ª£ l√Ω ph√¢n t√≠ch ƒë√°nh gi√° ƒëi·ªán tho·∫°i chuy√™n nghi·ªáp.

NHI·ªÜM V·ª§:
D·ª±a tr√™n d·ªØ li·ªáu th·ªëng k√™ ƒë∆∞·ª£c cung c·∫•p, h√£y tr·∫£ l·ªùi c√¢u h·ªèi c·ªßa ng∆∞·ªùi d√πng m·ªôt c√°ch:
- Ch√≠nh x√°c, d·ª±a tr√™n s·ªë li·ªáu c·ª• th·ªÉ
- C√¢n b·∫±ng, kh√¥ng thi√™n v·ªã
- D·ªÖ hi·ªÉu, s√∫c t√≠ch
- Tr√≠ch d·∫´n % v√† s·ªë l∆∞·ª£ng review

C·∫§U TR√öC TR·∫¢ L·ªúI:
1. T√≥m t·∫Øt ng·∫Øn g·ªçn (1-2 c√¢u)
2. Ph√¢n t√≠ch s·ªë li·ªáu chi ti·∫øt
3. ƒê∆∞a ra v√≠ d·ª• t·ª´ review (n·∫øu c√≥)
4. K·∫øt lu·∫≠n

L∆ØU √ù:
- KH√îNG b·ªãa ƒë·∫∑t th√¥ng tin ngo√†i context
- N·∫øu d·ªØ li·ªáu kh√¥ng ƒë·ªß, n√≥i r√µ h·∫°n ch·∫ø
- ∆Øu ti√™n s·ªë li·ªáu th·ªëng k√™ h∆°n review ƒë∆°n l·∫ª"""

        user_prompt = f"""D·ªØ li·ªáu th·ªëng k√™:
{context}

C√¢u h·ªèi: {query}

H√£y tr·∫£ l·ªùi d·ª±a tr√™n d·ªØ li·ªáu tr√™n:"""
        
        try:
            response = self.client.chat.completions.create(
                model="gpt-4o-mini",
                messages=[
                    {"role": "system", "content": system_prompt},
                    {"role": "user", "content": user_prompt}
                ],
                temperature=0.3,
                max_tokens=1000
            )
            
            answer = response.choices[0].message.content
            
            # Log token usage
            usage = response.usage
            print(f"üí∞ Tokens: {usage.total_tokens} (prompt: {usage.prompt_tokens}, completion: {usage.completion_tokens})")
            
            return answer
            
        except Exception as e:
            return f"‚ö†Ô∏è L·ªói khi generate answer: {e}\n\nContext:\n{context}"


# ============================================================================
# MAIN
# ============================================================================

def main():
    """Run the system"""
    
    CSV_PATH = "clean_reviews.csv"
    
    print("="*80)
    print("üöÄ SIMPLE STATISTICS RAG SYSTEM WITH VISUALIZATION")
    print("="*80)
    
    # Initialize RAG
    rag = SimpleRAG(CSV_PATH)
    
    print("\n" + "="*80)
    print("‚úÖ READY TO USE!")
    print("="*80)
    print("\nüìù Example usage:")
    print("result = rag.answer('Pin Xiaomi 15T c√≥ t·ªët kh√¥ng?')")
    print("print(result['answer'])")
    print("# Save charts:")
    print("for chart_name, img_base64 in result['charts'].items():")
    print("    with open(f'{chart_name}.png', 'wb') as f:")
    print("        f.write(base64.b64decode(img_base64))")
    
    return rag


def demo():
    """Interactive demo with chart display"""
    rag = main()
    
    print("\n" + "="*80)
    print("üéÆ INTERACTIVE DEMO")
    print("="*80)
    print("Type 'exit' to quit")
    print("-"*80)
    
    while True:
        try:
            user_input = input("\nüí¨ Your question: ").strip()
            
            if user_input.lower() in ['exit', 'quit', 'q']:
                print("üëã Goodbye!")
                break
            
            if not user_input:
                continue
            
            # Get result
            result = rag.answer(user_input)
            
            # Display answer
            print(f"\nü§ñ Answer:\n{result['answer']}\n")
            
            # Save charts
            if result['charts']:
                print(f"üìä Generated {len(result['charts'])} chart(s):")
                for chart_name, img_base64 in result['charts'].items():
                    filename = f"{chart_name}.png"
                    with open(filename, 'wb') as f:
                        f.write(base64.b64decode(img_base64))
                    print(f"  ‚úÖ Saved: {filename}")
                print(f"\nüí° Tip: Open the PNG files to view charts!\n")
            
        except KeyboardInterrupt:
            print("\n\nüëã Goodbye!")
            break
        except Exception as e:
            print(f"\n‚ö†Ô∏è Error: {e}")


def display_charts_html(charts: Dict[str, str], output_file: str = "charts.html"):
    """Generate HTML file to display all charts in browser"""
    html_content = """
    <!DOCTYPE html>
    <html>
    <head>
        <meta charset="UTF-8">
        <title>Review Analysis Charts</title>
        <style>
            body {
                font-family: Arial, sans-serif;
                max-width: 1400px;
                margin: 0 auto;
                padding: 20px;
                background: #f5f5f5;
            }
            h1 {
                color: #333;
                text-align: center;
                margin-bottom: 30px;
            }
            .chart-container {
                background: white;
                padding: 20px;
                margin: 20px 0;
                border-radius: 8px;
                box-shadow: 0 2px 4px rgba(0,0,0,0.1);
            }
            .chart-title {
                color: #555;
                font-size: 18px;
                font-weight: bold;
                margin-bottom: 15px;
            }
            img {
                max-width: 100%;
                height: auto;
                display: block;
                margin: 0 auto;
            }
        </style>
    </head>
    <body>
        <h1>üìä Bi·ªÉu ƒë·ªì ph√¢n t√≠ch ƒë√°nh gi√°</h1>
    """
    
    chart_titles = {
        'sentiment_pie': 'ü•ß Ph√¢n b·ªë c·∫£m x√∫c t·ªïng th·ªÉ',
        'product_comparison': 'üì± So s√°nh s·∫£n ph·∫©m',
        'aspect_breakdown': 'üîç Ph√¢n t√≠ch theo kh√≠a c·∫°nh'
    }
    
    for chart_name, img_base64 in charts.items():
        title = chart_titles.get(chart_name, chart_name)
        html_content += f"""
        <div class="chart-container">
            <div class="chart-title">{title}</div>
            <img src="data:image/png;base64,{img_base64}" alt="{chart_name}">
        </div>
        """
    
    html_content += """
    </body>
    </html>
    """
    
    with open(output_file, 'w', encoding='utf-8') as f:
        f.write(html_content)
    
    print(f"üìÑ HTML report saved: {output_file}")
    print(f"üí° Open {output_file} in your browser to view charts!")


if __name__ == "__main__":
    # Run setup
    rag = main()
    
    # Example: Run a query and display charts
    print("\n" + "="*80)
    print("üß™ RUNNING EXAMPLE QUERY")
    print("="*80)
    
    result = rag.answer("So s√°nh pin Xiaomi 15T v√† 15T Pro")
    print(f"\nü§ñ Answer:\n{result['answer']}\n")
    
    if result['charts']:
        # Generate HTML report
        display_charts_html(result['charts'], "review_analysis.html")
    
    # Uncomment to run interactive demo
    demo()

üöÄ SIMPLE STATISTICS RAG SYSTEM WITH VISUALIZATION
üìÇ Loading data...
‚úÖ Loaded 2429 reviews
üì± Products: 195
‚úÖ System ready!

‚úÖ READY TO USE!

üìù Example usage:
result = rag.answer('Pin Xiaomi 15T c√≥ t·ªët kh√¥ng?')
print(result['answer'])
# Save charts:
for chart_name, img_base64 in result['charts'].items():
    with open(f'{chart_name}.png', 'wb') as f:
        f.write(base64.b64decode(img_base64))

üß™ RUNNING EXAMPLE QUERY

‚ùì Query: So s√°nh pin Xiaomi 15T v√† 15T Pro

üîç Step 1: Parsing query...
üîç Parsed: {'products': ['Xiaomi 15T', 'Xiaomi 15T Pro'], 'aspects': ['battery'], 'sentiment_focus': None, 'is_comparison': True}

üìä Step 2: Filtering data...
‚úÖ Matched 2 products: ['Xiaomi 15T 5G 12GB/256GB', 'Xiaomi 15T Pro 5G 12GB/256GB']
üìä Filtered to 12 review entries

üìà Step 3: Computing statistics...

üìù Step 4: Building context...

üìä Step 5: Generating charts...
‚úÖ Generated 3 chart(s)

ü§ñ Step 6: Generating answer...
üí∞ Tokens: 1408 (prom