In [None]:
import pandas as pd
import json
from collections import Counter
import re
import logging

# Configure basic logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

def profile_dataset(filepath: str):
    """
    Analyzes the product dataset to extract key patterns, distributions,
    and potential data quality issues, printing the output directly.
    """
    print("="*60)
    print("      🚀 STARTING PRODUCT DATASET PROFILE 🚀")
    print("="*60)

    try:
        # Assuming the file is in the root of your Colab environment
        df = pd.read_csv(filepath)
        total_rows = len(df)
        print(f"\n✅ Successfully loaded {total_rows} rows from '{filepath}'.\n")
    except FileNotFoundError:
        print(f"❌ ERROR: File not found at '{filepath}'.")
        print("Please make sure 'products-export.csv' is uploaded to your Colab session.")
        return

    # --- 1. Category and Sub-Category Distribution ---
    print("\n" + "="*20 + " 1. Category Distribution " + "="*20)
    print(df['category'].value_counts())
    print("\n" + "="*20 + " 2. Sub-Category Distribution " + "="*20)
    print(df['sub_category'].value_counts())

    # --- 2. Brand Analysis ---
    print("\n" + "="*20 + " 3. Brand Analysis " + "="*20)
    brand_counts = df['brand'].str.lower().str.strip().value_counts()
    print("\nTotal List of  brand names from 'brand' column:")
    print(df['brand'].str.lower().str.strip().unique())
    print("\nTotal No of brands from 'brand' column:")
    print(df['brand'].str.lower().str.strip().nunique())
    print("\nTop 20 Brands from 'brand' column:")
    print(brand_counts.head(20))

    print("\nTop 50 Brands from 'brand' column:")
    print(brand_counts.head(50))

    known_brands = ['apple', 'dell', 'hp', 'lenovo', 'asus', 'acer', 'microsoft',
                    'samsung', 'lg', 'sony', 'toshiba', 'msi', 'razer', 'google',
                    'hisense', 'tcl', 'vizio']

    def guess_brand_from_name(name):
        name_lower = str(name).lower()
        for brand in known_brands:
            if brand in name_lower:
                return brand
        return None

    df['guessed_brand'] = df['name'].apply(guess_brand_from_name)
    guessed_brand_counts = df[df['brand'].isna()]['guessed_brand'].value_counts()
    print("\n\nBrands guessed from 'name' where 'brand' column is NULL:")
    print(guessed_brand_counts)

    # --- 3. 'details' Column JSON Structure Analysis ---
    print("\n" + "="*20 + " 4. 'details' JSON Analysis " + "="*20)
    json_parse_errors = 0
    spec_key_counter = Counter()

    for details_str in df['details'].dropna():
        try:
            details_json = json.loads(details_str)
            if 'specifications' in details_json and isinstance(details_json['specifications'], dict):
                for key in details_json['specifications'].keys():
                    spec_key_counter[key] += 1
        except (json.JSONDecodeError, TypeError):
            json_parse_errors += 1

    print(f"\nRows with JSON parsing errors in 'details': {json_parse_errors}")
    print(f"Percentage of parse errors: {json_parse_errors / total_rows:.2%}")
    print("\n\nNo of keys in 'details.specifications':")
    print(len(spec_key_counter.items()))
    print("\n\nTop 100 most common keys in 'details.specifications':")
    for key, count in spec_key_counter.most_common(100):
        print(f"- {key}: {count} times")

    # --- 4. Anomaly Detection in Key Specs ---
    print("\n" + "="*20 + " 5. Potential Data Anomalies " + "="*20)

    storage_anomalies = []
    storage_pattern = re.compile(r'(\d+\.?\d*)\s*(TB|GB|MB)', re.IGNORECASE)
    for text in df['name'].dropna():
        matches = storage_pattern.findall(text)
        for val, unit in matches:
            try:
                val_f = float(val)
                if (unit.upper() == 'TB' and val_f > 10) or (unit.upper() == 'GB' and val_f > 8192):
                     storage_anomalies.append(f"Suspicious storage: '{val} {unit}' in title: '{text[:100]}...'")
            except ValueError:
                continue

    print("\nSuspicious Storage Values (e.g., >10TB):")
    if storage_anomalies:
        for anomaly in storage_anomalies[:10]:
            print(f"- {anomaly}")
    else:
        print("None found in initial scan.")

    screen_size_anomalies = []
    screen_pattern = re.compile(r'(\d+\.?\d*)\s*(?:inch|")', re.IGNORECASE)
    for text in df['name'].dropna():
        matches = screen_pattern.findall(text)
        for val in matches:
            try:
                val_f = float(val)
                if not (10 <= val_f <= 100):
                    screen_size_anomalies.append(f"Unusual screen size: '{val}\"' in title: '{text[:100]}...'")
            except ValueError:
                continue

    print("\n\nSuspicious Screen Sizes (e.g., <10\" or >100\"):")
    print(f"\n\n Length of screen_size_anomalies list:{len(screen_size_anomalies)}")
    if screen_size_anomalies:
        for anomaly in screen_size_anomalies:
            print(f"- {anomaly}")
    else:
        print("None found in initial scan.")

    print("\n" + "="*60)
    print("      🏁 PROFILE COMPLETE 🏁")
    print("="*60)

# --- EXECUTE THE PROFILING ---
# Ensure the path to your uploaded file is correct.
# If you placed it in a sub-folder (e.g., 'data'), change the path.
input_file_path = '/content/drive/MyDrive/Product_Hierarchy_Classifier/products-export.csv'
profile_dataset(input_file_path)