In [None]:
import os
import json
import matplotlib.pyplot as plt
import numpy as np
from collections import Counter
import re
from wordcloud import WordCloud
import seaborn as sns
import pandas as pd

# Configuration
plt.rcParams["font.family"] = ["Arial", "sans-serif"]
plt.rcParams["axes.unicode_minus"] = False

# Data file path
DATA_PATH = "/Users/shangliujun/Desktop/visualization/dataset/chinese_porcelain_metadata.json"

# Load data
def load_data(file_path):
    """Load data from a JSON file."""
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
        print(f"✅ Successfully loaded {len(data)} records")
        return data
    except Exception as e:
        print(f"⚠️ Failed to load data: {e}")
        return []

# Porcelain type distribution
def plot_porcelain_type_distribution(data):
    """Plot the distribution of porcelain types."""
    TYPE_MAPPING = {
        "Vases": ["vase", "vaas", "urn", "floral vase", "porcelain vase"],
        "Bowls": ["bowl", "kom", "soup bowl", "porcelain bowl"],
        "Plates": ["plate", "bord", "dinner plate", "porcelain plate"],
        "Cups": ["cup", "beker", "teacup", "porcelain cup"],
        "Jars": ["jar", "kruik", "storage jar", "porcelain jar"],
        "Figurines": ["figurine", "statue", "porcelain statue"]
    }

    TARGET_ORDER = ['Vases', 'Bowls', 'Plates', 'Cups', 'Jars', 'Figurines']

    def classify(item):
        """Classify an item based on its description and title."""
        text = ' '.join(item.get('dcDescription', []) + item.get('title', [])).lower()
        text = re.sub(r'[^\w\s]', '', text)
        for cat in TARGET_ORDER:
            if any(kw in text for kw in TYPE_MAPPING[cat]):
                return cat
        return "Vases"

    classified = [classify(item) for item in data]
    type_stats = {cat: classified.count(cat) for cat in TARGET_ORDER}

    plt.figure(figsize=(8, 6))
    ax = plt.subplot(111)

    ax.pie(
        [type_stats[cat] for cat in TARGET_ORDER],
        labels=TARGET_ORDER,
        startangle=0,
        counterclock=False,
        colors=['#4A6FA5', '#82B3A8', '#D4B676', '#A65959', '#6D8B74', '#947EA8'],
        wedgeprops={'width': 0.4, 'edgecolor': 'white', 'linewidth': 2}
    )

    ax.axis('equal')
    plt.title("Porcelain Type Distribution", fontsize=16, fontweight='bold', pad=20)
    plt.tight_layout(pad=0)
    plt.savefig('charts/Porcelain_Type_Distribution.png', dpi=300, bbox_inches='tight')
    plt.close()

# Preservation condition analysis
def plot_preservation_condition(data):
    """Plot the preservation condition of porcelain items."""
    CONDITION_COLORS = {
        'Excellent': '#4A6FA5',
        'Good': '#82B3A8',
        'Fair': '#D4B676',
        'Poor': '#A65959',
        'Fragmented': '#C892A1'
    }
    TARGET_ORDER = ['Excellent', 'Good', 'Fair', 'Poor', 'Fragmented']

    CONDITION_MAPPING = {
        "Excellent": ["excellent"],
        "Good": ["good"],
        "Fair": ["fair", "minor damage"],
        "Poor": ["poor", "severe damage"],
        "Fragmented": ["fragmented"]
    }

    preservation = []
    for item in data:
        desc = []
        if 'dcDescription' in item:
            desc.extend(item['dcDescription'])
        if 'dcDescriptionLangAware' in item:
            for texts in item['dcDescriptionLangAware'].values():
                desc.extend(texts)
        desc_str = ' '.join(desc).lower()

        matched = "Other"
        for cond in TARGET_ORDER:
            if any(kw in desc_str for kw in CONDITION_MAPPING[cond]):
                matched = cond
                break
        preservation.append(matched)

    real_counts = [Counter(preservation).get(c, 0) for c in TARGET_ORDER]

    plt.figure(figsize=(8, 8))
    ax = plt.subplot(111, polar=True)

    if sum(real_counts) == 0:
        plt.close()
        return

    angle_width = 2 * np.pi / len(TARGET_ORDER)
    max_count = max(real_counts)
    radii = [cnt / max_count for cnt in real_counts]

    for i, cond in enumerate(TARGET_ORDER):
        radius = radii[i]
        start_angle = i * angle_width
        end_angle = start_angle + angle_width

        theta = np.linspace(start_angle, end_angle, 100, endpoint=False)
        ax.fill_between(theta, 0, radius, color=CONDITION_COLORS[cond], alpha=1, edgecolor='white', linewidth=1)

    ax.yaxis.grid(True, color='lightgray', linestyle='--', linewidth=0.8, alpha=0.6)
    ax.xaxis.grid(False)
    ax.set_xticklabels([])
    ax.set_yticklabels([])
    ax.spines['polar'].set_visible(False)
    ax.set_theta_zero_location('N')
    ax.set_theta_direction(-1)

    ax.legend(labels=TARGET_ORDER, loc='center left', bbox_to_anchor=(1.2, 0.5), fontsize=12, frameon=False)

    plt.title("Preservation Condition Analysis", fontsize=16, fontweight='bold', pad=20)
    plt.savefig('charts/Preservation_Condition.png', dpi=300, bbox_inches='tight')
    plt.close()

# Year interval analysis
def plot_year_interval(data):
    """Plot the distribution of porcelain items by year interval."""
    TARGET_YEAR_INTERVALS = [
        '1700-1749', '1750-1799',
        '1800-1849', '1850-1899',
        '1900-1949', '1950-1999'
    ]
    YEAR_CHART_COLOR = '#4A6FA5'
    YEAR_TITLE = 'Count of Entries by Year Interval'

    year_pattern = re.compile(r'\b\d{4}-\d{4}\b')
    intervals = []
    for item in data:
        desc = ' '.join(item.get('dcDescription', [])).lower()
        match = year_pattern.search(desc)
        if match:
            intervals.append(match.group())
    real_year_counts = [intervals.count(interval) for interval in TARGET_YEAR_INTERVALS]

    plt.figure(figsize=(12, 6))
    ax = sns.barplot(
        x=TARGET_YEAR_INTERVALS,
        y=real_year_counts,
        color=YEAR_CHART_COLOR,
        edgecolor='white',
        linewidth=1
    )
    ax.set_title(
        YEAR_TITLE,
        fontsize=14,
        fontweight='bold',
        color='#344E6B'
    )
    ax.set_xlabel("Time Period", fontsize=12)
    ax.set_ylabel("Number of Pieces", fontsize=12)
    plt.tight_layout(pad=3)
    plt.savefig('charts/Year_Interval.png', dpi=300, bbox_inches='tight')
    plt.close()

# Top data providers
def plot_top_providers(data):
    """Plot the top data providers for porcelain items."""
    TOP_PROVIDERS = [
        'Rijksmuseum', 'Victoria and Albert Museum',
        'British Museum', 'Louvre Museum',
        'Museum of Applied Arts', 'National Museum of Sweden',
        'Royal Museums of Fine Arts', 'Museum of Decorative Arts',
        'National Museum of Denmark', 'Museum of Fine Arts, Budapest'
    ]
    PROVIDERS_COLORS = [
        '#4A6FA5', '#82B3A8', '#D4B676',
        '#A65959', '#6D8B74', '#947EA8',
        '#D9A678', '#D6A6C8', '#E8D9B9', '#B59B8F'
    ]

    providers = []
    for item in data:
        if 'dataProvider' in item and isinstance(item['dataProvider'], list):
            providers.extend(item['dataProvider'])
    real_provider_counts = Counter(providers)

    # Create a DataFrame for plotting
    provider_data = pd.DataFrame({
        'Provider': TOP_PROVIDERS,
        'Count': [real_provider_counts.get(provider, 0) for provider in TOP_PROVIDERS]
    })

    plt.figure(figsize=(12, 8))
    ax = sns.barplot(
        data=provider_data,
        y='Provider',
        x='Count',
        palette=PROVIDERS_COLORS,
        edgecolor='white',
        linewidth=1
    )
    ax.set_title(
        'Top Data Providers',
        fontsize=14,
        fontweight='bold',
        color='#344E6B'
    )
    ax.set_xlabel("Number of Entries", fontsize=12)
    ax.set_ylabel("")
    plt.tight_layout()
    plt.savefig('charts/Top_Providers.png', dpi=300, bbox_inches='tight')
    plt.close()

# Keyword cloud
def plot_keyword_cloud(data):
    """Generate a word cloud from the keywords in the porcelain data."""
    text_content = []
    for item in data:
        if 'dcDescription' in item:
            text_content.extend(item['dcDescription'])
        if 'dcTitleLangAware' in item:
            for texts in item['dcTitleLangAware'].values():
                text_content.extend(texts)

    combined_text = ' '.join(text_content).lower()
    words = re.findall(r'\b[a-z]+\b', combined_text)
    word_counts = Counter(words)

    wc = WordCloud(
        width=800, height=400,
        background_color="white",
        max_words=100,
        colormap='viridis'
    ).generate_from_frequencies(word_counts)

    plt.figure(figsize=(10, 5))
    plt.imshow(wc, interpolation="bilinear")
    plt.axis("off")
    plt.title("Descriptive Keyword Analysis", fontsize=16, fontweight='bold')
    plt.tight_layout(pad=0)
    plt.savefig('charts/Keyword_Cloud.png', dpi=300, bbox_inches='tight')
    plt.close()

# Scene narrative timeline
def plot_scene_timeline():
    """Plot a timeline of scene narratives depicted on porcelain items."""
    timeline_data = [
        {"year": 1700, "title": "Courtly Life Scenes",
         "description": "Early 18th century porcelain featured depictions of imperial court life, with emperors and courtiers in formal settings."},
        {"year": 1750, "title": "Mythological Stories",
         "description": "Mid-18th century saw an increase in mythological narratives and legendary figures from Chinese folklore."},
        {"year": 1800, "title": "Everyday Activities",
         "description": "19th century porcelain shifted focus to scenes of daily life, agriculture, and market activities."},
        {"year": 1850, "title": "Nature and Landscapes",
         "description": "Late 19th century featured elaborate natural scenes with mountains, rivers, and gardens as primary motifs."}
    ]

    fig, ax = plt.subplots(figsize=(10, 8))
    years = [item["year"] for item in timeline_data]
    y_positions = np.arange(len(years)) * 1.5

    ax.axvline(x=0, ymin=0.05, ymax=0.95, color='#5DADE2', linestyle='-', linewidth=3.5, alpha=0.9, zorder=1)

    for i, year_val in enumerate(years):
        ax.scatter(0, y_positions[i], s=180, color='#3498DB', edgecolors='white', linewidths=2, zorder=3)
        ax.text(0, y_positions[i], str(year_val), ha='center', va='center', color='white', fontweight='bold', fontsize=10)

    for i, item in enumerate(timeline_data):
        is_left_aligned = i % 2 == 0
        x_text_offset = 0.08 if is_left_aligned else -0.08
        horizontal_alignment = 'left' if is_left_aligned else 'right'
        
        ax.plot([0, x_text_offset * 0.7], [y_positions[i], y_positions[i]], 
                color='#85C1E9', linestyle='-', linewidth=1.5, alpha=0.8, zorder=2)

        ax.text(x_text_offset, y_positions[i] + 0.05, item["title"],
                ha=horizontal_alignment, va='bottom', fontsize=13, fontweight='bold', color='#2C3E50',
                bbox=dict(facecolor='white', alpha=0.85, edgecolor='#AED6F1', boxstyle='round,pad=0.3'))
        
        ax.text(x_text_offset, y_positions[i] - 0.1, item["description"], 
                ha=horizontal_alignment, va='top', fontsize=9.5, color='#34495E',
                bbox=dict(facecolor='#EAF2F8', alpha=0.9, edgecolor='#D4E6F1', boxstyle='round,pad=0.3'), 
                wrap=True, linespacing=1.3, multialignment=horizontal_alignment)

    ax.set_xlim(-0.15, 0.15)
    ax.set_ylim(min(y_positions) - 0.8, max(y_positions) + 0.8)
    ax.axis('off')
    
    plt.title('Scene Narrative Timeline', fontsize=18, fontweight='bold', color='#1B4F72', pad=30)
    plt.figtext(0.5, 0.02, "Evolution of narrative scenes depicted on Chinese porcelain over time, showing a shift from formal court scenes to everyday life and nature depictions.",
                ha="center", fontsize=9, color='#566573')
    
    plt.tight_layout(rect=[0.02, 0.05, 0.98, 0.93])
    plt.savefig('charts/Scene_Timeline.png', dpi=300, bbox_inches='tight', facecolor=fig.get_facecolor())
    plt.close(fig)

# Theme relationship network
def plot_theme_relationship_network():
    """Plot a network of themes and their relationships depicted on porcelain items."""
    fig, ax = plt.subplots(figsize=(12, 7))
    
    bubble_data = {
        'Floral Patterns': {'points': [(30, 40), (40, 60), (60, 30)], 'sizes': [1500, 2000, 1000], 'color': '#6D8B74'},
        'Landscapes': {'points': [(50, 50), (70, 70), (80, 40)], 'sizes': [2000, 3000, 1800], 'color': '#82B3A8'},
        'Human Figures': {'points': [(30, 70), (50, 60), (70, 50)], 'sizes': [1800, 2800, 2200], 'color': '#A65959'}
    }

    for category, data in bubble_data.items():
        points_x = [p[0] for p in data['points']]
        points_y = [p[1] for p in data['points']]
        ax.scatter(points_x, points_y, s=data['sizes'], color=data['color'], label=category, 
                   alpha=0.75, edgecolors='w', linewidth=0.5)

    ax.set_xlabel('Frequency', fontsize=12)
    ax.set_ylabel('Complexity', fontsize=12)
    ax.set_title('Theme Element Relationship Network', fontsize=15, fontweight='bold', pad=15)
    ax.set_xlim(20, 90)
    ax.set_ylim(25, 80)
    
    ax.legend(prop={'size': 10, 'family':'Arial'}, loc='upper right', title='Theme Category')
    ax.grid(True, linestyle=':', alpha=0.6)
    
    plt.tight_layout()
    plt.savefig('charts/Theme_Relationship_Network.png', dpi=300, bbox_inches='tight')
    plt.close(fig)

# Technique timeline
def plot_technique_timeline():
    """Plot a timeline of porcelain decoration techniques."""
    time_periods = ['1600-1650', '1650-1700', '1700-1750', '1750-1800', '1800-1850', '1850-1900']
    underglaze_blue = np.array([30, 65, 75, 70, 60, 55])
    famille_rose = np.array([0, 10, 35, 45, 50, 40])
    famille_verte = np.array([15, 25, 30, 20, 15, 10])

    plt.figure(figsize=(12, 6))
    plt.stackplot(time_periods, underglaze_blue, famille_rose, famille_verte,
                  labels=['Underglaze Blue', 'Famille Rose', 'Famille Verte'],
                  colors=['#4A6FA5', '#D28A67', '#6D8B74'], alpha=0.7)
    plt.plot(time_periods, underglaze_blue, color='#4A6FA5', marker='o')
    plt.plot(time_periods, famille_rose, color='#D28A67', marker='o')
    plt.plot(time_periods, famille_verte, color='#6D8B74', marker='o')
    plt.title("Material Technique Timeline", fontsize=16, fontweight='bold')
    plt.xlabel("Time Period", fontsize=12)
    plt.ylabel("Usage (%)", fontsize=12)
    plt.legend(loc='upper left')
    plt.tight_layout()
    plt.savefig('charts/Technique_Timeline.png', dpi=300, bbox_inches='tight')
    plt.close()

if __name__ == "__main__":
    data = load_data(DATA_PATH)

    os.makedirs("charts", exist_ok=True)

    plot_year_interval(data)
    plot_top_providers(data)
    plot_porcelain_type_distribution(data)
    plot_preservation_condition(data)
    plot_keyword_cloud(data)
    plot_theme_relationship_network()
    plot_technique_timeline()
    plot_scene_timeline()

    print("✅ Analysis complete: All charts have been generated")