# Edo-Meiji Polysemy Analysis: Exploratory Notebook

This notebook provides interactive exploration and visualization of polysemy changes between Edo (1603-1868) and Meiji (1868-1912) Japanese literature.

## Setup

In [None]:
import sys
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# Add src to path
sys.path.append('../src')

# Import our modules
from utils import load_json, load_pickle

# Set plotting style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

print("Setup complete!")

## 1. Load Polysemy Scores

Load the computed polysemy scores for both eras.

In [None]:
# Load polysemy scores
results_dir = '../results'

edo_df = pd.read_csv(os.path.join(results_dir, 'edo_polysemy_scores.csv'))
meiji_df = pd.read_csv(os.path.join(results_dir, 'meiji_polysemy_scores.csv'))

print(f"Edo period: {len(edo_df)} words analyzed")
print(f"Meiji period: {len(meiji_df)} words analyzed")

# Display sample data
print("\nSample Edo data:")
display(edo_df.head())

print("\nSample Meiji data:")
display(meiji_df.head())

## 2. Descriptive Statistics

In [None]:
# Summary statistics
print("Edo Period Statistics:")
print(edo_df[['n_clusters', 'silhouette', 'polysemy_index']].describe())

print("\nMeiji Period Statistics:")
print(meiji_df[['n_clusters', 'silhouette', 'polysemy_index']].describe())

## 3. Distribution Visualizations

In [None]:
# Polysemy index distributions
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Histograms
axes[0].hist(edo_df['polysemy_index'], bins=20, alpha=0.6, label='Edo', color='blue')
axes[0].hist(meiji_df['polysemy_index'], bins=20, alpha=0.6, label='Meiji', color='red')
axes[0].set_xlabel('Polysemy Index')
axes[0].set_ylabel('Frequency')
axes[0].set_title('Polysemy Index Distribution')
axes[0].legend()

# Box plots
data_to_plot = [edo_df['polysemy_index'], meiji_df['polysemy_index']]
axes[1].boxplot(data_to_plot, labels=['Edo', 'Meiji'])
axes[1].set_ylabel('Polysemy Index')
axes[1].set_title('Polysemy Index by Era')

plt.tight_layout()
plt.show()

## 4. Statistical Comparison

In [None]:
# Load statistical comparison results
try:
    stats_results = load_json(os.path.join(results_dir, 'statistical_comparison.json'))
    
    print("Statistical Comparison Results:")
    print("=" * 50)
    print(f"Edo mean: {stats_results['edo_mean']:.3f}")
    print(f"Meiji mean: {stats_results['meiji_mean']:.3f}")
    print(f"Mean difference: {stats_results['mean_difference']:.3f}")
    print(f"\nCohen's d: {stats_results['cohens_d']:.3f} ({stats_results['effect_size_interpretation']})")
    print(f"\nT-test p-value: {stats_results['t_pvalue']:.4f}")
    print(f"Significant at α=0.05: {stats_results['t_significant']}")
    print(f"\nMann-Whitney U p-value: {stats_results['mannwhitney_pvalue']:.4f}")
    print(f"Significant at α=0.05: {stats_results['mannwhitney_significant']}")
except FileNotFoundError:
    print("Statistical comparison results not found. Run compare_eras.py first.")

## 5. Word-Level Analysis

In [None]:
# Load word-level comparison
try:
    comparison_df = pd.read_csv(os.path.join(results_dir, 'word_level_comparison.csv'))
    
    print(f"Number of common words: {len(comparison_df)}")
    print("\nTop 10 words with largest polysemy increase (Meiji > Edo):")
    display(comparison_df.nlargest(10, 'polysemy_change')[['word', 'polysemy_index_edo', 'polysemy_index_meiji', 'polysemy_change']])
    
    print("\nTop 10 words with largest polysemy decrease (Edo > Meiji):")
    display(comparison_df.nsmallest(10, 'polysemy_change')[['word', 'polysemy_index_edo', 'polysemy_index_meiji', 'polysemy_change']])
except FileNotFoundError:
    print("Word-level comparison not found. Run compare_eras.py first.")

## 6. Scatter Plot: Edo vs Meiji Polysemy

In [None]:
# Scatter plot of polysemy scores
try:
    plt.figure(figsize=(10, 10))
    plt.scatter(comparison_df['polysemy_index_edo'], 
               comparison_df['polysemy_index_meiji'],
               alpha=0.6)
    
    # Add diagonal line (no change)
    max_val = max(comparison_df['polysemy_index_edo'].max(), 
                  comparison_df['polysemy_index_meiji'].max())
    plt.plot([0, max_val], [0, max_val], 'r--', alpha=0.5, label='No change')
    
    plt.xlabel('Edo Polysemy Index')
    plt.ylabel('Meiji Polysemy Index')
    plt.title('Polysemy Comparison: Edo vs Meiji')
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()
except NameError:
    print("Comparison data not loaded.")

## 7. Cluster Count Analysis

In [None]:
# Cluster count distributions
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Edo cluster counts
edo_cluster_counts = edo_df['n_clusters'].value_counts().sort_index()
axes[0].bar(edo_cluster_counts.index, edo_cluster_counts.values, color='blue', alpha=0.7)
axes[0].set_xlabel('Number of Clusters')
axes[0].set_ylabel('Number of Words')
axes[0].set_title('Edo Period: Cluster Count Distribution')

# Meiji cluster counts
meiji_cluster_counts = meiji_df['n_clusters'].value_counts().sort_index()
axes[1].bar(meiji_cluster_counts.index, meiji_cluster_counts.values, color='red', alpha=0.7)
axes[1].set_xlabel('Number of Clusters')
axes[1].set_ylabel('Number of Words')
axes[1].set_title('Meiji Period: Cluster Count Distribution')

plt.tight_layout()
plt.show()

## 8. Case Study: Specific Words

Examine specific words in detail (example).

In [None]:
# Example: Look at a specific word's embeddings
# Uncomment and modify as needed

# target_word = '人'  # Example: 'person'
# 
# # Load embeddings
# edo_embeddings = load_pickle('../data/embeddings/edo_embeddings.pkl')
# meiji_embeddings = load_pickle('../data/embeddings/meiji_embeddings.pkl')
# 
# if target_word in edo_embeddings and target_word in meiji_embeddings:
#     print(f"Analyzing word: {target_word}")
#     print(f"Edo contexts: {edo_embeddings[target_word]['n_contexts']}")
#     print(f"Meiji contexts: {meiji_embeddings[target_word]['n_contexts']}")
#     
#     # Could add t-SNE visualization here
# else:
#     print(f"Word '{target_word}' not found in both corpora")

## 9. Export Results

Save any additional analysis or figures.

In [None]:
# Example: Save a custom figure
# plt.savefig('../results/custom_analysis.png', dpi=300, bbox_inches='tight')
print("Analysis complete!")