# Roman Numeral Service - Data Exploration

This notebook provides exploratory data analysis (EDA) for the Roman Numeral Service data platform.

## Contents
1. Connect to PostgreSQL (OLTP)
2. Explore conversion_request table
3. Basic statistics and visualizations
4. Data profiling with ydata-profiling


In [None]:
# Import required packages
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sqlalchemy import create_engine, text

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

# Plotting style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('husl')

print("Libraries loaded successfully!")


In [None]:
# Database connection
DATABASE_URL = "postgresql://romannumeral:romannumeral_secret@postgres:5432/romannumeral"

engine = create_engine(DATABASE_URL)

# Test connection
with engine.connect() as conn:
    result = conn.execute(text("SELECT version()"))
    print(f"Connected to: {result.fetchone()[0]}")


In [None]:
# List all tables
tables_query = """
SELECT table_name 
FROM information_schema.tables 
WHERE table_schema = 'public'
ORDER BY table_name;
"""

tables = pd.read_sql(tables_query, engine)
print("Available tables:")
display(tables)


In [None]:
# Load conversion_request data
query = """
SELECT 
    id,
    user_id,
    input_value,
    output_value,
    request_timestamp,
    client_ip
FROM conversion_request
ORDER BY request_timestamp DESC
LIMIT 10000;
"""

df = pd.read_sql(query, engine)
print(f"Loaded {len(df)} records")
df.head(10)


In [None]:
# Summary statistics
if len(df) > 0:
    print("=" * 50)
    print("DATASET SUMMARY")
    print("=" * 50)
    print(f"Total records: {len(df):,}")
    print(f"Unique users: {df['user_id'].nunique():,}")
    print(f"Date range: {df['request_timestamp'].min()} to {df['request_timestamp'].max()}")
    print("\nInput value statistics:")
    display(df['input_value'].describe())
else:
    print("No data available. Generate some conversions first!")


In [None]:
# Most popular numbers to convert
if len(df) > 0:
    top_numbers = df['input_value'].value_counts().head(20)
    
    plt.figure(figsize=(12, 6))
    top_numbers.plot(kind='bar', color='steelblue')
    plt.title('Top 20 Most Converted Numbers', fontsize=14)
    plt.xlabel('Input Value')
    plt.ylabel('Count')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()
