# PART 1: POKEMON CARD PRICING ANALYSIS

---

## 1. Import Libraries and Load Data

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Set consistent style with minimal colors
sns.set_style('whitegrid')
sns.set_palette(['steelblue'])  # Single consistent color
plt.rcParams['figure.figsize'] = (14, 8)

%matplotlib inline

In [None]:
# Load the data
df = pd.read_csv('final_dataset.csv')

# Convert Date column to datetime
df['Date'] = pd.to_datetime(df['Date'])
df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month
df['YearMonth'] = df['Date'].dt.to_period('M')

# Filter out Booster Box outliers
df = df[df['Rarity'] != 'Booster Box']

# Filter out Booster Pack/Box products from card names
df = df[~df['Card Name'].str.contains('Booster|Pack|Box', case=False, na=False)]

# Define price columns
price_cols = ['New', 'Used', 'Graded']

print(f"Dataset loaded successfully!")
print(f"Total records (after filtering Booster Box): {len(df):,}")
print(f"Date range: {df['Date'].min().strftime('%B %Y')} - {df['Date'].max().strftime('%B %Y')}")

## 2. Price Distribution by Condition

In [None]:
# Price Distribution Across All Conditions
fig, axes = plt.subplots(1, 3, figsize=(18, 6))
fig.suptitle('Price Distribution by Condition', fontsize=16, fontweight='bold')

for idx, col in enumerate(price_cols):
    ax = axes[idx]
    
    # Filter out zeros for better visualization
    data = df[df[col] > 0][col]
    
    ax.hist(data, bins=30, edgecolor='black', alpha=0.7, color='steelblue')
    ax.set_xlabel('Price ($)')
    ax.set_ylabel('Frequency')
    ax.set_title(f'{col} (n={len(data):,})')
    ax.axvline(data.mean(), color='red', linestyle='--',
               label=f'Mean: ${data.mean():,.0f}')
    ax.axvline(data.median(), color='green', linestyle='--',
               label=f'Median: ${data.median():,.0f}')
    ax.legend(fontsize=8)

plt.tight_layout()
plt.show()

In [None]:
# Price statistics by condition (excluding zeros)
print("Price Statistics by Condition (Excluding Zeros):")
print("="*70)
for col in price_cols:
    non_zero = df[df[col] > 0][col]
    if len(non_zero) > 0:
        print(f"\n{col}:")
        print(f"  Mean: ${non_zero.mean():,.2f}")
        print(f"  Median: ${non_zero.median():,.2f}")
        print(f"  Std Dev: ${non_zero.std():,.2f}")
        print(f"  Min: ${non_zero.min():,.2f}")
        print(f"  Max: ${non_zero.max():,.2f}")
        print(f"  Count: {len(non_zero):,}")

## 3. Top 15 Most Valuable Pokemon Cards

In [None]:
# Top 15 Most Valuable Cards (by Graded price only)
card_avg_prices = df[df['Graded'] > 0].groupby('Card Name')['Graded'].mean().sort_values(ascending=False).head(15)

fig, ax = plt.subplots(figsize=(12, 8))
ax.barh(range(len(card_avg_prices)), card_avg_prices.values, 
        alpha=0.8, edgecolor='black', color='steelblue')
ax.set_yticks(range(len(card_avg_prices)))
ax.set_yticklabels(card_avg_prices.index)
ax.set_xlabel('Average Graded Price ($)')
ax.set_title('Top 15 Most Valuable Pokemon Cards (Graded)')
ax.invert_yaxis()
ax.xaxis.set_major_formatter(plt.FuncFormatter(lambda x, p: f'${x:,.0f}'))
plt.tight_layout()
plt.show()

## 4. Top 10 Pokemon Sets by Average Graded Price

In [None]:
# Top 10 Sets by Average Graded Price
set_avg_prices = df[df['Graded'] > 0].groupby('Set Name')['Graded'].mean().sort_values(ascending=False).head(10)

fig, ax = plt.subplots(figsize=(12, 7))
bars = ax.bar(range(len(set_avg_prices)), set_avg_prices.values, 
              color='steelblue', alpha=0.8, edgecolor='black')
ax.set_xlabel('Set Name')
ax.set_ylabel('Average Graded Price ($)')
ax.set_title('Top 10 Pokemon Sets by Average Graded Price')
ax.set_xticks(range(len(set_avg_prices)))
ax.set_xticklabels(set_avg_prices.index, rotation=45, ha='right')
ax.grid(axis='y', alpha=0.3)
ax.yaxis.set_major_formatter(plt.FuncFormatter(lambda x, p: f'${x:,.0f}'))

plt.tight_layout()
plt.show()

## 5. Price Distribution by Condition (Boxplot)

In [None]:
# Condition Comparison Boxplot
fig, ax = plt.subplots(figsize=(14, 7))

# Prepare data for boxplot (melt the dataframe)
price_data = []
conditions = []
for col in price_cols:
    non_zero = df[df[col] > 0][col]
    price_data.extend(non_zero.values)
    conditions.extend([col] * len(non_zero))

boxplot_df = pd.DataFrame({'Price': price_data, 'Condition': conditions})

sns.boxplot(data=boxplot_df, x='Condition', y='Price', ax=ax, color='steelblue')
ax.set_xlabel('Condition')
ax.set_ylabel('Price ($)')
ax.set_title('Price Distribution by Condition (Excluding Zeros)')
plt.setp(ax.get_xticklabels(), rotation=45, ha='right')
ax.set_yscale('log')  # Log scale due to high variance
ax.yaxis.set_major_formatter(plt.FuncFormatter(lambda x, p: f'${x:,.0f}'))
plt.tight_layout()
plt.show()

---
## Pokemon Card Growth Analysis
---

In [None]:
# Load Pokemon data
df_pk = pd.read_csv('final_dataset.csv')
df_pk['Date'] = pd.to_datetime(df_pk['Date'])

# Filter for graded prices and exclude booster products
df_pk = df_pk[df_pk['Graded'] > 0]
df_pk = df_pk[~df_pk['Card Name'].str.contains('Booster|Pack|Box', case=False, na=False)]

print(f"Total records: {len(df_pk):,}")
print(f"Unique cards: {df_pk['Card Name'].nunique()}")
print(f"Date range: {df_pk['Date'].min().strftime('%Y-%m')} to {df_pk['Date'].max().strftime('%Y-%m')}")

In [None]:
# Calculate growth for each card
growth_results = []
for card in df_pk['Card Name'].unique():
    card_data = df_pk[df_pk['Card Name'] == card].sort_values('Date')
    if len(card_data) >= 2:
        first_price = card_data.iloc[0]['Graded']
        last_price = card_data.iloc[-1]['Graded']
        first_date = card_data.iloc[0]['Date']
        last_date = card_data.iloc[-1]['Date']
        if first_price > 0:
            pct_growth = ((last_price - first_price) / first_price) * 100
            growth_results.append({
                'Card': card,
                'First Price': first_price,
                'Last Price': last_price,
                'Growth %': pct_growth,
                'Months': (last_date - first_date).days / 30.44
            })

pk_growth_df = pd.DataFrame(growth_results).sort_values('Growth %', ascending=False)
print(f"\nTop 5 Cards by Growth:")
print(pk_growth_df.head())

In [None]:
# Visualize top 15 growers
fig, ax = plt.subplots(figsize=(12, 8))

top_15 = pk_growth_df.head(15).sort_values('Growth %')
colors = ['green' if x > 0 else 'red' for x in top_15['Growth %']]

ax.barh(range(len(top_15)), top_15['Growth %'], color=colors, alpha=0.7, edgecolor='black')
ax.set_yticks(range(len(top_15)))
ax.set_yticklabels(top_15['Card'])
ax.set_xlabel('Growth %', fontsize=12)
ax.set_title('Top 15 Pokemon Cards by Price Growth % (Graded)', fontsize=14, fontweight='bold')
ax.axvline(0, color='black', linewidth=0.8)
ax.grid(axis='x', alpha=0.3)

# Add percentage labels
for i, v in enumerate(top_15['Growth %']):
    ax.text(v + 5 if v > 0 else v - 5, i, f"{v:.1f}%", 
            va='center', fontsize=9, fontweight='bold')

plt.tight_layout()
plt.show()

In [None]:
# Price Evolution Over Time - Top 3 Pokemon Cards
fig, ax = plt.subplots(figsize=(14, 8))

# Get top 3 cards by growth
top_3_cards = pk_growth_df.head(3)['Card'].values

for idx, card in enumerate(top_3_cards):
    card_data = df_pk[df_pk['Card Name'] == card].sort_values('Date')
    ax.plot(card_data['Date'], card_data['Graded'], 
            marker='o', linewidth=2.5, markersize=6, 
            label=card, alpha=0.8)

ax.set_xlabel('Date', fontsize=12, fontweight='bold')
ax.set_ylabel('Graded Price ($)', fontsize=12, fontweight='bold')
ax.set_title('Price Evolution: Top 3 Pokemon Cards by Growth', 
             fontsize=14, fontweight='bold')
ax.legend(loc='best', fontsize=10, framealpha=0.9)
ax.grid(True, alpha=0.3)
ax.yaxis.set_major_formatter(plt.FuncFormatter(lambda x, p: f'${x:,.0f}'))

# Rotate x-axis labels
plt.setp(ax.get_xticklabels(), rotation=45, ha='right')

plt.tight_layout()
plt.show()


---

# PART 2: STAR WARS ACTION FIGURES ANALYSIS

---

In [None]:
# Load the data
df = pd.read_csv('starwars_filtered.csv')
print(f"Dataset loaded successfully!")
print(f"Total records: {len(df):,}")

## Star Wars Price Distribution by Grading

In [None]:
# Price Distribution by Grading/Certification
fig, axes = plt.subplots(1, 2, figsize=(16, 8))
fig.suptitle('Star Wars Price Distribution by Grading', fontsize=16, fontweight='bold')

# Define the combinations
combinations = [
    (1, 'Graded/Certified', axes[0]),
    (0, 'Not Graded', axes[1])
]

for auth, title, ax in combinations:
    # Filter data
    data = df[df['authenticity_n'] == auth]['selling_price']
    
    # Plot histogram
    ax.hist(data, bins=50, edgecolor='black', alpha=0.7, color='steelblue')
    ax.set_xlabel('Selling Price ($)', fontsize=11)
    ax.set_ylabel('Frequency', fontsize=11)
    ax.set_title(title, fontsize=12, fontweight='bold')
    
    # Add mean and median lines
    ax.axvline(data.mean(), color='red', linestyle='--', linewidth=2,
               label=f'Mean: ${data.mean():.2f}')
    ax.axvline(data.median(), color='green', linestyle='--', linewidth=2,
               label=f'Median: ${data.median():.2f}')
    
    # Add count annotation
    ax.text(0.98, 0.97, f'n = {len(data):,}', 
            transform=ax.transAxes, fontsize=10,
            verticalalignment='top', horizontalalignment='right',
            bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))
    
    ax.legend(fontsize=9)
    ax.grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.show()

## Top 10 Most Valuable Figures by Grading Status

In [None]:
# Top 10 Most Valuable Figures by Grading Status
fig, axes = plt.subplots(1, 2, figsize=(18, 8))

# Define the two combinations
combinations = [
    (1, 'Graded/Certified', axes[0]),
    (0, 'Not Graded', axes[1])
]

for auth, title, ax in combinations:
    # Filter data for this segment
    subset = df[df['authenticity_n'] == auth]
    
    # Get top 10 figures by average price (with at least 5 sales for reliability)
    figure_stats = subset.groupby('figure').agg({
        'selling_price': ['mean', 'count']
    })
    figure_stats.columns = ['avg_price', 'count']
    figure_stats = figure_stats[figure_stats['count'] >= 5]  # At least 5 sales
    top_10 = figure_stats.nlargest(10, 'avg_price')
    
    # Plot
    y_pos = np.arange(len(top_10))
    ax.barh(y_pos, top_10['avg_price'], color='steelblue', alpha=0.8, edgecolor='black')
    ax.set_yticks(y_pos)
    ax.set_yticklabels(top_10.index, fontsize=9)
    ax.set_xlabel('Average Selling Price ($)', fontsize=10)
    ax.set_title(title, fontsize=12, fontweight='bold')
    ax.invert_yaxis()
    ax.grid(axis='x', alpha=0.3)
    
    # Add count annotations
    for i, (idx, row) in enumerate(top_10.iterrows()):
        ax.text(row['avg_price'] + 10, i, f"n={int(row['count'])}", 
                va='center', fontsize=10, color='black')

fig.suptitle('Top 10 Most Valuable Figures by Grading Status (min 5 sales)', 
             fontsize=16, fontweight='bold', y=0.98)
plt.tight_layout()
plt.show()

## Top 10 Best Selling Figures by Grading Status

In [None]:
# Top 10 Best Selling Figures by Grading Status (by total sales volume)
fig, axes = plt.subplots(1, 2, figsize=(18, 8))

# Define the two combinations
combinations = [
    (1, 'Graded/Certified', axes[0]),
    (0, 'Not Graded', axes[1])
]

for auth, title, ax in combinations:
    # Filter data for this segment
    subset = df[df['authenticity_n'] == auth]
    
    # Get top 10 figures by SUMMING the 'sales' column (actual sales volume)
    figure_sales = subset.groupby('figure')['sales'].sum().sort_values(ascending=False).head(10)
    
    # Get average price for each of these top figures
    avg_prices = []
    for figure in figure_sales.index:
        avg_price = subset[subset['figure'] == figure]['selling_price'].mean()
        avg_prices.append(avg_price)
    
    # Create dataframe for plotting
    plot_data = pd.DataFrame({
        'figure': figure_sales.index,
        'total_sales': figure_sales.values,
        'avg_price': avg_prices
    })
    
    # Plot
    y_pos = np.arange(len(plot_data))
    bars = ax.barh(y_pos, plot_data['total_sales'], color='steelblue', alpha=0.8, edgecolor='black')
    ax.set_yticks(y_pos)
    ax.set_yticklabels(plot_data['figure'], fontsize=9)
    ax.set_xlabel('Total Sales Volume', fontsize=10)
    ax.set_title(title, fontsize=12, fontweight='bold')
    ax.invert_yaxis()
    ax.grid(axis='x', alpha=0.3)
    
    # Add average price annotations
    for i, (idx, row) in enumerate(plot_data.iterrows()):
        ax.text(row['total_sales'] + max(plot_data['total_sales'])*0.02, i, 
                f"${row['avg_price']:.0f}", 
                va='center', fontsize=10, color='black', fontweight='normal')

fig.suptitle('Top 10 Best Selling Figures by Grading Status (by total sales volume)', 
             fontsize=16, fontweight='bold', y=0.98)
plt.tight_layout()
plt.show()

---
## Star Wars Figure Growth Analysis
---

In [None]:
# Load Star Wars data
df_sw = pd.read_csv('starwars_filtered.csv')

# Filter for authentic (graded) figures only
df_sw = df_sw[df_sw['authenticity_n'] == 1]
df_sw = df_sw[df_sw['selling_price'] > 0]

print(f"Total records: {len(df_sw):,}")
print(f"Unique figures: {df_sw['figure'].nunique()}")
print(f"Year range: {df_sw['year'].min()} to {df_sw['year'].max()}")

In [None]:
# Calculate growth for each figure
sw_growth_results = []
for figure in df_sw['figure'].unique():
    figure_data = df_sw[df_sw['figure'] == figure].sort_values('year')
    if len(figure_data) >= 2:
        first_year = figure_data['year'].min()
        last_year = figure_data['year'].max()
        first_year_data = figure_data[figure_data['year'] == first_year]
        last_year_data = figure_data[figure_data['year'] == last_year]
        first_price = first_year_data['selling_price'].mean()
        last_price = last_year_data['selling_price'].mean()
        if first_price > 0:
            pct_growth = ((last_price - first_price) / first_price) * 100
            sw_growth_results.append({
                'Figure': figure,
                'First Price': first_price,
                'Last Price': last_price,
                'Growth %': pct_growth,
                'Years': last_year - first_year
            })

sw_growth_df = pd.DataFrame(sw_growth_results).sort_values('Growth %', ascending=False)
print(f"\nTop 5 Figures by Growth:")
print(sw_growth_df.head())

In [None]:
# Visualize top 15 growers
fig, ax = plt.subplots(figsize=(12, 8))

top_15_sw = sw_growth_df.head(15).sort_values('Growth %')
colors = ['green' if x > 0 else 'red' for x in top_15_sw['Growth %']]

ax.barh(range(len(top_15_sw)), top_15_sw['Growth %'], color=colors, alpha=0.7, edgecolor='black')
ax.set_yticks(range(len(top_15_sw)))
ax.set_yticklabels(top_15_sw['Figure'], fontsize=9)
ax.set_xlabel('Growth %', fontsize=12)
ax.set_title('Top 15 Star Wars Figures by Price Growth % (Authentic)', fontsize=14, fontweight='bold')
ax.axvline(0, color='black', linewidth=0.8)
ax.grid(axis='x', alpha=0.3)

# Add percentage labels
for i, v in enumerate(top_15_sw['Growth %']):
    ax.text(v + max(top_15_sw['Growth %'])*0.02 if v > 0 else v - max(top_15_sw['Growth %'])*0.02, 
            i, f"{v:.1f}%", va='center', fontsize=8, fontweight='bold')

plt.tight_layout()
plt.show()

In [None]:
# Price Evolution Over Time - Top 3 Star Wars Figures
fig, ax = plt.subplots(figsize=(14, 8))

# Get top 3 figures by growth
top_3_figures = sw_growth_df.head(3)['Figure'].values

for idx, figure in enumerate(top_3_figures):
    figure_data = df_sw[df_sw['figure'] == figure].groupby('year')['selling_price'].mean().reset_index()
    ax.plot(figure_data['year'], figure_data['selling_price'], 
            marker='s', linewidth=2.5, markersize=7, 
            label=figure, alpha=0.8)

ax.set_xlabel('Year', fontsize=12, fontweight='bold')
ax.set_ylabel('Average Selling Price ($)', fontsize=12, fontweight='bold')
ax.set_title('Price Evolution: Top 3 Star Wars Figures by Growth (Authentic)', 
             fontsize=14, fontweight='bold')
ax.legend(loc='best', fontsize=10, framealpha=0.9)
ax.grid(True, alpha=0.3)
ax.yaxis.set_major_formatter(plt.FuncFormatter(lambda x, p: f'${x:,.2f}'))

plt.tight_layout()
plt.show()

---
## Summary: Cross-Market Comparison
---

In [None]:
print("HIGHEST GROWTH WINNERS")
print("="*80)

print(f"\nPokemon Card Winner:")
pk_winner = pk_growth_df.iloc[0]
print(f"  {pk_winner['Card']}")
print(f"  Growth: {pk_winner['Growth %']:+.1f}% over {pk_winner['Months']:.1f} months")
print(f"  ${pk_winner['First Price']:,.0f} → ${pk_winner['Last Price']:,.0f}")

print(f"\nStar Wars Figure Winner:")
sw_winner = sw_growth_df.iloc[0]
print(f"  {sw_winner['Figure']}")
print(f"  Growth: {sw_winner['Growth %']:+.1f}% over {int(sw_winner['Years'])} years")
print(f"  ${sw_winner['First Price']:,.2f} → ${sw_winner['Last Price']:,.2f}")

print("\n" + "="*80)
print("\nKey Observations:")
print(f"• Pokemon median growth: {pk_growth_df['Growth %'].median():.1f}%")
print(f"• Star Wars median growth: {sw_growth_df['Growth %'].median():.1f}%")
print(f"• Pokemon cards with positive growth: {(pk_growth_df['Growth %'] > 0).sum()} of {len(pk_growth_df)}")
print(f"• Star Wars figures with positive growth: {(sw_growth_df['Growth %'] > 0).sum()} of {len(sw_growth_df)}")