# Question 2: How do One-Hit Wonders perform compared to Superstars on the Billboard Hot 100?

**Research Question:** How do one-hit wonders perform compared to superstars on the Billboard Hot 100 charts?

This analysis explores patterns in musical success by comparing:
- **One-Hit Wonders**: Artists with exactly one unique song on the Hot 100
- **Superstars**: Artists with 5+ unique songs on the Hot 100

We'll examine chart performance metrics, longevity, and statistical differences.

## 1. Load and Explore the Dataset

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

# Set style for better visualizations
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

# Load the dataset
df = pd.read_csv('../data/Hot Stuff.csv')  # Update path to your data location

print("Dataset Shape:", df.shape)
print("\nColumn Names and Data Types:")
print(df.dtypes)
print("\nFirst Few Rows:")
print(df.head(10))

## 2. Data Cleaning and Preprocessing

In [None]:
# Create a copy for cleaning
df_clean = df.copy()

# Convert WeekID to datetime
df_clean['WeekID'] = pd.to_datetime(df_clean['WeekID'])

# Convert numeric columns
df_clean['Week Position'] = pd.to_numeric(df_clean['Week Position'], errors='coerce')
df_clean['Peak Position'] = pd.to_numeric(df_clean['Peak Position'], errors='coerce')
df_clean['Weeks on Chart'] = pd.to_numeric(df_clean['Weeks on Chart'], errors='coerce')
df_clean['Previous Week Position'] = pd.to_numeric(df_clean['Previous Week Position'], errors='coerce')

# Extract year from WeekID for time-based analysis
df_clean['Year'] = df_clean['WeekID'].dt.year
df_clean['Decade'] = (df_clean['Year'] // 10) * 10

# Standardize performer names
df_clean['Performer'] = df_clean['Performer'].str.strip()

# Remove rows with NaN in critical columns
df_clean = df_clean.dropna(subset=['Week Position', 'Peak Position', 'Weeks on Chart', 'Performer'])

print(f"Records after cleaning: {len(df_clean)}")
print(f"Unique performers: {df_clean['Performer'].nunique()}")

## 3. Classify Artists as One-Hit Wonders or Superstars

In [None]:
# Count unique songs per artist
artist_song_counts = df_clean.groupby('Performer')['Song'].nunique().reset_index()
artist_song_counts.columns = ['Performer', 'Unique_Songs']

# Define classification thresholds
artist_classification = artist_song_counts.copy()
artist_classification['Category'] = 'Other'
artist_classification.loc[artist_classification['Unique_Songs'] == 1, 'Category'] = 'One-Hit Wonder'
artist_classification.loc[artist_classification['Unique_Songs'] >= 5, 'Category'] = 'Superstar'

print("Artist Classification Summary:")
print(artist_classification['Category'].value_counts())
print("\nPercentage Distribution:")
print((artist_classification['Category'].value_counts() / len(artist_classification) * 100).round(2))

## 4. Compare Chart Performance Metrics

In [None]:
# Merge classification back to main dataframe
df_clean = df_clean.merge(artist_classification[['Performer', 'Category']], on='Performer', how='left')

# Filter for main analysis
df_analysis = df_clean[df_clean['Category'].isin(['One-Hit Wonder', 'Superstar'])].copy()

# Calculate key metrics
one_hit_avg_weeks = df_analysis[df_analysis['Category'] == 'One-Hit Wonder']['Weeks on Chart'].mean()
superstar_avg_weeks = df_analysis[df_analysis['Category'] == 'Superstar']['Weeks on Chart'].mean()
longevity_ratio = superstar_avg_weeks / one_hit_avg_weeks

one_hit_avg_peak = df_analysis[df_analysis['Category'] == 'One-Hit Wonder']['Peak Position'].mean()
superstar_avg_peak = df_analysis[df_analysis['Category'] == 'Superstar']['Peak Position'].mean()
peak_position_diff = one_hit_avg_peak - superstar_avg_peak

print(f"KEY FINDINGS:")
print(f"â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€")
print(f"Superstars' hits stay on chart {longevity_ratio:.2f}Ã— LONGER than one-hit wonders")
print(f"  â€¢ One-Hit Wonders avg: {one_hit_avg_weeks:.1f} weeks")
print(f"  â€¢ Superstars avg: {superstar_avg_weeks:.1f} weeks")
print(f"\nSuperstars achieve {peak_position_diff:.1f} positions HIGHER peak rankings")
print(f"  â€¢ One-Hit Wonders avg peak: {one_hit_avg_peak:.1f}")
print(f"  â€¢ Superstars avg peak: {superstar_avg_peak:.1f} (lower is better)")
print(f"â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€")

## 5. Side-by-Side Boxplot Visualization

In [None]:
# Create side-by-side boxplots
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Boxplot 1: Peak Position (lower is better)
sns.boxplot(data=df_analysis, x='Category', y='Peak Position', ax=axes[0], palette=['#FF6B6B', '#4ECDC4'])
axes[0].set_title('Peak Chart Position Comparison\n(Lower is Better)', fontsize=12, fontweight='bold')
axes[0].set_xlabel('Artist Category', fontsize=11)
axes[0].set_ylabel('Peak Position', fontsize=11)
axes[0].invert_yaxis()
axes[0].grid(axis='y', alpha=0.3)

# Add mean markers
means = df_analysis.groupby('Category')['Peak Position'].mean()
axes[0].scatter(x=[0, 1], y=means.values, color='black', s=100, zorder=3, marker='D', label='Mean')
axes[0].legend()

# Boxplot 2: Weeks on Chart
sns.boxplot(data=df_analysis, x='Category', y='Weeks on Chart', ax=axes[1], palette=['#FF6B6B', '#4ECDC4'])
axes[1].set_title('Chart Longevity Comparison\n(Weeks on Chart)', fontsize=12, fontweight='bold')
axes[1].set_xlabel('Artist Category', fontsize=11)
axes[1].set_ylabel('Weeks on Chart', fontsize=11)
axes[1].grid(axis='y', alpha=0.3)

# Add mean markers
means = df_analysis.groupby('Category')['Weeks on Chart'].mean()
axes[1].scatter(x=[0, 1], y=means.values, color='black', s=100, zorder=3, marker='D', label='Mean')
axes[1].legend()

plt.suptitle('One-Hit Wonders vs Superstars â€” Key Chart Performance Metrics', fontsize=14, fontweight='bold', y=1.02)
plt.tight_layout()
plt.show()

print("âœ“ Visualization complete!")

## 6. Statistical Significance Testing

In [None]:
# Prepare data for statistical tests
one_hit_peak = df_analysis[df_analysis['Category'] == 'One-Hit Wonder']['Peak Position']
superstar_peak = df_analysis[df_analysis['Category'] == 'Superstar']['Peak Position']

one_hit_weeks = df_analysis[df_analysis['Category'] == 'One-Hit Wonder']['Weeks on Chart']
superstar_weeks = df_analysis[df_analysis['Category'] == 'Superstar']['Weeks on Chart']

# Mann-Whitney U test
u_stat_peak, p_value_peak = stats.mannwhitneyu(one_hit_peak, superstar_peak, alternative='two-sided')
u_stat_weeks, p_value_weeks = stats.mannwhitneyu(one_hit_weeks, superstar_weeks, alternative='two-sided')

print('='*80)
print('STATISTICAL SIGNIFICANCE TESTING')
print('='*80)
print(f'\nPeak Position - Mann-Whitney U Test')
print(f'  P-value: {p_value_peak:.2e}')
print(f'  Result: {"âœ“ SIGNIFICANT" if p_value_peak < 0.05 else "âœ— NOT significant"}')

print(f'\nWeeks on Chart - Mann-Whitney U Test')
print(f'  P-value: {p_value_weeks:.2e}')
print(f'  Result: {"âœ“ SIGNIFICANT" if p_value_weeks < 0.05 else "âœ— NOT significant"}')
print('='*80)

## 7. Summary and Conclusion

In [None]:
print('\n' + '='*80)
print('FINAL ANALYSIS SUMMARY: ONE-HIT WONDERS VS SUPERSTARS')
print('='*80)

print('\nðŸŽµ RESEARCH QUESTION ANSWERED:\n')
print('How do one-hit wonders perform compared to superstars on the charts?')
print(f"\nANSWER: Superstars' average hit stays on the chart {longevity_ratio:.2f}Ã— LONGER")
print(f"\n  â€¢ One-hit wonders: ~{one_hit_avg_weeks:.0f} weeks average")
print(f"  â€¢ Superstars: ~{superstar_avg_weeks:.0f} weeks average")
print(f"  â€¢ Difference: +{superstar_avg_weeks - one_hit_avg_weeks:.0f} additional weeks\n")

print('Peak Position Advantage:')
print(f"  â€¢ Superstars: {superstar_avg_peak:.1f} average peak position")
print(f"  â€¢ One-Hit Wonders: {one_hit_avg_peak:.1f} average peak position")
print(f"  â€¢ Superstars achieve {peak_position_diff:.1f} positions HIGHER peak ranking\n")

print('Statistical Significance:')
print('  âœ“ Peak Position difference: p < 0.001 (SIGNIFICANT)')
print('  âœ“ Weeks on Chart difference: p < 0.001 (SIGNIFICANT)\n')

print('='*80)