# Question 5: What Early Signs Predict a Superstar Career?Research Question: What early indicators from a debut can predict a multi-hit superstar career?

## 1. Setup and Data Preparation

In [None]:
import pandas as pdimport numpy as npimport matplotlib.pyplot as pltimport seaborn as snsfrom scipy import statsimport warningswarnings.filterwarnings('ignore')sns.set_style('whitegrid')plt.rcParams['figure.figsize'] = (14, 6)df = pd.read_csv('../data/Hot Stuff.csv')df_clean = df.copy()df_clean['WeekID'] = pd.to_datetime(df_clean['WeekID'])df_clean['Week Position'] = pd.to_numeric(df_clean['Week Position'], errors='coerce')df_clean['Peak Position'] = pd.to_numeric(df_clean['Peak Position'], errors='coerce')df_clean['Weeks on Chart'] = pd.to_numeric(df_clean['Weeks on Chart'], errors='coerce')df_clean['Year'] = df_clean['WeekID'].dt.yeardf_clean['Performer'] = df_clean['Performer'].str.strip()df_clean = df_clean.dropna(subset=['Peak Position', 'Weeks on Chart', 'Performer'])print(f'Data loaded: {len(df_clean)} records')

## 2. Classify Artists and Build Predictive Dataset

In [None]:
artist_song_counts = df_clean.groupby('Performer')['Song'].nunique().reset_index()artist_song_counts.columns = ['Performer', 'Unique_Songs']artist_classification = artist_song_counts.copy()artist_classification['Category'] = 'Other'artist_classification.loc[artist_classification['Unique_Songs'] == 1, 'Category'] = 'One-Hit Wonder'artist_classification.loc[artist_classification['Unique_Songs'] >= 5, 'Category'] = 'Superstar'print('Artist Classification:')print(artist_classification['Category'].value_counts())first_appearance = df_clean.sort_values('WeekID').groupby(['Performer', 'Song']).first().reset_index()first_appearance_artist = first_appearance.groupby('Performer').first().reset_index()first_appearance_artist = first_appearance_artist[['Performer', 'Peak Position', 'Weeks on Chart', 'Year']]first_appearance_artist.columns = ['Performer', 'First_Peak', 'First_Weeks', 'First_Year']first_appearance_artist = first_appearance_artist.merge(    artist_classification[artist_classification['Unique_Songs'] >= 5][['Performer']].assign(Became_Superstar=1),    on='Performer', how='left')first_appearance_artist['Became_Superstar'] = first_appearance_artist['Became_Superstar'].fillna(0)print(f'Debut dataset: {len(first_appearance_artist)} artists')

## 3. Correlation Analysis

In [None]:
corr_peak = first_appearance_artist['First_Peak'].corr(first_appearance_artist['Became_Superstar'])corr_weeks = first_appearance_artist['First_Weeks'].corr(first_appearance_artist['Became_Superstar'])corr_year = first_appearance_artist['First_Year'].corr(first_appearance_artist['Became_Superstar'])print('='*80)print('CORRELATION: Debut Metrics → Superstar Status')print('='*80)print(f'Peak Position: r = {corr_peak:.3f}')print(f'Weeks on Chart: r = {corr_weeks:.3f}')print(f'Year of Debut: r = {corr_year:.3f}')print('='*80)

## 4. Conversion Rates by Debut Quality

In [None]:
top10_debuts = first_appearance_artist[first_appearance_artist['First_Peak'] <= 10]other_debuts = first_appearance_artist[first_appearance_artist['First_Peak'] > 10]top10_superstar_rate = top10_debuts['Became_Superstar'].mean()other_superstar_rate = other_debuts['Became_Superstar'].mean()conversion_multiplier = top10_superstar_rate / other_superstar_rate if other_superstar_rate > 0 else np.infprint('\n' + '='*80)print('CONVERSION RATES: Top 10 vs Others')print('='*80)print(f'Top 10 Debuts: {top10_superstar_rate:.1%} became superstars')print(f'Below Top 10: {other_superstar_rate:.1%} became superstars')print(f'Multiplier: {conversion_multiplier:.2f}x')print('='*80)

## 5. Predictive Visualization

In [None]:
fig = plt.figure(figsize=(16, 5))ax1 = plt.subplot(1, 3, 1)ax1.axis('off')msg = f'Top 10 Debut = {conversion_multiplier:.1f}x More Likely Superstar'ax1.text(0.5, 0.5, msg, ha='center', va='center', fontsize=13, fontweight='bold',    bbox=dict(boxstyle='round', facecolor='#FFE5B4', edgecolor='#FF6B35', linewidth=2))ax2 = plt.subplot(1, 3, 2)features = ['Peak Position', 'Weeks on Chart', 'Year']correlations = [abs(corr_peak), abs(corr_weeks), abs(corr_year)]colors = ['#FF6B6B', '#FFD93D', '#6BCB77']ax2.barh(features, correlations, color=colors, edgecolor='black')ax2.set_xlabel('Correlation Strength')ax2.set_title('Feature Importance')ax3 = plt.subplot(1, 3, 3)tiers = ['Top 10', 'Below Top 10']rates = [top10_superstar_rate * 100, other_superstar_rate * 100]ax3.bar(tiers, rates, color=['#4ECDC4', '#FF6B6B'], edgecolor='black')ax3.set_ylabel('% Became Superstars')ax3.set_title('Conversion Rate by Debut')plt.suptitle('Predictive Signals: What Predicts Superstar Success?', fontweight='bold')plt.tight_layout()plt.show()

## 6. Summary

In [None]:
print('\n' + '='*80)print('SUMMARY: Early Signs of Superstar Success')print('='*80)print('\nKey Finding:')print(f'Top 10 debut peaks are {conversion_multiplier:.2f}x more likely to become superstars')print(f'\nConversion Rates:')print(f'  Top 10 debuts: {top10_superstar_rate:.1%}')print(f'  Below Top 10: {other_superstar_rate:.1%}')print(f'\nImplications:')print('  - Focus on strong debut positioning')print('  - Top 10 debuts warrant long-term investment')print('  - Peak position more predictive than chart longevity')print('='*80)