# Exploratory Data Analysis

This notebook provides a space for exploratory analysis of the artist discovery data.

## Objectives

- Explore data distributions
- Identify patterns in underrated artists
- Validate scoring algorithm
- Generate additional insights

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from google.cloud import bigquery
import os
from dotenv import load_dotenv

load_dotenv()

# Set plotting style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

In [None]:
# Initialize BigQuery client
project_id = os.getenv('BIGQUERY_PROJECT_ID')
dataset_id = os.getenv('BIGQUERY_DATASET_ID', 'artist_discovery')

credentials_path = os.getenv('GOOGLE_APPLICATION_CREDENTIALS')
if credentials_path and os.path.exists(credentials_path):
    os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = credentials_path

client = bigquery.Client(project=project_id)

## Load Data

In [None]:
# Load raw data
query_raw = f"""
SELECT * 
FROM `{project_id}.{dataset_id}.artist_raw_data`
LIMIT 1000
"""

df_raw = client.query(query_raw).to_dataframe()
print(f"Loaded {len(df_raw)} rows from raw data")

In [None]:
# Load scored data
query_scores = f"""
SELECT * 
FROM `{project_id}.{dataset_id}.artist_scores`
ORDER BY priority_score DESC
"""

df_scores = client.query(query_scores).to_dataframe()
print(f"Loaded {len(df_scores)} rows from scored data")

## Data Overview

In [None]:
df_scores.head(10)

In [None]:
df_scores.describe()

## Visualizations

In [None]:
# Distribution of priority scores
plt.figure(figsize=(10, 6))
plt.hist(df_scores['priority_score'], bins=50, edgecolor='black')
plt.xlabel('Priority Score')
plt.ylabel('Number of Artists')
plt.title('Distribution of A&R Priority Scores')
plt.show()

In [None]:
# Instagram vs Spotify followers scatter
plt.figure(figsize=(10, 8))
plt.scatter(df_scores['spotify_followers'], 
            df_scores['instagram_followers'],
            c=df_scores['priority_score'],
            cmap='viridis',
            alpha=0.6,
            s=50)
plt.colorbar(label='Priority Score')
plt.xlabel('Spotify Followers')
plt.ylabel('Instagram Followers')
plt.title('Instagram vs Spotify Presence')
plt.xscale('log')
plt.yscale('log')
plt.show()

In [None]:
# Genre distribution
genre_counts = df_scores['primary_genre'].value_counts()
plt.figure(figsize=(12, 6))
genre_counts.plot(kind='bar')
plt.xlabel('Genre')
plt.ylabel('Number of Artists')
plt.title('Artists by Genre')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

## Top Artists Analysis

In [None]:
# Top 20 artists
top_20 = df_scores.head(20)
print("Top 20 Underrated Artists:")
print(top_20[['artist_name', 'primary_genre', 'priority_score', 
               'instagram_followers', 'spotify_followers', 'growth_potential']].to_string())

## Additional Analysis

Add your own analysis cells below.