# Spotify Listening Data Analysis

This notebook analyzes your Spotify listening history to uncover patterns, trends, and insights about your music consumption.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from data_loader import load_spotify_data, get_data_summary

plt.style.use('default')
sns.set_palette("husl")

%matplotlib inline

In [None]:
df = load_spotify_data()
summary = get_data_summary(df)

print("Dataset loaded successfully!")
print(f"Shape: {df.shape}")
df.head()

## 📊 Data Overview

In [None]:
for key, value in summary.items():
    print(f"{key.replace('_', ' ').title()}: {value}")

## 🎵 Top Artists & Tracks

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

top_artists = df.groupby('track_artist')['minutes_played'].sum().sort_values(ascending=False).head(10)
top_artists.plot(kind='barh', ax=axes[0,0], color='skyblue')
axes[0,0].set_title('Top 10 Artists by Listening Time')
axes[0,0].set_xlabel('Minutes Played')

top_tracks = df.groupby(['track_name', 'track_artist'])['minutes_played'].sum().sort_values(ascending=False).head(10)
track_labels = [f"{track} - {artist}" for (track, artist) in top_tracks.index]
top_tracks.values
axes[0,1].barh(range(len(top_tracks)), top_tracks.values, color='lightcoral')
axes[0,1].set_yticks(range(len(top_tracks)))
axes[0,1].set_yticklabels([label[:30] + '...' if len(label) > 30 else label for label in track_labels])
axes[0,1].set_title('Top 10 Tracks by Listening Time')
axes[0,1].set_xlabel('Minutes Played')

most_played_artists = df['track_artist'].value_counts().head(10)
most_played_artists.plot(kind='barh', ax=axes[1,0], color='lightgreen')
axes[1,0].set_title('Top 10 Artists by Play Count')
axes[1,0].set_xlabel('Number of Plays')

most_played_tracks = df.groupby(['track_name', 'track_artist']).size().sort_values(ascending=False).head(10)
track_labels = [f"{track} - {artist}" for (track, artist) in most_played_tracks.index]
axes[1,1].barh(range(len(most_played_tracks)), most_played_tracks.values, color='gold')
axes[1,1].set_yticks(range(len(most_played_tracks)))
axes[1,1].set_yticklabels([label[:30] + '...' if len(label) > 30 else label for label in track_labels])
axes[1,1].set_title('Top 10 Tracks by Play Count')
axes[1,1].set_xlabel('Number of Plays')

plt.tight_layout()
plt.show()

## ⏰ Listening Patterns Over Time

In [None]:
daily_listening = df.groupby('date')['minutes_played'].sum().reset_index()

fig = px.line(daily_listening, x='date', y='minutes_played', 
              title='Daily Listening Time Over Time',
              labels={'minutes_played': 'Minutes Played', 'date': 'Date'})
fig.update_layout(height=400)
fig.show()

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

hourly_listening = df.groupby('hour')['minutes_played'].sum()
hourly_listening.plot(kind='bar', ax=axes[0,0], color='purple', alpha=0.7)
axes[0,0].set_title('Listening by Hour of Day')
axes[0,0].set_xlabel('Hour')
axes[0,0].set_ylabel('Minutes Played')
axes[0,0].tick_params(axis='x', rotation=0)

day_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
daily_listening = df.groupby('day_of_week')['minutes_played'].sum().reindex(day_order)
daily_listening.plot(kind='bar', ax=axes[0,1], color='orange', alpha=0.7)
axes[0,1].set_title('Listening by Day of Week')
axes[0,1].set_xlabel('Day')
axes[0,1].set_ylabel('Minutes Played')
axes[0,1].tick_params(axis='x', rotation=45)

monthly_listening = df.groupby('month')['minutes_played'].sum()
monthly_listening.plot(kind='bar', ax=axes[1,0], color='green', alpha=0.7)
axes[1,0].set_title('Listening by Month')
axes[1,0].set_xlabel('Month')
axes[1,0].set_ylabel('Minutes Played')
axes[1,0].tick_params(axis='x', rotation=45)

skip_by_hour = df.groupby('hour')['is_skip'].mean() * 100
skip_by_hour.plot(kind='line', ax=axes[1,1], color='red', marker='o')
axes[1,1].set_title('Skip Rate by Hour of Day')
axes[1,1].set_xlabel('Hour')
axes[1,1].set_ylabel('Skip Rate (%)')

plt.tight_layout()
plt.show()

## 🎯 Listening Behavior Analysis

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

df['minutes_played'].hist(bins=50, ax=axes[0,0], alpha=0.7, color='blue')
axes[0,0].set_title('Distribution of Track Listening Duration')
axes[0,0].set_xlabel('Minutes Played')
axes[0,0].set_ylabel('Frequency')
axes[0,0].axvline(df['minutes_played'].mean(), color='red', linestyle='--', label=f'Mean: {df["minutes_played"].mean():.1f} min')
axes[0,0].legend()

platform_counts = df['platform'].value_counts()
platform_counts.plot(kind='pie', ax=axes[0,1], autopct='%1.1f%%')
axes[0,1].set_title('Listening by Platform')
axes[0,1].set_ylabel('')

skip_reasons = df[df['is_skip']]['reason_end'].value_counts().head(10)
skip_reasons.plot(kind='barh', ax=axes[1,0], color='salmon')
axes[1,0].set_title('Top Skip Reasons')
axes[1,0].set_xlabel('Count')

session_lengths = df.groupby('listening_session')['minutes_played'].sum()
session_lengths.hist(bins=30, ax=axes[1,1], alpha=0.7, color='green')
axes[1,1].set_title('Distribution of Listening Session Lengths')
axes[1,1].set_xlabel('Session Length (minutes)')
axes[1,1].set_ylabel('Frequency')
axes[1,1].axvline(session_lengths.mean(), color='red', linestyle='--', label=f'Mean: {session_lengths.mean():.1f} min')
axes[1,1].legend()

plt.tight_layout()
plt.show()

## 🔍 Advanced Analysis

In [None]:
artist_diversity = df.groupby('date')['track_artist'].nunique().reset_index()
artist_diversity.columns = ['date', 'unique_artists']

fig = px.scatter(artist_diversity, x='date', y='unique_artists',
                title='Musical Diversity Over Time (Unique Artists per Day)',
                labels={'unique_artists': 'Unique Artists', 'date': 'Date'})
fig.update_layout(height=400)
fig.show()

In [None]:
repeat_listening = df.groupby(['track_name', 'track_artist']).agg({
    'minutes_played': 'sum',
    'ts': 'count'
}).rename(columns={'ts': 'play_count'}).reset_index()

repeat_listening['avg_minutes_per_play'] = repeat_listening['minutes_played'] / repeat_listening['play_count']

fig = px.scatter(repeat_listening, 
                x='play_count', 
                y='minutes_played',
                size='avg_minutes_per_play',
                hover_data=['track_name', 'track_artist'],
                title='Track Popularity vs Total Listening Time',
                labels={'play_count': 'Number of Plays', 'minutes_played': 'Total Minutes Played'})
fig.update_layout(height=500)
fig.show()

## 📈 Custom Analysis

Use the cells below for your own custom analysis!

In [None]:
print("Available columns for analysis:")
print(df.columns.tolist())