# Exploratory Data Analysis - Spotify Skip Prediction

This notebook explores the collected listening history data to understand skip patterns.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')
%matplotlib inline

## Load Data

In [None]:
# Load most recent data file
data_dir = Path('../data/processed')
data_files = sorted(data_dir.glob('listening_history_*.csv'))

if data_files:
    df = pd.read_csv(data_files[-1])
    print(f"Loaded {len(df)} tracks from {data_files[-1].name}")
else:
    print("No data files found. Run data collection first!")
    df = None

In [None]:
# Display basic info
if df is not None:
    print(df.info())
    display(df.head())

## Skip Rate Analysis

In [None]:
# Calculate skip rate
if df is not None:
    skip_rate = df['is_skip'].sum() / df['is_skip'].notna().sum()
    print(f"Overall Skip Rate: {skip_rate:.2%}")
    
    # Visualize
    fig, ax = plt.subplots(figsize=(8, 5))
    df['is_skip'].value_counts().plot(kind='bar', ax=ax)
    ax.set_title('Skip vs. Complete Listens')
    ax.set_xlabel('Skipped')
    ax.set_ylabel('Count')
    plt.tight_layout()

## Audio Feature Distributions

In [None]:
# Plot distributions of key audio features
if df is not None:
    audio_features = ['danceability', 'energy', 'valence', 'tempo', 
                      'acousticness', 'instrumentalness', 'speechiness']
    
    fig, axes = plt.subplots(3, 3, figsize=(15, 12))
    axes = axes.ravel()
    
    for i, feature in enumerate(audio_features):
        if feature in df.columns:
            df[feature].hist(bins=30, ax=axes[i])
            axes[i].set_title(feature.capitalize())
            axes[i].set_xlabel('Value')
            axes[i].set_ylabel('Frequency')
    
    plt.tight_layout()

## Skip Patterns by Audio Features

In [None]:
# Compare audio features for skipped vs. completed tracks
if df is not None and df['is_skip'].notna().sum() > 0:
    skip_comparison = df.groupby('is_skip')[audio_features].mean()
    display(skip_comparison)

## Next Steps

1. Collect more data over time
2. Engineer temporal features (time of day, day of week)
3. Add sequential features (previous track features, listening context)
4. Build baseline models