# Skip Pattern Analysis

Analyzing skip behavior from parsed Spotify listening history.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('Set2')
%matplotlib inline

# Set figure size defaults
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['figure.dpi'] = 100

## Load Parsed Data

In [None]:
df = pd.read_csv('../data/processed/listening_history_parsed.csv')
df['played_at_dt'] = pd.to_datetime(df['played_at_dt'])

print(f"Total records: {len(df):,}")
print(f"Date range: {df['played_at_dt'].min()} to {df['played_at_dt'].max()}")
print(f"\nSkip rate: {df['is_skip'].mean():.2%}")
df.head()

## Skip Rate Over Time

In [None]:
# Skip rate by month
df['year_month'] = df['played_at_dt'].dt.to_period('M')
monthly_skip = df.groupby('year_month').agg({
    'is_skip': ['sum', 'count', 'mean']
}).reset_index()
monthly_skip.columns = ['year_month', 'skips', 'total', 'skip_rate']
monthly_skip['year_month'] = monthly_skip['year_month'].astype(str)

fig, ax = plt.subplots(figsize=(15, 6))
ax.plot(monthly_skip['year_month'], monthly_skip['skip_rate'] * 100, marker='o', linewidth=2)
ax.set_xlabel('Month', fontsize=12)
ax.set_ylabel('Skip Rate (%)', fontsize=12)
ax.set_title('Skip Rate Over Time', fontsize=14, fontweight='bold')
ax.grid(True, alpha=0.3)
# Show every 6th month on x-axis
ax.set_xticks(ax.get_xticks()[::6])
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## Skip Patterns by Time of Day

In [None]:
# Skip rate by hour of day
hourly_skip = df.groupby('hour_of_day')['is_skip'].agg(['sum', 'count', 'mean']).reset_index()
hourly_skip.columns = ['hour', 'skips', 'total', 'skip_rate']

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 5))

# Skip rate by hour
ax1.bar(hourly_skip['hour'], hourly_skip['skip_rate'] * 100, color='steelblue', alpha=0.7)
ax1.set_xlabel('Hour of Day', fontsize=12)
ax1.set_ylabel('Skip Rate (%)', fontsize=12)
ax1.set_title('Skip Rate by Hour of Day', fontsize=14, fontweight='bold')
ax1.set_xticks(range(24))
ax1.grid(True, alpha=0.3, axis='y')

# Total listens by hour
ax2.bar(hourly_skip['hour'], hourly_skip['total'], color='coral', alpha=0.7)
ax2.set_xlabel('Hour of Day', fontsize=12)
ax2.set_ylabel('Number of Tracks', fontsize=12)
ax2.set_title('Listening Activity by Hour', fontsize=14, fontweight='bold')
ax2.set_xticks(range(24))
ax2.grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.show()

## Skip Patterns by Day of Week

In [None]:
# Skip rate by day of week
day_names = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
daily_skip = df.groupby('day_of_week')['is_skip'].agg(['sum', 'count', 'mean']).reset_index()
daily_skip.columns = ['day', 'skips', 'total', 'skip_rate']
daily_skip['day_name'] = daily_skip['day'].map(lambda x: day_names[x])

fig, ax = plt.subplots(figsize=(12, 6))
ax.bar(daily_skip['day_name'], daily_skip['skip_rate'] * 100, color='mediumseagreen', alpha=0.7)
ax.set_xlabel('Day of Week', fontsize=12)
ax.set_ylabel('Skip Rate (%)', fontsize=12)
ax.set_title('Skip Rate by Day of Week', fontsize=14, fontweight='bold')
ax.grid(True, alpha=0.3, axis='y')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## Weekend vs Weekday

In [None]:
weekend_skip = df.groupby('is_weekend')['is_skip'].agg(['sum', 'count', 'mean']).reset_index()
weekend_skip.columns = ['is_weekend', 'skips', 'total', 'skip_rate']
weekend_skip['period'] = weekend_skip['is_weekend'].map({False: 'Weekday', True: 'Weekend'})

print("Weekend vs Weekday Skip Rates:")
print(weekend_skip[['period', 'skip_rate', 'total']])
print(f"\nWeekend skip rate: {weekend_skip[weekend_skip['is_weekend']]['skip_rate'].values[0]:.2%}")
print(f"Weekday skip rate: {weekend_skip[~weekend_skip['is_weekend']]['skip_rate'].values[0]:.2%}")

## Listen Duration Analysis

In [None]:
# Compare listen duration for skipped vs completed tracks
fig, axes = plt.subplots(1, 2, figsize=(16, 5))

# Distribution of listen duration
skipped = df[df['is_skip']]['ms_played'] / 1000
completed = df[~df['is_skip']]['ms_played'] / 1000

axes[0].hist([completed, skipped], bins=50, label=['Completed', 'Skipped'], alpha=0.7, color=['green', 'red'])
axes[0].set_xlabel('Listen Duration (seconds)', fontsize=12)
axes[0].set_ylabel('Frequency', fontsize=12)
axes[0].set_title('Listen Duration Distribution', fontsize=14, fontweight='bold')
axes[0].legend()
axes[0].set_xlim(0, 300)  # Focus on 0-5 minutes

# Box plot comparison
df['listen_seconds'] = df['ms_played'] / 1000
df['skip_status'] = df['is_skip'].map({True: 'Skipped', False: 'Completed'})
sns.boxplot(data=df, x='skip_status', y='listen_seconds', ax=axes[1])
axes[1].set_ylabel('Listen Duration (seconds)', fontsize=12)
axes[1].set_xlabel('', fontsize=12)
axes[1].set_title('Listen Duration by Skip Status', fontsize=14, fontweight='bold')
axes[1].set_ylim(0, 300)

plt.tight_layout()
plt.show()

print(f"Average listen time (skipped): {skipped.mean():.1f} seconds")
print(f"Average listen time (completed): {completed.mean():.1f} seconds")

## Most Skipped vs Least Skipped Artists

In [None]:
# Calculate skip rate by artist (minimum 20 plays)
artist_stats = df.groupby('artist_name').agg({
    'is_skip': ['sum', 'count', 'mean']
}).reset_index()
artist_stats.columns = ['artist', 'skips', 'total_plays', 'skip_rate']
artist_stats = artist_stats[artist_stats['total_plays'] >= 20].sort_values('skip_rate')

print("Top 10 Artists (Lowest Skip Rate):")
print(artist_stats.head(10)[['artist', 'skip_rate', 'total_plays']])

print("\nTop 10 Artists (Highest Skip Rate):")
print(artist_stats.tail(10)[['artist', 'skip_rate', 'total_plays']])

## Key Insights for Feature Engineering

Based on this analysis, potential features for the model:

1. **Temporal Features**:
   - Hour of day
   - Day of week
   - Weekend vs weekday
   - Time period (morning/afternoon/evening/night)

2. **Artist/Track Features**:
   - Artist historical skip rate
   - Track popularity (play count)
   - Days since last played

3. **Sequential Features**:
   - Number of tracks from same artist in session
   - Position in listening session
   - Previous track skipped?

4. **Audio Features** (to be added):
   - Danceability, energy, valence, tempo, etc.
   - From Spotify API when available