## Imports

In [None]:
import os
from dotenv import load_dotenv

import polars as pl
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns



## Config

In [None]:
# Fix random state for reproducibility
RANDOM_STATE = 42

# Load environment variables
load_dotenv()

# Get project root (one level up from notebooks/)
project_root = os.path.abspath(os.path.join(os.getcwd(), '..')) if 'notebooks' in os.getcwd() else os.getcwd()

# Get preprocessed directory from env or use default
preprocessed_dir = os.getenv('PREPROCESSED_DATA_DIR', 'data/preprocessed')

# If path is relative, make it relative to project root
if not os.path.isabs(preprocessed_dir):
    preprocessed_dir = os.path.join(project_root, preprocessed_dir)

# Check if preprocessed data exists
print(f'üìÅ Project root: {project_root}')
print(f'üìÅ Preprocessed dir: {preprocessed_dir}')

if not os.path.exists(preprocessed_dir):
    print(f'\n‚ö†Ô∏è  Warning: Preprocessed directory not found!')
    print('   Run from project root: python3 -m src.main')
else:
    print(f'\n‚úÖ Preprocessed directory found')


## Preprocessed Data Loading

In [None]:
# ---------- Load datasets ---------- #

# Check if preprocessed files exist
items_path = os.path.join(preprocessed_dir, 'items.parquet')
events_path = os.path.join(preprocessed_dir, 'events.parquet')
tracks_catalog_path = os.path.join(preprocessed_dir, 'tracks_catalog_clean.parquet')

if not os.path.exists(items_path) or not os.path.exists(events_path):
    print('‚ùå Preprocessed data not found!')
    print(f'   Looking for files in: {preprocessed_dir}')
    print('\nüìù To generate preprocessed data, run from terminal:')
    print('   cd /home/mle-user/mle_projects/mle-project-sprint-4')
    print('   python3 -m src.main')
    print('\n   This will:')
    print('   1. Download raw data (if needed)')
    print('   2. Preprocess and clean the data')
    print('   3. Generate items.parquet and events.parquet')
    raise FileNotFoundError(f'Preprocessed data not found in {preprocessed_dir}')

# Load preprocessed data
print('‚úÖ Loading preprocessed data...')
items = pl.read_parquet(items_path)
events = pl.read_parquet(events_path)
tracks_catalog_clean = pl.read_parquet(tracks_catalog_path)

print(f'\n‚úÖ Data loaded successfully!')
print(f'   Items: {items.shape}')
print(f'   Events: {events.shape}')
print(f'   Tracks catalog: {tracks_catalog_clean.shape}')

## EDA

In [None]:
# ---------- Check data summary ---------- #

def data_summary(df: pl.DataFrame, name: str):
    '''
    Display a quick overview of a Polars DataFrame.
    '''

    print(f'\n===== {name.upper()} =====')  
  
    # Sample rows
    print('\nSample rows:')
    display(df.head())

    # Shape
    rows, cols = df.shape
    print(f'\nShape: {rows:,} rows x {cols} columns')
    
    # Data info
    print('\nSummary for numeric columns:')
    display(df.describe())
   
    # Column info
    print('\nColumn names and types:')
    for col in df.columns:
        print(f'  {col}: {df[col].dtype}')
    
    # Missing values
    print('\nMissing values:')
    display(df.null_count())

In [None]:
# ---------- Items data summary ---------- #

data_summary(items, 'items')

In [None]:
# ---------- Events data summary ---------- #

data_summary(events, 'events')

In [None]:
# tracks_catalog_clean already loaded in Cell 5
display(tracks_catalog_clean.head())

In [None]:
# Removed - duplicate of previous cell

In [None]:
#---------- Top tracks by popularity ---------- #

top_tracks_by_listen_number = (
    events
        .group_by('track_id')
        .agg(pl.sum('listen_count').alias('total_listen_count'))
        .join(tracks_catalog_clean.select(['track_id', 'track_clean']), on='track_id', how='left')
        .sort('total_listen_count', descending=True)
        .head(10)
)

display(top_tracks_by_listen_number)

In [None]:
# Memory-efficient approach: aggregate first (small result), then join for names
print("=== Top Track Versions ===")

# Step 1: Get top 10 track IDs by listen count (small DataFrame)
top_track_ids = (
    events
    .lazy()
    .group_by('track_id')
    .agg(pl.sum('listen_count').alias('total_listens'))
    .sort('total_listens', descending=False)
    .tail(10)  # Use tail for descending (more efficient)
    .collect()
)

# Step 2: Join only these 10 tracks with catalog (small join)
top_track_versions = (
    top_track_ids
    .join(tracks_catalog_clean.select(['track_id', 'track_clean', 'track_group_id']), 
          on='track_id', how='left')
    .sort('total_listens', descending=True)
)
display(top_track_versions)

# 2. Most popular songs (versions aggregated) - memory efficient
print("\n=== Top Songs (All Versions Combined) ===")

# Step 1: Aggregate by track_id first (reduces size before join)
events_agg = (
    events
    .lazy()
    .group_by('track_id')
    .agg(pl.sum('listen_count').alias('track_listens'))
    .collect()
)

# Step 2: Join with catalog (smaller join)
events_with_groups = (
    events_agg
    .join(tracks_catalog_clean.select(['track_id', 'track_clean', 'track_group_id']), 
          on='track_id', how='left')
)

# Step 3: Group by track_group_id
top_songs = (
    events_with_groups
    .group_by(['track_group_id', 'track_clean'])
    .agg([
        pl.sum('track_listens').alias('total_listens'),
        pl.len().alias('num_versions'),
    ])
    .sort('total_listens', descending=True)
    .head(10)
)
display(top_songs)

# 3. Analysis: How many top tracks are just different versions?
print("\n=== Diversity Analysis ===")

# Get top 100 tracks
top_100_track_ids = (
    events
    .lazy()
    .group_by('track_id')
    .agg(pl.sum('listen_count').alias('total_listens'))
    .sort('total_listens', descending=False)
    .tail(100)
    .collect()
)

top_100_with_groups = (
    top_100_track_ids
    .join(tracks_catalog_clean.select(['track_id', 'track_group_id']), 
          on='track_id', how='left')
)

unique_groups_in_top_100 = top_100_with_groups['track_group_id'].n_unique()
print(f"Top 100 tracks represent {unique_groups_in_top_100} unique songs")
print(f"Duplication rate: {(100 - unique_groups_in_top_100) / 100:.1%}")

–ù–∞–∏–±–æ–ª–µ–µ –ø–æ–ø—É–ª—è—Ä–Ω—ã–µ –∂–∞–Ω—Ä—ã

In [None]:
# Top 5 genres by listening number - memory efficient
# Step 1: Aggregate events by track_id (reduces size)
events_by_track = (
    events
    .lazy()
    .group_by('track_id')
    .agg(pl.sum('listen_count').alias('track_listen_count'))
    .collect()
)

# Step 2: Get unique track-genre mapping (small subset of items)
track_genres = items.select(['track_id', 'genre_clean']).unique(['track_id', 'genre_clean'])

# Step 3: Join and aggregate
genres_by_listen_count = (
    events_by_track
    .join(track_genres, on='track_id', how='left')
    .group_by('genre_clean')
    .agg(pl.sum('track_listen_count').alias('total_listen_count'))
    .sort('total_listen_count', descending=True)
)

top_5_genres = genres_by_listen_count.head(5)
display(top_5_genres)

–¢—Ä–µ–∫–∏, –∫–æ—Ç–æ—Ä—ã–µ –Ω–∏–∫—Ç–æ –Ω–µ –ø—Ä–æ—Å–ª—É—à–∞–ª

In [None]:
# Tracks that haven't been listened to by anybody - memory efficient
# Step 1: Get set of listened track IDs (small set)
listened_track_ids = set(events['track_id'].unique())

# Step 2: Get unique tracks from items
unique_tracks = items.select(['track_id', 'track_clean', 'artist_clean', 'album_clean', 'genre_clean']).unique('track_id')

# Step 3: Filter out listened tracks (using Python set for efficiency)
unlistened_tracks = unique_tracks.filter(~pl.col('track_id').is_in(list(listened_track_ids)))

print(f'Number of unlistened tracks: {unlistened_tracks.height:,}')
display(unlistened_tracks.head(10))