## Imports

In [None]:
import os
import requests
from dotenv import load_dotenv

import pandas as pd
import polars as pl
import numpy as np
import scipy

import matplotlib.pyplot as plt
import seaborn as sns


from implicit.als import AlternatingLeastSquares
from sklearn.preprocessing import LabelEncoder

from catboost import CatBoostClassifier, Pool

import joblib
import gc
import boto3
from botocore.exceptions import ClientError



## Config

In [None]:
# Fix random state for reproducibility
RANDOM_STATE = 42

# Load environment variables
load_dotenv()

datasets = {
    'tracks.parquet': os.getenv('RAW_URL_TRACKS'),
    'catalog_names.parquet': os.getenv('RAW_URL_CATALOG_NAMES'),
    'interactions.parquet': os.getenv('RAW_URL_INTERACTIONS'),
}

raw_dir = os.getenv('RAW_DATA_DIR', '../data/raw')
preprocessed_dir = os.getenv('PREPROCESSED_DATA_DIR', '../data/preprocessed')
encoder_dir = os.getenv('ENCODERS_DIR', '../encoders')

s3_bucket = os.getenv('S3_BUCKET_NAME')
s3_prefix = os.getenv('S3_PREFIX', 'recsys/data/')
s3_region = os.getenv('S3_REGION', 'us-east-1')
aws_access_key_id = os.getenv('AWS_ACCESS_KEY_ID')
aws_secret_access_key = os.getenv('AWS_SECRET_ACCESS_KEY')


## Preprocessed Data Loading

In [None]:
# ---------- Load datasets ---------- #
items = pl.read_parquet(os.path.join(preprocessed_dir, 'items.parquet'))
events = pl.read_parquet(os.path.join(preprocessed_dir, 'events.parquet'))

## EDA

In [None]:
# ---------- Check data summary ---------- #

def data_summary(df: pd.DataFrame, name: str):
    '''
        Display a quick overview of a DataFrame.
    '''

    print(f'\n===== {name.upper()} =====')  
  
    # Sample rows
    print('\nSample rows:')
    display(df.head())

    # Shape
    rows, cols = df.shape
    print(f'\nShape: {rows} rows x {cols} columns')
    
    # Data info
    print('\nSummary for numeric columns:')
    print(df.describe())
   
    # Unique values (column-wise, skip if error occurs)
    print('\nUnique values (for each column):')
    try:
        for col in df.columns:
            print(f'\nColumn: {col}')
            print(df[col].value_counts())
    except Exception as e:
        print(f'Skipped value_counts due to error: {e}')
    
    # Missing values
    print('\nMissing values:')
    print(df.null_count())

In [None]:
# ---------- Items data summary ---------- #

#data_summary(items, 'events')

In [None]:
# ---------- Events data summary ---------- #

data_summary(events, 'events')

In [None]:
tracks_catalog_clean = pl.read_parquet(os.path.join(preprocessed_dir, 'tracks_catalog_clean.parquet'))

In [None]:
display(tracks_catalog_clean.head())

In [None]:
#---------- Top tracks by popularity ---------- #

top_tracks_by_listen_number = (
    events
        .group_by('track_id')
        .agg(pl.sum('listen_count').alias('total_listen_count'))
        .join(tracks_catalog_clean.select(['track_id', 'track_clean']), on='track_id', how='left')
        .sort('total_listen_count', descending=True)
        .head(10)
)

display(top_tracks_by_listen_number)

In [None]:
# 1. Most popular track versions (current behavior)
print("=== Top Track Versions ===")
top_track_versions = (
    events
    .group_by('track_id')
    .agg(pl.sum('listen_count').alias('total_listens'))
    .join(tracks_catalog_clean.select(['track_id', 'track_clean', 'track_group_id']), 
          on='track_id', how='left')
    .sort('total_listens', descending=True)
    .head(10)
)
display(top_track_versions)

# 2. Most popular songs (versions aggregated)
print("\n=== Top Songs (All Versions Combined) ===")
top_songs = (
    events
    .join(tracks_catalog_clean.select(['track_id', 'track_clean', 'track_group_id']), 
          on='track_id', how='left')
    .group_by(['track_group_id', 'track_clean'])
    .agg([
        pl.sum('listen_count').alias('total_listens'),
        pl.n_unique('track_id').alias('num_versions'),
    ])
    .sort('total_listens', descending=True)
    .head(10)
)
display(top_songs)

# 3. Analysis: How many top tracks are just different versions?
print("\n=== Diversity Analysis ===")
unique_groups_in_top_100 = (
    top_track_versions.head(100)['track_group_id'].n_unique()
)
print(f"Top 100 tracks represent {unique_groups_in_top_100} unique songs")
print(f"Duplication rate: {(100 - unique_groups_in_top_100) / 100:.1%}")

Наиболее популярные жанры

In [None]:
# Top 5 genres by listening number
genres_by_listen_count = (
    interactions
        .group_by('track_id')
        .agg(pl.len().alias('track_listen_count'))
        .join(
            items.select(['track_id', 'genre_clean']).unique(['track_id', 'genre_clean']),
            on='track_id',
            how='left'
        )
        .group_by('genre_clean')
        .agg(pl.sum('track_listen_count').alias('listen_count'))
        .sort('listen_count', descending=True)
)

top_5_genres = genres_by_listen_count.head(5)
display(top_5_genres)

Треки, которые никто не прослушал

In [None]:
# Tracks that haven't been listened to by anybody
unlistened_tracks = (
    items
        .select(['track_id', 'track_clean', 'artist_clean', 'album_clean', 'genre_clean'])
        .unique('track_id')
        .join(
            interactions.select('track_id').unique(),
            on='track_id',
            how='anti'
        )
)

print(f'Number of unlistened tracks: {unlistened_tracks.height}')
print(unlistened_tracks.head(10))

# Преобразование данных

Преобразуем данные в формат, более пригодный для дальнейшего использования в расчётах рекомендаций.

In [None]:
# Set up events dataset by aggregating interactions: count listens per user-track pair
# Polars lazy mode with streaming used in order not to crash kernel by dealing with the whole dataset at once
events = (
    interactions.lazy()
        .group_by(['user_id', 'track_id'])
        .agg([
            pl.len().alias('listen_count'),
            pl.max('started_at').alias('last_listen')
        ])
        .sort(['user_id', 'listen_count'], descending=[False, True])
        .collect(engine='streaming')
)
print(f"Aggregated interactions: {events.shape}")
print(events.head())

In [None]:
# Create label encoders for user_id and track_id
# ALS requires consecutive integer indices starting from 0
user_encoder = LabelEncoder()
track_encoder = LabelEncoder()
# Fit encoders
user_encoder.fit(events['user_id'].to_numpy())
track_encoder.fit(events['track_id'].to_numpy())
# Transform to encoded indices
events_data = (
    events
        .with_columns([
            pl.Series('user_idx', user_encoder.transform(events['user_id'].to_numpy())),
            pl.Series('track_idx', track_encoder.transform(events['track_id'].to_numpy()))
        ])
)
print(f"Encoded data shape: {events_data.shape}")
print(f"Unique users: {events_data['user_idx'].n_unique()}")
print(f"Unique tracks: {events_data['track_idx'].n_unique()}")

In [None]:
# Create scipy sparse matrix (COO format)
# Using log-transformed listen counts as implicit feedback weights
user_track_sparse = scipy.sparse.coo_matrix(
    (
        np.log1p(events_data['listen_count'].to_numpy()),  # values (log-scaled)
        (
            events_data['user_idx'].to_numpy(),  # row indices
            events_data['track_idx'].to_numpy()   # col indices
        )
    ),
    shape=(
        events_data['user_idx'].max() + 1,
        events_data['track_idx'].max() + 1
    )
)
# Convert to CSR format for efficient row operations
user_track_sparse = user_track_sparse.tocsr()
print(f"Sparse matrix shape: {user_track_sparse.shape}")
print(f"Sparsity: {1 - user_track_sparse.nnz / (user_track_sparse.shape[0] * user_track_sparse.shape[1]):.4%}")

In [None]:
# Save encoders for decoding predictions back to original ids
encoder_mappings = {
    'user_encoder': user_encoder,
    'track_encoder': track_encoder,
    'user_id_to_idx': dict(zip(events['user_id'].unique(), 
                                user_encoder.transform(events['user_id'].unique()))),
    'track_id_to_idx': dict(zip(events['track_id'].unique(),
                                 track_encoder.transform(events['track_id'].unique())))
}

# Save encoder locally
os.makedirs(encoder_dir, exist_ok=True)
with open(os.path.join(encoder_dir, 'encoder_mappings.joblib'), 'wb') as f:
    joblib.dump(encoder_mappings, f)


# Сохранение данных

Сохраним данные в двух файлах в персональном S3-бакете по пути `recsys/data/`:
- `items.parquet` — все данные о музыкальных треках,
- `events.parquet` — все данные о взаимодействиях.

In [None]:
# Save datasets locally
os.makedirs(preprocessed_dir, exist_ok=True)
items.write_parquet(os.path.join(preprocessed_dir, 'items.parquet'))
events.write_parquet(os.path.join(preprocessed_dir, 'events.parquet'))
print(f'Files saved locally')

In [None]:
## Save data to S3 bucket
#
## Initialize S3 client
#s3_client = boto3.client(
#    's3',
#    region_name=s3_region,
#    aws_access_key_id=aws_access_key_id,
#    aws_secret_access_key=aws_secret_access_key
#)
#
#def upload_to_s3(local_path, s3_key):
#    '''
#        Upload a file to S3 bucket
#    '''
#    try:
#        s3_client.upload_file(local_path, s3_bucket, s3_key)
#        print(f'Uploaded {local_path} to s3://{s3_bucket}/{s3_key}')
#    except ClientError as e:
#        print(f'Error uploading {local_path}: {e}')
#        raise
#
## Upload to S3
#upload_to_s3(
#    os.path.join(preprocessed_dir, 'items.parquet'),
#    f'{s3_prefix}items.parquet'
#)
#upload_to_s3(
#    os.path.join(preprocessed_dir, 'events.parquet'),
#    f'{s3_prefix}events.parquet'
#)
#
#print(f'All files uploaded to S3 bucket: {s3_bucket}')

# Очистка памяти

Здесь, может понадобится очистка памяти для высвобождения ресурсов для выполнения кода ниже. 

Приведите соответствующие код, комментарии, например:
- код для удаление более ненужных переменных,
- комментарий, что следует перезапустить kernel, выполнить такие-то начальные секции и продолжить с этапа 3.

In [None]:
# Clean up unnecessary variables to free memory

# List of objects to delete
variables_to_delete = [
    'interactions',
    'tracks',
    'catalog_names',
    'tracks_exploded',
    'user_track_interactions',
    'albums_catalog',
    'artists_catalog',
    'genres_catalog',
    'tracks_catalog',
    'album_duplicates',
    'artist_duplicates',
    'genre_duplicates',
    'track_duplicates',
    'album_id_map',
    'artist_id_map',
    'genre_id_map',
    'track_id_map',
    'albums_dedup',
    'artists_dedup',
    'genres_dedup',
    'tracks_dedup'
]

# Delete variables
for var in variables_to_delete:
    if var in globals():
        del globals()[var]
        print(f"Deleted {var}")

# Force garbage collection
gc.collect()

print('Memory cleanup complete')
print('To fully free memory, restart the kernel:')
print('  1. Click "Kernel" → "Restart Kernel..."')
print('  2. Re-run initial cells:')
print('     - Cell 3: Imports')
print('     - Cell 5: Config')
print('  3. Load preprocessed data:')
print("     items = pl.read_parquet('../data/preprocessed/items.parquet')")
print("     events = pl.read_parquet('../data/preprocessed/events.parquet')")
print("  4. Continue from Stage 3")

# === ЭТАП 3 ===

# Загрузка данных

Если необходимо, то загружаем items.parquet, events.parquet.

In [None]:
#items = pl.read_parquet('../data/preprocessed/items.parquet')
#events = pl.read_parquet('../data/preprocessed/events.parquet')

# Разбиение данных

Разбиваем данные на тренировочную, тестовую выборки.

In [None]:
# ---------- Split data chronologically ---------- #

# Define split date
# Convert date to days since epoch, find quantile, convert back as Polars cannot handle date quantiles
date_threshold = (
    events
        .select(
            pl.col('last_listen')
              .cast(pl.Date)
              .to_physical()
              .quantile(0.8)
              .cast(pl.Int32)
        )
        .item()
)

# Convert back to date
date_threshold = pl.Series([date_threshold]).cast(pl.Date).item()
print(f'Split date: {date_threshold}')

# Split based on time
train_events = events.filter(pl.col('last_listen') <= date_threshold)
test_events = events.filter(pl.col('last_listen') > date_threshold)

print(f'Train set: {train_events.shape[0]:,}')
print(f'Test set: {test_events.shape[0]:,}')
print(f'Split ratio: {train_events.shape[0]/events.shape[0]:.1%} / {test_events.shape[0]/events.shape[0]:.1%}')

# Топ популярных

Рассчитаем рекомендации как топ популярных.

In [None]:
# Find popularity score from training data and get top 100 tracks

def get_popular_tracks(train_events, tracks_catalog, items=None, top_n=100, min_users=10, max_avg_listens=50):
    ''' 
        Get most popular tracks with protection against data corruption.
        
        Parameters:
        - train_events: aggregated user-track interactions
        - tracks_catalog: deduplicated track catalog (tracks_catalog_clean)
        - items: optional DataFrame for genre/artist info
        - top_n: number of top tracks to return
        - min_users: minimum unique users required
        - max_avg_listens: maximum average listens per user (anti-bot protection)
        
        Anti-bot protection applied as filters of:
        - minimum user_count,
        - maximum average listens per user.
        
        Popularity score is calculated as multiplicative combination of 
        log(total_listens) * log(user_count), 
        so low user_count works as penalty and drastically reduces score.

        Return is the top-N tracks with highest popularity score, each track_id 
        combined with the most common genre and artists
        so that track in the top is original and not a remix or cover.

        Returns:
        - DataFrame with track_id, popularity_score, and optional genre, artist, album. 
    '''
    
    popular_tracks = (
        train_events
            .group_by('track_id')
            .agg([
                pl.sum('listen_count').alias('total_listens'),
                pl.len().alias('user_count')  
            ])
            # Calculate average listens per user
            .with_columns([
                (pl.col('total_listens') / pl.col('user_count')).alias('avg_per_user')
            ])
            # Filter suspicious tracks
            .filter(
                (pl.col('user_count') >= min_users) &  # Minimum user diversity
                (pl.col('avg_per_user') <= max_avg_listens)  # Anti-bot filter
            )
            # Multiplicative popularity score (both must be high in order to get a high popularity score)
            .with_columns([
                (pl.col('total_listens').log1p() * 
                 pl.col('user_count').log1p()).alias('popularity_score')
            ])
            .sort('popularity_score', descending=True)
            .head(top_n)
    )
    
    # Join with deduplicated track catalog to get unique track names
    popular_tracks_with_info = (
        popular_tracks
            .join(
                tracks_catalog.select(['track_id', 'track_clean']),
                on='track_id',
                how='left'
            )
    )
    
    # Add most common genre and artist for each track
    if items is not None:
        track_meta = (
            items
                .group_by('track_id')
                .agg([
                    pl.col('genre_clean').mode().first().alias('genre_clean'),
                    pl.col('artist_clean').mode().first().alias('artist_clean'),
                    pl.col('album_clean').mode().first().alias('album_clean')
                ])
        )
        popular_tracks_with_info = popular_tracks_with_info.join(
            track_meta,
            on='track_id',
            how='left'
        )
    
    return popular_tracks_with_info

In [None]:
# Generate top 100 popular tracks
top_popular = get_popular_tracks(
    train_events=train_events, 
    tracks_catalog=tracks_catalog_clean,
    items=items,
    top_n=100, 
    min_users=10, 
    max_avg_listens=50
)

print(f'Top 10 Popular Tracks:')
display(top_popular.head(10))


# Персональные

Рассчитаем персональные рекомендации.

In [None]:
# Cell: Prepare Data for ALS Model

# Encode user and track IDs for train data
user_encoder = LabelEncoder()
track_encoder = LabelEncoder()

# Fit encoders on training data only
user_encoder.fit(train_events['user_id'].to_numpy())
track_encoder.fit(train_events['track_id'].to_numpy())

# Transform training events
train_events_encoded = train_events.with_columns([
    pl.Series('user_idx', user_encoder.transform(train_events['user_id'].to_numpy())),
    pl.Series('track_idx', track_encoder.transform(train_events['track_id'].to_numpy()))
])

print(f"Encoded training data shape: {train_events_encoded.shape}")
print(f"Unique users: {train_events_encoded['user_idx'].n_unique()}")
print(f"Unique tracks: {train_events_encoded['track_idx'].n_unique()}")

# Create sparse user-track matrix (CSR format for efficiency)
user_track_sparse = scipy.sparse.coo_matrix(
    (
        np.log1p(train_events_encoded['listen_count'].to_numpy()),  # log-scaled weights
        (
            train_events_encoded['user_idx'].to_numpy(),  # row indices
            train_events_encoded['track_idx'].to_numpy()   # col indices
        )
    ),
    shape=(
        train_events_encoded['user_idx'].max() + 1,
        train_events_encoded['track_idx'].max() + 1
    )
).tocsr()

print(f"\nSparse matrix shape: {user_track_sparse.shape}")
print(f"Sparsity: {1 - user_track_sparse.nnz / (user_track_sparse.shape[0] * user_track_sparse.shape[1]):.4%}")

In [None]:
# Cell: Train ALS Model

# Initialize ALS model
als_model = AlternatingLeastSquares(
    factors=64,              # Number of latent factors
    regularization=0.01,     # L2 regularization
    iterations=15,           # Number of training iterations
    calculate_training_loss=True,
    random_state=RANDOM_STATE
)

# Train the model (items x users matrix for implicit library)
print("Training ALS model...")
als_model.fit(user_track_sparse.T.tocsr(), show_progress=True)

print("\n✓ ALS model trained successfully")
print(f"Factors: {als_model.factors}")
print(f"User factors shape: {als_model.user_factors.shape}")
print(f"Item factors shape: {als_model.item_factors.shape}")

In [None]:
# Cell: Generate Personal Recommendations

def get_personal_recommendations(
    user_id, 
    als_model, 
    user_encoder, 
    track_encoder, 
    user_track_sparse,
    items,
    n_recommendations=10,
    filter_already_listened=True
):
    """
    Get personalized recommendations for a user using ALS model.
    
    Parameters:
    - user_id: original user ID
    - als_model: trained ALS model
    - user_encoder: fitted LabelEncoder for users
    - track_encoder: fitted LabelEncoder for tracks
    - user_track_sparse: sparse user-track matrix
    - items: items DataFrame with track information
    - n_recommendations: number of recommendations to return
    - filter_already_listened: whether to filter out already listened tracks
    
    Returns:
    - DataFrame with recommended tracks and scores
    """
    try:
        # Encode user_id
        user_idx = user_encoder.transform([user_id])[0]
    except ValueError:
        # User not in training set (cold start)
        print(f"User {user_id} not found in training data. Returning popular tracks.")
        return get_popular_tracks(train_events, items, top_n=n_recommendations)
    
    # Get recommendations from ALS
    track_ids, scores = als_model.recommend(
        user_idx,
        user_track_sparse[user_idx],
        N=n_recommendations + 100,  # Get more to filter out listened tracks
        filter_already_liked_items=filter_already_listened
    )
    
    # Decode track indices to original track IDs
    recommended_track_ids = track_encoder.inverse_transform(track_ids)
    
    # Create DataFrame with recommendations
    recommendations = pl.DataFrame({
        'track_id': recommended_track_ids[:n_recommendations],
        'score': scores[:n_recommendations]
    })
    
    # Join with items to get track details
    recommendations_with_info = recommendations.join(
        items.select(['track_id', 'albums', 'artists', 'genres']),
        on='track_id',
        how='left'
    )
    
    return recommendations_with_info


# Test recommendations for a few users
test_user_ids = train_events['user_id'].unique().head(5).to_list()

for user_id in test_user_ids:
    print(f"\n{'='*70}")
    print(f"Recommendations for User ID: {user_id}")
    print('='*70)
    
    recs = get_personal_recommendations(
        user_id=user_id,
        als_model=als_model,
        user_encoder=user_encoder,
        track_encoder=track_encoder,
        user_track_sparse=user_track_sparse,
        items=items,
        n_recommendations=10
    )
    
    print(recs)

In [None]:
# Cell: Generate Recommendations for All Users

def generate_all_recommendations(
    user_ids,
    als_model,
    user_encoder,
    track_encoder,
    user_track_sparse,
    items,
    top_popular,
    n_recommendations=10
):
    """
    Generate recommendations for all users (with fallback to popular for cold start).
    
    Returns:
    - DataFrame with user_id, track_id, score, rank
    """
    all_recommendations = []
    
    for i, user_id in enumerate(user_ids):
        if i % 10000 == 0:
            print(f"Processing user {i}/{len(user_ids)}...")
        
        try:
            # Try to get personalized recommendations
            user_idx = user_encoder.transform([user_id])[0]
            track_ids, scores = als_model.recommend(
                user_idx,
                user_track_sparse[user_idx],
                N=n_recommendations,
                filter_already_liked_items=True
            )
            recommended_track_ids = track_encoder.inverse_transform(track_ids)
            
        except (ValueError, IndexError):
            # Cold start user - use popular tracks
            recommended_track_ids = top_popular['track_id'].head(n_recommendations).to_numpy()
            scores = top_popular['popularity_score'].head(n_recommendations).to_numpy()
        
        # Create recommendation records
        for rank, (track_id, score) in enumerate(zip(recommended_track_ids, scores), 1):
            all_recommendations.append({
                'user_id': user_id,
                'track_id': track_id,
                'score': score,
                'rank': rank
            })
    
    return pl.DataFrame(all_recommendations)


# Generate recommendations for all test users
print("Generating recommendations for test set users...")
test_user_ids = test_events['user_id'].unique().sort().to_list()

test_recommendations = generate_all_recommendations(
    user_ids=test_user_ids,
    als_model=als_model,
    user_encoder=user_encoder,
    track_encoder=track_encoder,
    user_track_sparse=user_track_sparse,
    items=items,
    top_popular=top_popular,
    n_recommendations=10
)

print(f"\n✓ Generated {len(test_recommendations):,} recommendations")
print(f"For {test_recommendations['user_id'].n_unique():,} users")
print(test_recommendations.head(20))

In [None]:
# Cell: Save ALS Model and Encoders

import joblib

# Create encoder directory
os.makedirs(encoder_dir, exist_ok=True)

# Save encoders and model
joblib.dump(user_encoder, os.path.join(encoder_dir, 'user_encoder.pkl'))
joblib.dump(track_encoder, os.path.join(encoder_dir, 'track_encoder.pkl'))
joblib.dump(als_model, os.path.join(encoder_dir, 'als_model.pkl'))

# Save popular tracks for cold start
top_popular.write_parquet(os.path.join(preprocessed_dir, 'popular_tracks.parquet'))

# Save recommendations
test_recommendations.write_parquet(os.path.join(preprocessed_dir, 'test_recommendations.parquet'))

print("✓ Saved:")
print(f"  - user_encoder.pkl")
print(f"  - track_encoder.pkl")
print(f"  - als_model.pkl")
print(f"  - popular_tracks.parquet")
print(f"  - test_recommendations.parquet")

# Похожие

Рассчитаем похожие, они позже пригодятся для онлайн-рекомендаций.

# Построение признаков

Построим три признака, можно больше, для ранжирующей модели.

# Ранжирование рекомендаций

Построим ранжирующую модель, чтобы сделать рекомендации более точными. Отранжируем рекомендации.

# Оценка качества

Проверим оценку качества трёх типов рекомендаций: 

- топ популярных,
- персональных, полученных при помощи ALS,
- итоговых
  
по четырем метрикам: recall, precision, coverage, novelty.

# === Выводы, метрики ===

Основные выводы при работе над расчётом рекомендаций, рассчитанные метрики.