# Инициализация

Загружаем библиотеки необходимые для выполнения кода ноутбука.

### Imports

In [None]:
import os
import requests
from dotenv import load_dotenv

import pandas as pd
import polars as pl
import numpy as np
import scipy

import matplotlib.pyplot as plt
import seaborn as sns


from implicit.als import AlternatingLeastSquares
from sklearn.preprocessing import LabelEncoder

from catboost import CatBoostClassifier, Pool

#import calendar
import joblib
#import s3fs
import gc
import boto3
from botocore.exceptions import ClientError



### Config

In [None]:
# Fix random state for reproducibility
RANDOM_STATE = 42

# Load environment variables
load_dotenv()

datasets = {
    'tracks.parquet': os.getenv('RAW_URL_TRACKS'),
    'catalog_names.parquet': os.getenv('RAW_URL_CATALOG_NAMES'),
    'interactions.parquet': os.getenv('RAW_URL_INTERACTIONS'),
}

raw_dir = os.getenv('RAW_DATA_DIR', '../data/raw')
preprocessed_dir = os.getenv('PREPROCESSED_DATA_DIR', '../data/preprocessed')
encoder_dir = os.getenv('ENCODERS_DIR', '../encoders')

s3_bucket = os.getenv('S3_BUCKET_NAME')
s3_prefix = os.getenv('S3_PREFIX', 'recsys/data/')
s3_region = os.getenv('S3_REGION', 'us-east-1')
aws_access_key_id = os.getenv('AWS_ACCESS_KEY_ID')
aws_secret_access_key = os.getenv('AWS_SECRET_ACCESS_KEY')


# === ЭТАП 1 ===

# Загрузка первичных данных

Загружаем первичные данные из файлов:
- tracks.parquet
- catalog_names.parquet
- interactions.parquet

In [None]:
# ---------- Download datasets and save locally ---------- #
# Create directory
os.makedirs(raw_dir, exist_ok=True)

# Download and save each dataset
for filename, url in datasets.items():
    save_path = os.path.join(raw_dir, filename)
    response = requests.get(url)
    response.raise_for_status()
    with open(save_path, 'wb') as f:
        f.write(response.content)
    print(f'Saved {filename} to {save_path}')

In [None]:
# ---------- Load datasets ---------- #
tracks = pl.read_parquet(os.path.join(raw_dir, 'tracks.parquet'))
catalog_names = pl.read_parquet(os.path.join(raw_dir, 'catalog_names.parquet'))
interactions = pl.read_parquet(os.path.join(raw_dir, 'interactions.parquet'))

# Обзор данных

Проверяем данные, есть ли с ними явные проблемы.

In [None]:
# ---------- Check data summary ---------- #
def data_summary(df: pd.DataFrame, name: str):
    '''
        Display a quick overview of a DataFrame.
    '''

    print(f'\n===== {name.upper()} =====')  
  
    # Sample rows
    print('\nSample rows:')
    display(df.head())

    # Shape
    rows, cols = df.shape
    print(f'\nShape: {rows} rows x {cols} columns')
    
    # Data info
    print('\nSummary for numeric columns:')
    print(df.describe())
   
    # Unique values (column-wise, skip if error occurs)
    print('\nUnique values (for each column):')
    try:
        for col in df.columns:
            print(f'\nColumn: {col}')
            print(df[col].value_counts())
    except Exception as e:
        print(f'Skipped value_counts due to error: {e}')
    
    # Missing values
    print('\nMissing values:')
    print(df.null_count())

In [None]:
data_summary(tracks, 'tracks')

In [None]:
data_summary(catalog_names, 'catalog_names')

In [None]:
data_summary(interactions, 'interactions')

### Main takeaways
1. Tracs dataframe contains lists instead of scalar values. This can cause several problems:
- Unable to get insights on data;
Counting, merging, or joining on list columns is tricky.
- Missing values are hidden;
df.isna().sum() can’t detect empty lists, so there might be tracks with no genres or artists but they look as not missing values.
- Hard to work with for ML models.
Most algorithms expect scalar values, not lists.
Thus, it's necessary to explode the lists to get a dataframe with one row per track-per-item (artist, genre, album).

2. Catalog_names dataframe is in a format where everything (tracks, albums, artists, genres) is stacked in one column, and the type column tells what each row represents. This format isn't convenient for futher work. Thus, it's necessary to split catalog_names into several dataframes.