In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, HistGradientBoostingRegressor
from sklearn.metrics import r2_score, mean_squared_error
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
plt.rcParams['figure.figsize'] = (8,4)

In [None]:
DATA_PATH = 'data/spotify_tracks.csv'
assert os.path.exists(DATA_PATH), f'Файл {DATA_PATH} не найден. Положите CSV в папку data/'

df = pd.read_csv(DATA_PATH)
df.head()

In [None]:
# Обзор
display(df.info())
display(df.describe(include='all').T)

# Пропуски
miss = df.isna().sum().sort_values(ascending=False)
display(miss[miss>0].head(20))

In [None]:
candidates = ['danceability','energy','loudness','speechiness',
              'acousticness','instrumentalness','liveness','valence','tempo','duration_ms','popularity']
audio_features = [c for c in candidates if c in df.columns]
audio_features
for col in audio_features:
    plt.figure()
    plt.hist(df[col].dropna(), bins=40)
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Count')
    plt.show()


In [None]:
if 'popularity' in df.columns:
    num = df.select_dtypes(include=[np.number])
    corr_with_target = num.corr()['popularity'].sort_values(ascending=False)
    display(corr_with_target)
else:
    print('Нет колонки "popularity" — выберите другую целевую колонку.')


In [None]:
df_fe = df.copy()
# duration -> minutes
if 'duration_ms' in df_fe.columns:
    df_fe['duration_min'] = df_fe['duration_ms'] / 60000

# title_len
if 'track_name' in df_fe.columns:
    df_fe['title_len'] = df_fe['track_name'].fillna('').str.len()

# release year
if 'album_release_date' in df_fe.columns:
    df_fe['release_year'] = pd.to_datetime(df_fe['album_release_date'], errors='coerce').dt.year

# flag explicit
if 'explicit' in df_fe.columns:
    df_fe['is_explicit'] = df_fe['explicit'].astype(int)

# Проверьте корреляцию новых колонок с таргетом
new_feats = [c for c in ['duration_min','title_len','release_year','is_explicit'] if c in df_fe.columns]
if 'popularity' in df_fe.columns:
    display(df_fe[new_feats + ['popularity']].corr()['popularity'].sort_values(ascending=False))
else:
    print('Нет popularity')


In [None]:
df_fe = df.copy()
# duration -> minutes
if 'duration_ms' in df_fe.columns:
    df_fe['duration_min'] = df_fe['duration_ms'] / 60000

# title_len
if 'track_name' in df_fe.columns:
    df_fe['title_len'] = df_fe['track_name'].fillna('').str.len()

# release year
if 'album_release_date' in df_fe.columns:
    df_fe['release_year'] = pd.to_datetime(df_fe['album_release_date'], errors='coerce').dt.year

# flag explicit
if 'explicit' in df_fe.columns:
    df_fe['is_explicit'] = df_fe['explicit'].astype(int)

# Проверьте корреляцию новых колонок с таргетом
new_feats = [c for c in ['duration_min','title_len','release_year','is_explicit'] if c in df_fe.columns]
if 'popularity' in df_fe.columns:
    display(df_fe[new_feats + ['popularity']].corr()['popularity'].sort_values(ascending=False))
else:
    print('Нет popularity')


In [None]:
# Подготовка X и y
target = 'popularity'  # измените, если у вас другая целевая
assert target in df_fe.columns, 'Нужен столбец popularity в датасете'
y = df_fe[target]

# Уберём текстовые/ID поля из X
drop_cols = [c for c in df_fe.columns if df_fe[c].dtype == 'O' or c.lower() in ['id','uri','href','track_id','artist_id','name']]
X = df_fe.drop(columns=[c for c in drop_cols if c in df_fe.columns])
# уберем целевой столбец если попал
if target in X.columns:
    X = X.drop(columns=[target])
# оставим только числовые
X = X.select_dtypes(include=[np.number]).copy()
X.shape

In [None]:
from sklearn.model_selection import KFold, cross_val_score
models = {
    'LinearRegression': LinearRegression(),
    'Ridge': Ridge(alpha=1.0),
    'DecisionTree': DecisionTreeRegressor(max_depth=8, random_state=42),
    'RandomForest': RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1),
    'HistGB': HistGradientBoostingRegressor(random_state=42)
}

kf = KFold(n_splits=5, shuffle=True, random_state=42)
results = {}
for name, model in models.items():
    pipe = Pipeline([('imputer', SimpleImputer(strategy='median')),
                     ('scaler', StandardScaler()),
                     ('model', model)])
    scores = cross_val_score(pipe, X, y.fillna(0), cv=kf, scoring='r2', n_jobs=-1)
    results[name] = (scores.mean(), scores.std())
    print(f'{name}: R2 mean={scores.mean():.4f}, std={scores.std():.4f}')
