In [2]:
import catboost
from catboost import Pool, FeaturesData, CatBoostClassifier, CatBoost
import pandas as pd
import os
import numpy as np
from copy import deepcopy
from sklearn.model_selection import train_test_split
import pickle
np.set_printoptions(precision=4)
print(f"catboost version: {catboost.__version__}")
!python --version

catboost version: 1.0.6
Python 3.10.4


# Reading the data

In [3]:
data_filepath = r"/Users/jamesmoro/Documents/Python/playlist-recommender/playlist-creator/data/playlist_df.pkl"

In [4]:
raw_df = pd.read_pickle(data_filepath)

In [5]:
df = raw_df.copy()

In [6]:
df.head()

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,...,type,id,uri,track_href,analysis_url,duration_ms,time_signature,artist_names,track_names,playlist_name
0,0.506,0.805,1,-4.119,1,0.0469,0.00711,0.00193,0.0856,0.383,...,audio_features,46lFttIf5hnUZMGvjK0Wxo,spotify:track:46lFttIf5hnUZMGvjK0Wxo,https://api.spotify.com/v1/tracks/46lFttIf5hnU...,https://api.spotify.com/v1/audio-analysis/46lF...,227074,4,Galantis,Runaway (U & I),briz party songz
1,0.49,0.811,1,-2.524,0,0.044,0.0896,0.0,0.0898,0.547,...,audio_features,5BIgbz2Dy7X2I0bnkKH9Mb,spotify:track:5BIgbz2Dy7X2I0bnkKH9Mb,https://api.spotify.com/v1/tracks/5BIgbz2Dy7X2...,https://api.spotify.com/v1/audio-analysis/5BIg...,200992,4,Tim Berg,Seek Bromance (Avicii Vocal Edit),briz party songz
2,0.527,0.835,6,-5.298,1,0.0433,0.0166,0.0,0.249,0.654,...,audio_features,0ct6r3EGTcMLPtrXHDvVjc,spotify:track:0ct6r3EGTcMLPtrXHDvVjc,https://api.spotify.com/v1/tracks/0ct6r3EGTcML...,https://api.spotify.com/v1/audio-analysis/0ct6...,176658,4,Avicii,The Nights,briz party songz
3,0.633,0.818,1,-5.437,1,0.0376,0.0114,0.0,0.259,0.337,...,audio_features,5r3aYGutXgsxSqB6W3RrzJ,spotify:track:5r3aYGutXgsxSqB6W3RrzJ,https://api.spotify.com/v1/tracks/5r3aYGutXgsx...,https://api.spotify.com/v1/audio-analysis/5r3a...,193481,4,The Magician,Sunlight (feat. Years and Years) - Radio Edit,briz party songz
4,0.662,0.858,2,-4.844,1,0.0428,0.00163,0.0,0.0456,0.295,...,audio_features,6Pgkp4qUoTmJIPn7ReaGxL,spotify:track:6Pgkp4qUoTmJIPn7ReaGxL,https://api.spotify.com/v1/tracks/6Pgkp4qUoTmJ...,https://api.spotify.com/v1/audio-analysis/6Pgk...,181673,4,Avicii,Without You (feat. Sandro Cavazza),briz party songz


# Preparing the data

## Column Removal

In [7]:
df.columns

Index(['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
       'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo',
       'type', 'id', 'uri', 'track_href', 'analysis_url', 'duration_ms',
       'time_signature', 'artist_names', 'track_names', 'playlist_name'],
      dtype='object')

In [8]:
useless_cols = ["type", "id", "uri", "track_href", "analysis_url"]

In [9]:
df = df.drop(useless_cols, axis = 1)

In [10]:
df.head()

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,artist_names,track_names,playlist_name
0,0.506,0.805,1,-4.119,1,0.0469,0.00711,0.00193,0.0856,0.383,126.008,227074,4,Galantis,Runaway (U & I),briz party songz
1,0.49,0.811,1,-2.524,0,0.044,0.0896,0.0,0.0898,0.547,125.965,200992,4,Tim Berg,Seek Bromance (Avicii Vocal Edit),briz party songz
2,0.527,0.835,6,-5.298,1,0.0433,0.0166,0.0,0.249,0.654,125.983,176658,4,Avicii,The Nights,briz party songz
3,0.633,0.818,1,-5.437,1,0.0376,0.0114,0.0,0.259,0.337,121.975,193481,4,The Magician,Sunlight (feat. Years and Years) - Radio Edit,briz party songz
4,0.662,0.858,2,-4.844,1,0.0428,0.00163,0.0,0.0456,0.295,133.993,181673,4,Avicii,Without You (feat. Sandro Cavazza),briz party songz


Label values extraction

In [11]:
y = df['playlist_name']
X = df.drop(['playlist_name'], axis = 1)

Categorical features declaration

In [12]:
cat_features_names = df.select_dtypes(include = ['object']).columns.to_list()
cat_features_names

['artist_names', 'track_names', 'playlist_name']

In [13]:
cat_features = [X.columns.get_loc(c) for c in cat_features_names if c in X]
cat_features

[13, 14]

Looking on label balance in dataset

In [14]:
y.value_counts()

Classic Rock                                                             340
Discovered Weekly                                                        336
Gym                                                                      218
Summer Songs                                                             129
Summer Tunes                                                             115
80's Hits                                                                115
60s/70s                                                                  113
Chill                                                                     97
Koala Kontrol                                                             90
The Beatles                                                               77
Your Top Songs 2019                                                       73
Classical                                                                 73
Guilty Pleasures                                                          66

Create pool

In [15]:
X_prepared = X.values.astype(str).astype(object)
# For FeaturesData class categorial features must have type str

pool = Pool(
    data=FeaturesData(
        cat_feature_data=X_prepared,
        cat_feature_names=list(X)
    ),
    label=y.values
)

In [16]:
print("Column names:")
print(pool.get_feature_names()) 

Column names:
['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms', 'time_signature', 'artist_names', 'track_names']


In [17]:
X_train, X_validation, y_train, y_validation = train_test_split(X, y, 
                                                                train_size=0.75, 
                                                                stratify = y, 
                                                                shuffle= True)

# Modelling

In [23]:
model = CatBoostClassifier(
    iterations=500,
    random_seed=43,
    loss_function='MultiClass', 
    early_stopping_rounds=20
)
model.fit(
    X_train, y_train,
    cat_features=cat_features,
    eval_set=(X_validation, y_validation),
    verbose=False,
    plot=True
)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

<catboost.core.CatBoostClassifier at 0x7fecd7390370>

In [25]:
print(model.tree_count_)

129


In [27]:
from catboost.utils import get_roc_curve
import sklearn
from sklearn import metrics

eval_pool = Pool(X_validation, y_validation, cat_features=cat_features)
curve = get_roc_curve(model, eval_pool)
(fpr, tpr, thresholds) = curve
roc_auc = sklearn.metrics.auc(fpr, tpr)

CatBoostError: catboost/private/libs/target/data_providers.cpp:405: Non-Multiclassification and Non-MultiTarget compatible metric (Logloss) specified for a multidimensional model