---
title: "Artist Classification"
subtitle: "DSAN 5300 Final Project"
authors: ["Jorge Bris Moreno", "William McGloin", "Kangheng Liu", "Isfar Baset"]
date: last-modified
date-format: long
format:
  html:
    self-contained: true
    toc: true
    code-overflow: wrap
    code-fold: true
---

# Artists

In [117]:
# import relevant libraries
import numpy as np
import pandas as pd

# load in the data
artists = pd.read_csv('../data/clean_data/artists.csv')

# what does that data look like? 
artists.head(20)

Unnamed: 0,artist_name,artist_id,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,key_mode,genre1,genre2,genre3
0,Green Day,7oPftvlwr6VrsViSDV7fJY,0.435219,0.895942,-4.404705,0.07197,0.029052,0.02420668,0.2101,0.65843,140.041634,189080.946429,D major,rock,,
1,Dusty Springfield,5zaXYwewAXedKNCff45U5l,0.485341,0.498333,-10.186623,0.04303,0.51877,0.01524729,0.222696,0.565521,116.199674,195254.23913,D major,jazz,,
2,Hozier,2FXC3k01G6Gw61bmprjgqS,0.448021,0.516437,-7.487792,0.046327,0.384334,0.04871918,0.136029,0.339523,116.836062,241630.729167,G major,rock,,
3,Ms. Lauryn Hill,2Mu5NfyYm8n5iTomuKAEHl,0.652385,0.493627,-12.291385,0.302931,0.270838,0.00041714,0.25895,0.610846,102.295308,281465.692308,C# major,soul,,
4,Incubus,3YcBF2ttyueytpXtEzn1Za,0.508414,0.786576,-6.192253,0.057951,0.080398,0.0761958,0.181052,0.43214,134.341747,245848.59596,A major,funk,,
5,Farruko,329e4yvIujISKGKz1BZZbO,0.695963,0.745696,-5.090474,0.100282,0.214355,0.0005877183,0.217191,0.5978,123.408267,223879.296296,B minor,latin,,
6,Red Hot Chili Peppers,0L8ExT028jH3ddEcZwqJJ5,0.530749,0.787411,-5.798831,0.067306,0.068541,0.05678086,0.170656,0.500953,120.363904,259293.369863,C major,funk,rock,
7,Kali Uchis,1U1el3k54VvEUzo3ybLPlM,0.636603,0.555097,-7.84109,0.067583,0.286546,0.02188057,0.191645,0.473921,117.950526,180635.435897,F# minor,pop,,
8,Swedish House Mafia,1h6Cn3P4NGzXbaXidqURXs,0.58544,0.65728,-7.71132,0.09074,0.082635,0.390617,0.215492,0.201592,124.436,269980.92,D major,edm,,
9,Dean Martin,49e4v89VmlDcFCMyDv9wQ9,0.495437,0.428639,-10.407168,0.039445,0.689061,0.003668141,0.200671,0.532756,117.147185,162275.12605,D# major,jazz,,


In [118]:
# display rows where genre1 = NaN
artists[artists['genre1'].isnull()]

Unnamed: 0,artist_name,artist_id,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,key_mode,genre1,genre2,genre3
193,Roger Miller,1RP2UpEaRzkF0Id3JigqD8,0.521208,0.431797,-12.92274,0.060275,0.736829,0.130657,0.170462,0.595644,113.374175,161172.032468,D major,,,
196,The Beach Boys,3oDbviiivRWhXwIE8hxkVV,0.498619,0.445967,-12.14511,0.083242,0.381031,0.128259,0.184383,0.550309,120.514409,153459.132992,C major,,,


In [119]:
# delete Roger Miller
artists = artists.drop(artists[artists['artist_name'] == 'Roger Miller'].index)

# set genre1 for 'The Beach Boys' to 'rock'
artists.loc[artists['artist_name'] == 'The Beach Boys', 'genre1'] = 'rock'

# look at rows where artist_name == The Beach Boys
artists[artists['artist_name'] == 'The Beach Boys']

Unnamed: 0,artist_name,artist_id,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,key_mode,genre1,genre2,genre3
196,The Beach Boys,3oDbviiivRWhXwIE8hxkVV,0.498619,0.445967,-12.14511,0.083242,0.381031,0.128259,0.184383,0.550309,120.514409,153459.132992,C major,rock,,


In [120]:
# concatenate all the genre columns and get value counts
all_genres = pd.concat([artists['genre1'], artists['genre2'], artists['genre3']])
all_genres.value_counts()

rock       54
pop        52
hip hop    50
country    50
soul       49
jazz       48
latin      48
edm        47
funk       46
rap        40
Name: count, dtype: int64

this data is rather balanced

In [121]:
# split 'key_mode' column
artists['key'] = artists['key_mode'].apply(lambda x: x.split(' ')[0])
artists['mode'] = artists['key_mode'].apply(lambda x: x.split(' ')[1])

# drop key_mode
artists.drop('key_mode', axis=1, inplace=True)

# covert mode into binary
artists['mode'] = artists['mode'].apply(lambda x: 1 if x == 'major' else 0)

# convert key into numerical
key_dict = {'C': 0, 'C#': 1, 'D': 2, 'D#': 3, 'E': 4, 'F': 5, 'F#': 6, 'G': 7, 'G#': 8, 'A': 9, 'A#': 10, 'B': 11}
artists['key'] = artists['key'].apply(lambda x: key_dict[x])

# convert key and mode to factors
artists['key'] = artists['key'].astype('category')
artists['mode'] = artists['mode'].astype('category')

# rename mode to major
artists.rename(columns={'mode': 'major'}, inplace=True)

# what does the data look like now?
artists.head(20)

Unnamed: 0,artist_name,artist_id,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,genre1,genre2,genre3,key,major
0,Green Day,7oPftvlwr6VrsViSDV7fJY,0.435219,0.895942,-4.404705,0.07197,0.029052,0.02420668,0.2101,0.65843,140.041634,189080.946429,rock,,,2,1
1,Dusty Springfield,5zaXYwewAXedKNCff45U5l,0.485341,0.498333,-10.186623,0.04303,0.51877,0.01524729,0.222696,0.565521,116.199674,195254.23913,jazz,,,2,1
2,Hozier,2FXC3k01G6Gw61bmprjgqS,0.448021,0.516437,-7.487792,0.046327,0.384334,0.04871918,0.136029,0.339523,116.836062,241630.729167,rock,,,7,1
3,Ms. Lauryn Hill,2Mu5NfyYm8n5iTomuKAEHl,0.652385,0.493627,-12.291385,0.302931,0.270838,0.00041714,0.25895,0.610846,102.295308,281465.692308,soul,,,1,1
4,Incubus,3YcBF2ttyueytpXtEzn1Za,0.508414,0.786576,-6.192253,0.057951,0.080398,0.0761958,0.181052,0.43214,134.341747,245848.59596,funk,,,9,1
5,Farruko,329e4yvIujISKGKz1BZZbO,0.695963,0.745696,-5.090474,0.100282,0.214355,0.0005877183,0.217191,0.5978,123.408267,223879.296296,latin,,,11,0
6,Red Hot Chili Peppers,0L8ExT028jH3ddEcZwqJJ5,0.530749,0.787411,-5.798831,0.067306,0.068541,0.05678086,0.170656,0.500953,120.363904,259293.369863,funk,rock,,0,1
7,Kali Uchis,1U1el3k54VvEUzo3ybLPlM,0.636603,0.555097,-7.84109,0.067583,0.286546,0.02188057,0.191645,0.473921,117.950526,180635.435897,pop,,,6,0
8,Swedish House Mafia,1h6Cn3P4NGzXbaXidqURXs,0.58544,0.65728,-7.71132,0.09074,0.082635,0.390617,0.215492,0.201592,124.436,269980.92,edm,,,2,1
9,Dean Martin,49e4v89VmlDcFCMyDv9wQ9,0.495437,0.428639,-10.407168,0.039445,0.689061,0.003668141,0.200671,0.532756,117.147185,162275.12605,jazz,,,3,1


In [122]:
# find all numerical columns and normalize the numerical columns into a bracket from [0,1]
numerical_columns = artists.select_dtypes(include=[np.number]).columns
artists[numerical_columns] = (artists[numerical_columns] - artists[numerical_columns].min()) / (artists[numerical_columns].max() - artists[numerical_columns].min())

# what does the data look like now?
artists.head(20)

Unnamed: 0,artist_name,artist_id,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,genre1,genre2,genre3,key,major
0,Green Day,7oPftvlwr6VrsViSDV7fJY,0.376262,0.890147,0.922345,0.127416,0.014041,0.030719,0.407467,0.75797,0.986405,0.141162,rock,,,2,1
1,Dusty Springfield,5zaXYwewAXedKNCff45U5l,0.446053,0.382602,0.535421,0.044035,0.570726,0.019349,0.44493,0.616964,0.703131,0.159115,jazz,,,2,1
2,Hozier,2FXC3k01G6Gw61bmprjgqS,0.394088,0.405712,0.716026,0.053533,0.417907,0.061825,0.187164,0.27397,0.710692,0.29399,rock,,,7,1
3,Ms. Lauryn Hill,2Mu5NfyYm8n5iTomuKAEHl,0.678651,0.376595,0.39457,0.792875,0.288891,0.000529,0.552755,0.685753,0.537929,0.409841,soul,,,1,1
4,Incubus,3YcBF2ttyueytpXtEzn1Za,0.478182,0.750542,0.802723,0.087023,0.072409,0.096694,0.32107,0.414534,0.918682,0.306257,funk,,,9,1
5,Farruko,329e4yvIujISKGKz1BZZbO,0.739331,0.69836,0.876454,0.208992,0.224683,0.000746,0.428556,0.665953,0.788779,0.242365,latin,,,11,0
6,Red Hot Chili Peppers,0L8ExT028jH3ddEcZwqJJ5,0.509281,0.751608,0.829051,0.11398,0.05893,0.072056,0.29015,0.51897,0.752608,0.345358,funk,rock,,0,1
7,Kali Uchis,1U1el3k54VvEUzo3ybLPlM,0.656676,0.455061,0.692383,0.114778,0.306747,0.027767,0.352576,0.477943,0.723934,0.1166,pop,,,6,0
8,Swedish House Mafia,1h6Cn3P4NGzXbaXidqURXs,0.585435,0.585497,0.701067,0.181498,0.074951,0.495699,0.423503,0.064634,0.800989,0.37644,edm,,,2,1
9,Dean Martin,49e4v89VmlDcFCMyDv9wQ9,0.460112,0.293637,0.520662,0.033703,0.764303,0.004655,0.379423,0.567237,0.714389,0.063203,jazz,,,3,1


In [123]:
# split the data into training and testing sets, with 80% of the data for training and 20% for testing
from sklearn.model_selection import train_test_split

train, test = train_test_split(artists, test_size=0.25, random_state=42)

# split the data into features and target, target being three columns ['genre1','genre2','genre3']
X_train = train.drop(['artist_name','artist_id','genre1','genre2','genre3'], axis=1)
y_train = train[['genre1','genre2','genre3']]
X_test = test.drop(['artist_name','artist_id','genre1','genre2','genre3'], axis=1)
y_test = test[['genre1','genre2','genre3']]

print("Shape of X_train: ", X_train.shape, "\nShape of X_test: ", X_test.shape, "\nShape of y_train: ", y_train.shape, "\nShape of y_test: ", y_test.shape)

Shape of X_train:  (297, 12) 
Shape of X_test:  (99, 12) 
Shape of y_train:  (297, 3) 
Shape of y_test:  (99, 3)


In [124]:
# concatenate all the genre columns and get value counts for train data
y_train_genres = pd.concat([y_train['genre1'], y_train['genre2'], y_train['genre3']])
y_train_genres.value_counts()

soul       39
country    38
edm        38
funk       37
latin      37
pop        37
jazz       37
hip hop    36
rock       33
rap        29
Name: count, dtype: int64

In [125]:
# concatenate all the genre columns and get value counts for test data
y_test_genres = pd.concat([y_test['genre1'], y_test['genre2'], y_test['genre3']])
y_test_genres.value_counts()

rock       21
pop        15
hip hop    14
country    12
jazz       11
latin      11
rap        11
soul       10
edm         9
funk        9
Name: count, dtype: int64

In [126]:
# create a new column for each genre in y_train and y_test and set it to 1 if the genre is present in genre1, genre2, or genre3
for genre in y_train_genres.unique():
    y_train[genre] = y_train.apply(lambda x: 1 if genre in x.values else 0, axis=1)
    y_test[genre] = y_test.apply(lambda x: 1 if genre in x.values else 0, axis=1)

# drop genre1, genre2, genre3 from y_train and y_test
y_train.drop(['genre1','genre2','genre3'], axis=1, inplace=True)
y_test.drop(['genre1','genre2','genre3'], axis=1, inplace=True)

# what does the data look like now?
y_train.head(20)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y_train[genre] = y_train.apply(lambda x: 1 if genre in x.values else 0, axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y_test[genre] = y_test.apply(lambda x: 1 if genre in x.values else 0, axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y_train[genre] = y_train.apply(lambda x: 1 if g

Unnamed: 0,country,funk,latin,pop,edm,rock,hip hop,soul,jazz,rap,NaN
16,1,0,0,0,0,0,0,0,0,0,0
66,0,1,0,0,0,0,0,1,0,0,0
148,0,0,1,0,0,0,0,0,0,0,0
7,0,0,0,1,0,0,0,0,0,0,0
19,0,0,0,0,1,0,0,0,0,0,0
303,0,0,0,0,0,1,0,0,0,0,0
352,0,0,0,0,0,0,1,0,0,1,0
346,0,0,1,0,0,0,0,0,0,0,0
118,0,0,0,0,0,0,0,1,0,0,0
180,0,1,0,0,0,0,0,0,0,0,0


In [127]:
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier

# Assuming you have already split your dataset into X_train, X_test, y_train, and y_test

# Initialize the Logistic Regression classifier
log_reg = LogisticRegression(solver='lbfgs', max_iter=1000)

# Wrap the classifier with OneVsRestClassifier to support multi-label classification
ovr_classifier = OneVsRestClassifier(log_reg)

# Fit the model to the training data
ovr_classifier.fit(X_train, y_train)

# Predict on the test data
y_pred = ovr_classifier.predict(X_test)

# You can also use the predict_proba method to get the probability estimates for each class
y_pred_prob = ovr_classifier.predict_proba(X_test)

# After this, you can evaluate your model using various metrics
from sklearn.metrics import accuracy_score, hamming_loss

print("Accuracy Score: ", accuracy_score(y_test, y_pred))
print("Hamming Loss: ", hamming_loss(y_test, y_pred))

# Note that accuracy_score and hamming_loss may not be the best metrics for all multi-label classification problems.
# It's often useful to look at more detailed metrics like the classification report:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))

# Make sure that y_train and y_test are in the right format:
# They should have a separate label column for each class, with each element being binary (0 or 1).


Accuracy Score:  0.1111111111111111
Hamming Loss:  0.10284664830119375
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        12
           1       0.00      0.00      0.00         9
           2       0.00      0.00      0.00        11
           3       0.00      0.00      0.00        15
           4       0.75      0.33      0.46         9
           5       0.00      0.00      0.00        21
           6       1.00      0.29      0.44        14
           7       0.00      0.00      0.00        10
           8       0.75      0.55      0.63        11
           9       1.00      0.18      0.31        11
          10       0.00      0.00      0.00         0

   micro avg       0.79      0.12      0.21       123
   macro avg       0.32      0.12      0.17       123
weighted avg       0.33      0.12      0.17       123
 samples avg       0.13      0.12      0.12       123



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [128]:
# look at each genre and print the number of times it was predicted above
for genre in y_train_genres.unique():
    print("Number of times", genre, "was predicted above: ", sum(y_pred[:, y_train_genres.unique().tolist().index(genre)] > 0))

Number of times country was predicted above:  0
Number of times funk was predicted above:  0
Number of times latin was predicted above:  1
Number of times pop was predicted above:  0
Number of times edm was predicted above:  4
Number of times rock was predicted above:  0
Number of times hip hop was predicted above:  4
Number of times soul was predicted above:  0
Number of times jazz was predicted above:  8
Number of times rap was predicted above:  2
Number of times nan was predicted above:  0


In [130]:
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import cross_val_score, RepeatedKFold
from sklearn.metrics import make_scorer, accuracy_score, hamming_loss

# Assuming you have already preprocessed your dataset into X (features) and Y (multi-label targets)

# Initialize the Logistic Regression classifier
log_reg = LogisticRegression(solver='lbfgs', max_iter=1000)

# Wrap the classifier with OneVsRestClassifier to support multi-label classification
ovr_classifier = OneVsRestClassifier(log_reg)

# Define the k-fold cross-validation procedure
kfold = RepeatedKFold(n_splits=5, n_repeats=1, random_state=1)

# Custom scorer for accuracy and hamming loss
accuracy_scorer = make_scorer(accuracy_score)
hamming_scorer = make_scorer(hamming_loss, greater_is_better=False)

# Calculate cross-validation scores for accuracy
accuracy_scores = cross_val_score(ovr_classifier, X_test, y_test, scoring=accuracy_scorer, cv=kfold)

# Calculate cross-validation scores for hamming loss
hamming_scores = cross_val_score(ovr_classifier, X_test, y_test, scoring=hamming_scorer, cv=kfold)

# Report average cross-validation accuracy and hamming loss
print("Average CV Accuracy Score: ", accuracy_scores.mean())
print("Average CV Hamming Loss: ", -hamming_scores.mean())

# Note: Since hamming_loss should be minimized (the lower, the better), 
# we are negating the scores because cross_val_score function interprets higher score as better.




Average CV Accuracy Score:  0.04052631578947369
Average CV Hamming Loss:  0.11191387559808612




In [None]:
# look at each genre and print the number of times it was predicted above
for genre in y_train_genres.unique():
    print("Number of times", genre, "was predicted above: ", sum(y_pred[:, y_train_genres.unique().tolist().index(genre)] > 0))

In [52]:
# Ignore warnings
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer, OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Fill NaN values in the genre columns
artists['genre1'].fillna('Unknown', inplace=True)
artists['genre2'].fillna('Unknown', inplace=True)
artists['genre3'].fillna('Unknown', inplace=True)
print("Filled NaN")

# Combine the genres into a single multi-label column
artists['genres'] = artists[['genre1', 'genre2', 'genre3']].values.tolist()

# Preprocess categorical features
categorical_features = ['key', 'major']  # Assuming these are the only categorical features
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

# Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_features)
    ], remainder='passthrough')

# Binarize the target labels
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(artists['genres'])

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(
    artists.drop(['artist_name', 'artist_id', 'genre1', 'genre2', 'genre3', 'genres'], axis=1),
    y,
    test_size=0.2,
    random_state=42,
    # stratify=y
)

# Create the One-vs-Rest classification pipeline
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', OneVsRestClassifier(LogisticRegression(solver='liblinear')))
])

# Train the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
print(classification_report(y_test, y_pred, target_names=mlb.classes_))


TypeError: '<' not supported between instances of 'float' and 'str'