In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import MaxAbsScaler
from sklearn.metrics import silhouette_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import LabelBinarizer
from sklearn.pipeline import Pipeline
from typing import List
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('data/audio_features_4.0_clean.csv')
df = df.drop(['popularity'], axis=1)

In [3]:
# Fill missing values in the 'text1' and 'text2' columns with a placeholder string
df['artist_name'] = df['artist_name'].fillna('missing_text')
df['song_name'] = df['song_name'].fillna('missing_text')

In [4]:
vectorizer1 = TfidfVectorizer(max_features=500)
vectorizer2 = TfidfVectorizer(max_features=500)

In [5]:
text1_tfidf = vectorizer1.fit_transform(df['song_name'])
text2_tfidf = vectorizer2.fit_transform(df['artist_name'])

In [6]:
text1_tfidf_df = pd.DataFrame(text1_tfidf.toarray(), columns=vectorizer1.get_feature_names_out())
text2_tfidf_df = pd.DataFrame(text2_tfidf.toarray(), columns=vectorizer2.get_feature_names_out())

# Add prefix to the columns to identify them easily
text1_tfidf_df.columns = ['text1_' + col for col in text1_tfidf_df.columns]
text2_tfidf_df.columns = ['text2_' + col for col in text2_tfidf_df.columns]

# Concatenate the original DataFrame with the new TF-IDF features
df_extended = pd.concat([df, text1_tfidf_df, text2_tfidf_df], axis=1)


In [7]:
X = df.drop(['genre', 
'artist_name',
'song_name',
                 'spotify_track_id'], axis=1)
y = df.genre

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=538)

pipe = Pipeline([
    ('scaler', RobustScaler()),
    ('classifier', KNeighborsClassifier())
    ]).fit(X_train, y_train)

In [34]:
import numpy as np

sample = df.sample()
sample_drop = sample.drop([
            'spotify_track_id',
            'artist_name',
            'song_name',
            'genre'
            ], axis=1)

# Get the predicted probabilities (assuming you've already run pipe.predict_proba(sample))
proba_array = pipe.predict_proba(sample_drop)

# Get the class labels from the classifier (assuming it's the last step in your pipeline)
class_labels = pipe.named_steps['classifier'].classes_

# Find the indices of the top 3 probabilities in each row
top3_proba_indices = np.argsort(proba_array, axis=1)[:, -3:]

# Reverse the order to have the highest probability first
top3_proba_indices = np.fliplr(top3_proba_indices)

# Get the corresponding class labels
predicted_top3_classes = class_labels[top3_proba_indices]

# Print the results
for i, (proba, top_classes) in enumerate(zip(proba_array, predicted_top3_classes)):
    top_class, runner_up_class, third_class = top_classes
    print(sample)
    print(f"Sample {i+1}:")
    print(f"Predicted probabilities: {proba}")
    print(f"Top predicted class: {top_class}")
    print(f"Runner-up class: {runner_up_class}")
    print(f"Third-ranked class: {third_class}")
    print()



       danceability  energy  key  loudness  mode  speechiness  acousticness  \
55037         0.373   0.895  2.0    -7.169   0.0       0.0889      0.000019   

       instrumentalness  liveness  valence    tempo  duration_ms  \
55037             0.854     0.145    0.197  158.009     197365.0   

       time_signature artist_name        song_name        genre  \
55037             4.0   Whispered  Boomer Kuwanger  Death Metal   

             spotify_track_id  
55037  5xpHAqKjoX5IMFW1aVBfA7  
Sample 1:
Predicted probabilities: [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.2
 0.  0.  0.  0.  0.2 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.2 0.  0.  0.  0.  0.  0.
 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.2 0.  0.
 0.  0.  0.  0.  0.  0.  0.  0.  0.  