In [1]:
import pandas as pd
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [2]:
# Load the CSV file from the Kaggle dataset
data = pd.read_csv('/content/Spotify_BTS_AudioFeatures.csv')
print(data.head())

   Unnamed: 0                    Title Artist    Release  danceability  \
0           1                     Like    BTS  2013/6/12         0.582   
1           2            No More Dream    BTS  2013/6/12         0.436   
2           3  We Are Bulletproof Pt.2    BTS  2013/6/12         0.754   
3           4        Attack on Bangtan    BTS  2013/9/11         0.722   
4           5                   Coffee    BTS  2013/9/11         0.652   

   energy  key  loudness  mode  speechiness  acousticness  instrumentalness  \
0   0.726    0    -4.174     0       0.1160       0.27100          0.000000   
1   0.869    2    -5.174     1       0.4720       0.01240          0.000002   
2   0.952    8    -5.110     0       0.1530       0.00945          0.000006   
3   0.961   11    -2.548     0       0.2670       0.20300          0.000000   
4   0.794    0    -6.604     1       0.0957       0.00808          0.000000   

   liveness  valence    tempo                      id  duration_ms  
0     0.227

In [4]:
# Create a synthetic popularity metric based on danceability, energy, and valence
data['popularity_class'] = data.apply(lambda row: 1 if (row['danceability'] > 0.7 and row['energy'] > 0.7 and row['valence'] > 0.7) else 0, axis=1)
data['popularity_class'].head()

Unnamed: 0,popularity_class
0,0
1,0
2,1
3,0
4,0


In [5]:
from sklearn.model_selection import train_test_split

# Selecting relevant audio features and the target variable
X = data[['danceability', 'energy', 'valence', 'tempo']]  # Feature columns
y = data['popularity_class']  # Target variable (0 or 1)

# Split the data into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
from sklearn.preprocessing import StandardScaler

# Initialize the StandardScaler
scaler = StandardScaler()

# Fit the scaler on the training data and transform it
X_train_scaled = scaler.fit_transform(X_train)

# Use the same scaler to transform the test data
X_test_scaled = scaler.transform(X_test)

In [7]:
from sklearn.linear_model import LogisticRegression

# Create and train the logistic regression model
model = LogisticRegression()
model.fit(X_train_scaled, y_train)

In [8]:
# Predict the popularity class of songs using the test data
predictions = model.predict(X_test_scaled)

In [9]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, predictions)
print(f'Accuracy: {accuracy * 100:.2f}%')

# Display the confusion matrix
conf_matrix = confusion_matrix(y_test, predictions)
print('Confusion Matrix:\n', conf_matrix)

# Display the classification report
report = classification_report(y_test, predictions)
print('Classification Report:\n', report)

Accuracy: 86.67%
Confusion Matrix:
 [[26  2]
 [ 2  0]]
Classification Report:
               precision    recall  f1-score   support

           0       0.93      0.93      0.93        28
           1       0.00      0.00      0.00         2

    accuracy                           0.87        30
   macro avg       0.46      0.46      0.46        30
weighted avg       0.87      0.87      0.87        30

