In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve, auc, classification_report
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from sklearn.preprocessing import StandardScaler

In [5]:
from spotify_data import *
data

Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,key,...,is_acoustic,is_punk-rock,is_progressive-house,is_power-pop,is_pop,is_pop-film,is_piano,is_party,is_pagode,is_opera
0,5SuOikwiRyPMVoIQDJUgSV,Gen Hoshino,Comedy,Comedy,73,230666,False,0.676,0.4610,1,...,1,0,0,0,0,0,0,0,0,0
1,4qPNDBW1i3p13qLCt0Ki3A,Ben Woodward,Ghost (Acoustic),Ghost - Acoustic,55,149610,False,0.420,0.1660,1,...,1,0,0,0,0,0,0,0,0,0
2,1iJBSr7s7jYXzM8EGcbK5b,Ingrid Michaelson;ZAYN,To Begin Again,To Begin Again,57,210826,False,0.438,0.3590,0,...,1,0,0,0,0,0,0,0,0,0
3,6lfxq3CG4xtTiEg7opyCyx,Kina Grannis,Crazy Rich Asians (Original Motion Picture Sou...,Can't Help Falling In Love,71,201933,False,0.266,0.0596,0,...,1,0,0,0,0,0,0,0,0,0
4,5vjLSffimiIP26QG5WcN2K,Chord Overstreet,Hold On,Hold On,82,198853,False,0.618,0.4430,2,...,1,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
113994,2C3TZjDRiAzdyViavDJ217,Rainy Lullaby,#mindfulness - Soft Rain for Mindful Meditatio...,Sleep My Little Boy,21,384999,False,0.172,0.2350,5,...,0,0,0,0,0,0,0,0,0,0
113995,1hIz5L4IB9hN3WRYPOCGPw,Rainy Lullaby,#mindfulness - Soft Rain for Mindful Meditatio...,Water Into Light,22,385000,False,0.174,0.1170,0,...,0,0,0,0,0,0,0,0,0,0
113996,6x8ZfSoqDjuNa5SVP5QjvX,Cesária Evora,Best Of,Miss Perfumado,22,271466,False,0.629,0.3290,0,...,0,0,0,0,0,0,0,0,0,0
113997,2e6sXL2bYv4bSz6VTdnfLs,Michael W. Smith,Change Your World,Friends,41,283893,False,0.587,0.5060,7,...,0,0,0,0,0,0,0,0,0,0


In [None]:
def logistic_regression_analysis(data, top_genres, predictors):
    results = []
    feature_importance = {}

    for genre in top_genres:
        y = data[f'is_{genre}']
        X = data[predictors]

        # Standardize the predictors
        scaler = StandardScaler()
        X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=predictors)

        # Split balanced data into train and test sets
        X_train, X_test, y_train, y_test = train_test_split(
            X_scaled, y, test_size=0.3, random_state=42
        )

        # Train logistic regression
        model = LogisticRegression(max_iter=1000, random_state=42)
        model.fit(X_train, y_train)

        # Store coefficients for feature importance
        feature_importance[genre] = model.coef_[0]

        # Make predictions
        y_pred = model.predict(X_test)

        # Generate classification report
        report = classification_report(y_test, y_pred, output_dict=True)
        results.append({
            'Genre': genre,
            'Precision (0)': report['0']['precision'],
            'Recall (0)': report['0']['recall'],
            'F1-Score (0)': report['0']['f1-score'],
            'Precision (1)': report['1']['precision'],
            'Recall (1)': report['1']['recall'],
            'F1-Score (1)': report['1']['f1-score'],
            'Accuracy': report['accuracy'],
            'Macro Precision': report['macro avg']['precision'],
            'Macro Recall': report['macro avg']['recall'],
            'Macro F1-Score': report['macro avg']['f1-score'],
        })

    # Convert to a DataFrame for easier handling
    results_df = pd.DataFrame(results)
    return results_df, feature_importance

In [None]:
def plot_feature_importance(feature_importance, predictors):
    for genre, coefficients in feature_importance.items():
        plt.figure(figsize=(10, 6))
        plt.barh(predictors, coefficients, color='skyblue')
        plt.title(f'Feature Importance for Genre: {genre}')
        plt.xlabel('Coefficient Value')
        plt.ylabel('Features')
        plt.grid(axis='x', alpha=0.5)
        plt.tight_layout()
        plt.show()

In [None]:
def plot_roc_curves(data, top_genres, predictors):
    plt.figure(figsize=(15, 10))

    for genre in top_genres:
        y = data[f'is_{genre}']
        X = data[predictors]

        # Standardize the predictors
        scaler = StandardScaler()
        X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=predictors)

        # Balance the classes manually
        X_balanced, y_balanced = balance_classes(X_scaled, y)

        # Split balanced data into train and test sets
        X_train, X_test, y_train, y_test = train_test_split(
            X_balanced, y_balanced, test_size=0.3, random_state=42
        )

        # Train logistic regression
        model = LogisticRegression(max_iter=1000, random_state=42)
        model.fit(X_train, y_train)

        # Predict probabilities for ROC curve
        y_prob = model.predict_proba(X_test)[:, 1]

        # Calculate ROC curve and AUC
        fpr, tpr, _ = roc_curve(y_test, y_prob)
        roc_auc = auc(fpr, tpr)

        # Plot each ROC curve
        plt.plot(fpr, tpr, label=f'{genre} (AUC = {roc_auc:.2f})')

    # Plot baseline
    plt.plot([0, 1], [0, 1], 'k--', lw=2, label='Random Chance')

    # Customize plot
    plt.title('ROC Curves for Logistic Regression by Genre')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.legend(loc='lower right')
    plt.grid(alpha=0.3)
    plt.show()

In [None]:
# Full pipeline

# Define top genres and predictors
top_genres = ['acoustic', 'punk-rock', 'progressive-house', 'power-pop', 'pop', 'pop-film', 'piano', 'party', 'pagode', 'opera']
predictors = ['popularity', 'duration_ms', 'danceability', 'energy', 'loudness',
                'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence',
                'tempo', 'time_signature']

# Generate binary columns for top genres
for genre in top_genres:
    data[f'is_{genre}'] = (data['track_genre'] == genre).astype(int)

# Perform logistic regression analysis
results_df, feature_importance = logistic_regression_analysis(data, top_genres, predictors)

# Display results
print("Classification Report Summary:")
print(results_df)

# Plot ROC curves
plot_roc_curves(data, top_genres, predictors)

# Plot feature importance
plot_feature_importance(feature_importance, predictors)