<a href="https://colab.research.google.com/github/liamaaaa/BopBot/blob/main/final_bopbot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install python-dotenv

from dotenv import load_dotenv
import os

# Example path — adjust for your folder structure
load_dotenv('/content/drive/MyDrive/secrets/.env')

client_id = os.getenv("CLIENT_ID")
client_secret = os.getenv("CLIENT_SECRET")

print("Client ID loaded:", bool(client_id))
print("Client Secret loaded:", bool(client_secret))

#Data Collection + Cleaning


In [None]:
import requests
import pandas as pd
import base64
import random
from urllib.parse import quote

# creating Spotify API token
def create_token():
  url = 'https://accounts.spotify.com/api/token'

  credentials = f'{client_id}:{client_secret}'.encode()

  headers = {
    'Authorization': 'Basic ' + base64.b64encode(credentials).decode(),
    'Content-Type': 'application/x-www-form-urlencoded'
  }

  data = {
      'grant_type': 'client_credentials'
  }

  response = requests.post(url, headers=headers, data=data)
  return response.json().get('access_token')


def get_track_info(track_name, artist_name, access_token):
    search_url = f'https://api.spotify.com/v1/search?q=track:{quote(track_name)}%20artist:{quote(artist_name)}&type=track&limit=1' # before it was an _ in between quote(track_name)}_artist, changed to quote(track_name)}%20artist,
    headers = {                                                                                                                    # which %20 is a space formatting so then the api knows to consider spacing in what youre looking for if that makes sense
        'Authorization': f'Bearer {access_token}'
    }
    response = requests.get(search_url, headers=headers)
    if response.status_code == 200:
        track_data = response.json()
        for item in track_data['tracks']['items']:
            # Instead of just checking first artist, check all
            all_artists = [artist['name'].lower() for artist in item['artists']]
            if artist_name.lower() in all_artists:
                return item
        print(f"No track found with the name '{track_name}' by '{artist_name}'.")
        return None
    else:
        print(f"Error: {response.status_code}")
        return None


def get_artist_info(artist_name, access_token):
    search_url = f'https://api.spotify.com/v1/search?q={artist_name}&type=artist&limit=1'
    headers = {
        'Authorization': f'Bearer {access_token}'
    }
    response = requests.get(search_url, headers=headers)
    if response.status_code == 200:
        artist_data = response.json()
        artist = artist_data['artists']['items'][0]
        return artist
    else:
        print(f"Error: {response.status_code}")
        return None


def get_track_popularity(track):
    track_popularity = track['popularity'] # before was "track[]""
    if track_popularity:
        print(f"The popularity of '{track['name']}' is: {track_popularity}")
        return track_popularity
    else:
        print(f"No popularity data available for '{track['name']}'.")
        return None

def get_artist_popularity(artist):
    artist_popularity = artist['popularity']
    if artist_popularity:
        print(f"The popularity of '{artist['name']}' is: {artist_popularity}")
        return artist_popularity #returns lone artist popularity score again, to stdout
    else:
        print(f"No popularity data available for '{artist}'.")
        return None


apiToken = create_token()
track_stored = get_track_info('Dance the Night Away', 'TWICE', apiToken) # The song title has to be exact
artist_stored = get_artist_info('TWICE', apiToken)

In [None]:
import pandas as pd
from tabulate import tabulate
client_id = 'b8ec5a2bae2f4a9f8228df36b5cf53fd'
client_secret = '947b46b1c2af448ca6e4166c979487a2'

def build_dataframe(tracks_and_artists, token):
    results = []
    for track_name, artist_name in tracks_and_artists:
        track = get_track_info(track_name, artist_name, token)
        artist = get_artist_info(artist_name, token)
        if track and artist:
            results.append({
                'track_name': track['name'],
                'track_artist': track['artists'][0]['name'],
                'track_popularity': track['popularity'],
                'artist_popularity': artist['popularity']
            })
    df = pd.DataFrame(results)
    return df

songs_to_check = [
    ('BIRDS OF A FEATHER', 'Billie Eilish'),
    ('That’s So True', 'Gracie Abrams'),
    ('Taste', 'Sabrina Carpenter'),
    ('Bhaja Govindam', 'M. S. Subbulakshmi'),
    ('Varnam', 'Jayanthi Kumaresh')
]

api_token = create_token()
df = build_dataframe(songs_to_check, api_token)

# display ddtaframe
display(df)

In [None]:
import pandas as pd
import kagglehub
from kagglehub import KaggleDatasetAdapter
import re
import os
from IPython.display import display
import pickle

combined_file = "combined_spotify_data.csv"

if not os.path.exists(combined_file):
    # Download latest version
    path = kagglehub.dataset_download("solomonameh/spotify-music-dataset")

    # ANDREA - high
    dfHigh = kagglehub.dataset_load(KaggleDatasetAdapter.PANDAS, "solomonameh/spotify-music-dataset/versions/1", "high_popularity_spotify_data.csv")

    # EMILY - low
    dfLow = kagglehub.dataset_load(KaggleDatasetAdapter.PANDAS, "solomonameh/spotify-music-dataset/versions/1", "low_popularity_spotify_data.csv")

    # Combining high + low dataframes
    dfKaggle = pd.concat([dfHigh, dfLow])
    dfKaggle.to_csv(combined_file, index=False)

dfCombined = pd.read_csv(combined_file)

dfTrackFeatures = dfCombined[['track_id', 'track_name', 'track_artist', 'energy', 'tempo', 'danceability', 'loudness', 'playlist_genre', 'speechiness', 'duration_ms', 'instrumentalness', 'valence', 'key', 'tempo', 'acousticness', 'loudness', 'liveness']]
dfTrackFeatures = dfTrackFeatures.set_index('track_id')

display(dfTrackFeatures) # displays dataframe

In [None]:
from IPython.display import display
import time
import pandas as pd
import io
import re
import pickle

dfTrackFeatures = dfTrackFeatures.reset_index(drop=True)
dfTrackFeatures['track_name'] = dfTrackFeatures['track_name'].str.lower().str.strip()
dfTrackFeatures['track_artist'] = dfTrackFeatures['track_artist'].str.lower().str.strip()
dfTrackFeatures['track_artist'] = dfTrackFeatures['track_artist'].apply(
    lambda x: ', '.join(sorted([a.strip() for a in re.split(',|&|/|feat', x, flags=re.IGNORECASE)]))
)
dfTrackFeatures['track_name'] = dfTrackFeatures['track_name'].apply(
    lambda x: re.sub(r"\(.*?\)|\[.*?\]", "", x).lower().strip()
)

unique_songs = dfTrackFeatures[['track_name', 'track_artist']].drop_duplicates()
api_token = create_token()

results = []

for index, row in unique_songs.iterrows():
    track_name = row['track_name']
    artist_name_raw = row['track_artist']
    artist_name_search = re.split(',|&|/|feat', artist_name_raw, flags=re.IGNORECASE)[0].strip()

    # print(f"Searching for '{track_name}' by '{artist_name_search}'")

    track = get_track_info(track_name, artist_name_search, api_token)

    artist_names_to_check = [a.strip() for a in re.split(',|&|/|feat', artist_name_raw, flags=re.IGNORECASE)]
    popularity_scores = []

    for name in artist_names_to_check:
        artist_info = get_artist_info(name, api_token)
        if artist_info:
            popularity_scores.append(artist_info['popularity'])

    if not track or not popularity_scores:
        continue

    track_name_clean = re.sub(r"\(.*?\)|\[.*?\]", "", track['name']).lower().strip()
    spotify_artists = [artist['name'].lower().strip() for artist in track['artists']]
    artist_name_clean = ', '.join(sorted(spotify_artists))

    results.append({
        'track_name': track_name_clean,
        'track_artist': artist_name_clean,
        'track_popularity': track['popularity'],
        'artist_popularities': popularity_scores
    })

    time.sleep(0.1)

dfSpotify = pd.DataFrame(results)
dfSpotify['track_name'] = dfSpotify['track_name'].str.lower().str.strip()
dfSpotify['track_artist'] = dfSpotify['track_artist'].str.lower().str.strip()

dfCombined = pd.merge(
    dfSpotify,
    dfTrackFeatures,
    on=['track_name', 'track_artist'],
    how='left'
)

dfCombined = dfCombined.drop(columns=[col for col in ['level_0', 'index', 'merge_key'] if col in dfCombined.columns])
dfCombined = dfCombined.drop_duplicates(subset=['track_name', 'track_artist'], keep='first')

display(dfCombined)
combo = dfCombined.to_csv("spotify_kaggle_combined_sample.csv", index=False)
with open('combined_df_csv_.pkl', 'wb') as f:
  pickle.dump(combo, f)

# Load back the CSV string
with open('combined_df_csv_.pkl', 'rb') as f:
  loaded_combo = pickle.load(f)

loaded_combo_df = pd.read_csv(pd.compat.StringIO(loaded_combo))

In [None]:
# ... previous code ...
import io
import pickle

'''
Save the DataFrame to CSV, then load it back as a DataFrame
to demonstrate saving and loading
'''
dfCombined.to_csv('combined_df_csv.csv', index=False)
loaded_combo_df = pd.read_csv('combined_df_csv.csv')


# Print the DataFrame
print(loaded_combo_df)

#Creating Multi-Layer Perceptron

In [None]:
# Let's create our MLP model
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense
from sklearn.model_selection import train_test_split

dfCombined = pd.read_csv("dfCombined.csv")  # Replace with actual dataset path

# Use the correct DataFrame with the required columns
x = dfCombined[['artist_popularities', 'tempo', 'loudness', 'energy', 'danceability', 'playlist_genre']]  # Use dfCombined instead of df
y = dfCombined['track_popularity']  # Use dfCombined instead of df


# one-got encoding
encoded_genres = pd.get_dummies(dfCombined['playlist_genre'], dtype=int, prefix='genre')  # Apply one-hot encoding to 'playlist_genre' column
x = pd.concat([x, encoded_genres], axis=1)  # Concatenate encoded genres with other features
x = x.drop('playlist_genre', axis=1) # drop original playlist_genre column

# Convert 'artist_popularities' to numeric, handling potential errors
x['artist_popularities'] = pd.to_numeric(x['artist_popularities'], errors='coerce')

# Fill NaN values with 0 after conversion
x['artist_popularities'] = x['artist_popularities'].fillna(0)
x = x.dropna()
y = y.loc[x.index]

print(x.isnull().sum())  # Shows NaN count per column
print(y.isnull().sum())  # Check target too
display(x)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)
 # 30% of data used for training

model = Sequential()

# add layers to the model
model.add(Dense(6, activation="softplus")) # 1 hidden layer
model.add(Dense(4, activation="softplus")) # Dense represents 1 hidden layer
model.add(Dense(1, activation="linear")) # Output layer -- 3 probability values for each species

model.compile(loss="MSE", metrics=["mae", "mse"])
model.fit(x_train, y_train, epochs=25, batch_size=5)

score = model.evaluate(x_test, y_test)
print("ACCURACY", score)

In [None]:
# Let's create our MLP model
from keras.models import Sequential
from keras.layers import Dense
from sklearn.model_selection import train_test_split

dfCombined = pd.read_csv("dfCombined.csv")

# Use the correct DataFrame with the required columns
x = dfCombined[['artist_popularities', 'tempo', 'loudness', 'energy', 'danceability', 'playlist_genre']]  # Use dfCombined instead of df
y = dfCombined['track_popularity']  # Use dfCombined instead of df

# one-got encoding
encoded_genres = pd.get_dummies(dfCombined['playlist_genre'], dtype=int, prefix='genre')  # Apply one-hot encoding to 'playlist_genre' column
x = pd.concat([x, encoded_genres], axis=1)  # Concatenate encoded genres with other features
x = x.drop('playlist_genre', axis=1) # drop original playlist_genre column

# Convert 'artist_popularities' to numeric, handling potential errors
x['artist_popularities'] = pd.to_numeric(x['artist_popularities'], errors='coerce')

# Fill NaN values with 0 after conversion
x['artist_popularities'] = x['artist_popularities'].fillna(0)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)
 # 30% of data used for training

model = Sequential()

# add layers to the model
model.add(Dense(6, activation="softplus")) # 1 hidden layer
model.add(Dense(4, activation="softplus")) # Dense represents 1 hidden layer
model.add(Dense(1, activation="linear")) # Output layer -- 3 probability values for each species

model.compile(loss="MSE", metrics=["accuracy"])
model.fit(x_train, y_train, epochs=25, batch_size=5)

score = model.evaluate(x_test, y_test)
print("ACCURACY", score)

In [None]:
import numpy as np
import pandas as pd
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Load dataset
data = pd.read_csv("dfCombined.csv")  # Replace with actual dataset path

# Select features and target
features = ["loudness", "energy", "speechiness", "tempo"]
target = "track_popularity"

X = data[features].values
y = data[target].values

# Normalize features (important for NN)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Build the model
model = Sequential([
    Dense(16, activation="softplus", input_shape=(4,)),  # First hidden layer
    Dense(12, activation="softplus"),
    Dropout(0.2),  # Dropout to prevent overfitting
    Dense(8, activation="softplus"),
    Dense(1, activation="linear")  # Output layer for regression
])

# Compile the model
model.compile(optimizer="adam", loss="mse", metrics=["mae"])

# Train the model
history = model.fit(X_train, y_train, epochs=100, batch_size=16, validation_data=(X_test, y_test), verbose=1)

# Evaluate performance
loss, mae = model.evaluate(X_test, y_test)
print(f"Mean Absolute Error: {mae}")

#Refining MLP (reducing MAE)

In [None]:
!pip install IntegratedGradients

In [None]:
import pandas as pd
import ast
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, mean_absolute_error
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.activations import softplus
from tensorflow.keras.optimizers import Adam
from captum.attr import IntegratedGradients

# Load and preprocess the dataset
df = pd.read_csv('dfCombined.csv')

# Convert artist_popularities from string to list, then average
df['artist_popularities'] = df['artist_popularities'].apply(
    lambda x: ast.literal_eval(x) if isinstance(x, str) else x
)
df['artist_popularities'] = df['artist_popularities'].apply(
    lambda x: sum(x)/len(x) if isinstance(x, list) else x
)

# Encode categorical columns
le = LabelEncoder()
df['playlist_genre'] = le.fit_transform(df['playlist_genre'])

# Impute missing values
imputer = SimpleImputer(strategy='most_frequent')
for col in df.columns:
    if df[col].isnull().any():
        df[col] = imputer.fit_transform(df[[col]]).ravel()

# Feature and target separation
X = df.drop(['track_name', 'track_artist', 'track_popularity'], axis=1)
y = df['track_popularity']

# Normalize input features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)

# Build the MLP model
model = Sequential([
    Dense(12, activation=softplus, input_shape=(X_train.shape[1],)),
    Dropout(0.3),
    Dense(8, activation=softplus),
    Dropout(0.3),
    Dense(1)  # Output layer: linear activation by default for regression
])

# Compile the model
model.compile(optimizer=Adam(), loss='mean_squared_error')

ig = IntegratedGradients(model)
attributions, delta = ig.attribute(inputs, target=target_class, return_convergence_delta=True)

# Train the model
history = model.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    epochs=200,
    batch_size=64,
    verbose=1
)

# Evaluate the model
predictions = model.predict(X_test)
mse = mean_squared_error(y_test, predictions)
mae = mean_absolute_error(y_test, predictions)

print("\nMLP Model Performance:")
print("Mean Squared Error:", mse)
print("Mean Absolute Error:", mae)


#Simple UI + Predictor

In [None]:
import pandas as pd
import ast
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.activations import softplus
import ipywidgets as widgets
from IPython.display import display, clear_output

# Load and preprocess the dataset (replicate steps from LhqBtt6dq2F2)
df = pd.read_csv('dfCombined.csv')

# Convert artist_popularities from string to list, then average
df['artist_popularities'] = df['artist_popularities'].apply(
    lambda x: ast.literal_eval(x) if isinstance(x, str) else x
)
df['artist_popularities'] = df['artist_popularities'].apply(
    lambda x: sum(x)/len(x) if isinstance(x, list) else x
)

# Encode categorical columns
le = LabelEncoder()
df['playlist_genre'] = le.fit_transform(df['playlist_genre'])

# Impute missing values
imputer = SimpleImputer(strategy='most_frequent')
for col in df.columns:
    if df[col].isnull().any():
        df[col] = imputer.fit_transform(df[[col]]).ravel()

# Feature and target separation (needed for scaler fitting)
X_for_scaler = df.drop(['track_name', 'track_artist', 'track_popularity'], axis=1)

# Normalize input features
scaler = StandardScaler()
X_for_scaler_scaled = scaler.fit_transform(X_for_scaler)

# Rebuild the MLP model architecture (assuming weights are not saved/loaded, this needs to match LhqBtt6dq2F2)
# If the model was saved, it should be loaded here.
# For now, assuming `model` variable from LhqBtt6dq2F2 is globally accessible. If not, the model needs to be re-trained or loaded.
# To ensure independence, let's define the model architecture and if `model` from LhqBtt6dq2F2 is not available, this will lead to an error.
# The safest approach would be to save and load the model, but for this fix, I'll assume `model` is accessible after `LhqBtt6dq2F2` runs.

# Assign the trained model (from LhqBtt6dq2F2) to mlp
# This assumes the model from cell LhqBtt6dq2F2 is available in the global scope.
# If it's not, you might need to save and load the model.
mlp = tf.keras.models.Sequential([
    tf.keras.layers.Dense(12, activation=softplus, input_shape=(X_for_scaler_scaled.shape[1],)),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(8, activation=softplus),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(1)
])

# If the model was trained in cell LhqBtt6dq2F2 and its weights are not automatically transferred,
# you would need to either re-train it here or save/load its weights.
# For the purpose of fixing the NameError, I'll assume `model` from LhqBtt6dq2F2 is the `mlp` here.
# If cell LhqBtt6dq2F2 has already been run, 'model' should exist. Let's use it if it exists.
try:
    if 'model' in globals():
        mlp = model # Assign the already trained model if it exists
except NameError:
    pass # If 'model' is not defined, mlp will use the newly defined sequential model


def style_text(text, color="lightgray", weight="normal", size="15px", italic=False):
    font_style = "italic" if italic else "normal"
    return f"<span style='color:{color}; font-weight:{weight}; font-size:{size}; font-style:{font_style};'>{text}</span>"

def section_box(content_html):
    return widgets.HTML(
        f"<div style='background-color:#1e1e1e; border:1px solid #444; padding:15px; border-radius:10px; margin:10px 0;'>{content_html}</div>"
    )

def predict_popularity_gui(b):
    clear_output(wait=True)
    display(widgets.HTML("<h2 style='font-family:sans-serif; color: lightgray;'>BopBot</h2>"))
    display(widgets.HTML(style_text("Enter a track name to predict its popularity.", "gray", size="14px", italic=True)))
    display(input_box, predict_button)

    song_name = input_box.value.strip()
    song_data = df[df['track_name'].str.lower() == song_name.lower()]

    if song_data.empty:
        display(section_box(style_text(f"Song '{song_name}' not found in the dataset.", "crimson", "bold", "16px")))
        return

    # Prepare song_data for prediction (replicate preprocessing for a single instance)
    song_features_raw = song_data.drop(['track_name', 'track_artist', 'track_popularity'], axis=1).copy()

    # Convert artist_popularities for the single song
    song_features_raw['artist_popularities'] = song_features_raw['artist_popularities'].apply(
        lambda x: ast.literal_eval(x) if isinstance(x, str) else x
    )
    song_features_raw['artist_popularities'] = song_features_raw['artist_popularities'].apply(
        lambda x: sum(x)/len(x) if isinstance(x, list) else x
    )

    # Apply label encoding to playlist_genre for the single song
    song_features_raw['playlist_genre'] = le.transform(song_features_raw['playlist_genre'])

    # Impute missing values for the single song if any
    for col in song_features_raw.columns:
        if song_features_raw[col].isnull().any():
            song_features_raw[col] = imputer.transform(song_features_raw[[col]]).ravel()

    song_features_scaled = scaler.transform(song_features_raw)
    prediction = mlp.predict(song_features_scaled)[0][0]

    score = int(prediction)
    if score <= 50:
        bar_color = 'crimson'
    elif score <= 80:
        bar_color = 'orange'
    else:
        bar_color = 'limegreen'

    # Progress bar
    score_bar = widgets.IntProgress(
        value=score,
        min=0,
        max=100,
        step=1,
        description='Score:',
        style={'bar_color': bar_color},
        layout=widgets.Layout(width='80%')
    )

    # stuff about the song
    artist = song_data['track_artist'].values[0]
    genre_code = song_data['playlist_genre'].values[0]
    genre_name = le.inverse_transform([genre_code])[0]

    meta_info_html = f"""
        <b style='color:#ccc;'>Track:</b> <span style='color:white'>{song_name}</span><br>
        <b style='color:#ccc;'>Artist:</b> <span style='color:white'>{artist}</span><br>
        <b style='color:#ccc;'>Genre:</b> <span style='color:white'>{genre_name}</span><br>
        <b style='color:#ccc;'>Predicted Popularity:</b> <span style='color:white'>{prediction:.2f}%</span>
    """

    # claasify the scores
    if score > 80:
        message = style_text("This song might be a hit!", "lightgreen", "bold", "16px")
    elif score > 50:
        message = style_text("Could trend with the right exposure.", "orange", "bold", "16px")
    else:
        message = style_text("Time to go back to the studio...", "crimson", "bold", "16px", italic=True)

    display(section_box(meta_info_html))
    display(score_bar)
    display(section_box(message))
    display(widgets.HTML(style_text("Created by Emily Freeman, Andrea Ayon, Lia Mathews, and Jillian Russell", "gray", size="14px", italic=True)))

# input and button widgets
input_box = widgets.Text(
    placeholder='Enter song name here...',
    layout=widgets.Layout(width='60%'),
    style={'description_width': 'initial'}
)
predict_button = widgets.Button(
    description='Predict Popularity',
    button_style='info',
    layout=widgets.Layout(width='30%')
)
predict_button.on_click(predict_popularity_gui)

#launch gui
display(widgets.HTML("<h2 style='font-family:sans-serif; color: lightgray;'>BopBot</h2>"))
display(widgets.HTML(style_text("Song Popularity Predictor", "gray", size="16px", italic=False)))
display(widgets.HTML(style_text("Enter a track name to predict its popularity.", "gray", size="14px", italic=True)))
display(input_box, predict_button)
display(widgets.HTML(style_text("Created by Emily Freeman, Andrea Ayon, Lia Mathews, and Jillian Russell", "gray", size="14px", italic=True)))