In [14]:
import pandas as pd
import numpy as np

# Load the dataset from the specified path
file_path = r"C:\Users\Jonathan Gonzalez\Final Project\P4-main\P4-main\resources\spotify_songs.csv"
songs_df = pd.read_csv(file_path)

# Remove duplicates based on track_id
songs_df = songs_df.drop_duplicates(subset='track_id')

# Keep necessary columns for final recommendation
songs_df_pp = songs_df[['track_id', 'track_name', 'track_artist', 'track_album_release_date', 'danceability', 
                        'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 
                        'instrumentalness', 'liveness', 'track_popularity']]

# Converting and extracting date information
songs_df_pp['track_album_release_date'] = pd.to_datetime(songs_df_pp['track_album_release_date'], format='mixed')
songs_df_pp['release_month'] = songs_df_pp['track_album_release_date'].dt.month

# Encoding categorical variables using one-hot encoding
songs_df_pp_encoded = pd.get_dummies(songs_df_pp)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  songs_df_pp['track_album_release_date'] = pd.to_datetime(songs_df_pp['track_album_release_date'], format='mixed')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  songs_df_pp['release_month'] = songs_df_pp['track_album_release_date'].dt.month


Explanation:

Load the dataset: Read the CSV file into a pandas DataFrame.
Remove duplicates: Ensure each track appears only once in the dataset.
Select columns: Keep relevant columns needed for analysis and recommendations.
Convert date: Change the track_album_release_date to a datetime format and extract the month.
One-hot encoding: Convert categorical variables into numerical format for machine learning compatibility.

In [15]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Define the features and target based on the available columns
features = ['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness']
target = 'track_popularity'  # Assuming 'track_popularity' is the target variable indicating your preference

# Create feature matrix X and target vector y
X = songs_df_pp_encoded[features]
y = songs_df_pp_encoded[target]

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Normalizing the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


Explanation:

Define features and target: Specify which columns are predictors (features) and which column is the response (target).
Split data: Divide the data into training and testing sets to evaluate the model's performance.
Normalize data: Scale the features so they contribute equally to the model.

In [16]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam

# Define the neural network model
model = Sequential([
    Dense(64, input_dim=X_train_scaled.shape[1], activation='relu'),
    Dense(32, activation='relu'),
    Dense(1, activation='linear')  # Linear activation for regression
])

# Compile the model
model.compile(optimizer=Adam(learning_rate=0.001), loss='mean_squared_error', metrics=['mae'])

# Train the model
history = model.fit(X_train_scaled, y_train, validation_data=(X_test_scaled, y_test), epochs=50, batch_size=32)


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/50
[1m709/709[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 3ms/step - loss: 1168.5179 - mae: 27.9829 - val_loss: 554.0170 - val_mae: 19.6728
Epoch 2/50
[1m709/709[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - loss: 552.6563 - mae: 19.7141 - val_loss: 549.9847 - val_mae: 19.3852
Epoch 3/50
[1m709/709[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - loss: 533.1457 - mae: 19.3021 - val_loss: 543.2710 - val_mae: 19.5314
Epoch 4/50
[1m709/709[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - loss: 534.8322 - mae: 19.3670 - val_loss: 544.5415 - val_mae: 19.3044
Epoch 5/50
[1m709/709[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - loss: 527.9398 - mae: 19.1815 - val_loss: 540.1624 - val_mae: 19.5020
Epoch 6/50
[1m709/709[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - loss: 529.6072 - mae: 19.1969 - val_loss: 540.3128 - val_mae: 19.4074
Epoch 7/50
[1m709/709[0m [32m━━━━━━━━━━━━━━━━━━━

Explanation:

Define model: Create a neural network with two hidden layers.
Compile model: Specify the optimizer and loss function for regression tasks.
Train model: Fit the model on the training data, using the testing data for validation, over 50 epochs.

In [17]:
# Function to get user preferences on a scale of "low", "medium", and "high"
def get_user_preferences():
    scale_mapping = {"low": 0.3, "medium": 0.6, "high": 0.9}
    
    preferences = {}
    preferences['danceability'] = scale_mapping[input("Enter your preference for Danceability (low, medium, high): ").lower()]
    preferences['energy'] = scale_mapping[input("Enter your preference for Energy (low, medium, high): ").lower()]
    preferences['key'] = scale_mapping[input("Enter your preference for Key (low, medium, high): ").lower()]
    preferences['loudness'] = scale_mapping[input("Enter your preference for Loudness (low, medium, high): ").lower()]
    preferences['mode'] = scale_mapping[input("Enter your preference for Mode (low, medium, high): ").lower()]
    preferences['speechiness'] = scale_mapping[input("Enter your preference for Speechiness (low, medium, high): ").lower()]
    preferences['acousticness'] = scale_mapping[input("Enter your preference for Acousticness (low, medium, high): ").lower()]
    preferences['instrumentalness'] = scale_mapping[input("Enter your preference for Instrumentalness (low, medium, high): ").lower()]
    preferences['liveness'] = scale_mapping[input("Enter your preference for Liveness (low, medium, high): ").lower()]
    return preferences

def recommend_songs(model, scaler, preferences, original_df, features, X_scaled):
    # Convert preferences to DataFrame
    user_df = pd.DataFrame([preferences])
    
    # Scale the user preferences
    user_scaled = scaler.transform(user_df)
    
    # Predict the popularity for the user's preferences
    predicted_popularity = model.predict(user_scaled)
    
    # Calculate the distance between user preferences and song features
    distances = np.sqrt(np.sum((X_scaled - user_scaled)**2, axis=1))
    
    # Combine with the original dataset
    similar_songs = original_df.copy()
    similar_songs['distance'] = distances
    
    # Drop duplicates based on track_name and track_artist
    similar_songs = similar_songs.drop_duplicates(subset=['track_name', 'track_artist'])
    
    # Get the top 10 similar songs
    top_10_songs = similar_songs.sort_values(by='distance').head(10)
    return top_10_songs[['track_name', 'track_artist', 'distance']]


Explanation:

Get user preferences: Collect user input for each feature and map it to numerical values.
Scale preferences: Use the same scaler to normalize user preferences as used for the dataset.
Calculate distances: Compute the Euclidean distance between user preferences and each song's features to find the most similar songs.
Find similar songs: Identify and return the top 10 songs that are closest to the user's preferences.

In [22]:
# Get user preferences
user_preferences = get_user_preferences()

# Recommend songs based on user preferences
top_10_songs = recommend_songs(model, scaler, user_preferences, songs_df_pp, features, X_scaled)

# Print the recommended songs
print("Top 10 Recommended Songs Based on Your Preferences:")
print(top_10_songs)


Enter your preference for Danceability (low, medium, high):  low
Enter your preference for Energy (low, medium, high):  low
Enter your preference for Key (low, medium, high):  medium
Enter your preference for Loudness (low, medium, high):  high
Enter your preference for Mode (low, medium, high):  high
Enter your preference for Speechiness (low, medium, high):  high
Enter your preference for Acousticness (low, medium, high):  high
Enter your preference for Instrumentalness (low, medium, high):  low
Enter your preference for Liveness (low, medium, high):  medium


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step
Top 10 Recommended Songs Based on Your Preferences:
                      track_name                     track_artist  distance
6530              fortune cookie                            B0nds  5.326185
9875                     D(R)Own                       Ghostemane  5.385548
25705                   I'M DEAD                         Duckwrth  5.548274
8820    Shut Up - Studio Version                          Stormzy  5.583872
8256         True Colors/It's On                         The Game  5.704529
8331   I'm Going to Live My Life  DJ Screw & The Screwed Up Click  5.921740
8875                 Kranichstil                          Olexesh  5.923949
10289               Disconnected                          Lucchii  5.943659
21890       Dreams Money Can Buy                            Drake  6.059498
7355                     Trylogy                           Kurupt  6.085347
