Load Spotify Dataset And Necessary Modules

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
df=pd.read_csv(r"E:\30 day datascience\SpotifyFeatures.csv")
print(df.head())
print(df.shape)

Check For Missing values

In [1]:
missing_data=df.isnull().sum()
print(missing_data)

NameError: name 'df' is not defined

Fill the Null Values

In [None]:
df['track_name']=df['track_name'].fillna(df['track_name'].mode()[0])

In [None]:
df.isnull().sum()

Detecting Outliers

In [None]:
features = df[['acousticness', 'danceability', 'energy', 
               'instrumentalness', 'liveness', 'loudness', 
               'speechiness', 'tempo', 'valence']]
Q1 = features.quantile(0.25)
Q3 = features.quantile(0.75)
IQR = Q3 - Q1

# Identify outliers beyond 1.5 * IQR
outliers = ((features < (Q1 - 1.5 * IQR)) | (features > (Q3 + 1.5 * IQR))).sum()
print(outliers)
print(df.shape)


Filtering Dataset From Outliers

In [None]:
# Define a list of features to check for outliers
features = ['acousticness', 'danceability', 'energy', 
            'instrumentalness', 'liveness', 'loudness', 
            'speechiness', 'tempo', 'valence']

# Loop over the columns and remove outliers using IQR
filtered_df = df.copy()

for col in features:
    # Calculate Q1, Q3, and IQR for the feature
    Q1 = filtered_df[col].quantile(0.25)
    Q3 = filtered_df[col].quantile(0.75)
    IQR = Q3 - Q1

    # Calculate lower and upper bounds for detecting outliers
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    # Remove rows where values are outside the bounds
    filtered_df = filtered_df[(filtered_df[col] >= lower_bound) & (filtered_df[col] <= upper_bound)]

# Print the size of the original and filtered dataframe
print(f"Original dataset size: {df.shape[0]}")
print(f"Dataset size after removing outliers: {filtered_df.shape[0]}")


Scaling The Features

In [None]:
from sklearn.preprocessing import StandardScaler

# Selecting features after outlier removal
features = ['acousticness', 'danceability', 'energy', 
            'instrumentalness', 'liveness', 'loudness', 
            'speechiness', 'tempo', 'valence']

scaler = StandardScaler()
scaled_features = scaler.fit_transform(filtered_df[features])

# Optional: Convert back to DataFrame for easier handling
df_scaled = pd.DataFrame(scaled_features, columns=features)


Splitting Data into Training And Testing

In [None]:
from sklearn.model_selection import train_test_split
train_data, test_data = train_test_split(df_scaled,test_size=0.2,random_state=42)

Finding Optimal K Using Elbow Method Plotting

In [None]:
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

inertia = []
k_values = range(1,11)

for k in k_values:
    kmeans= KMeans(n_clusters=k, random_state=42)
    kmeans.fit(train_data)
    inertia.append(kmeans.inertia_)
plt.figure(figsize=(10,8))
plt.plot(k_values, inertia, marker='o')
plt.xlabel("Number of Cluster (K)")
plt.ylabel("Inertia")
plt.title("Elbow Method For Optimal K")
plt.show()

In [None]:
Applying KMeans

In [None]:
optimal_k = 5
kmeans = KMeans(n_clusters=optimal_k, random_state=42)
filtered_df["cluster"] = kmeans.fit_predict(df_scaled)


In [None]:
filtered_df["cluster"].value_counts()

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
pca_result = pca.fit_transform(df_scaled)
plt.figure(figsize=(10,8))
plt.scatter(pca_result[:,0],pca_result[:,1], c=filtered_df['cluster'],cmap='viridis')
plt.title("K-Means Clusters")
plt.show()

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

def recommend_songs(song_name, filtered_df, numerical_features, num_recommendations=5):
    # Find song(s) matching the track name
    song_data = filtered_df[filtered_df['track_name'] == song_name]
    
    if song_data.empty:
        print(f"Song '{song_name}' not found.")
        return None
    
    if len(song_data) > 1:
        print(f"Warning: Multiple songs found for '{song_name}'. Using the first one.")
    
    # Use the first matching song
    song_row = song_data.iloc[0]
    song_cluster = song_row['cluster']

    # Filter songs in the same cluster
    same_cluster_songs = filtered_df[filtered_df['cluster'] == song_cluster].copy()

    # Features for all songs in cluster
    cluster_features = same_cluster_songs[numerical_features]

    # Features for the selected input song
    input_features = song_row[numerical_features].values.reshape(1, -1)

    # Compute similarity between input song and cluster songs
    similarity_scores = cosine_similarity(input_features, cluster_features).flatten()

    # Add similarity scores to DataFrame
    same_cluster_songs['similarity'] = similarity_scores

    # Exclude the input song itself
    recommendations = same_cluster_songs[same_cluster_songs['track_name'] != song_name] \
                        .sort_values(by='similarity', ascending=False) \
                        .head(num_recommendations)

    return recommendations[['track_name', 'artist_name', 'genre', 'similarity']]


In [None]:
input_song = "Don't Let Me Be Lonely Tonight"
numerical_features = features  # Adjust to your feature set

recommended = recommend_songs(input_song, filtered_df, numerical_features, num_recommendations=5)

if recommended is not None:
    print(f"Songs similar to '{input_song}':")
    print(recommended)


In [None]:
# choosing unique 
print(filtered_df['track_name'].unique())
