In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics.pairwise import cosine_similarity
import random

In [2]:
data = pd.read_csv("dataset.csv")

In [3]:
data.head()

Unnamed: 0.1,Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,...,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
0,0,5SuOikwiRyPMVoIQDJUgSV,Gen Hoshino,Comedy,Comedy,73,230666,False,0.676,0.461,...,-6.746,0,0.143,0.0322,1e-06,0.358,0.715,87.917,4,acoustic
1,1,4qPNDBW1i3p13qLCt0Ki3A,Ben Woodward,Ghost (Acoustic),Ghost - Acoustic,55,149610,False,0.42,0.166,...,-17.235,1,0.0763,0.924,6e-06,0.101,0.267,77.489,4,acoustic
2,2,1iJBSr7s7jYXzM8EGcbK5b,Ingrid Michaelson;ZAYN,To Begin Again,To Begin Again,57,210826,False,0.438,0.359,...,-9.734,1,0.0557,0.21,0.0,0.117,0.12,76.332,4,acoustic
3,3,6lfxq3CG4xtTiEg7opyCyx,Kina Grannis,Crazy Rich Asians (Original Motion Picture Sou...,Can't Help Falling In Love,71,201933,False,0.266,0.0596,...,-18.515,1,0.0363,0.905,7.1e-05,0.132,0.143,181.74,3,acoustic
4,4,5vjLSffimiIP26QG5WcN2K,Chord Overstreet,Hold On,Hold On,82,198853,False,0.618,0.443,...,-9.681,1,0.0526,0.469,0.0,0.0829,0.167,119.949,4,acoustic


In [4]:
nonNumeric = ['song_name', 'artist_name', 'track_id']
dataClean = data.drop(columns=nonNumeric, errors='ignore')
dataClean = dataClean.select_dtypes(include=[np.number])

In [5]:
scaler = StandardScaler()
dataStandardized = scaler.fit_transform(dataClean)

In [6]:
# knn
def knnRecommend(userSong, data, k=5):
    similarities = cosine_similarity(data[userSong].reshape(1,-1), data).flatten()
    recommended = np.argsort(-similarities)[1:k+1]
    return recommended

In [7]:
# hash-based recommendation
class LSH:
    def __init__(self, numHashes, numBands):
        self.buckets = {}
        self.numHashes = numHashes
        self.numBands = numBands
    def fit(self, data):
        rows, features = data.shape
        for i in range(self.numHashes):
            randProjection = np.random.randn(self.numHashes, data.shape[1])
            hashVals = np.sign(np.dot(data, randProjection.T))
            for j in range(self.numBands):
                startID = j * (self.numHashes // self.numBands)
                endID = startID + (self.numHashes // self.numBands)
                bHash = tuple(hashVals[:, startID:endID].flatten())
                if bHash not in self.buckets:
                    self.buckets[bHash] = []
                self.buckets[bHash].append(i)
    def query(self, songID):
        similarSongs = []
        for bucket in self.buckets.values():
            if songID in bucket:
                similarSongs.extend(bucket)
        return list(set(similarSongs) - {songID})

In [8]:
userSongID = random.randint(0, dataStandardized.shape[0]-1)

In [9]:
# knn trial
k = 5
knnRecommendations = knnRecommend(userSongID, dataStandardized, k)
print("KNN Recommendations", knnRecommendations)

KNN Recommendations [91020 83162 85863 53963 83488]


In [10]:
# hash-based trial - FIXME
lsh = LSH(numHashes=20, numBands=5)
lsh.fit(dataStandardized)
lshRecommendations = lsh.query(userSongID)
print("Hash-Based Recommendations", lshRecommendations[:k])

Hash-Based Recommendations []


In [11]:
# Naive Bayes
class NaiveBayes:
    def __init__(self):
        self.classes = None
        self.means = {}
        self.variances = {}
        self.priors = {}

    def fit(self, X, y):
        self.classes = np.unique(y)
        for cls in self.classes:
            X_c = X[y == cls]
            self.means[cls] = np.mean(X_c, axis=0)
            self.variances[cls] = np.var(X_c, axis=0)
            self.priors[cls] = X_c.shape[0] / X.shape[0]

    def calcLikelihood(self, cls, x):
        mean = self.means[cls]
        variance = self.variances[cls]
        numerator = np.exp(-((x - mean) ** 2) / (2 * variance))
        denominator = np.sqrt(2 * np.pi * variance)
        return numerator / denominator

    def calcPosterior(self, x):
        posteriors = {}
        for cls in self.classes:
            prior = self.priors[cls]
            likelihood = np.prod(self.calcLikelihood(cls, x))
            posteriors[cls] = prior * likelihood
        return posteriors

    def predict(self, X):
        predictions = []
        for x in X:
            posteriors = self.calcPosterior(x)
            predictions.append(max(posteriors, key=posteriors.get))
        return np.array(predictions)

In [12]:
# Train NB
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=5, random_state=42)
labels = kmeans.fit_predict(dataStandardized)
nb = NaiveBayes()
nb.fit(dataStandardized, labels)

In [13]:
# Predict NB
predictions = nb.predict(dataStandardized)
print("Predicted Labels:", predictions)

Predicted Labels: [4 1 1 ... 2 1 2]


In [30]:
# Recommend with NB
def recommendNB(userSongID, X, labels, model, k=5):
    userLabel = model.predict(X[userSongID:userSongID+1])[0]
    recommendations = np.where(labels == userLabel)[0]
    return recommendations[:k]

recommendations = recommendNB(userSongID, dataStandardized, labels, nb, k=5)
print("Recommended Songs", recommendations)

Recommended Songs [ 0 24 27 52 61]
