In [None]:
import numpy as np
import pandas as pd
from google.colab import drive
drive.mount('/content/drive')

# Load the Spotify Million Song Dataset
df = pd.read_csv("/content/drive/My Drive/spotify_millsongdata.csv")

In [None]:
# Preprocess the data
df["text"] = df["text"].str.lower()
df["text"] = df["text"].str.replace("[^a-zA-Z]", " ")

# Tokenize the data
tokens = []
for text in df["text"]:
    tokens.extend(text.split())

# Create a dictionary to store the TF-IDF scores

tf_idf = {}
for token in set(tokens):
    tf = tokens.count(token) / len(tokens)
    idf = np.log(len(df) / (np.sum([token in text for text in df["text"]])))
    tf_idf[token] = tf * idf

In [None]:
# Create a TF-IDF vector for each song

song_vectors = []
for text in df['text']:
    song_vector = np.zeros(len(tf_idf))
    for word in text.split():
        if word in tf_idf:
            song_vector[int(tf_idf[word])] += 1
        else:
            song_vector[0] += 1
        song_vectors.append(song_vector)

In [None]:
# Save the TF-IDF vectors to a file
np.save('song_vectors.npy', song_vectors)

In [None]:
# Define a function to get the top N most similar songs to a given text query
def get_top_n_similar_songs(query, song_vectors, n=1):

    # Get the TF-IDF vector for the query
    query_vector = np.zeros(len(tf_idf))
    for word in query.split():
        if word in tf_idf:
            query_vector[int(tf_idf[word])] += 1

    # Calculate the cosine similarity between the query vector and each song vector
    song_similarities = []
    for song_vector in song_vectors:
        song_similarity = np.dot(query_vector, song_vector) / (np.linalg.norm(query_vector) * np.linalg.norm(song_vector))
        song_similarities.append(song_similarity)

    # Sort the songs by their similarity to the query vector
    song_similarities = sorted(song_similarities, reverse=True)

    # Get the top N most similar songs
    top_10_songs = []
    for i in range(n):
        top_10_songs.append(song_vectors[int(song_similarities[i])])

    # Get the names of the top 10 most similar songs
    top_10_song_names = []
    for song_vector in top_10_songs:
        top_10_song_names.append(df['text'].reset_index(drop=True)[song_vector.argmax()])

    return top_10_song_names


In [None]:
# Get the top 10 most similar songs to the query "sad songs"
user_query = "sad songs"
top_10_song_names = get_top_n_similar_songs(user_query, song_vectors, n=1)

# Print the names of the top 10 most similar songs
print(top_10_song_names)