### ***This code implements a content-based music recommendation system using TF-IDF (Term Frequency-Inverse Document Frequency) and cosine similarity. It loads a dataset of songs, preprocesses the text data, builds a TF-IDF matrix, calculates cosine similarities between songs, generates a similarity dictionary, defines a ContentBasedRecommender class for making recommendations, instantiates the class, and makes recommendations for two sample songs based on their similarity to other songs in the dataset.***

In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [23]:
# Function to load the dataset
def load_dataset():
    return pd.read_csv("songdata.csv")

In [24]:
# Function to preprocess text data
def preprocess_text(data):
    data['text'] = data['text'].str.replace(r'\n', '')
    return data

In [25]:
# Function to build TF-IDF matrix
def build_tfidf_matrix(data):
    tfidf = TfidfVectorizer(analyzer='word', stop_words='english')
    lyrics_matrix = tfidf.fit_transform(data['text'])
    return tfidf, lyrics_matrix

In [26]:
# Function to calculate cosine similarities
def calculate_cosine_similarity(matrix):
    return cosine_similarity(matrix)

In [27]:
# Function to generate similarity dictionary
def generate_similarity_dict(data, cosine_similarities):
    similarities = {}
    for i in range(len(cosine_similarities)):
        similar_indices = cosine_similarities[i].argsort()[:-50:-1]
        similarities[data['song'].iloc[i]] = [(cosine_similarities[i][x], data['song'][x], data['artist'][x]) for x in similar_indices][1:]
    return similarities


In [28]:

# Define the ContentBasedRecommender class
class ContentBasedRecommender:
    def __init__(self, matrix, songs_data):
        self.matrix_similar = matrix
        self.songs_data = songs_data

    def _print_message(self, song, recom_song):
        rec_items = len(recom_song)
        print(f'The {rec_items} recommended songs for "{song}" are:')
        for i in range(rec_items):
            print(f"Number {i+1}:")
            print(f"{recom_song[i][1]} by {recom_song[i][2]} with {round(recom_song[i][0], 3)} similarity score")
            print("--------------------")

    def recommend(self, recommendation):
        song = recommendation['song']
        number_songs = recommendation['number_songs']
        if song not in self.matrix_similar:
            print(f"Sorry, {song} is not found in the dataset.")
            return
        recom_song = self.matrix_similar[song][:number_songs]
        self._print_message(song=song, recom_song=recom_song)

In [29]:
# Load dataset
songs = load_dataset()

In [30]:
# Sample a subset of the dataset
songs = songs.sample(n=5000).drop('link', axis=1).reset_index(drop=True)

In [31]:
# Preprocess text data
songs = preprocess_text(songs)


  data['text'] = data['text'].str.replace(r'\n', '')


In [32]:
# Build TF-IDF matrix
tfidf, lyrics_matrix = build_tfidf_matrix(songs)

# Calculate cosine similarities
cosine_similarities = calculate_cosine_similarity(lyrics_matrix)

# Generate similarity dictionary
similarities = generate_similarity_dict(songs, cosine_similarities)

# Instantiate the recommender class
recommender = ContentBasedRecommender(similarities, songs)

In [33]:
# Make recommendations
recommendation = {"song": songs['song'].iloc[10], "number_songs": 4}
recommender.recommend(recommendation)

recommendation2 = {"song": songs['song'].iloc[120], "number_songs": 4}
recommender.recommend(recommendation2)

The 4 recommended songs for "It's Only A Paper Moon" are:
Number 1:
Hello, Little Girl by The Beatles with 0.536 similarity score
--------------------
Number 2:
Till The Sun Comes Up Again by America with 0.365 similarity score
--------------------
Number 3:
Heart Of A Fool by Eddie Cochran with 0.223 similarity score
--------------------
Number 4:
Tell It Like It Is by Engelbert Humperdinck with 0.218 similarity score
--------------------
The 4 recommended songs for "Love U Better" are:
Number 1:
At This Moment by Tom Jones with 0.236 similarity score
--------------------
Number 2:
Don't Think They Know by Chris Brown with 0.235 similarity score
--------------------
Number 3:
Blow It In The Wind by Chris Brown with 0.23 similarity score
--------------------
Number 4:
The Hurt by Cat Stevens with 0.218 similarity score
--------------------
