In [2]:
import pandas as pd
import requests
import numpy as np
from tqdm import tqdm

In [36]:
def get_similar_song(track_name,artist_name, lastfm_api_key)-> tuple:
	# Define the Last.fm API endpoint and parameters
	lastfm_url = 'http://ws.audioscrobbler.com/2.0/'
	params = {
		'method': 'track.getsimilar',
		'track': track_name,  # Use the track name from Spotify data
		'artist': artist_name,  # Use the first artist's name
		'api_key': lastfm_api_key,
		'format': 'json',
		'autocorrect': 1,  # Enable auto-correction of spelling mistakes
		'limit': 10  # Limit the results to 10 similar tracks
	}

	# Make the API call to Last.fm
	lastfm_response = requests.get(lastfm_url, params=params)

	# Parse the response
	if lastfm_response.status_code == 200:
		similar_tracks_data = lastfm_response.json()
		similar_tracks = similar_tracks_data.get('similartracks', {}).get('track', [])
		return similar_tracks
		#for i, similar_track in enumerate(similar_tracks, start=1):
		#	print(f"{i}. {similar_track['name']} by {similar_track['artist']['name']}")
		#	return similar_track['name'], similar_track['artist']['name']

	else:
		#print(f"Error: {lastfm_response.status_code} - {lastfm_response.text}")
		return None

In [37]:
def process_similar_songs(dataset, api_key, num_rows=100):
	"""
	Process a dataset to find similar songs for each track.

	Parameters:
		dataset (pd.DataFrame): The input dataset containing song information.
		api_key (str): The Last.fm API key.
		num_rows (int): Number of rows to process from the dataset.

	Returns:
		pd.DataFrame: A new dataframe with similar song information.
	"""

	# Create a list to store new rows
	new_rows = []

	# Iterate over each row in the dataframe
	for index, row in tqdm(dataset[:num_rows].iterrows(), total=min(num_rows, len(dataset))):
		track_name = row['title']
		artist_name = row['artist']

		# Get similar songs using the function
		similar_tracks = get_similar_song(track_name, artist_name, api_key)

		if similar_tracks:
			for track in similar_tracks:
				new_row = row.to_dict()  # Copy the original row's data
				new_row['song'] = f"{row['title']} by {row['artist']}"  # Add the original song column
				new_row['similar_title'] = track['name']
				new_row['similar_artist'] = track['artist']['name']
				new_row['similar_song'] = f"{track['name']} by {track['artist']['name']}"
				new_rows.append(new_row)
		else:
			new_row = row.to_dict()  # Copy the original row's data
			new_row['song'] = f"{row['title']} by {row['artist']}"  # Add the original song column
			new_row['similar_title'] = np.nan
			new_row['similar_artist'] = np.nan
			new_row['similar_song'] = np.nan
			new_rows.append(new_row)

	# Create a new dataframe from the new rows
	return pd.DataFrame(new_rows)


In [38]:
# Importing large Kaggle dataset
data = pd.read_csv("data/ds2.csv.zip",encoding='latin1', nrows=10000)

# Last.fm API credentials
lastfm_api_key = '340a0195516763a1af37aa4168461814'

In [42]:
processed_data = process_similar_songs(data, lastfm_api_key, num_rows=100)

100%|██████████| 100/100 [00:34<00:00,  2.90it/s]


In [44]:
processed_data.to_csv("data/processed_data.csv", index=False)