# Get Top Tracks from Spotify

In [2]:
# Import necessary packages
import spotipy
from spotipy.oauth2 import SpotifyOAuth
import pandas as pd

In [3]:
# Set up Spotify API authentication
sp = spotipy.Spotify(auth_manager=SpotifyOAuth(
    client_id= ,
    client_secret= ,
    redirect_uri="http://localhost:8888/callback",
    scope="user-top-read"
))

# Function to get top tracks (up to 2000)
def get_top_tracks(limit=50, total=2000, time_range="long_term"):
    all_tracks = []
    for offset in range(0, total, limit):
        results = sp.current_user_top_tracks(limit=limit, offset=offset, time_range=time_range)
        all_tracks.extend(results["items"])
    return all_tracks

# Fetch user's top tracks
top_tracks = get_top_tracks()

# Convert to DataFrame with Artist IDs included
df_top_tracks = pd.DataFrame([{
    "name": track["name"],
    "artist": ", ".join([artist["name"] for artist in track["artists"]]),
    "artist_ids": ", ".join([artist["id"] for artist in track["artists"]]),  # Add Artist ID
    "album": track["album"]["name"],
    "id": track["id"],
} for track in top_tracks])

# Save or display
df_top_tracks.to_csv("top_2000_tracks.csv", index=False)
df_top_tracks.head()

Unnamed: 0,name,artist,artist_ids,album,id
0,Good Life,Sammy Rae & The Friends,3lFDsTyYNPQc8WzJExnQWn,The Good Life,6sOK9LnvTTj4E2ZGzhKTQo
1,לצאת מדיכאון,יגל אושרי,6Kn2CjfEButrWXszsA0E6Q,לצאת מדיכאון,0E9S1H3TktlpHmGyuKEfkL
2,Off My Face,Justin Bieber,1uNFoZAHBGtllmzznpCI3s,Justice,3T03rPwlL8NVk1yIaxeD8U
3,Hakol Letova,Static & Ben El,0xHa28taiElkcQf9o3z76g,Hakol Letova,5JDMaE2pFTToZ7spTl9svP
4,TEXAS HOLD 'EM,Beyoncé,6vWDO969PvNqNYHIOW5v0m,COWBOY CARTER,7wLShogStyDeZvL0a6daN5


In [5]:
df_top_tracks.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   name        2000 non-null   object
 1   artist      2000 non-null   object
 2   artist_ids  2000 non-null   object
 3   album       2000 non-null   object
 4   id          2000 non-null   object
dtypes: object(5)
memory usage: 78.3+ KB


In [8]:
df_songs=pd.read_csv('tracks_features.csv')
df_songs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1204025 entries, 0 to 1204024
Data columns (total 24 columns):
 #   Column            Non-Null Count    Dtype  
---  ------            --------------    -----  
 0   id                1204025 non-null  object 
 1   name              1204022 non-null  object 
 2   album             1204014 non-null  object 
 3   album_id          1204025 non-null  object 
 4   artists           1204025 non-null  object 
 5   artist_ids        1204025 non-null  object 
 6   track_number      1204025 non-null  int64  
 7   disc_number       1204025 non-null  int64  
 8   explicit          1204025 non-null  bool   
 9   danceability      1204025 non-null  float64
 10  energy            1204025 non-null  float64
 11  key               1204025 non-null  int64  
 12  loudness          1204025 non-null  float64
 13  mode              1204025 non-null  int64  
 14  speechiness       1204025 non-null  float64
 15  acousticness      1204025 non-null  float64
 16  

In [13]:
# Extract only song IDs from top songs
top_song_ids = set(df_top_tracks["id"])  # Convert to set for faster lookup

# Add "in_top_tracks" column to df_song_database
df_songs["in_top_tracks"] = df_songs["id"].apply(lambda x: 1 if x in top_song_ids else 0)

# Save or display the updated dataset
df_songs.to_csv("data/songs.csv", index=False)
df_songs.head()

Unnamed: 0,id,name,album,album_id,artists,artist_ids,track_number,disc_number,explicit,danceability,...,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,year,release_date,in_top_tracks
0,7lmeHLHBe4nmXzuXc0HDjk,Testify,The Battle Of Los Angeles,2eia0myWFgoHuttJytCxgX,['Rage Against The Machine'],['2d0hyoQ5ynDBnkvAbJKORj'],1,1,False,0.47,...,0.0261,1.1e-05,0.356,0.503,117.906,210133,4.0,1999,1999-11-02,0
1,1wsRitfRRtWyEapl0q22o8,Guerrilla Radio,The Battle Of Los Angeles,2eia0myWFgoHuttJytCxgX,['Rage Against The Machine'],['2d0hyoQ5ynDBnkvAbJKORj'],2,1,True,0.599,...,0.0129,7.1e-05,0.155,0.489,103.68,206200,4.0,1999,1999-11-02,0
2,1hR0fIFK2qRG3f3RF70pb7,Calm Like a Bomb,The Battle Of Los Angeles,2eia0myWFgoHuttJytCxgX,['Rage Against The Machine'],['2d0hyoQ5ynDBnkvAbJKORj'],3,1,False,0.315,...,0.0234,2e-06,0.122,0.37,149.749,298893,4.0,1999,1999-11-02,0
3,2lbASgTSoDO7MTuLAXlTW0,Mic Check,The Battle Of Los Angeles,2eia0myWFgoHuttJytCxgX,['Rage Against The Machine'],['2d0hyoQ5ynDBnkvAbJKORj'],4,1,True,0.44,...,0.163,4e-06,0.121,0.574,96.752,213640,4.0,1999,1999-11-02,0
4,1MQTmpYOZ6fcMQc56Hdo7T,Sleep Now In the Fire,The Battle Of Los Angeles,2eia0myWFgoHuttJytCxgX,['Rage Against The Machine'],['2d0hyoQ5ynDBnkvAbJKORj'],5,1,False,0.426,...,0.00162,0.105,0.0789,0.539,127.059,205600,4.0,1999,1999-11-02,0


In [14]:
df_songs.in_top_tracks.value_counts()

in_top_tracks
0    1203884
1        141
Name: count, dtype: int64