In [5]:
from spotipy import Spotify
import pandas as pd
from spotipy.oauth2 import SpotifyClientCredentials
from selenium import webdriver
from bs4 import BeautifulSoup
import re
import concurrent.futures
import glob
import os
from dotenv import load_dotenv

load_dotenv()

# Spotify Authentication
client_id = os.getenv("CLIENT_ID")
client_secret = os.getenv('CLIENT_SECRET')
auth_manager = SpotifyClientCredentials(client_id=client_id, client_secret=client_secret)
spotify = Spotify(auth_manager=auth_manager)

# Function to crawl a specific year
def crawl_year(year):
    driver = webdriver.Chrome()
    data = pd.DataFrame(columns=['id', 'name_song', 'name_album', 'name_artist', 'release_date', 'duration_ms', 'popularity', 'explicit', 'playback', 'available_market', 'is_playable', 'is_local'])

    for i in range(0, 1000, 50):  # Adjust range if needed
        songs = spotify.search(q=f'year:{year}', type='track', limit=50, offset=i)

        for track in songs['tracks']['items']:
            url = track['external_urls']['spotify']
            driver.get(url)
            soup = BeautifulSoup(driver.page_source, 'html.parser')
            playcount_span = soup.find('span', {'data-testid': 'playcount'})
            playback = int(re.sub(r'\D', '', playcount_span.text)) if playcount_span else None

            data.loc[-1] = [
                track['id'],
                track['name'],
                track['album']['name'],
                track['album']['artists'][0]['name'],
                track['album']['release_date'],
                track['duration_ms'],
                track['popularity'],
                track['explicit'],
                playback,
                track['available_markets'],
                track['is_playable'],
                track['is_local']
            ]
            data.index = data.index + 1
            data = data.sort_index()

    driver.quit()

    # Save data for the year
    data.to_csv(f'tracks_{year}.csv', index=False)
    print(f'Done crawling year {year} with {len(data)} records.')

# List of years to crawl
years = ['2020', '2021', '2022', '2023', '2024']

# Using ThreadPoolExecutor in Jupyter Notebook
with concurrent.futures.ThreadPoolExecutor() as executor:
    executor.map(crawl_year, years)

# Check if CSV files were created
csv_files = glob.glob('tracks_*.csv')

# Combine all yearly CSV files into one
combined_data = pd.concat([pd.read_csv(file) for file in csv_files], ignore_index=True)
combined_data.to_csv('all_tracks.csv', index=False)
print("All data combined into all_tracks.csv!")


Done crawling year 2023 with 1000 records.
Done crawling year 2021 with 1000 records.
Done crawling year 2022 with 1000 records.
Done crawling year 2024 with 1000 records.
Done crawling year 2020 with 1000 records.
All data combined into all_tracks.csv!
