In [81]:
import numpy as np
from pyyoutube import Api
from pyyoutube import Client
import requests as req
import pandas as pd
from datetime import datetime, timezone
import re
import os
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import musicbrainzngs
import yaml



In [82]:
with open("config.yaml") as f:
    cfg = yaml.load(f, Loader=yaml.FullLoader)
youtube_api_key = cfg["youtube_api_key"]

spotify_client_id=cfg["spotify_client_id"]
spotify_client_secret=cfg["spotify_client_secret"]
yt = Api(api_key=youtube_api_key)
client_credentials_manager = SpotifyClientCredentials(client_id=spotify_client_id, client_secret=spotify_client_secret)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)
musicbrainzngs.set_useragent("scraper", "0.1", "@j")


In [83]:
def get_rating(video):
    description = video.snippet.description
    rating_regex = re.search(r"\d+/10", description)
    if rating_regex:
        rating_str = rating_regex.group(0)
            
        parts = rating_str.split("/")
            
        rating = int(parts[0])
        return rating
    else:
        print("no rating: " + video.snippet.title)
    
            

In [84]:
def get_album_info(video):
    vid_title = video.snippet.title.lower()
    if "album review" not in vid_title:
        print("invalid title: " + video.snippet.title)
    vid_title=vid_title[:-12]
    parts = vid_title.split(" - ", 1)

    if len(parts) == 2:
        artist_name = parts[0]
        album_name = parts[1]
        return artist_name, album_name
            
    
    
    

In [85]:
def get_album_id(artist, album):
    search_term = ""
    search_term += " ".join(artist.replace("[","").replace("]","").replace(" X "," ").split(" & ")) + " "
    if "self-titled" in album:
        search_term += " " + artist
    else:
        search_term += album.replace("[","").replace("]","")
    album_on_spotify= sp.search(q=search_term, type="album", limit=1)

    if album_on_spotify["albums"]["items"]:
        album_id = album_on_spotify["albums"]["items"][0]["id"]
        return album_id
    else:
        print(f"No album found for "{album}" by "{artist}"")
        return None
print(get_album_id("Charli XCX", "brat"))

2lIZef4lzdvZkiiCzvPKj7


In [86]:
def get_spotify_info(id):
    try:
        
        album = sp.album(id)
        
        
        
        release_date = album["release_date"]
        total_tracks = album["total_tracks"]
        
        album_popularity = album["popularity"]
        return release_date, total_tracks,  album_popularity
    except Exception as e:
        print(f"Error retrieving album info: {str(e)}")
        

In [87]:

def get_artist_info(artist_name, release):
    
    result = musicbrainzngs.search_artists(artist=artist_name, limit=1)
    try:
        if result["artist-count"] > 0:
            artist = result["artist-list"][0]
            artist_id = artist["id"]
            artist_info = musicbrainzngs.get_artist_by_id(artist_id, includes=["aliases", "tags", "ratings", "release-groups"])
            
            
            
            country=artist_info["artist"].get("country")
            if artist["type"]!="Group":
                birth_date = artist_info["artist"].get("life-span", {}).get("begin")
                gender = artist_info["artist"].get("gender")
            else:
                birth_date = None
                gender= "Group"
            try:
                if birth_date:
                    if len(birth_date) == 4 and birth_date.isdigit():
                        birth_date = f"{birth_date}-01-01"
                    birth_date = datetime.strptime(birth_date, "%Y-%m-%d").date()
                    
                    release_date = datetime.strptime(release, "%Y-%m-%d").date()
                    
                    age = (release_date - birth_date).days // 365
                else:
                    age = None
            except ValueError:
                age = None
            
            return gender, age, country
        else:
            return None, None, None
    except:
        return None, None, None
release_date=get_spotify_info("2lIZef4lzdvZkiiCzvPKj7")[0]
print(release_date)
print(get_artist_info("Radiohead", release_date))

2024-06-07
('Group', None, 'GB')


In [88]:
result = musicbrainzngs.search_artists(artist="Radiohead", limit=1)
artist = result["artist-list"][0]
artist_id = artist["id"]
artist_info = musicbrainzngs.get_artist_by_id(artist_id, includes=["aliases", "tags", "ratings", "release-groups"])
print(artist.keys())
if artist["type"]=="Group":
    print(artist["alias-list"])


dict_keys(['id', 'type', 'ext:score', 'name', 'sort-name', 'country', 'area', 'begin-area', 'isni-list', 'life-span', 'alias-list', 'tag-list'])
[{'sort-name': 'r/head', 'type': 'Search hint', 'alias': 'r/head'}, {'locale': 'zh', 'sort-name': '电台司令', 'type': 'Artist name', 'primary': 'primary', 'alias': '电台司令'}, {'locale': 'ja', 'sort-name': 'れでぃおへっど', 'type': 'Artist name', 'primary': 'primary', 'alias': 'レディオヘッド'}, {'sort-name': 'Radiohead & Thom Yorke', 'type': 'Search hint', 'alias': 'Radiohead & Thom Yorke'}, {'sort-name': 'Radio Head', 'type': 'Search hint', 'alias': 'Radio Head'}]


In [89]:
tnd_id="UCt7fwAhXDy3oNFTAzF2o8Pw"
albums=[]
for yr in range(2020,2024+1):
    
    time1 = datetime(yr,1,1,tzinfo=timezone.utc).isoformat()
    time2 = datetime(yr+1,1,1,tzinfo=timezone.utc).isoformat()
    review_vids = yt.search(search_type="video",
                            channel_id=tnd_id, 
                            q="ALBUM REVIEW",
                            count=400,
                            limit=500,
                            published_after=time1,
                            published_before=time2)
    for item in review_vids.items:
        album={}
        vid_id = item.id.videoId
        vid = yt.get_video_by_id(video_id=vid_id).items[0]
        album_info = get_album_info(vid)
        if album_info is None:
            continue
        artist_name, album_name = album_info 
        rating=get_rating(vid)
        if rating is None:
            continue
        
        spotify_id=get_album_id(artist_name, album_name)
        if spotify_id is None:
            continue
        spotify_info=get_spotify_info(spotify_id)
        if spotify_info is None:
            continue
        release_date, tracks_amt, popularity = spotify_info
        artist_info=get_artist_info(artist_name, release_date)
        if artist_info is None:
            continue
        gender, age, country = artist_info
        
        

        album["spotify_id"]=spotify_id
        album["album"]=album_name
        album["artist"]=artist_name
        album["tracks"]=tracks_amt
        album["artist_gender"]=gender
        album["artist_age"]=age
        album["release_date"]=release_date
        
        album["artist_country"]=country
        album["popularity"]=popularity
        album["rating"]=rating
        

        albums.append(album)
albums_df=pd.DataFrame(albums)

albums_df.to_csv("20-24.csv")
print(albums_df)




no rating: Gunna - Wunna ALBUM REVIEW
no rating: My Chemical Romance - The Black Parade ALBUM REVIEW
no rating: Clipse - Hell Hath No Fury ALBUM REVIEW
no rating: Janet Jackson - The Velvet Rope ALBUM REVIEW
no rating: Converge - Jane Doe ALBUM REVIEW
no rating: Talk Talk - Laughing Stock ALBUM REVIEW
invalid title: Joyner Lucas' ADHD: NOT GOOD
invalid title: Top 200 Albums of the 2010s
invalid title: 15 Worst Albums of the 2010s
invalid title: Kanye West - My Beautiful Dark Twisted Fantasy REDUX REVIEW
invalid title: Top 200 Albums of the 2010s
invalid title: Glass Animals' Dreamland: NOT GOOD
no rating: Trapt - Shadow Work ALBUM REVIEW
invalid title: Bring Me the Horizon - POST HUMAN: SURVIVAL HORROR EP REVIEW
invalid title: Machine Gun Kelly's Tickets to My Downfall: NOT GOOD
invalid title: Nav's Emergency Tsunami: NOT GOOD
invalid title: Drake - Dark Lane Demo Tapes MIXTAPE REVIEW
invalid title: Nav's Good Intentions & Brown Boy 2: NOT GOOD
invalid title: 20 Worst Singles of the 20