In [38]:
#get dependencies
import numpy as np
from pyyoutube import Api
import requests as req
import re
import pandas as pd
import glob
from datetime import datetime, timezone
import logging as log
from progress.bar import Bar
import yaml

with open("config.yaml") as config_file:
    config = yaml.load(config_file.read())

spotify_auth = (config["spotify"]["client-id"],config["spotify"]["client-secret"])
youtube_api_key = config["youtube"]["api-key"]

VERBOSE = False
START_YEAR = 2017
END_YEAR = 2021


In [39]:
token_res = req.post(   'https://accounts.spotify.com/api/token', 
                        auth=spotify_auth,
                        headers={
                            'content-type': 'application/x-www-form-urlencoded'
                        },
                        data={'grant_type': 'client_credentials'}
                    )
spotify_token = token_res.json()['access_token']

In [40]:



yt = Api(api_key=youtube_api_key)

res = yt.get_channel_info(channel_name="theneedledrop")
print(res)
tnd_id = res.items[0].id

#note: youtubes api limits to 500 videos per search.


albums = pd.DataFrame(
    {'spotify_id':[],
    'youtube_id':[],
    'project_name':[],
    'artist':[],
    'project_type':[],
    'tracks':[],
    'project_art':[],
    'year':[],
    'rating':[]})

#print(albums.columns)

tracks = pd.DataFrame(
    {'spotify_id':[],
        'album_id':[],
        'youtube_id':[],
        'name':[],
        'duration':[],
        'explicit':[],
        'preview':[],
        'key':[],
        'mode':[],
        'acousticness':[],
        'danceability':[],
        'energy':[],
        'instrumentalness':[],
        'liveness':[],
        'loudness':[],
        'speechiness':[],
        'valence':[],
        'tempo':[]})

for yr in range(START_YEAR,END_YEAR+1):

    print("collecting data for " + str(yr))
    

    published_before_ts = datetime(yr+1,1,1,tzinfo=timezone.utc).isoformat()
    published_after_ts = datetime(yr,1,1,tzinfo=timezone.utc).isoformat()

    review_vids = yt.search(parts='id,snippet',
                            search_type='video',
                            channel_id=tnd_id, 
                            q='REVIEW',
                            count=500,
                            limit=500,
                            published_after=published_after_ts,
                            published_before=published_before_ts)

    bar = Bar("Processing videos", max=len(review_vids.items))
    for item in review_vids.items:

        album = {}

        vid_id = item.id.videoId
        vid = yt.get_video_by_id(video_id=vid_id).items[0]
        description = vid.snippet.description
        vid_title = vid.snippet.title.lower().split(" ")
        if 'review' not in vid_title:
            print("invalid title: " + vid.snippet.title)
            continue
        
        review_index = len(vid_title) - vid_title[::-1].index('review') - 1

        if vid_title[vid_title.index('review') - 1] in ['album','mixtape','ep','redux','complimation','soundtrack','track']:
            project_type = vid_title[review_index - 1]

            type_index = review_index - 1

            project_info = " ".join(vid_title[:type_index]).split("- ")
        else:
            project_info = " ".join(vid_title[:review_index]).split("- ")

        if len(project_info) < 2:
            print("invalid title: " + vid.snippet.title)
            continue
        artist = project_info[0].strip()
        project_name = "-".join(project_info[1::]).strip()
        rating_regex = re.search(r'\d+/10',description)
        if rating_regex:
            rating = int(rating_regex.group(0)[0:1])
        else:
            print("no rating: " + vid.snippet.title)
            continue

        

        # spotify has trouble picking up artists with ampersands in their names as well as 
        # self-titled albums, so fix the query to eliminate those errors
        search_term = ""

        search_term += " ".join(artist.replace("[","").replace("]","").replace(" X "," ").split(" & ")) + " "

        if "self-titled" in project_name:
            search_term += " " + artist
        else:
            search_term += project_name.replace("[","").replace("]","")

        if VERBOSE:
            print("searching for: " + search_term)
        

        search_res = req.get(   'https://api.spotify.com/v1/search',
                            params={
                                'q': search_term,
                                "type": "album",
                                "limit": 1,
                                'include_external': True
                            },
                            headers={
                                'authorization': "Bearer " + spotify_token
                            }
                        )
    
        album_on_spotify =  'albums' in search_res.json() and len(search_res.json()['albums']['items']) is not 0

        if not album_on_spotify:
            print("not on spotify: " + artist + " - " + project_name + " search term: " + search_term)

        if album_on_spotify:
            spotify_album = search_res.json()['albums']['items'][0]
            tracks_amt = spotify_album['total_tracks']
            release_year = int(spotify_album['release_date'][0:4])
            spotify_id = spotify_album['id']
            project_art = spotify_album['images'][-1]['url']
        
        



        
        album["spotify_id"] = spotify_id if album_on_spotify else ""
        album["youtube_id"] = vid_id
        album["project_name"] = project_name
        album["artist"] = artist
        album["project_type"] = project_type
        album["tracks"] = tracks_amt if album_on_spotify else -1
        album["project_art"] = project_art if album_on_spotify else ""
        album["year"] = release_year if album_on_spotify else -1
        album["rating"] = rating

        albums = albums.append(album,ignore_index=True)

        if VERBOSE:
            print("\033[1;30;43martist:\033[0m ", end='')
            print(artist, end=' ')
            print("\033[1;30;43mproject name:\033[0m ", end='')
            print(project_name, end=' ')
            print("\033[1;30;43mproject type:\033[0m ", end='')
            print(project_type, end=' ')


            print("\033[1;30;43mrelease year:\033[0m ", end='')
            print(release_year, end=' ')

            print("\033[1;30;43mrating:\033[0m ", end='')
            print(rating)


    

        if not album_on_spotify: continue

        #get tracks
        search_res = req.get(   'https://api.spotify.com/v1/albums/' + spotify_id + '/tracks',
                            params={
                                "limit": 50
                            },
                            headers={
                                'authorization': "Bearer " + spotify_token
                            }
                        )
        spotify_tracks = search_res.json()['items']
        

        
        for st in spotify_tracks:
            
            spotify_track_id = st['id']
            #get audio features of track
            search_res = req.get(   'https://api.spotify.com/v1/audio-features/' + spotify_track_id,
                            headers={
                                'authorization': "Bearer " + spotify_token
                            }
                        )
            has_audio_features = search_res is not None and  len(search_res.text) > 0 

            if has_audio_features:
                spotify_analysis = search_res.json()
            else:
                print("spotify audio features query failed: " + project_name)

            track = {}
            track["spotify_id"] = spotify_track_id
            track["album_id"] = spotify_id
            track["youtube_id"] = vid_id
            track["name"] = st['name']
            track["duration"] = st['duration_ms']
            track["explicit"] = st['explicit']
            track["preview"] = st['preview_url']
            track["key"] = spotify_analysis['key'] if has_audio_features and 'key' in spotify_analysis else -1
            track["mode"] = spotify_analysis['mode'] if has_audio_features and 'mode' in spotify_analysis else -1
            track["acousticness"] = spotify_analysis['acousticness'] if has_audio_features and 'acousticness' in spotify_analysis else -1
            track['danceability'] = spotify_analysis['danceability'] if has_audio_features and 'danceability' in spotify_analysis else -1
            track['energy'] = spotify_analysis['energy'] if has_audio_features and 'energy' in spotify_analysis else -1
            track['instrumentalness'] = spotify_analysis['instrumentalness'] if has_audio_features and 'instrumentalness' in spotify_analysis else -1
            track['liveness'] = spotify_analysis['liveness'] if has_audio_features and 'liveness' in spotify_analysis else -1
            track['loudness'] = spotify_analysis['loudness'] if has_audio_features and 'loudness' in spotify_analysis else -1
            track['speechiness'] = spotify_analysis['speechiness'] if has_audio_features and 'speechiness' in spotify_analysis else -1
            track['valence'] = spotify_analysis['valence'] if has_audio_features and 'valence' in spotify_analysis else -1
            track['tempo'] = spotify_analysis['tempo'] if has_audio_features and 'tempo' in spotify_analysis else -1
            

            tracks = tracks.append(track,ignore_index=True)
        
            bar.update()
            if VERBOSE:

                print("\t\033[1;30;43mtracks (" + str(tracks_amt) + "):\033[0m ")
            
                for track in tracks:
                    print("\t" + track[3], end=' ')
                    t_min_str = str(track[4] // 60000)
                    t_sec_str = "0" + str((track[4] // 1000) % 60) if (track[4] // 1000) % 60 < 10 else str((track[4] // 1000) % 60)
                    print("(" + t_min_str + ":" + t_sec_str + ")", end=' ')
                    
                    track_key = ["C", "C#", "D", "D#", "E", "F", "F#", "G", "G#", "A", "A#", "B"][track[7]] if track[7] != -1 else "unknown"
                    print("\033[1;30;43mkey:\033[0m ",end='')
                    print(track_key, end=' ')

                    track_mode = "major" if track[8] == 1 else "minor"
                    print("\033[1;30;43mmode:\033[0m ",end='')
                    print(track_mode)
            
    bar.finish()
    #save to tracks.csv
    print(albums)
    print(tracks)
    tracks.to_csv('./data/tracks' + str(yr) + '.csv')
    albums.to_csv('./data/albums' + str(yr) + '.csv')





ChannelListResponse(kind='youtube#channelListResponse')
collecting data for 2017
invalid title: Anthony Fantano HUMAN BEING REVIEW (1 Million Subscriber Special)
no rating: Pink Guy - Pink Season ALBUM REVIEW
no rating: Kendrick Lamar - HUMBLE. TRACK REVIEW
no rating: Captain Beefheart - Trout Mask Replica ALBUM REVIEW
no rating: Joji - In Tongues EP REVIEW
no rating: Gorillaz - Humanz ALBUM REVIEW
no rating: Playboi Carti - Self-Titled MIXTAPE REVIEW
no rating: B.o.B - Ether ALBUM REVIEW
no rating: Talking Heads - Remain In Light ALBUM REVIEW
no rating: Outkast - ATLiens ALBUM REVIEW
not on spotify: neil cicierega - mouth moods search term: neil cicierega mouth moods
no rating: Wale - Shine ALBUM REVIEW
no rating: Emperor - In the Nightside Eclipse ALBUM REVIEW
no rating: Calvin Harris - Funk Wav Bounces Vol. 1 ALBUM REVIEW
no rating: Gucci Mane & Metro Boomin - Drop Top Wop MIXTAPE REVIEW
invalid title: Kanye West: Worst To Best
no rating: Bleachers - Gone Now ALBUM REVIEW
no rating:

In [None]:
#strategy for scraping data:
#The Needle Drop website has list of reviews which includes tags for rating, as well as genre & artist
#cycle through reviews to get features


#alternatively, cycle through youtube videos with REVIEW keyword and get artist, genre, & rating that way

In [42]:
#recombine the csvs into a dataframe 


path = './data' # use your path
track_files = glob.glob(path + "/tracks20*.csv")
album_files = glob.glob(path + "/albums20*.csv")

li = []

for filename in album_files:
    df = pd.read_csv(filename, index_col=0, header=0, dtype=dict(albums.dtypes))
    li.append(df)

album_df = pd.concat(li, axis=0, ignore_index=True)

for filename in track_files:
    df = pd.read_csv(filename, index_col=0, header=0,dtype=dict(tracks.dtypes))
    li.append(df)

track_df = pd.concat(li, axis=0, ignore_index=True)

album_df.to_csv('./data/albums.csv')
track_df.to_csv('./data/tracks.csv')
print(album_df)
print(track_df)


                  spotify_id   youtube_id                       project_name  \
0     20r762YmB5HeofjMCiPMLv  Jo4S2qlQGs0  my beautiful dark twisted fantasy   
1     4hnqM0JK4CM1phwfq1Ldyz  3Hbhb5CGEB0                  this is happening   
2                        NaN  _2RPGAA6p4E                            bastard   
3     70TgSPZ7rkVPH3KnJlgCYS  qMsnKVtohy0                      public strain   
4     5xFZ4iElFbUFtOGX4lvdTM  xBxvMXOjHM4                come around sundown   
...                      ...          ...                                ...   
7260  3vRrTXRKeqF9eotYWzpOlZ  T2KxQVMzLUM        i've seen all i need to see   
7261                     NaN  CWqSCn0SG10   imperative imperceptible impulse   
7262  6luveNSKdzgv80ZqQYsIgA  8L9s6emx0Rc                we are always alone   
7263  3hufhPvd2yMXONUeQjyrT3  xBKHJshpMuE                            charmed   
7264  5lVgjLzgPHy8wuFYJOxKWA  VvCRyCMCJpE                          bob's son   

                         artist project

In [67]:
from keras import Input, Model
from keras.models import Sequential
from keras.layers import Dense
from keras.utils import to_categorical

In [70]:


ratings = to_categorical(album_df["rating"])

album_no_rating = album_df.drop(columns="rating")

album_input = Input(shape=(len(album_df.columns) - 1,))
model = Dense(64,activation='relu')(album_input)
model = Dense(256,activation='relu')(model)
model = Dense(256,activation='relu')(model)
output = Dense(10,activation='softmax')(model)


model = Model(inputs=album_input,outputs=output)
model.compile()

model.fit(x=album_no_rating,y=ratings)
#rough design:
#features:
#artist:
    #transfer to artist id?
#genre
    #encode
#album length (songs & total runtime)
    #tracks scaled to 0-1
#spotify metrics
    #
#mix quality (not sure how to measure)
#number of previous releases
#country of origin


#output: eleven units for rating 0-10


[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 1.]
 [0. 0. 0. ... 1. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


ValueError: Failed to convert a NumPy array to a Tensor (Unsupported object type float).