In [2]:
#get dependencies
import numpy as np
from pyyoutube import Api
import requests as req
import re
import pandas as pd
import glob
from datetime import datetime, timezone
import logging as log
from progress.bar import Bar
import yaml
from lyricsgenius import Genius

with open("config.yaml") as config_file:
    config = yaml.load(config_file.read())

VERBOSE = False
START_YEAR = 2017
END_YEAR = 2021


  config = yaml.load(config_file.read())


In [3]:
# authenticate for spotify
spotify_auth = (config["spotify"]["client-id"],config["spotify"]["client-secret"])
token_res = req.post(   'https://accounts.spotify.com/api/token', 
                        auth=spotify_auth,
                        headers={
                            'content-type': 'application/x-www-form-urlencoded'
                        },
                        data={'grant_type': 'client_credentials'}
                    )
spotify_token = token_res.json()['access_token']

# authenticate for youtube 
youtube_api_key = config["youtube"]["api-key"]
yt = Api(api_key=youtube_api_key)

genius_access_token = config["genius"]["access-token"]
genius = Genius(genius_access_token)



In [4]:
albums = pd.DataFrame(
    {'spotify_id':[],
    'youtube_id':[],
    'project_name':[],
    'artist':[],
    'project_type':[],
    'tracks':[],
    'project_art':[],
    'year':[],
    'rating':[]})

#print(albums.columns)

tracks = pd.DataFrame(
    {'spotify_id':[],
        'album_id':[],
        'youtube_id':[],
        'name':[],
        'duration':[],
        'explicit':[],
        'preview':[],
        'key':[],
        'mode':[],
        'acousticness':[],
        'danceability':[],
        'energy':[],
        'instrumentalness':[],
        'liveness':[],
        'loudness':[],
        'speechiness':[],
        'valence':[],
        'tempo':[]})

In [6]:





res = yt.get_channel_info(channel_name="theneedledrop")
print(res)
tnd_id = res.items[0].id

#note: youtubes api limits to 500 videos per search.



for yr in range(START_YEAR,END_YEAR+1):

    print("collecting data for " + str(yr))
    

    published_before_ts = datetime(yr+1,1,1,tzinfo=timezone.utc).isoformat()
    published_after_ts = datetime(yr,1,1,tzinfo=timezone.utc).isoformat()

    review_vids = yt.search(parts='id,snippet',
                            search_type='video',
                            channel_id=tnd_id, 
                            q='REVIEW',
                            count=500,
                            limit=500,
                            published_after=published_after_ts,
                            published_before=published_before_ts)

    bar = Bar("Processing videos", max=len(review_vids.items))
    for item in review_vids.items:

        album = {}

        vid_id = item.id.videoId
        vid = yt.get_video_by_id(video_id=vid_id).items[0]
        description = vid.snippet.description
        vid_title = vid.snippet.title.lower().split(" ")
        if 'review' not in vid_title:
            print("invalid title: " + vid.snippet.title)
            continue
        
        review_index = len(vid_title) - vid_title[::-1].index('review') - 1

        if vid_title[vid_title.index('review') - 1] in ['album','mixtape','ep','redux','complimation','soundtrack','track']:
            project_type = vid_title[review_index - 1]

            type_index = review_index - 1

            project_info = " ".join(vid_title[:type_index]).split("- ")
        else:
            project_info = " ".join(vid_title[:review_index]).split("- ")

        if len(project_info) < 2:
            print("invalid title: " + vid.snippet.title)
            continue
        artist = project_info[0].strip()
        project_name = "-".join(project_info[1::]).strip()
        rating_regex = re.search(r'\d+/10',description)
        if rating_regex:
            rating = int(rating_regex.group(0)[0:1])
        else:
            print("no rating: " + vid.snippet.title)
            continue

        

        # spotify has trouble picking up artists with ampersands in their names as well as 
        # self-titled albums, so fix the query to eliminate those errors
        search_term = ""

        search_term += " ".join(artist.replace("[","").replace("]","").replace(" X "," ").split(" & ")) + " "

        if "self-titled" in project_name:
            search_term += " " + artist
        else:
            search_term += project_name.replace("[","").replace("]","")

        if VERBOSE:
            print("searching for: " + search_term)
        

        search_res = req.get(   'https://api.spotify.com/v1/search',
                            params={
                                'q': search_term,
                                "type": "album",
                                "limit": 1,
                                'include_external': True
                            },
                            headers={
                                'authorization': "Bearer " + spotify_token
                            }
                        )
    
        album_on_spotify =  'albums' in search_res.json() and len(search_res.json()['albums']['items']) != 0

        if not album_on_spotify:
            print("not on spotify: " + artist + " - " + project_name + " search term: " + search_term)

        if album_on_spotify:
            spotify_album = search_res.json()['albums']['items'][0]
            tracks_amt = spotify_album['total_tracks']
            release_year = int(spotify_album['release_date'][0:4])
            spotify_id = spotify_album['id']
            project_art = spotify_album['images'][-1]['url']
        
        



        
        album["spotify_id"] = spotify_id if album_on_spotify else ""
        album["youtube_id"] = vid_id
        album["project_name"] = project_name
        album["artist"] = artist
        album["project_type"] = project_type
        album["tracks"] = tracks_amt if album_on_spotify else -1
        album["project_art"] = project_art if album_on_spotify else ""
        album["year"] = release_year if album_on_spotify else -1
        album["rating"] = rating

        albums = albums.append(album,ignore_index=True)

        if VERBOSE:
            print("\033[1;30;43martist:\033[0m ", end='')
            print(artist, end=' ')
            print("\033[1;30;43mproject name:\033[0m ", end='')
            print(project_name, end=' ')
            print("\033[1;30;43mproject type:\033[0m ", end='')
            print(project_type, end=' ')


            print("\033[1;30;43mrelease year:\033[0m ", end='')
            print(release_year, end=' ')

            print("\033[1;30;43mrating:\033[0m ", end='')
            print(rating)


    

        if not album_on_spotify: continue

        #get tracks
        search_res = req.get(   'https://api.spotify.com/v1/albums/' + spotify_id + '/tracks',
                            params={
                                "limit": 50
                            },
                            headers={
                                'authorization': "Bearer " + spotify_token
                            }
                        )
        spotify_tracks = search_res.json()['items']
        

        
        for st in spotify_tracks:
            
            spotify_track_id = st['id']
            #get audio features of track
            search_res = req.get(   'https://api.spotify.com/v1/audio-features/' + spotify_track_id,
                            headers={
                                'authorization': "Bearer " + spotify_token
                            }
                        )
            has_audio_features = search_res != None and len(search_res.text) > 0 

            if has_audio_features:
                spotify_analysis = search_res.json()
            else:
                print("spotify audio features query failed: " + project_name)

            track = {}
            track["spotify_id"] = spotify_track_id
            track["album_id"] = spotify_id
            track["youtube_id"] = vid_id
            track["name"] = st['name']
            track["duration"] = st['duration_ms']
            track["explicit"] = st['explicit']
            track["preview"] = st['preview_url']
            track["key"] = spotify_analysis['key'] if has_audio_features and 'key' in spotify_analysis else -1
            track["mode"] = spotify_analysis['mode'] if has_audio_features and 'mode' in spotify_analysis else -1
            track["acousticness"] = spotify_analysis['acousticness'] if has_audio_features and 'acousticness' in spotify_analysis else -1
            track['danceability'] = spotify_analysis['danceability'] if has_audio_features and 'danceability' in spotify_analysis else -1
            track['energy'] = spotify_analysis['energy'] if has_audio_features and 'energy' in spotify_analysis else -1
            track['instrumentalness'] = spotify_analysis['instrumentalness'] if has_audio_features and 'instrumentalness' in spotify_analysis else -1
            track['liveness'] = spotify_analysis['liveness'] if has_audio_features and 'liveness' in spotify_analysis else -1
            track['loudness'] = spotify_analysis['loudness'] if has_audio_features and 'loudness' in spotify_analysis else -1
            track['speechiness'] = spotify_analysis['speechiness'] if has_audio_features and 'speechiness' in spotify_analysis else -1
            track['valence'] = spotify_analysis['valence'] if has_audio_features and 'valence' in spotify_analysis else -1
            track['tempo'] = spotify_analysis['tempo'] if has_audio_features and 'tempo' in spotify_analysis else -1
            

            tracks = tracks.append(track,ignore_index=True)
        
            bar.update()
            if VERBOSE:

                print("\t\033[1;30;43mtracks (" + str(tracks_amt) + "):\033[0m ")
            
                for track in tracks:
                    print("\t" + track[3], end=' ')
                    t_min_str = str(track[4] // 60000)
                    t_sec_str = "0" + str((track[4] // 1000) % 60) if (track[4] // 1000) % 60 < 10 else str((track[4] // 1000) % 60)
                    print("(" + t_min_str + ":" + t_sec_str + ")", end=' ')
                    
                    track_key = ["C", "C#", "D", "D#", "E", "F", "F#", "G", "G#", "A", "A#", "B"][track[7]] if track[7] != -1 else "unknown"
                    print("\033[1;30;43mkey:\033[0m ",end='')
                    print(track_key, end=' ')

                    track_mode = "major" if track[8] == 1 else "minor"
                    print("\033[1;30;43mmode:\033[0m ",end='')
                    print(track_mode)
            
    bar.finish()
    #save to tracks.csv
    print(albums)
    print(tracks)
    tracks.to_csv('./data/tracks' + str(yr) + '.csv')
    albums.to_csv('./data/albums' + str(yr) + '.csv')





ChannelListResponse(kind='youtube#channelListResponse')


TypeError: Invalid token

In [None]:
#strategy for scraping data:
#The Needle Drop website has list of reviews which includes tags for rating, as well as genre & artist
#cycle through reviews to get features


#alternatively, cycle through youtube videos with REVIEW keyword and get artist, genre, & rating that way

In [9]:
#recombine the csvs into a dataframe 


path = './data' # use your path
track_files = glob.glob(path + "/tracks20*.csv")
album_files = glob.glob(path + "/albums20*.csv")

li = []

for filename in album_files:
    df = pd.read_csv(filename, index_col=0, header=0, dtype=dict(albums.dtypes))
    li.append(df)

album_df = pd.concat(li, axis=0, ignore_index=True)

for filename in track_files:
    df = pd.read_csv(filename, index_col=0, header=0,dtype=dict(tracks.dtypes))
    li.append(df)

track_df = pd.concat(li, axis=0, ignore_index=True)

album_df.to_csv('./data/albums.csv')

track_df.to_csv('./data/tracks.csv')


album_df = album_df.sample(frac=1).reset_index(drop=True)
track_df = track_df.sample(frac=1).reset_index(drop=True)
print(album_df)
print(track_df)


                  spotify_id   youtube_id  \
0     42oQv8fbI5CbeuAasf2ILC  osgjyexGTEo   
1                        NaN  dfzAuvtWVYk   
2     36nDwdvBHoJMIZn0wzYWrD  quu1-xHf4EM   
3     5lnQLEUiVDkLbFJHXHQu9m  IfnHn2hWWzA   
4     6Iu8toVsvCc3I4INxYiVIy  Gc0Jpf_25a4   
...                      ...          ...   
7260  5GOo2Xk33SMeP05fZZirfz  kbmrV7CIa28   
7261  3MiiF9utmtGnLVITgl0JP7  v7F2qoZkNIo   
7262  3vnTHiTvlOcNFg2wCK0Uyl  VCGwCQDMEi8   
7263  3LHOep5q4yGo96ui8fpSST  zxs4h0ABJMI   
7264  2XhQwji1ixgjca0XzkiTek  hCy3_Fosq2g   

                                project_name          artist project_type  \
0                         my name is my name         pusha t        album   
1     angels of darkness, demons of light ii           earth        album   
2                 last year was weird vol. 2     tkay maidza           ep   
3                              wasting light    foo fighters        album   
4                            ravedeath, 1972      tim hecker        album 

In [8]:
from keras import Input, Model
from keras.models import Sequential
from keras.layers import Dense, Concatenate
from keras.utils import to_categorical
from keras.optimizers import Adam
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, MinMaxScaler
from math import floor
from random import sample


In [10]:




#set up encoders
artists = album_df["artist"].to_numpy().reshape(-1,1)
artist_encoder = OrdinalEncoder().fit(artists)

project_types = album_df["project_type"].to_numpy().reshape(-1,1)
project_type_encoder = OneHotEncoder().fit(project_types)

def create_tracks_input(row):
    out = []

    #get all tracks for a single album
    tracks = track_df[track_df['album_id'] == row['spotify_id']]
    tracks_count = len(tracks)
    for i, track in tracks.iterrows():
        print(f"Processing track: {i}/{tracks_count}")
        out.append(track[['key':'tempo']])
    return out


def create_input_row(row):
    out = []
    #out.append(row.name) # need to have a row id included to relate to tracks
    out.append(artist_encoder.transform([[row['artist']]])[0][0])
    project_type_encoded = project_type_encoder.transform([[row['project_type']]]).toarray()
    out.extend(project_type_encoded[0])
    out.append(row['year'])
    out.append(int(row['tracks']))
    return out

ratings = to_categorical(album_df["rating"])

album_in = album_df.apply(create_input_row,axis=1,result_type='expand')
album_in = MinMaxScaler().fit_transform(album_in)

track_in = album_df.apply(create_tracks_input,axis=1,result_type='expand')
track_in = MinMaxScaler().fit_transform(track_in)

test_ratings = ratings[:test_amt]
test_in = album_in[:test_amt]

print(len(album_in))
print(len(track_in))



SyntaxError: invalid syntax (<ipython-input-10-c25f2f31f3bb>, line 16)

In [15]:
test_split = 0.2
validation_split = 0.4
test_amt = floor(test_split * len(album_df))

print(album_in)
print(track_in)

track_input = Input(shape=(len(track_in[0])*album_df["tracks"],))


album_input = Input(shape=(len(album_in[0]),))
model = Dense(256,activation='relu')(album_input)
model = Dense(256,activation='relu')(model)
output = Dense(len(ratings[0]),activation='softmax')(model)


model = Model(inputs=album_input,outputs=output)
model.summary()
opt = Adam(learning_rate=0.000001)
model.compile(optimizer=opt,loss='categorical_crossentropy',metrics=['accuracy'])

model.fit(album_in,ratings,batch_size=32, epochs=25, validation_split=validation_split,shuffle=True,verbose=1)



#evaluate

model.evaluate(test_in,test_ratings)


#output: eleven units for rating 0-10


[[0.39386973 1.         0.         ... 0.         0.99554896 0.19607843]
 [0.85977011 1.         0.         ... 0.         0.99455984 0.25490196]
 [0.23831418 1.         0.         ... 0.         0.99455984 0.17647059]
 ...
 [0.89425287 1.         0.         ... 0.         0.9950544  0.25490196]
 [0.62605364 1.         0.         ... 0.         0.99851632 0.39215686]
 [0.45977011 1.         0.         ... 0.         0.99604352 0.21568627]]


NameError: name 'track_in' is not defined