In [8]:
#get dependencies
import numpy as np
from pyyoutube import Api
import requests as req
import re
import pandas as pd
import glob
from datetime import datetime, timezone
import logging as log
from progress.bar import Bar
import yaml
from lyricsgenius import Genius

with open("config.yaml") as config_file:
    config = yaml.load(config_file.read())

VERBOSE = False
START_YEAR = 2014
END_YEAR = 2016

  config = yaml.load(config_file.read())


In [9]:
# authenticate for spotify
spotify_auth = (config["spotify"]["client-id"],config["spotify"]["client-secret"])
token_res = req.post(   'https://accounts.spotify.com/api/token', 
                        auth=spotify_auth,
                        headers={
                            'content-type': 'application/x-www-form-urlencoded'
                        },
                        data={'grant_type': 'client_credentials'}
                    )
spotify_token = token_res.json()['access_token']

# authenticate for youtube 
youtube_api_key = config["youtube"]["api-key"]
yt = Api(api_key=youtube_api_key)

genius_access_token = config["genius"]["access-token"]
genius = Genius(genius_access_token)



In [10]:





res = yt.get_channel_info(channel_name="theneedledrop")
print(res)
tnd_id = res.items[0].id

#note: youtubes api limits to 500 videos per search.



for yr in range(START_YEAR,END_YEAR+1):

    print("collecting data for " + str(yr))
    albums = pd.DataFrame(
    {'spotify_id':[],
    'youtube_id':[],
    'project_name':[],
    'artist':[],
    'project_type':[],
    'tracks':[],
    'project_art':[],
    'year':[],
    'rating':[]})


    tracks = pd.DataFrame(
        {'spotify_id':[],
            'album_id':[],
            'youtube_id':[],
            'name':[],
            'duration':[],
            'explicit':[],
            'preview':[],
            'key':[],
            'mode':[],
            'acousticness':[],
            'danceability':[],
            'energy':[],
            'instrumentalness':[],
            'liveness':[],
            'loudness':[],
            'speechiness':[],
            'valence':[],
            'tempo':[]})

    published_before_ts = datetime(yr+1,1,1,tzinfo=timezone.utc).isoformat()
    published_after_ts = datetime(yr,1,1,tzinfo=timezone.utc).isoformat()

    review_vids = yt.search(parts='id,snippet',
                            search_type='video',
                            channel_id=tnd_id, 
                            q='REVIEW',
                            count=500,
                            limit=500,
                            published_after=published_after_ts,
                            published_before=published_before_ts)

    bar = Bar("Processing videos", max=len(review_vids.items))
    for item in review_vids.items:

        album = {}

        vid_id = item.id.videoId
        vid = yt.get_video_by_id(video_id=vid_id).items[0]
        description = vid.snippet.description
        vid_title = vid.snippet.title.lower().split(" ")
        if 'review' not in vid_title:
            print("invalid title: " + vid.snippet.title)
            continue
        
        review_index = len(vid_title) - vid_title[::-1].index('review') - 1

        if vid_title[vid_title.index('review') - 1] in ['album','mixtape','ep','redux','complimation','soundtrack','track']:
            project_type = vid_title[review_index - 1]

            type_index = review_index - 1

            project_info = " ".join(vid_title[:type_index]).split("- ")
        else:
            project_info = " ".join(vid_title[:review_index]).split("- ")

        if len(project_info) < 2:
            print("invalid title: " + vid.snippet.title)
            continue
        artist = project_info[0].strip()
        project_name = "-".join(project_info[1::]).strip()
        rating_regex = re.search(r'\d+/10',description)
        if rating_regex:
            rating = int(rating_regex.group(0)[0:1])
        else:
            print("no rating: " + vid.snippet.title)
            continue

        

        # spotify has trouble picking up artists with ampersands in their names as well as 
        # self-titled albums, so fix the query to eliminate those errors
        search_term = ""

        search_term += " ".join(artist.replace("[","").replace("]","").replace(" X "," ").split(" & ")) + " "

        if "self-titled" in project_name:
            search_term += " " + artist
        else:
            search_term += project_name.replace("[","").replace("]","")

        if VERBOSE:
            print("searching for: " + search_term)
        

        search_res = req.get(   'https://api.spotify.com/v1/search',
                            params={
                                'q': search_term,
                                "type": "album",
                                "limit": 1,
                                'include_external': True
                            },
                            headers={
                                'authorization': "Bearer " + spotify_token
                            }
                        )
    
        album_on_spotify =  'albums' in search_res.json() and len(search_res.json()['albums']['items']) != 0

        if not album_on_spotify:
            print("not on spotify: " + artist + " - " + project_name + " search term: " + search_term)

        if album_on_spotify:
            spotify_album = search_res.json()['albums']['items'][0]
            tracks_amt = spotify_album['total_tracks']
            release_year = int(spotify_album['release_date'][0:4])
            spotify_id = spotify_album['id']
            if spotify_album['images'] and len(spotify_album['images']) > 0:
                project_art = spotify_album['images'][-1]['url']
        
        



        
        album["spotify_id"] = spotify_id if album_on_spotify else ""
        album["youtube_id"] = vid_id
        album["project_name"] = project_name
        album["artist"] = artist
        album["project_type"] = project_type
        album["tracks"] = tracks_amt if album_on_spotify else -1
        album["project_art"] = project_art if album_on_spotify else ""
        album["year"] = release_year if album_on_spotify else -1
        album["rating"] = rating

        albums = albums.append(album,ignore_index=True)

        if VERBOSE:
            print("\033[1;30;43martist:\033[0m ", end='')
            print(artist, end=' ')
            print("\033[1;30;43mproject name:\033[0m ", end='')
            print(project_name, end=' ')
            print("\033[1;30;43mproject type:\033[0m ", end='')
            print(project_type, end=' ')


            print("\033[1;30;43mrelease year:\033[0m ", end='')
            print(release_year, end=' ')

            print("\033[1;30;43mrating:\033[0m ", end='')
            print(rating)


    

        if not album_on_spotify: continue

        #get tracks
        search_res = req.get(   'https://api.spotify.com/v1/albums/' + spotify_id + '/tracks',
                            params={
                                "limit": 50
                            },
                            headers={
                                'authorization': "Bearer " + spotify_token
                            }
                        )
        spotify_tracks = search_res.json()['items']
        

        
        for st in spotify_tracks:
            
            spotify_track_id = st['id']
            #get audio features of track
            search_res = req.get(   'https://api.spotify.com/v1/audio-features/' + spotify_track_id,
                            headers={
                                'authorization': "Bearer " + spotify_token
                            }
                        )
            has_audio_features = search_res != None and len(search_res.text) > 0 

            if has_audio_features:
                spotify_analysis = search_res.json()
            else:
                print("spotify audio features query failed: " + project_name)

            track = {}
            track["spotify_id"] = spotify_track_id
            track["album_id"] = spotify_id
            track["youtube_id"] = vid_id
            track["name"] = st['name']
            track["duration"] = st['duration_ms']
            track["explicit"] = st['explicit']
            track["preview"] = st['preview_url']
            track["key"] = spotify_analysis['key'] if has_audio_features and 'key' in spotify_analysis else -1
            track["mode"] = spotify_analysis['mode'] if has_audio_features and 'mode' in spotify_analysis else -1
            track["acousticness"] = spotify_analysis['acousticness'] if has_audio_features and 'acousticness' in spotify_analysis else -1
            track['danceability'] = spotify_analysis['danceability'] if has_audio_features and 'danceability' in spotify_analysis else -1
            track['energy'] = spotify_analysis['energy'] if has_audio_features and 'energy' in spotify_analysis else -1
            track['instrumentalness'] = spotify_analysis['instrumentalness'] if has_audio_features and 'instrumentalness' in spotify_analysis else -1
            track['liveness'] = spotify_analysis['liveness'] if has_audio_features and 'liveness' in spotify_analysis else -1
            track['loudness'] = spotify_analysis['loudness'] if has_audio_features and 'loudness' in spotify_analysis else -1
            track['speechiness'] = spotify_analysis['speechiness'] if has_audio_features and 'speechiness' in spotify_analysis else -1
            track['valence'] = spotify_analysis['valence'] if has_audio_features and 'valence' in spotify_analysis else -1
            track['tempo'] = spotify_analysis['tempo'] if has_audio_features and 'tempo' in spotify_analysis else -1
            

            tracks = tracks.append(track,ignore_index=True)
            
        
            bar.update()
            if VERBOSE:
                    
                print("\t" + track['name'], end=' ')
                t_min_str = str(track['duration'] // 60000)
                t_sec_str = "0" + str((track['duration'] // 1000) % 60) if (track['duration'] // 1000) % 60 < 10 else str((track['duration'] // 1000) % 60)
                print("(" + t_min_str + ":" + t_sec_str + ")", end=' ')
                
                track_key = ["C", "C#", "D", "D#", "E", "F", "F#", "G", "G#", "A", "A#", "B"][track['key']] if track['key'] != -1 else "unknown"
                print("\033[1;30;43mkey:\033[0m ",end='')
                print(track_key, end=' ')

                track_mode = "major" if track['mode'] == 1 else "minor"
                print("\033[1;30;43mmode:\033[0m ",end='')
                print(track_mode)
            
    bar.finish()
    #save to tracks.csv
    print(albums)
    print(tracks)
    tracks.to_csv('./data/tracks' + str(yr) + '.csv')
    albums.to_csv('./data/albums' + str(yr) + '.csv')





ChannelListResponse(kind='youtube#channelListResponse')
collecting data for 2011
no rating: Lil B- I'm Gay (I'm Happy) ALBUM REVIEW
no rating: Wu-Tang Clan- Enter the Wu-Tang: 36 Chambers ALBUM REVIEW
invalid title: Marvin Gaye- What's Going On
no rating: Frank Zappa- Hot Rats ALBUM REVIEW
no rating: Limp Bizkit- Gold Cobra ALBUM REVIEW
no rating: Godspeed You! Black Emperor- Lift Yr. Skinny Fists Like Antennas to Heaven! ALBUM REVIEW
no rating: Lady Gaga- Born This Way ALBUM REVIEW
no rating: The Drums- Portamento ALBUM REVIEW
no rating: 9th Wonder- The Wonder Years ALBUM REVIEW
no rating: The Roots- Undun ALBUM REVIEW
invalid title: Do messages matter?
no rating: Deerhoof- Deerhoof vs Evil ALBUM REVIEW
invalid title: YOU'RE NOT REVIEWING: Blink-182, The Game, Foster the People, and More
invalid title: Boris- Heavy Rocks / Attention Please ALBUM REVIEWS
invalid title: Top-10 Favorite Misfits Songs
invalid title: 4 Reviews: The Weeknd / Gotye / Blood Orange / All Pigs Must Die
invalid 



no rating: Madvillain- Madvillainy ALBUM REVIEW
no rating: Neutral Milk Hotel - In the Aeroplane Over the Sea ALBUM REVIEW
no rating: Actress- R.I.P. ALBUM REVIEW
no rating: Gorillaz- "DoYaThing" TRACK REVIEW
invalid title: Earl Sweatshirt, Ghost, A$AP Rocky, Starkey T.I., Wiz Khalifa, Yo La Tengo, and more w/the Quickness
invalid title: Y U NO REVIEW: JUNE '12
no rating: Various Artists- Cruel Summer COMPILATION REVIEW
invalid title: Y U NO REVIEW: JUNE '12
invalid title: Y U NO REVIEW: August '12
no rating: Mac Miller- Macadelic ALBUM REVIEW
no rating: Lil B- California Boy RARE TRACK REVIEW
invalid title: Top-10 Rage Against the Machine Songs
no rating: Muse- "The 2nd Law: Unsustainable" TRACK REVIEW
invalid title: Big Boi- Vicious Lies and Dangerous Rumors
invalid title: Y U NO REVIEW: MARCH '12
invalid title: Do we respect the past? (Tupac Hologram Reaction)
no rating: Animal Collective- Today's Supernatural TRACK REVIEW
invalid title: Ty Segall Band- Slaughterhouse / Ty Segall & 



no rating: Eminem - "Rap God" TRACK REVIEW ("Berzerk," too!)
no rating: My Bloody Valentine- Loveless ALBUM REVIEW
no rating: Miles Davis- Bitches Brew ALBUM REVIEW
no rating: King Crimson- In the Court of the Crimson King ALBUM REVIEW
no rating: Johnny Cash - At Folsom Prison ALBUM REVIEW
no rating: The Velvet Underground & Nico - Self-Titled ALBUM REVIEW
no rating: Mazzy Star - Seasons Of Your Day ALBUM REVIEW (QUICK)
no rating: Arca - &&&&& MIX REVIEW
no rating: Yung Lean - Unknown Death 2002 MIXTAPE REVIEW
no rating: Lorde - Pure Heroine ALBUM REVIEW (QUICK)
invalid title: Farewell, Lou Reed. (Re:flection #9)
not on spotify: mr. muthafuckin' exquire - kismet search term: mr. muthafuckin' exquire kismet
invalid title: The Strokes, Matmos, Biffy Clyro, Joe Budden, Mister Lies, Sannhet, Inc., and more w/the Quickness
no rating: Dead Kennedys- Plastic Surgery Disasters ALBUM REVIEW
no rating: Black Sabbath - 13 ALBUM REVIEW
no rating: Big Sean - "Control" ft. Kendrick Lamar, Jay Electr

In [10]:
#recombine the csvs into a dataframe 
RECOMBINE_FILES = True

path = './data' # use your path
track_files = glob.glob(path + "/tracks20*.csv")
album_files = glob.glob(path + "/albums20*.csv")

album_li = []
track_li = []
if RECOMBINE_FILES:
    for filename in album_files:
        df = pd.read_csv(filename, index_col=0, header=0, dtype=dict(albums.dtypes))
        album_li.append(df)

    album_df = pd.concat(album_li, axis=0, ignore_index=True)


    for filename in track_files:
        df = pd.read_csv(filename, index_col=0, header=0,dtype=dict(tracks.dtypes))
        track_li.append(df)

    track_df = pd.concat(track_li, axis=0, ignore_index=True)

    album_df.to_csv('./data/albums.csv')

    track_df.to_csv('./data/tracks.csv')
else:
    album_df = pd.read_csv('./data/albums.csv')
    track_df = pd.read_csv('./data/tracks.csv')

album_df = album_df.sample(frac=1).reset_index(drop=True)
track_df = track_df.sample(frac=1).reset_index(drop=True)
print(album_df)
print(track_df)


                  spotify_id   youtube_id             project_name  \
0     5fwYiohuGFqJx34Z4s26jI  42RML92tDyQ             life is good   
1     2PPvDD3t985MvMphfSwzgr  sVKjoQMZCfU              self-titled   
2     5RA5hhLlbw31QfQRX11pwo  t70fklNoFBI             innerspeaker   
3     2ZjlfQheUJ8Q4MxoPruYLB  iE5p3GCzoxI              self-titled   
4                        NaN  lMMpw5VCywc     instrumental tourist   
...                      ...          ...                      ...   
6973  7uDfs1lmoDlKJTGjLaipEg  p6T46TrMB80             gumboot soup   
6974  5JXnMCR2UcTUBeKbgwNxIr  FGaO0divFf0                        v   
6975  2xsmS7sSEtbTYv73akrN6i  ftt175Djodk      tomorrow, in a year   
6976  2dMmcXlG8xtRJNlsjIrPWe  3CrD91zHkLg                   virtue   
6977  3izETQ8Engz8v6Zkg0aOyX  -jvtyvMWE88  when fish ride bicycles   

                                artist project_type  tracks  \
0                                  nas        album    18.0   
1                             cza