In [42]:
# --- 2020-01 CAU
# --- Natural Language Processing and Information Retrieval
# --- Sentiment Analysis Team Project
# --- 
# --- 

from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from urllib.parse import urlparse, parse_qs
import xlsxwriter
import time
import pandas as pd

# webdriver dir
browser = webdriver.Chrome('./chromedriver.exe') # watch out the version

In [105]:
# --- Saving results as .xlsx file
def savexlsx(xlsxfilename, array):
    with xlsxwriter.Workbook(xlsxfilename) as workbook:
        worksheet = workbook.add_worksheet()

        # Add a bold format to use to highlight cells
        bold = workbook.add_format({'bold': True})

        for row_num, data in enumerate(array):
            if row_num == 0:
                worksheet.write('A1','title',bold)
                worksheet.write('B1','artist',bold)
                worksheet.write('C1','lyric',bold)
                continue
            worksheet.write_row(row_num, 0, data)
    workbook.close()

In [111]:
for year in range(2006, 2020):
    print("\n=====", year)
    
    # -------------------------------------------------------------------------------------
    # ----- collecting top 100 songs over years ----- 
    chart = []
    songs = 100
    
    chart_url = "https://www.billboard.com/charts/year-end/"+str(year)+"/hot-100-songs"
    browser.get(chart_url)
    
    for rank in range(1, 101): # html code starts with 1 not 0 so,
        try:
            title = browser.find_element_by_xpath("(//div[@class='ye-chart-item__title'])["+str(rank)+"]").text
            artist = browser.find_element_by_xpath("(//div[@class='ye-chart-item__artist'])["+str(rank)+"]").text
            #print("[{}] {} - {}".format(rank, title, artist))
            chart.append([title, artist, ''])
        except:
            print("... song #", rank, " has collecting problem")
            songs -= 1   # sometimes, billboard website doesn't show exactly 100 songs
                         # ex. 2011, there's no #7 song ^^;, 2016 #87
    # -------------------------------------------------------------------------------------
    
    
    # -------------------------------------------------------------------------------------
    # ----------- Search lyrics for songs ----------- 
    ### Be careful! You could be BLOCKED by the website ###
    for rank in range(0, songs):
        org_title = chart[rank][0].replace('/', ' ') # error_handling (ex.cupid's chokehead/breakfast in america)
        org_artist = chart[rank][1].replace(' Featuring','') # remove featuring artists _error handling
        org_artist = org_artist.split(' (Featuring',1)[0]
        org_artist = org_artist.split(' Or ',1)[0]
        org_artist = org_artist.replace(' Duet With','').replace(' With', '').replace(' X','').replace(' x ',' ')
        org_artist = org_artist.replace('*',' ').replace('#','').replace('/ ','') # these can cause query problem
        print(rank, org_title, org_artist)
        
        lyrics_url = "https://www.musixmatch.com/search/"+org_title+' '+org_artist 
        browser.get(lyrics_url)
        time.sleep(2) # wait while loading
        
        # Need to check which searched result is matched to original info over webpage but we passed this process
        # because [Best results] song is nearly perfectly same
        """
        title = browser.find_element_by_xpath("(//h2[@class='media-card-title'])[1]")
        artist = browser.find_element_by_xpath("(//h3[@class='media-card-subtitle'])[1]").text.split('feat',1)[0]
        #print(org_title,'-',org_artist, '/', title.text,'-',artist)
        if title.text == org_title and artist == org_artist:
            title.click()
            # ...
        """
        
        try:
            browser.find_element_by_xpath("//a[@class='title']").click() # click Best result
            time.sleep(1)

            # -------------------------------------------------------------------------------------
            # Sometimes, there're some lyrics which are judged as errored or warning by the website
            # But we ignored this notice and collect them no matter what state they are
            # -------------------------------------------------------------------------------------
            # ___case1. lyrics state: OK
            try: 
                lyric = browser.find_element_by_xpath("(//span[@class='lyrics__content__ok'])[1]").text
                # if the content is divided, concatenate them
                try: lyric += browser.find_element_by_xpath("(//span[@class='lyrics__content__ok'])[2]").text
                except: continue
            except:
            # ___case2. lyrics state: Error
                try: 
                    lyric = browser.find_element_by_xpath("(//span[@class='lyrics__content__error'])[1]").text
                    try: lyric += browser.find_element_by_xpath("(//span[@class='lyrics__content__error'])[2]").text
                    except: continue
                except:
            # ___case3. lyrics state: Warning
                    try: 
                        lyric = browser.find_element_by_xpath("(//span[@class='lyrics__content__warning'])[1]").text
                        try: lyric += browser.find_element_by_xpath("(//span[@class='lyrics__content__warning'])[2]").text
                        except:continue
            # ___case 4. lyrics state: Unavailable (ex. restricted)
                    except: continue 
            # -------------------------------------------------------------------------------------
            #print(lyric)
            chart[rank][2] = lyric
            
        except NoSuchElementException: # No searching results ___ this lyrics cell going to be blank
            print("!")
    # -------------------------------------------------------------------------------------
    #print(chart)
    
    savexlsx('./'+str(year)+'_data.xlsx', chart)


===== 2019
0 Old Town Road Lil Nas Billy Ray Cyrus
1 Sunflower (Spider-Man: Into The Spider-Verse) Post Malone & Swae Lee
2 Without Me Halsey
3 Bad Guy Billie Eilish
4 Wow. Post Malone
5 Happier Marshmello & Bastille
6 7 Rings Ariana Grande
7 Talk Khalid
8 Sicko Mode Travis Scott
9 Sucker Jonas Brothers
10 High Hopes Panic! At The Disco
11 Thank U, Next Ariana Grande
12 Truth Hurts Lizzo
13 Dancing With A Stranger Sam Smith & Normani
14 Senorita Shawn Mendes & Camila Cabello
15 I Don't Care Ed Sheeran & Justin Bieber
16 Eastside benny blanco, Halsey & Khalid
17 Going Bad Meek Mill Drake
18 Shallow Lady Gaga & Bradley Cooper
19 Better Khalid
20 No Guidance Chris Brown Drake
21 Girls Like You Maroon 5 Cardi B
22 Sweet But Psycho Ava Max
23 Suge DaBaby
24 Middle Child J. Cole
25 Drip Too Hard Lil Baby & Gunna
26 Someone You Loved Lewis Capaldi
27 Ran$om Lil Tecca
28 If I Can't Have You Shawn Mendes
29 Goodbyes Post Malone Young Thug
30 ZEZE Kodak Black Travis Scott & Offset
31 Better Now

In [76]:
%%time
# !!!!! This cell can be merged with above cell !!!!

# Collect songs' duration information from Youtube Music
# and insert that column to data file
for year in range(2006, 2020):
    path = './'+str(year)+'_data.xlsx'
    df = pd.read_excel(path) # make dataframe about .xlsx file
    print("\n=====", year, "-- total:", len(df))
    
    col = []
    for rank in range(0, len(df)):
        query = df['title'][rank]+' '+df['artist'][rank]
        query = query.replace('#','') # this can cause query problem

        duration_url = "https://music.youtube.com/search?q="+query
        browser.get(duration_url)
        time.sleep(1.5) # wait while loading

        def cal_len(i): # select i-th element of searched result (it could be best/video/song/playlist ...)
            duration = browser.find_element_by_xpath("(//ytmusic-responsive-list-item-renderer)["+str(i)+"]").text
            duration = duration.rsplit('\n',1)[1] # extract duration element only
            # transform duration form to seconds ... ex) 4:03 -> 243
            length = int(duration.split(':')[0])*60 + int(duration.split(':')[1])
            return length
            
        try: length = cal_len(2) # mostly this element is songs
        except: length = cal_len(1) #if second element is playlist, just try the best result one
            
        col.append(length)
        print(rank,') ', query, ':',length)
    df['duration'] = col
    df.to_excel(path)



===== 2016 -- total: 98
0 )  Sorry Justin Bieber : 201
1 )  One Dance Drake Featuring WizKid & Kyla : 167
2 )  Work Rihanna Featuring Drake : 220
3 )  Stressed Out twenty one pilots : 226
4 )  Panda Desiigner : 248
5 )  Hello Adele : 296
6 )  Don't Let Me Down The Chainsmokers Featuring Daya : 208
7 )  Can't Stop The Feeling! Justin Timberlake : 286
8 )  Closer The Chainsmokers Featuring Halsey : 244
9 )  Cheap Thrills Sia Featuring Sean Paul : 212
10 )  7 Years Lukas Graham : 238
11 )  Needed Me Rihanna : 192
12 )  My House Flo Rida : 192
13 )  I Took A Pill In Ibiza Mike Posner : 198
14 )  Work From Home Fifth Harmony Featuring Ty Dolla $ign : 214
15 )  This Is What You Came For Calvin Harris Featuring Rihanna : 222
16 )  Cake By The Ocean DNCE : 220
17 )  Me, Myself & I G-Eazy x Bebe Rexha : 252
18 )  Ride twenty one pilots : 215
19 )  Heathens twenty one pilots : 218
20 )  Pillowtalk Zayn : 202
21 )  Stitches Shawn Mendes : 207
22 )  Hotline Bling Drake : 296
23 )  Cold Water Majo

84 )  Slippery Migos Featuring Gucci Mane : 304
85 )  Sign Of The Times Harry Styles : 340
86 )  Water Under The Bridge Adele : 241
87 )  Malibu Miley Cyrus : 228
88 )  Down Marian Hill : 198
89 )  No Promises Cheat Codes Featuring Demi Lovato : 223
90 )  Treat You Better Shawn Mendes : 188
91 )  I Get The Bag Gucci Mane Featuring Migos : 233
92 )  Small Town Boy Dustin Lynch : 205
93 )  Everyday We Lit YFN Lucci Featuring PnB Rock : 197
94 )  Havana Camila Cabello Featuring Young Thug : 218
95 )  What Lovers Do Maroon 5 Featuring SZA : 200
96 )  Do Re Mi blackbear : 213
97 )  Look At Me! XXXTENTACION : 128
98 )  The Fighter Keith Urban Featuring Carrie Underwood : 183

===== 2018 -- total: 99
0 )  Perfect Ed Sheeran : 264
1 )  Meant To Be Bebe Rexha & Florida Georgia Line : 163
2 )  Havana Camila Cabello Featuring Young Thug : 218
3 )  Rockstar Post Malone Featuring 21 Savage : 219
4 )  Psycho Post Malone Featuring Ty Dolla $ign : 222
5 )  I Like It Cardi B, Bad Bunny & J Balvin : 254

78 )  Trip Ella Mai : 213
79 )  Rumor Lee Brice : 223
80 )  Swervin A Boogie Wit da Hoodie Featuring 6ix9ine : 190
81 )  How Do You Sleep? Sam Smith : 202
82 )  Baby Lil Baby & DaBaby : 143
83 )  Look What God Gave Her Thomas Rhett : 169
84 )  Good As You Kane Brown : 193
85 )  Clout Offset Featuring Cardi B : 201
86 )  Love Lies Khalid & Normani : 201
87 )  One Thing Right Marshmello & Kane Brown : 181
88 )  Cash Shit Megan Thee Stallion Featuring DaBaby : 192
89 )  Tequila Dan + Shay : 196
90 )  Shotta Flow NLE Choppa : 160
91 )  Hot Girl Summer Megan Thee Stallion, Nicki Minaj & Ty Dolla $ign : 199
92 )  Talk You Out Of It Florida Georgia Line : 203
93 )  Beautiful Bazzi Featuring Camila Cabello : 180
94 )  Eyes On You Chase Rice : 183
95 )  All To Myself Dan + Shay : 169
96 )  Boyfriend Ariana Grande & Social House : 187
97 )  Walk Me Home P!nk : 177
98 )  Robbery Juice WRLD : 241
Wall time: 16min 32s


In [9]:
# For reducing the required time, I tried to use Beautifulsoup for duration crawling,
# but it failed due to youtube music's browser problem
"""
import requests
from bs4 import BeautifulSoup

duration_url = "https://music.youtube.com/search?q="+"bad+guy+billie"

## HTTP GET Request
req = requests.get(duration_url)
html = req.text
soup = BeautifulSoup(html, 'html.parser')
print(soup)
"""


<!DOCTYPE html>
<html dir="ltr" lang="ko-KR"><head><title>사용되지 않는 브라우저입니다. 업그레이드하세요.</title><meta content="width=device-width, initial-scale=1.0, maximum-scale=1.0, user-scalable=no" name="viewport"/><link href="https://s.ytimg.com/yts/img/music/web/ytm_favicon-vflgs6l3r.ico" rel="icon" type="image/x-icon"/><link href="//s.ytimg.com/yts/img/music/web/ytm_favicon_32-vfl2lNWut.png" rel="icon" sizes="32x32"/><link href="//s.ytimg.com/yts/img/music/web/ytm_favicon_48-vflq0vN_b.png" rel="icon" sizes="48x48"/><link href="//s.ytimg.com/yts/img/music/web/ytm_favicon_96-vflBf1BFh.png" rel="icon" sizes="96x96"/><link href="//s.ytimg.com/yts/img/music/web/ytm_favicon_144-vfltAH-Lj.png" rel="icon" sizes="144x144"/><style>body {display: flex;height: 100vh;width: 100vw;align-items: center;justify-content: center;margin: 0;background-color: #131313;overflow: hidden;font-family: 'Roboto', arial, sans-serif}.content {display: flex;flex-direction: column;align-items: center;}.logo {width: 160px;}.messa