In [305]:
'''
Name: soundcloud_scraper.ipynb
Created: 2018.11.12
Author: Kenji Sakuramoto
Purpose: Scraping data from SoundCloud - Followers, Likes, Thumbnails, Plays, etc.
Version: 1.0
'''
# Libraries
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC 
import urllib.request as req
import pandas as pd
from time import sleep, strftime
from IPython.display import display, clear_output, Image
import os, re

now = strftime('%Y%m%d %H:%M')

# Selenium Webdriver
options = Options()
options.headless = True
assert options.headless
driver = webdriver.Firefox(options=options)

'''
scroll_to_end(url, expected, option='likes')
    -> load_page(url)

profile_info(username) : followers, following, tracks, likes, comments, image, profile
    -> load_page(url)
    -> convert_to_integer(a, b=None)
    -> download_img(url, username, overwrite=False)

follow_csv(username, option='following') : CSV & JPG
    -> profile_info(username)
    -> scroll_to_end(url, expected, option='likes')
    -> download_img(url, username, overwrite=False)    

liked_tracks(username) : CSV & JPG
    -> profile_info(username)
    -> scroll_to_end(url, expected, option='likes')
    -> convert_to_integer(a, b=None)
    -> download_img(url, username, overwrite=False)
'''

clear_output()

In [None]:
liked_tracks('glut_it')

scroll_to_end >>>
Retrieved: 170, Likes: 5038 Retries: 50


In [304]:
def load_page(url):
    # Load soundcloud page with error catching
    # url: STR url or username to load
    wait = WebDriverWait(driver, 10)
    if 'http' in url:
        driver.get(url)
        try:
            wait.until(EC.presence_of_element_located((By.XPATH, "//div[contains(@class,'footer')]"))).get_attribute('id')
            return
        except Exception as e:
            print(e)
    else:
        driver.get('https://soundcloud.com/' + url)
        try:
            wait.until(EC.presence_of_element_located((By.XPATH, "//div[contains(@class,'footer')]"))).get_attribute('id')
            return
        except Exception as e:
            print(e)
    if len(driver.find_elements_by_xpath("//div[contains(@class, 'errorPage')]")) > 0:
        raise Exception("{} not found".format(url))
    else:
        pass

def scroll_to_end(url, expected, option='likes'):
    # Scroll to the end for infinite dynamic scrolling
    # url: STR
    # expected: INT amount of elements to expect from the loaded website
    # option: STR 'like' or 'following' or 'followers'
    assert option=='following' or option=='likes' or option=='followers'
    load_page(url)
    SCROLL_PAUSE_TIME = 1
    DELTA = 20
    RETRY = 50

    # Get scroll height
    last_height = driver.execute_script("return document.body.scrollHeight")
    # Loop if new scroll height is not equal to current scroll height or elements on page < following minus delta
    while True and RETRY > 0:
        # Display count of loaded profiles
        clear_output(wait = True)
        if option == 'likes':
            total = len(driver.find_elements_by_xpath("//li[@class='soundList__item']"))
            print("scroll_to_end >>>")
            print("Retrieved: {}, Likes: {} Retries: {}".format(total, expected, RETRY))
        else:
            total = len(driver.find_elements_by_xpath("//div[@class='userBadgeListItem']"))
            print("scroll_to_end >>>")
            print("Retrieved: {}, Following: {} Retries: {}".format(total, expected, RETRY))
        # Scroll down to bottom
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        # Wait to load page
        sleep(SCROLL_PAUSE_TIME)
        # Calculate new scroll height and compare with last scroll height
        new_height = driver.execute_script("return document.body.scrollHeight")
        # Stop scrolling if the height is the same and the expected elements is within the total minus delta
        if (new_height == last_height) and (total > (expected - DELTA)):
            break
        # Reduce RETRY if the height is the same
        if (new_height == last_height):
            RETRY -= 1
        last_height = new_height    

def convert_to_integer(a, b = None):
    # Convert string with comma or other non-numbers to integer 
    # a: STR field descriptor
    # b: STR string to convert
    # returns: INT
    if b is not None:
        # Check for grand and million
        if 'K' in b:
            b.replace('K', '000')
        if 'M' in b:
            b.replace('M', '000000')
        b = int(re.sub("\D", "", b))
        return b
    else:
        raise Exception("No " + a + " Found!")

def download_img(url, username, overwrite=False):
    # Download image from url and place inside the folder image
    # url: STR
    # username: STR
    # overwrite: boolean
    if not os.path.exists('image'):
        os.makedirs('image')
    if os.path.exists('image/' + username + ".jpg") and overwrite is False:
        return
    req.urlretrieve(url, "image/" + username + ".jpg")       
    
def profile_info(username):
    # Returns the user stats and thumbnail html from the username, and downloads the thumbnail into image folder
    # username: STR soundcloud username
    # returns: INT followers, following, tracks, likes, comments, STR image, LIST profile
    load_page('https://soundcloud.com/' + username) 

    followers = driver.find_element_by_xpath("//*[starts-with(@class,'infoStats__stat')]//*[text()='Followers']//following::div")
    following = driver.find_element_by_xpath("//*[starts-with(@class,'infoStats__stat')]//*[text()='Following']//following::div")
    tracks = driver.find_element_by_xpath("//*[starts-with(@class,'infoStats__stat')]//*[text()='Tracks']//following::div")
    likes = driver.find_element_by_xpath("//*[starts-with(@class,'sidebarHeader') and contains(text(), 'likes')]")
    comments = driver.find_element_by_xpath("//*[starts-with(@class,'sidebarHeader') and contains(text(), 'comments')]")
    profile = driver.find_element_by_xpath("//*[starts-with(@class,'profileHeaderInfo')]").text.splitlines()
    image =  driver.find_element_by_xpath("//*[starts-with(@class,'profileHeader')]//span[starts-with(@class,'sc-artwork')]").get_attribute('style')
    if 'url' in image:
        image = image.split('"')[1]
        download_img(image, username, overwrite=False)
    else:
        image = None
    
    followers = convert_to_integer('followers', followers.text)
    following = convert_to_integer('following', following.text)
    tracks = convert_to_integer('tracks', tracks.text)
    likes = convert_to_integer('likes', likes.text)
    comments = convert_to_integer('comments', comments.text)
    print("profile_info >>>")
    print("{} followers:{} following:{} tracks:{} likes:{} comments:{}".format(username, followers, following, tracks, likes, comments))
    return followers, following, tracks, likes, comments, image, profile

def follow_csv(username, option='following'):
    # Exports the list of following and followed users to CSV, downloads the thumbnail to the folder image
    # username: STR soundcloud username
    # option: STR 'following' or 'followers'
    # returns: CSV, JPG
    option.strip().lower()
    assert option=='following' or option=='followers'
    followers, following, tracks, likes, comments, image, profile = profile_info(username)
    scroll_to_end('https://soundcloud.com/' + username + '/' + option, followers, 'following')
    
    profile = driver.find_elements_by_xpath("//div[@class='userNetwork']//a[starts-with(@class,'userBadgeListItem__heading')]")
    image = driver.find_elements_by_xpath("//div[@class='userNetwork']//span[starts-with(@class,'sc-artwork')]")
    # Do not continue if the profile and image is of different size
    assert len(profile) == len(image)

    # Returns dataframe from webdriver with text info
    temp1, temp2, temp3, temp4 = [], [], [], []
    # Returns html for profile thumbnail
    for x in range(len(image)):
        clear_output(wait = True)
        # Returns display name and profile html
        temp2.append(profile[x].text)
        html = profile[x].get_attribute('href')
        temp3.append(html)
        temp4.append(html.split('/')[3])
        img = image[x].get_attribute('style')
        # Check if there is thumbnail
        if 'url' in img:
            temp1.append(img.split('"')[1])
            download_img(img.split('"')[1], html.split('/')[3], False)
            print("follow_csv >>>")
            print("Actual {} Available {} Current {} Downloading... {}".format(following, len(image), x, html.split('/')[3]))
        else:
            temp1.append(None)

    # Convert list to DataFrame    
    df = pd.DataFrame({'displayname':temp2, 'username':temp4, 'profile':temp3, 'image-url':temp1})

    # Export to CSV
    df.to_csv(now + ' ' + username + ' ' + option + '.csv', index=True, header=True)#, mode='a')

def liked_tracks(username):
    # Navigates to the list of liked tracks and exports the information of the liked tracks (and thumbnails)
    # username : STR soundcloud username
    # returns: CSV, JPG
    followers, following, tracks, likes, comments, image, profile = profile_info(username)
    scroll_to_end('https://soundcloud.com/' + username + '/likes', likes, 'likes')
    a, b, c, d, e, f, g, h, i, j, k = [], [], [], [], [], [], [], [], [], [], []

    elem = len(driver.find_elements_by_xpath("//div[@class='soundTitle__usernameTitleContainer']"))
    title = driver.find_elements_by_xpath("//a[starts-with(@class, 'soundTitle__title')]")
    creator = driver.find_elements_by_xpath("//span[@class='soundTitle__usernameText']")
    time = driver.find_elements_by_xpath("//div[contains(@class, 'soundTitle__uploadTime')]//following::time")
    favs = driver.find_elements_by_xpath("//div[starts-with(@class, 'sc-button-group')]//*[contains(@title,'Like')]")
    reps = driver.find_elements_by_xpath("//div[starts-with(@class, 'sc-button-group')]//*[contains(@title,'Repost')]")
    image = driver.find_elements_by_xpath("//div[@class='sound__artwork']//span[starts-with(@class,'sc-artwork')]")
    tag = driver.find_elements_by_xpath("//div[contains(@class, 'soundTitle__tagContainer')]")
    stats = driver.find_elements_by_xpath("//div[@class='sound__soundStats']")

    print(elem, len(title),len(creator),len(time),len(favs),len(reps), len(image),len(tag), len(stats))
    # Loop through the minimum to prevent list overflow
    mininum = min(elem, len(title),len(creator),len(time),len(favs),len(reps), len(image),len(tag), len(stats))
#     assert len(title)==len(creator)==len(time)==len(favs)==len(reps)==len(image)==len(tag)==len(stats)

    for x in range(minimum):
        clear_output(wait = True)
        a.append(title[x].text) # Song title
        b.append(creator[x].text) # Song creator display name
        c.append(time[x].get_attribute('datetime')) # Song upload date
        d.append(convert_to_integer('favourites',favs[x].text)) # Song likes/favourites
        e.append(convert_to_integer('reposts',reps[x].text)) # Song Reposts
        html = title[x].get_attribute('href') # Song url
        f.append(html) # Song url
        g.append(html.split("/")[3]) # Song creator username
        k.append(html.split("/")[4]) # Song display name
        h.append(stats[x].text.split('\n')) # Song plays & comments
        img = image[x].get_attribute('style') # Song thumbnail html
        # Check if there is thumbnail
        if 'url' in img:
            i.append(img.split('"')[1])
            download_img(img.split('"')[1], html.split('/')[3], False)
            print("liked_tracks >>>")
            print("Loaded {}/{} Now at: {} {}".format(minimum, elem, x, html.split('/')[3]))
        else:
            i.append(None)    
            print("liked_tracks >>>")
            print("Loaded {}/{} Now at: {} NO IMAGE FOUND".format(minimum, elem, x))
        # Check if the tag name exists (more than 300 characters)
        if 'soundTitle__tagContent' in tag[x].get_attribute('innerHTML'): # Song tag
            j.append(tag[x].get_attribute('innerHTML').split(">")[6].split("<")[0])
        else:
            j.append(None)

    # Convert list to DataFrame    
    df = pd.DataFrame({'title':a, 'likes':d, 'reposts':e, 'creator':b, 'username':g, 'image-url':i, 'url':f, 'upload-date':c, 'tag':j, 'stats':h})    
    
    # Export to CSV
    df.to_csv(now + ' ' + username + ' liked' + '.csv', index=True, header=True)#, mode='a')

# Close the webdriver session
driver.close()