In [217]:
'''
Name: soundcloud_scraper.ipynb
Created: 2018.11.12
Author: Kenji Sakuramoto
Purpose: Scraping data from SoundCloud - Followers, Likes, Thumbnails, Plays, etc.
Version: 1.0
'''
# Libraries
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
import urllib.request as req
import pandas as pd
from time import sleep, strftime
from IPython.display import display, clear_output, Image
import os, re

now = strftime('%Y-%m-%d %H:%M:%S')

# Selenium Webdriver
options = Options()
options.headless = True
assert options.headless
driver = webdriver.Firefox(options=options)

'''
Summary

scroll_to_end(url)
    -> load_page(url)

profile_info(username)
    -> load_page(url)
    -> convert_to_integer(a, b=None)
    -> download_img(url, username, overwrite=False)

follow_csv(username, option='following')
    -> scroll_to_end(url)
    -> download_img(url, username, overwrite=False)
    
'''
;

''

In [218]:
profile_info('hacoon')

hacoon followers:8941 following:520 tracks:3 likes:524 comments:163


('hacoon',
 8941,
 520,
 3,
 524,
 163,
 'https://i1.sndcdn.com/avatars-000524573757-3ygydz-t200x200.jpg',
 ['HACOONPro', 'Raphael Cabral', 'São Paulo'])

In [225]:
likes
# follow_csv('whethan','following')

5037

In [222]:
def load_page(url):
    # Load soundcloud page with error catching
    # url: STR url or username to load
    if 'http' in url:
        driver.get(url)
    else:
        driver.get('https://soundcloud.com/' + url)
    if len(driver.find_elements_by_xpath("//div[contains(@class, 'errorPage')]")) > 0:
        raise Exception("{} not found".format(url))
    else:
        pass

def scroll_to_end(url):
    # Scroll to the end for infinite dynamic scrolling
    # url: STR
    load_page(url)
    SCROLL_PAUSE_TIME = 0.5
    DELTA = 20
    RETRY = 10

    # Get scroll height
    last_height = driver.execute_script("return document.body.scrollHeight")
    # Loop if new scroll height is not equal to current scroll height or elements on page < following minus delta
    while True and RETRY > 0:
        RETRY -= 1
        # Display count of loaded profiles
        clear_output(wait = True)
        total = len(driver.find_elements_by_xpath("//div[@class='userBadgeListItem']"))
        print("Retrieved: {}, Following: {}".format(total, following))
        # Scroll down to bottom
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        # Wait to load page
        sleep(SCROLL_PAUSE_TIME)
        # Calculate new scroll height and compare with last scroll height
        new_height = driver.execute_script("return document.body.scrollHeight")
        if (new_height == last_height) and (total > (following - DELTA)):
            break
        last_height = new_height    

def convert_to_integer(a, b = None):
    # Convert string with comma or other non-numbers to integer 
    # a: STR field descriptor
    # b: STR string to convert
    # returns: INT
    if b is not None:
        b = int(re.sub("\D", "", b))
        return b
    else:
        raise Exception("No " + a + " Found!")

def download_img(url, username, overwrite=False):
    # Download image from url and place inside the folder image
    # url: STR
    # username: STR
    # overwrite: boolean
    if not os.path.exists('image'):
        os.makedirs('image')
    if os.path.exists('image/' + username + ".jpg") and overwrite is False:
        return
    req.urlretrieve(url, "image/" + username + ".jpg")       
    
def profile_info(username):
    # Returns the user stats and thumbnail html from the username, and downloads the thumbnail into image folder
    # username: STR soundcloud username
    # returns: INT followers, following, tracks, likes, comments, STR image, LIST profile
    load_page('https://soundcloud.com/' + username)

    a = driver.find_element_by_xpath("//*[starts-with(@class,'infoStats__stat')]//*[text()='Followers']//following::div")
    b = driver.find_element_by_xpath("//*[starts-with(@class,'infoStats__stat')]//*[text()='Following']//following::div")
    c = driver.find_element_by_xpath("//*[starts-with(@class,'infoStats__stat')]//*[text()='Tracks']//following::div")
    d = driver.find_element_by_xpath("//*[starts-with(@class,'sidebarHeader') and contains(text(), 'likes')]")
    e = driver.find_element_by_xpath("//*[starts-with(@class,'sidebarHeader') and contains(text(), 'comments')]")
    profile = driver.find_element_by_xpath("//*[starts-with(@class,'profileHeaderInfo')]").text.splitlines()
    image =  driver.find_element_by_xpath("//*[starts-with(@class,'profileHeader')]//span[starts-with(@class,'sc-artwork')]").get_attribute('style')
    if 'url' in image:
        image = image.split('"')[1]
        download_img(image, username, overwrite=False)
    else:
        image = None
    
    followers = convert_to_integer('followers', a.text)
    following = convert_to_integer('following', b.text)
    tracks = convert_to_integer('tracks', c.text)
    likes = convert_to_integer('likes', d.text)
    comments = convert_to_integer('comments', e.text)
    print("{} followers:{} following:{} tracks:{} likes:{} comments:{}".format(username, followers, following, tracks, likes, comments))
    return followers, following, tracks, likes, comments, image, profile

def follow_csv(username, option='following'):
    # Exports the list of following and followed users to CSV, downloads the thumbnail to the folder image
    # username: STR soundcloud username
    # option: STR 'following' or 'followers'
    # returns: CSV, JPG
    option.strip().lower()
    assert option=='following' or option=='followers'
    scroll_to_end('https://soundcloud.com/' + username + '/' + option)
    
    profile = driver.find_elements_by_xpath("//div[@class='userNetwork']//a[starts-with(@class,'userBadgeListItem__heading')]")
    image = driver.find_elements_by_xpath("//div[@class='userNetwork']//span[starts-with(@class,'sc-artwork')]")
    # Do not continue if the profile and image is of different size
    assert len(profile) == len(image)

    # Returns dataframe from webdriver with text info
    temp1, temp2, temp3, temp4 = [], [], [], []
    # Returns html for profile thumbnail
    for x in range(len(image)):
        # Returns display name and profile html
        temp2.append(profile[x].text)
        html = profile[x].get_attribute('href')
        temp3.append(html)
        temp4.append(html.split('/')[3])
        img = image[x].get_attribute('style')
        # Error check if there is thumbnail
        if 'url' in img:
            temp1.append(img.split('"')[1])
            download_img(img.split('"')[1], html.split('/')[3], False)
        else:
            temp1.append(None)

    # Convert list to DataFrame    
    df = pd.DataFrame({'displayname':temp2, 'username':temp4, 'profile':temp3, 'image-url':temp1})

    # Export to CSV
    df.to_csv(now + ' ' + username + ' ' + option + '.csv', index=True, header=True)#, mode='a')

In [179]:
# Close the webdriver session
driver.close()

MaxRetryError: HTTPConnectionPool(host='127.0.0.1', port=58035): Max retries exceeded with url: /session/1dc9fb1d-ea0d-4963-b45d-fc62f13fd296/window (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7fdd417f5eb8>: Failed to establish a new connection: [Errno 111] Connection refused',))