In [3]:
from selenium import webdriver
import pandas as pd 
from selenium.webdriver.common.keys import Keys
import time
import numpy as np
import re
from datetime import datetime
import scipy.sparse

In [4]:
#Infinitely scrolls to the bottom of the page given by url. Assumes a chromedriver window is already open.
#stop_scroll does nothing

def get_html_scroll(url, stop_scroll):
    driver.get(url)

    SCROLL_PAUSE_TIME = .5

    # Get scroll height
    last_height = driver.execute_script("return document.body.scrollHeight")

    for i in range(50):
        # Scroll down to bottom
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

        # Wait to load page
        time.sleep(SCROLL_PAUSE_TIME)

        # Calculate new scroll height and compare with last scroll height
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            time.sleep(3)
            new_height = driver.execute_script("return document.body.scrollHeight")
            if new_height == last_height:
                break
        last_height = new_height
        
    return driver.page_source

In [5]:
#Returns a list of artist names (more precisely soundcloud's internal marker for them) contained in the html of a likes page. 
#The fact it concatenates this list to itself serves no real purpose before returning serves no real purpose.

def parse_likes(html):
    links = re.findall('<a class="sound__coverArt" href="/\S{,50}/', html)
    links = [str[34:len(str)-1].split("/")[0] for str in links]
    print(len(links))
    return np.column_stack((links, links))

In [6]:
#Same thing but for reposts

def parse_reposts(html):
    links = re.findall('<a class="sound__coverArt" href="/.{,100}/', html)
    links = [str[34:len(str)-1].split("/")[0] for str in links]
    return np.column_stack((links,links))
    

In [7]:
#Same thing but for comments

def parse_comments(html):
    links = re.findall('a class="sc-link-light" href="/.{,100}/', html)
    links = [str[31:len(str)-1].split("/")[0] for str in links]
    return np.column_stack((links,links))

In [8]:
#Same thing but for follows

def parse_follows(html):
    links = re.findall('<a href="/.{,100}" class="userBadgeListItem__image">', html)
    links = [str[10:len(str)-35] for str in links]
    return np.column_stack((links,links))

In [9]:
#Returns the number of followers a user has from the html of a user's base account page.

def parse_followers(html):
    followers = re.findall('"followers_count":\d{,50},', html)
    num = int(followers[0][18:len(followers[0])-1])
    return num

In [10]:
#Returns a user's display name from the html of a user's base account page.

def parse_name(html):
    name = re.findall('"username":".{,200}"', html_followers)[0]
    name = name[12:]
    name = name.split('\"')[0]
    return name


In [11]:
#Returns the "emerge date" from the html of a user's tracks page. "Emerge date" is defined as the 
#first 15% quantile of a user's track upload dates.

def parse_tracks(html):
 
    dates = re.findall('datetime=".{,25}">', html)
    dates = [str[10:len(str)-2] for str in dates]
    dates = pd.to_datetime(dates)

    #convert upload dates to unix time
    unix_times = (dates - pd.Timestamp("1970-01-01 00:00:00+00:00")) // pd.Timedelta('1s')
    unix_times = unix_times.to_numpy()
    unix_times = np.unique(unix_times) #reduce effects of album with multiple tracks

    emerge_date = np.quantile(unix_times,.15)

    emerge_date = datetime.utcfromtimestamp(emerge_date)
    string_output = emerge_date.strftime('%m/%d/%Y') #convert from unix time to string
    
    
    return string_output


In [12]:
#add newly encountered accounts to the "seeds" dataframe. will only add accounts that have not been 
#encountered before. the seeds dataframe is used to label the columns of the sparse_matrix data.

def add_seeds(arr, seeds):
    for i in range(arr.shape[0]):
        search = seeds['links'] == arr[i,1]
        if not any(search):
            new_line = pd.DataFrame(data={'names': arr[i,0], 'links': arr[i,1], 'followers': 0}, index=[0])
            seeds = seeds.append(new_line, ignore_index=True)

    return seeds

In [13]:
#Handles the routine of collecting every interaction data type for a given account

def scrape_data(sparse_matrix, seeds, i, followers):
        
    sparse_matrix.resize((sparse_matrix.shape[0]+1, seeds.shape[0])) #add row to sparse matrix
    
    for to_collect in ["likes", "comments", "following", "reposts"]:
        url = "https://soundcloud.com/" + i + "/" + to_collect
        html = get_html_scroll(url,False) #scroll down 50 times on page specified by url

        if to_collect == 'likes':
            data = parse_likes(html)
        elif to_collect == 'comments':
            data = parse_comments(html)
        elif to_collect == 'following':
            data = parse_follows(html)
        elif to_collect == 'reposts':
            data = parse_reposts(html)

        seeds = add_seeds(data, seeds)
        
        sparse_matrix.resize((sparse_matrix.shape[0], seeds.shape[0]))

        list = data[:,1]

        #data from likes, reposts, follows, and comments are combined into a single sparse matrix
        #with these weights rather than being stored separately and then combined later.
        if to_collect == 'likes':
            increment = 1
        else:
            increment = 3
            
        #a slightly more roundabout way of using .loc functions on a sparse matrix.
        #remaining_seeds specifies row labels and seeds specifies column labels.
        row_indexer = pd.Index(remaining_seeds['links'])    
        row_ind = row_indexer.get_loc(str(i))   
        col_indexer = pd.Index(seeds['links'])
        
        for j in list:
            col_ind = col_indexer.get_loc(str(j))
            sparse_matrix[row_ind, col_ind] += increment
            
    return (sparse_matrix, seeds)

In [148]:
#RUN THIS BLOCK WITH CAUTION IT WILL DELETE ALL YOUR PROGRESS

#Reinitialize a run. Use this to start a run from zero. 

num = 0 #num specifies the index of remaining_seeds the scraper is currently at

seeds = pd.read_csv("C:\\soundcloud_project\\v3.0\\seeds.csv")

remaining_seeds = pd.read_csv("C:\\soundcloud_project\\v3.0\\remaining_seeds.csv")

#lil_matrix format is used for incrementally constructing a sparse matrix
sparse_matrix = scipy.sparse.lil_matrix((0,0))

#num is saved to a simple text file
file = open("C:\\soundcloud_project\\v3.0\\tracking_num.txt","w")
file.write(str(0))
file.close()

(10, 3)

In [14]:
#Reload an in-progress run

sparse_matrix = scipy.sparse.load_npz("C:\\soundcloud_project\\v3.0\\sparse_matrix.npz")
sparse_matrix = sparse_matrix.tolil() 

seeds = pd.read_csv("C:\\soundcloud_project\\v3.0\\seeds.csv") #specifies order of COLUMN labels

#specifies order of ROW labels. also includes display name, followers, and emerge date
remaining_seeds = pd.read_csv("C:\\soundcloud_project\\v3.0\\remaining_seeds.csv") 

file = open("C:\\soundcloud_project\\v3.0\\tracking_num.txt","r")
num = int(file.read())
file.close()


In [15]:
#This function returns the account most interacted with by the accounts in remaining_seeds, not
#including the accounts already in remaining_seeds

def sort_interactions(lil_matrix, seeds, remaining_seeds, num):
    csc_matrix = lil_matrix.tocsc() #convert to csc to make summation along columns more efficient
    sums = csc_matrix.sum(axis=0)
    
    df = pd.DataFrame(data=np.transpose(sums),index=seeds['links'],columns=['sums'])
    df = df.drop(labels=remaining_seeds['links'], axis=0)
    
    max_idx = df.idxmax() 
    
    return max_idx.iloc[0]

In [None]:
#Main control loop for scraper

driver = webdriver.Chrome('C:\\Users\\pswjt\\Documents\\chromedriver\\chromedriver.exe')

while num < 15000: 
    
    #If there are accounts listed in remaining_seeds, use those for scraping.
    if remaining_seeds.shape[0] > num: 
        i = remaining_seeds.iloc[num]['links']
        
    #If you reached the end of remaining_seeds, find the most interacted with account not already
    #scraped, add it to remaining_seeds and scrape that account. Allows program to continually find
    #new accounts without specifying them in a list. Susceptible to eventually moving to more 
    #"mainstream" artists which may not be desired.
    else: 
        i = sort_interactions(sparse_matrix,seeds,remaining_seeds,num)
        new_line = pd.DataFrame(data={'names': i, 'links': i, 'followers': 0, 'emerge_date': 0}, index=[0])
        remaining_seeds = remaining_seeds.append(new_line, ignore_index=True)
    
    url = "https://soundcloud.com/" + i
    driver.get(url)
    html_followers = driver.page_source
    
    try: #If the url is broken for whatever reason, this try block will catch errors thrown
        followers = parse_followers(html_followers)
        remaining_seeds.at[num, 'followers'] = followers

        name = parse_name(html_followers)
        remaining_seeds.at[num, 'names'] = name

        url = "https://soundcloud.com/" + i + "/tracks"
        driver.get(url)

        html_tracks = get_html_scroll(url, False)
        emerge_date = parse_tracks(html_tracks)
        remaining_seeds.at[num, 'emerge_date'] = emerge_date

        sparse_matrix, seeds = scrape_data(sparse_matrix, seeds, i, followers)  
        
    except IndexError:
        print('Link to account %s does not work. Skipping this account.' % i)
        sparse_matrix.resize((sparse_matrix.shape[0]+1, seeds.shape[0]))
        
    except TypeError:
        print('%s has no tracks. Skipping this account.' % i)
        sparse_matrix.resize((sparse_matrix.shape[0]+1, seeds.shape[0]))
        
    finally:
        print(remaining_seeds.iloc[num])
        num += 1


In [None]:
#run this cell after terminating program to save progress

num -= 1 #included cause num is incremented in the finally block when the keyboardinterrupt
#exception is thrown

sparse_matrix_saveable = sparse_matrix.tocsr()
scipy.sparse.save_npz('C:\\soundcloud_project\\v3.0\\sparse_matrix.npz', sparse_matrix_saveable)

remaining_seeds.to_csv("C:\\soundcloud_project\\v3.0\\remaining_seeds.csv", index=False)

seeds.to_csv("C:\\soundcloud_project\\v3.0\\seeds.csv", index=False)

file = open("C:\\soundcloud_project\\v3.0\\tracking_num.txt","w")
file.write(str(num))
file.close()