In [1]:
from selenium import webdriver
import pandas as pd 
from selenium.webdriver.common.keys import Keys
import time
import numpy as np
import re

In [2]:
#Infinitely scrolls to the bottom of the page given by url. Assumes a chromedriver window is already open.
#stop_scroll does nothing lol

def get_html_scroll(url, stop_scroll):
    driver.get(url)
    
    #determines the intervals between jumps to the bottom of the page. could be decreased if you're internet is 
    #particularly fast
    SCROLL_PAUSE_TIME = .5

    # Get scroll height
    last_height = driver.execute_script("return document.body.scrollHeight")

    for i in range(50):
        # Scroll down to bottom
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

        # Wait to load page
        time.sleep(SCROLL_PAUSE_TIME)

        # Calculate new scroll height and compare with last scroll height
        new_height = driver.execute_script("return document.body.scrollHeight")
        
        # If it has reached the bottom of the page, wait three more seconds in case it's just loading.
        # If no more page has loaded after 3 seconds, return the pages html.
        
        if new_height == last_height: 
            time.sleep(3)  
            new_height = driver.execute_script("return document.body.scrollHeight")
            if new_height == last_height:
                break
        last_height = new_height
        
    return driver.page_source

In [3]:
#Returns a list of artist names (more precisely soundcloud's internal marker for them) contained in the HTML 
#of a likes page e.g. https://soundcloud.com/pswjt/likes The fact it concatenates this list to itself 
#serves no real purpose before returning serves no real purpose.

def parse_likes(html):
    links = re.findall('<a class="sound__coverArt" href="/\S{,50}/', html)
    links = [str[34:len(str)-1].split("/")[0] for str in links]
    return np.column_stack((links, links))

In [4]:
#Same thing but for reposts

def parse_reposts(html):
    links = re.findall('<a class="sound__coverArt" href="/.{,100}/', html)
    links = [str[34:len(str)-1].split("/")[0] for str in links]
    return np.column_stack((links,links))
    

In [5]:
#Same thing but for comments

def parse_comments(html):
    links = re.findall('a class="sc-link-light" href="/.{,100}/', html)
    links = [str[31:len(str)-1].split("/")[0] for str in links]
    return np.column_stack((links,links))

In [6]:
#Same thing but for follows

def parse_follows(html):
    links = re.findall('<a href="/.{,100}" class="userBadgeListItem__image">', html)
    links = [str[10:len(str)-35] for str in links]
    return np.column_stack((links,links))

In [7]:
#Returns the number of followers a user has. Takes in the HTML of an artists base profile.

def parse_followers(html):
    followers = re.findall('"followers_count":\d{,50},', html)
    num = int(followers[0][18:len(followers[0])-1])
    return num

In [8]:
#Returns a user's display name, as opposed to soundcloud's internal marker for them. HTML is for the base profile.

def parse_name(html):
    name = re.findall('"username":".{,200}"', html_followers)[0]
    name = name[12:]
    name = name.split('\"')[0]
    return name

In [9]:
#Checks if the artists in arr are already in the seeds dataframe, and adds them if not.
#arr is an array of artist url markers returned by the parse functions.

def add_seeds(arr, seeds): 
    
    for i in range(arr.shape[0]): #iterate through every artist passed to function
        
        search = seeds['links'] == arr[i,1] 
        if not any(search): #if artist not already in list, add to list
            
            #fills 'names' field with soundcloud's internal marker for now. will be changed when
            #an account is actually scraped.
            new_line = pd.DataFrame(data={'names': arr[i,0], 'links': arr[i,1], 'followers': 0}, index=[0])
            seeds = seeds.append(new_line, ignore_index=True)

    return seeds

In [10]:
#takes in a given dataframe and artist name, and appends that artist's data to the dataframe. df can be any of likes_df, 
#comments_df, reposts_df, or follows_df. i is internal soundcloud signifier for the artist (bad variable name ik),
#to_collect is a string representing the type of data to be collected. 

def scrape_data(df, to_collect, seeds, i, followers):
    
    #does not scrape data if followers is less than 250
    if followers >= 250: 
        
        #contstructs appropriate url given the artist marker and type of data to collect.
        url = "https://soundcloud.com/" + i + "/" + to_collect 
        
        #opens page and scrolls at least 50 times to bottom, gets html source code
        html = get_html_scroll(url,False)

        #clunky but it works. data = array of artists found in each respective page
        if to_collect == 'likes':
            data = parse_likes(html)
        elif to_collect == 'comments':
            data = parse_comments(html)
        elif to_collect == 'following':
            data = parse_follows(html)
        else:
            data = parse_reposts(html)

        #adds unique artists found to seeds list
        seeds = add_seeds(data, seeds)
        
    #these lines add a row with all zeros to the dataframe for each new artist scraped, as well as adding columns to the
    #dataframe. because it reconstructs the column names from the seeds dataframe every time a new artist is scraped, 
    #be careful to make sure order of columns labels and seeds stay the same. deleting a column from a dataframe
    #but not the seeds list will scramble the data 
    
    pad_len = seeds.shape[0]-df.shape[1]
    
    pad_arr = np.pad(df.values,((0,1), (0,pad_len)),mode='constant')
        
    df = pd.DataFrame(pad_arr, index=df.index.append(pd.Index([i])), columns=seeds['links'])

    #if an account's followers are less than 250, simply leave the row of all zeros unchanged.
    #these will be filtered out later.
    
    if followers >= 250:
        list = data[:,1]
    
        #iterates through every interactions and iterates the relevant data point
        for j in list:
            df.loc[str(i), str(j)] += 1
        
    return (df, seeds)

In [372]:
#DO NOT RUN THIS BLOCK UNLESS WOU WANT TO LOSE ALL YOUR CURRENT DATA

#This block basically resets progress, and initializes the dataframes to 
#all zeros with the columns and rows labeled by seeds.csv

seeds = pd.read_csv("seeds.csv")
num = seeds.shape[0]

likes_df = pd.DataFrame(np.zeros((num,num)), index=seeds['links'], columns=seeds['links'])
comments_df = pd.DataFrame(np.zeros((num,num)), index=seeds['links'], columns=seeds['links'])
follows_df = pd.DataFrame(np.zeros((num,num)), index=seeds['links'], columns=seeds['links'])
reposts_df = pd.DataFrame(np.zeros((num,num)), index=seeds['links'], columns=seeds['links'])

file = open("tracking_num.txt","w")
file.write(str(0))
file.close()

print(likes_df.shape)
seeds.head()

(3, 3)


Unnamed: 0,followers,links,names
0,8656,kggn,kuru
1,11328,axxturel,axxturel
2,701,cargoboym,cargoboym


In [11]:
#Use this block to reload data after closing and saving progress. Don't run this block starting the program for
#the first time

likes_df = pd.read_parquet("likes_df.parquet")

comments_df = pd.read_parquet("comments_df.parquet")

follows_df = pd.read_parquet("follows_df.parquet")

reposts_df = pd.read_parquet("reposts_df.parquet")

#remaining_seeds is the sorted list I used to determine scraping order around halfway 
#through. because columns are continuously relabeled with the seeds dataframe, it still
#needed to be kept.

remaining_seeds = pd.read_csv("remaining_seeds.csv")

seeds = pd.read_csv("seeds.csv")

#num is the row number in remaining_seeds that the program is currently at. written to a txt 
#file to track progress between scraping sessions

file = open("tracking_num.txt","r")
num = int(file.read())
file.close()

In [176]:
#The main block containing the for loop controlling the program

driver = webdriver.Chrome('chromedriver.exe') 

while num < 10000: #arbitrary choice of number here, did not even get close to 10000
    
    #if you would like to retain original depth first type behavior (meaning scrape artists the order
    #in which they're found), this line can be changed to 
    #i = seeds.iloc[num]['links']
    
    i = remaining_seeds.iloc[num]['links']
    
    #get artist name and followers first
    #I should add some form of error handling here since a few times it crashed if the sc link led nowhere
    
    url = "https://soundcloud.com/" + i
    driver.get(url)
    html_followers = driver.page_source
    followers = parse_followers(html_followers)
    remaining_seeds.at[num, 'followers'] = followers #also change this to seeds to get depth-first behavior
    
    name = parse_name(html_followers)
    remaining_seeds.at[num, 'names'] = name #ditto ^
    
    likes_df, seeds = scrape_data(likes_df, 'likes', seeds, i, followers)
    comments_df, seeds = scrape_data(comments_df, 'comments', seeds, i, followers)
    follows_df, seeds = scrape_data(follows_df, 'following', seeds, i, followers)
    reposts_df, seeds = scrape_data(reposts_df, 'reposts', seeds, i, followers)

    num += 1


490
117
470
480
470
450
500
258
318
20
480
470
460
29
460
460
470
14
470
470
469
50
62
257
450
490
52
0
57
440
480
94
1
19
480


MemoryError: Unable to allocate 2.04 GiB for an array with shape (2243, 121894) and data type int64

In [192]:
#Run this cell after terminating program to save progress. Try to only stop the program when it's
#still on an artist's likes page to prevent a dimensional mismatch

likes_df.to_parquet("likes_df.parquet")
comments_df.to_parquet("comments_df.parquet")
follows_df.to_parquet("follows_df.parquet")
reposts_df.to_parquet("reposts_df.parquet")

remaining_seeds.to_csv("remaining_seeds.csv", index=False)

seeds.to_csv("seeds.csv", index=False)

file = open("tracking_num.txt","w")
file.write(str(num))
file.close()

Actual plotting/data cleanup happens in the soundcloud_plotting_clean.ipynb file