In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sn
from sklearn.decomposition import PCA
from sklearn.decomposition import IncrementalPCA
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import scipy.sparse
from sklearn.manifold import TSNE
import umap.umap_ as umap

In [2]:
#load all data

sparse_matrix = scipy.sparse.load_npz("C:\\soundcloud_project\\v3.0\\sparse_matrix.npz")

seeds = pd.read_csv("C:\\soundcloud_project\\v3.0\\seeds.csv") #column labels

remaining_seeds = pd.read_csv("C:\\soundcloud_project\\v3.0\\remaining_seeds.csv") #row labels

sparse_matrix = sparse_matrix.tocsr()

#cuts out accounts that weren't gotten to by the scraper yet
remaining_seeds = remaining_seeds[0:sparse_matrix.shape[0]]


In [3]:
#filter out accounts with a sum interaction score of less than 40

sums = sparse_matrix.sum(axis=1)
mask = sums > 40
mask = np.ravel(mask)

print(remaining_seeds.shape)
sparse_matrix = sparse_matrix[mask,:]
remaining_seeds = remaining_seeds[mask]
print(remaining_seeds.shape)


(9774, 4)
(8795, 4)


In [4]:
#remove accounts which did not get properly assigned an emerge date of followers count

mask_2 = remaining_seeds['emerge_date'] == 'a'

mask_3 = remaining_seeds['followers'] == 0

mask_final = ~(mask_2 | mask_3)

remaining_seeds = remaining_seeds[mask_final]
sparse_matrix = sparse_matrix[mask_final,:]

0
437
8358
(8358, 4)


In [None]:
#Normalize data with the sum of each rows interactions

sums = sparse_matrix.sum(axis=1)

recips = np.reciprocal(sums.astype('float'))

mat_float_normalize = sparse_matrix.astype('float').multiply(recips)

In [6]:
#Remove which give more than 40% of their interactions to a single account.

maxes = mat_float_normalize.max(axis=1)

maxes = maxes.todense()

total_activity_mask = maxes < 0.4

remaining_seeds = remaining_seeds[total_activity_mask]

total_activity_mask = np.ravel(total_activity_mask)

mat_float_normalize = mat_float_normalize.tocsr()[total_activity_mask, :]

print(mat_float_normalize.shape)
print(remaining_seeds.shape)



(8095, 489554)
(8095, 4)


In [7]:
#Perform initial dimensional reduction

from sklearn.decomposition import TruncatedSVD

svd = TruncatedSVD(n_components=100)

arr_svd = svd.fit_transform(mat_float_normalize)


In [None]:
#Run UMAP

reducer = umap.UMAP(n_neighbors=100, min_dist=0.0001, metric='euclidean', low_memory=True)

reduced = reducer.fit_transform(arr_svd) #due to memory issues this was actually done in Colab
reduced.shape

In [213]:
#This was not used in any of the visualizations but these hdbscan settings seemed to cluster
#the data relatively well.

import hdbscan

clusterer = hdbscan.HDBSCAN(min_cluster_size=10, min_samples=4, cluster_selection_epsilon=0.09)
clusters = clusterer.fit_predict(reduced)

In [12]:

final_seeds = remaining_seeds
final_seeds = final_seeds.set_index('links')

links = final_seeds.index
urls = ['https://soundcloud.com/' + str for str in links]


In [13]:
#assign generation
print(final_seeds)
generations = list(range(final_seeds.shape[0]))

for i in range(final_seeds.shape[0]):
    if pd.to_datetime(final_seeds.iloc[i]['emerge_date'], errors='coerce') < pd.to_datetime('01/01/2013'):
        generations[i] = 0
    elif pd.to_datetime(final_seeds.iloc[i]['emerge_date'], errors='coerce') < pd.to_datetime('01/01/2015'):
        generations[i] = 1
    elif pd.to_datetime(final_seeds.iloc[i]['emerge_date'], errors='coerce') < pd.to_datetime('01/01/2017'):
        generations[i] = 2
    elif pd.to_datetime(final_seeds.iloc[i]['emerge_date'], errors='coerce') < pd.to_datetime('01/01/2019'):
        generations[i] = 3
    elif pd.to_datetime(final_seeds.iloc[i]['emerge_date'], errors='coerce') < pd.to_datetime('01/01/2021'):
        generations[i] = 4
    else:
        generations[i] = 5

                                     names  followers emerge_date
links                                                            
kggn                                  kuru      10301   1/10/2020
axxturel          @kingaxxturel #lastkings      15762   2/21/2019
paulonrecords                         ⠀⠀⠀⠀       2142   10/2/2015
blackwinterwells          blackwinterwells      11432   2/20/2020
djphat1996                         DJ PHAT      27569   10/5/2018
...                                    ...        ...         ...
typicaledmon                         Edmon        660  09/08/2019
ukaju                                ujaku        597  10/09/2016
bandedupkelo                  BandedUpKelo       1355  08/21/2016
sleeplessxcity            sleepless X city        219  05/09/2017
1poohy                               Poohy        191           0

[8095 rows x 3 columns]


In [None]:
#obtains the position of each point within each individual generation/trace.
#not necessary when graph was just a single trace but needed for search function

pos_within_trace = np.array(list(range(len(generations))))
nums = np.array(list(range(len(generations))))

for i in range(6):
    mask = generations == i
    pos_within_trace[mask] = nums[0:len(pos_within_trace[mask])]
    pos_within_trace[mask]

pos_within_trace = list(pos_within_trace)
pos_within_trace

In [None]:
#Forms the plotly graph

from plotly.offline import plot
import pandas as pd
import plotly.graph_objs as go
import re

links = final_seeds.index
urls = ['https://soundcloud.com/' + str for str in links]

urls = np.array(urls)
generations = np.array(generations)

#make hover labels
labels_2 = ['<b>' + str + '</b><br><sub>followers: ' for str in final_seeds['names']]
for i in range(final_seeds.shape[0]):
    labels_2[i] = labels_2[i] + str(final_seeds.iloc[i]['followers'])
    labels_2[i] = labels_2[i] + '   ||   Generation ' + str(generations[i] + 1) + '</sub>'

labels_2 = np.array(labels_2)

#make searchable text strings, just concatenates the display name with the url separated by 
search_text = []
for i in range(final_seeds.shape[0]):
    search_text.append((final_seeds.index[i] + '|||' + final_seeds.iloc[i]['names']).lower())

# mapbox_access_token = '...'

# Build scattermapbox trace and store URL's in the customdata
# property. The values of this list will be easy to get to in the
# JavaScript callback below

data = []

#manually get colors for gradient from gen 1 -> 6
colors = ['#0d0887', '#6c01a7', '#b22d8f', '#e26761', '#fcaa36', '#f1f822']
names = ['Generation I (pre-2013)', 'Generation II (2013-2014)', 'Generation III (2015-2016)', 'Generation IV (2017-2018)', 'Generation V (2019-2020)', 'Generation VI (2021-present)']

#originally this was done in one single trace, but each generation was divided into an individual
#trace to make the legend work
for i in range(6):
    mask = np.array(generations) == i
    data_toappend = [
        go.Scatter(
            x=reduced[mask,0], 
            y=reduced[mask,1],
            hovertext=labels_2[mask], 
            hoverinfo='text',
            marker=dict(
                color=str(colors[i])
            ),
            mode='markers',
            customdata=urls[mask],
            name = names[i]
        )
    ]
    data.append(data_toappend)


# Build layout
layout = go.Layout(
    hovermode='closest',
    title="The Shape of SoundCloud<br><sub>Generated by scraping recent SoundCloud activity (April-May 2021) from over 8,000 artists. Read more <a href'https://pswjt1.medium.com/visualizing-the-shape-of-soundcloud-communities-with-web-scraping-and-machine-learning-cc1c5d948f78'>here.</a>",
    xaxis_title ="<sub>Made by <a href='https://twitter.com/pswjt'>@pswjt</a>",
    
    #there are definitely more scenes you could define, intentionally left the labels a 
    #little vague
    annotations = [go.layout.Annotation(x=5.8, y=0.5, text='Digicore', showarrow=False, font={'size':16}),
                   go.layout.Annotation(x=1.7, y=1.0, text='Plugg', showarrow=False, font={'size':16}),
   #                go.layout.Annotation(x=0.4, y=3.6, text='Emo Rap', showarrow=False, font={'size':16}),
                   go.layout.Annotation(x=0.6, y=8.4, text='Wave', showarrow=False, font={'size':16}),
                   go.layout.Annotation(x=10.2, y=4.3, text='Dubstep', showarrow=False, font={'size':16}),
                  ]
) 


# Build Figure
fig = go.Figure(
    layout=layout,
)

for trace in data:
    fig.add_trace(trace[0])

fig.update_layout(
    legend=dict(
        yanchor='bottom',
        y=0.03,
        xanchor='right',
        x=0.99
    )
)

fig.update_xaxes(showgrid=False,zeroline=False,visible=False)
fig.update_yaxes(showgrid=False,zeroline=False,visible=False)


fig.update_layout(template='plotly_white')

#prepares the search strings and generation assignments for injection into plotly javascript code
links_str = str(search_text) + ";"
generations_str = str(list(generations)) + ";"

# Get HTML representation of plotly.js and this figure
plot_div = plot(fig, output_type='div', include_plotlyjs=True)

# Get id of html div element that looks like
# <div id="301d22ab-bfba-4621-8f5d-dc4fd855bb33" ... >
res = re.search('<div id="([^"]*)"', plot_div)
div_id = res.groups()[0]

# Build JavaScript callback for handling clicks
# and opening the URL in the trace's customdata 
js_callback = """
<div id="Search box" style="z-index=100; position:relative; margin-top: -3%;"> 
<input type="text" id="inputText">
<button onclick="search()" id="Search">Search</button>
<span id="status"></span>
</div>
<script>

var input = document.getElementById('inputText'); 
input.addEventListener("keyup", function(event) {{
    if (event.keyCode === 13) {{
        event.preventDefault();
        document.getElementById("Search").click();
    }}
}});

var plot_element = document.getElementById("{div_id}");
plot_element.on('plotly_click', function(data){{
    console.log(data);
    var point = data.points[0];
    if (point) {{
        console.log(point.customdata);
        window.open(point.customdata);
    }}
}})

var generations = {generations}
var points = {links}
var pos_within_trace = {pos_within_trace}

var dict = {{}};
points.forEach((points, i) => result[points] = generations[i]);
console.log(dict);

function search() {{
    document.getElementById("status").innerHTML = '';
    var i = 0;
    var found = [];
    var myDiv = document.getElementsByClassName("js-plotly-plot")[0];
    var text = document.getElementById("inputText").value.toLowerCase();
    for (i = 0; i < points.length; i += 1) {{
        if (points[i].includes(text)) {{
            found.push({{curveNumber: generations[i], pointNumber: pos_within_trace[i]}});
            console.log(found);
        }}
    }}

    if (found.length == 0){{
        document.getElementById("status").innerHTML = ' Artist not found'
    }} else if (found.length > 20) {{
        document.getElementById("status").innerHTML = ' Search something more specific'
    }} else {{
        Plotly.Fx.hover(myDiv, found);
    }}
    
}}
</script>
""".format(div_id=div_id, links=links_str, pos_within_trace=pos_within_trace, generations=list(generations))

# Build HTML string
html_str = """
<html>
<body>
{plot_div}
{js_callback}
</body>
</html>
""".format(plot_div=plot_div, js_callback=js_callback)

#Write out HTML file
with open('C:\\soundcloud_project\\soundcloud_map_v3_final.html', 'w', encoding='utf-8') as f:
    f.write(html_str)
    
fig.show()

In [None]:
#makes mask for if each column header is found in the rows

isin_mask = seeds['links'].isin(remaining_seeds['links'])
isin_mask

In [None]:
#forms list of edges from sparse matrix for construction of network graph

edge_list = []

for i in range(remaining_seeds.shape[0]):
    
    mask = sparse_matrix[i,:] > 0
    mask = mask.todense()
    mask = np.ravel(mask)
    mask = mask & isin_mask
    
    i_name = remaining_seeds.iloc[i]['links']
    j_names = np.ravel(seeds.loc[mask]['links'])
    
    values = np.ravel(sparse_matrix[i,mask].todense())
    
    print(i_name)
    for j in range(j_names.shape[0]):
        j_name = j_names[j]
        if values[j] > 3: #ignores values 3 or below for edge construction, too many edges otherwise
            edge_list.append((i_name,j_name,values[j]))

In [175]:
#greates network graph in networkx library and exports to gephi. 
#openord diagram was used to create network graph layout and the result was exported to 
#sigma.js format.

import networkx as nx
G = nx.DiGraph()

G.add_nodes_from(remaining_seeds['links'])

rem_seeds = remaining_seeds.set_index('links')
name_dict = rem_seeds['names'].to_dict()
follow_dict = rem_seeds['followers'].to_dict()
emerge_dict = rem_seeds['emerge_date'].to_dict()

G.add_weighted_edges_from(edge_list)

nx.set_node_attributes(G, name_dict, 'label')
nx.set_node_attributes(G, follow_dict, 'followers')
nx.set_node_attributes(G, emerge_dict, 'emerge date')
nx.set_node_attributes(G, url_dict, 'url')

nx.write_gexf(G, path='C:\\soundcloud_project\\v3.0\\gexf_sc.gexf')