Search using Top Charts and HipHopCharts, created connected graph of artists

artist can collaborate with other artists, so we can consider Ingoing and Outgoing links for page rank

Create visual circles describing 

Graph of words for different genres: https://towardsdatascience.com/getting-started-with-graph-analysis-in-python-with-pandas-and-networkx-5e2d2f82f18e

https://networkx.github.io/
https://plot.ly/python/3d-network-graph/

Max distance from one node to another
Avg distance from one node to another
Pagerank


artist = domain
song = document
collaborators = outgoing links / incoming links
lyrics = docuemnt text

In [298]:
from pymongo import MongoClient

# DB CONFIG
client = MongoClient('mongo', 27017)
db = client.music_db

In [299]:
db.collection_names()


collection_names is deprecated. Use list_collection_names instead.



['pop-songs',
 'latin-songs',
 'top_songs',
 'country-songs',
 'rock-songs',
 'songs',
 'jazz-songs',
 'all-songs',
 'christian-songs',
 'rap-song']

In [300]:
songs = list(db['songs'].find())
len(songs)

7354

In [301]:
# Filter Collaboration Songs 
SEPARATORS = ['&', 'Featuring', 'With', ',', 'Feat.']

def is_collaboration(artist):
    return any([word in artist for word in SEPARATORS])

collaborations = songs #[song for song in songs if is_collaboration(song['artist'])]
len([song for song in songs if is_collaboration(song['artist'])])

2919

In [302]:
import re
from typing import List
import re

def clean(artists: List) -> List:
    artists = [artist.strip() for artist in artists]
    alphanum = re.compile(r'[^\s\w]+')
    artists = [alphanum.sub('', artist).title() for artist in artists]
    return artists

def separate(artist: str) -> List:
    return re.split('|'.join(SEPARATORS), artist)

def add_collaborators_to_songs(collaborators, songs):
    for collabs, song in zip(collaborators, songs):
        song['collaborators'] = collabs
    return songs

collaborators = [song['artist'] for song in collaborations]
collaborators = map(separate, collaborators)
collaborators = map(clean, collaborators)
collaborators = list(collaborators)        
collaborations = add_collaborators_to_songs(collaborators, collaborations)
collaborators[:2]

[['Drake'], ['Cardi B', 'Bad Bunny', 'J Balvin']]

In [331]:
from collections import defaultdict, namedtuple
from pandas import DataFrame

def song_dataframe(songs):
    rows = []
    for song in songs:
        row = [song['title'], song['artist']]
        rows.append(row)
    return DataFrame(rows, columns=['title', 'artist'])

def songs_df_to_artist_df(songs_df):
    primary_dict = defaultdict(list)
    featured_dict = defaultdict(list)
    
    for index, row in songs_df.iterrows():
        primary, *collabs = row['artist']
        primary_dict[primary].append(row['title'])
        for artist in collabs:
            featured_dict[artist].append(row['title'])
    
    songs_df = DataFrame(list(primary_dict.items()))
    _df = DataFrame(list(featured_dict.items()))
    songs_df = songs_df.merge(_df, on=0)
    songs_df.columns = ['Artist', 'Primary', "Collaboration"]
    return songs_df
        
songs_df = song_dataframe(songs)
songs_df['artist'] = songs_df['artist'].apply(separate).apply(clean)
artist_df = songs_df_to_artist_df(songs_df)

artist_df['Primary'] = artist_df['Primary'].apply(len)
artist_df['Collaboration'] = artist_df['Collaboration'].apply(len)
artist_df['Total Songs'] = artist_df['Primary'] + artist_df['Collaboration']
artist_df = artist_df.sort_values('Total Songs', ascending=False).reset_index(drop=True)
artist_df[:50]

Unnamed: 0,Artist,Primary,Collaboration,Total Songs
0,Drake,137,67,204
1,Lil Wayne,38,116,154
2,Jayz,67,48,115
3,Future,62,47,109
4,Kanye West,59,42,101
5,Chris Brown,49,52,101
6,Nicki Minaj,42,52,94
7,Rick Ross,31,60,91
8,R Kelly,64,22,86
9,Ti,46,33,79


In [304]:
collaborations[0]['peakPos']

5

In [315]:
import networkx as nx
G = nx.DiGraph()

In [316]:
from functools import reduce
from operator import add

def unique_artists(songs):
    artists = [song['collaborators'] for song in songs]   # List of lists
    artists = reduce(add, collaborators)
    artists = [artist.strip() for artist in artists]
    return set(artists)

nodes = unique_artists(collaborations)
G.add_nodes_from(nodes)

In [317]:
from typing import List, Tuple

def add_edges(graph, songs) -> List[Tuple]:
    """ Returns tuples, which creates an edge between collaborators """
    edges = []
    for song in songs:
        collabs = song['collaborators']
        primary = collabs[0]
        for collab in collabs:
            
            weight = 1 / song['peakPos'] / len(collabs)
            naive_weight = 1 / len(collabs)
            
            if graph.has_edge(primary, collab):
                graph.edges[primary, collab]['weight'] += weight
                graph.edges[primary, collab]['naive_weight'] += naive_weight
                
            else:
                graph.add_edge(primary, collab, weight=weight)
                graph.add_edge(primary, collab, naive_weight=naive_weight)
    
add_edges(G, collaborations)

## Draw Graph


In [254]:
import matplotlib.pyplot as plt
import plotly.graph_objs as go
import plotly.plotly as py

options = {}

def assign_positions(graph: nx.Graph):
    positions = nx.spring_layout(G)
    nx.set_node_attributes(G, positions, 'pos')
    return G
    
assign_positions(G)

<networkx.classes.digraph.DiGraph at 0x7f66001f77b8>

In [255]:
def plot_graph(G):

    pos = nx.get_node_attributes(G, 'pos')

    dmin = 1
    ncenter = 'Drake'
    for n in pos:
        x, y= pos[n]
        d=(x-0.5)**2+(y-0.5)**2
        if d<dmin:
            ncenter=n
            dmin=d

    p = nx.single_source_shortest_path_length(G,ncenter)

    # 

    edge_trace = go.Scatter(
        x=[],
        y=[],
        line=dict(width=0.5,color='#888'),
        hoverinfo='none',
        mode='lines')

    for edge in G.edges():
        x0, y0 = G.node[edge[0]]['pos']
        x1, y1 = G.node[edge[1]]['pos']
        edge_trace['x'] += tuple([x0, x1, None])
        edge_trace['y'] += tuple([y0, y1, None])

    node_trace = go.Scatter(
        x=[],
        y=[],
        text=[],
        mode='markers',
        hoverinfo='text',
        marker=dict(
            showscale=True,
            # colorscale options
            #'Greys' | 'YlGnBu' | 'Greens' | 'YlOrRd' | 'Bluered' | 'RdBu' |
            #'Reds' | 'Blues' | 'Picnic' | 'Rainbow' | 'Portland' | 'Jet' |
            #'Hot' | 'Blackbody' | 'Earth' | 'Electric' | 'Viridis' |
            colorscale='YlGnBu',
            reversescale=True,
            color=[],
            size=10,
            colorbar=dict(
                thickness=15,
                title='Node Connections',
                xanchor='left',
                titleside='right'
            ),
            line=dict(width=2)))

    for node in G.nodes():
        x, y = G.node[node]['pos']
        node_trace['x'] += tuple([x])
        node_trace['y'] += tuple([y])

    adj_length = len(list(G.adjacency()))

    from IPython.display import display, clear_output

    i = 0
    for node, adjacencies in enumerate(G.adjacency()):
        node_trace['marker']['color']+=tuple([len(adjacencies[1])])
        name = adjacencies[0]
        node_info = f"""
                        {name} | 
                        # of outgoing connections: {len(adjacencies[1])} | 
                        # of incoming connections: {G.in_degree(name)}
                        """
        node_trace['text'] += tuple([node_info])
        i +=1
        if i % 10 == 0:
            clear_output(wait=True)
            print(f'{i}/{adj_length}', end=" ")

    fig = go.Figure(data=[edge_trace, node_trace],
                 layout=go.Layout(
                    title='<br>Artists Connections',
                    titlefont=dict(size=16),
                    showlegend=False,
                    hovermode='closest',
                    margin=dict(b=20,l=5,r=5,t=40),
                    annotations=[ dict(
                        text="Python code: <a href='https://plot.ly/ipython-notebooks/network-graphs/'> https://plot.ly/ipython-notebooks/network-graphs/</a>",
                        showarrow=False,
                        xref="paper", yref="paper",
                        x=0.005, y=-0.002 ) ],
                    xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
                    yaxis=dict(showgrid=False, zeroline=False, showticklabels=False)))

    return py.iplot(fig, filename='networkx')
    


In [36]:
plot_graph(G)

3100/3102 


Looks like you used a newline character: '\n'.

Plotly uses a subset of HTML escape characters
to do things like newline (<br>), bold (<b></b>),
italics (<i></i>), etc. Your newline characters 
have been converted to '<br>' so they will show 
up right on your Plotly figure!



In [256]:
# Sort by outgoing connections
# Sort by incoming connections

In [318]:
def sorted_nodes(g: nx.Graph, out_degree=False):
    """in_degree is False"""
    sort_type = G.out_degree if out_degree else G.in_degree
    sorted_n = sorted(G.nodes, key=sort_type, reverse=True)
    return [(node, sort_type(node)) for node in sorted_n]

in_degrees = sorted_nodes(G)
out_degrees = sorted_nodes(G, out_degree=True)

In [319]:
from pprint import pprint
from pandas import DataFrame 
import pandas as pd
from operator import itemgetter

df_in = DataFrame(in_degrees, columns=['Artist', 'In_Degree'])
df_out = DataFrame(out_degrees, columns=['Artist', 'Out_Degree'])
df = pd.merge(df_in, df_out, on=['Artist'])
df['Total_Degree'] = df['In_Degree'] + df['Out_Degree']
df = df.sort_values('Total_Degree', ascending=False)
df = pd.merge(df, artist_df, on='Artist')
df[:50]

Unnamed: 0,Artist,In_Degree,Out_Degree,Total_Degree,Primary,Collaboration,Total Songs
0,Lil Wayne,68,30,98,38,116,154
1,Drake,41,37,78,137,67,204
2,Chris Brown,38,33,71,49,52,101
3,Kanye West,36,32,68,59,42,101
4,Rick Ross,39,26,65,31,60,91
5,Nicki Minaj,43,19,62,42,52,94
6,Jayz,32,28,60,67,48,115
7,Ludacris,37,21,58,29,40,69
8,Ti,31,22,53,46,33,79
9,Dj Khaled,5,46,51,32,4,36


In [292]:
def subgraph(G, n_keep=3):
    to_keep = []
    for i, row in df.iterrows():
        if row['In_Degree'] >= n_keep or row['Out_Degree'] >= n_keep:
            to_keep.append(row['Artist'])  
            
    return G.subgraph(to_keep)

sub_g = subgraph(G, n_keep=15)
assign_positions(sub_g)
len(sub_g)

56

In [263]:
plot_graph(sub_g)

50/54 


Looks like you used a newline character: '\n'.

Plotly uses a subset of HTML escape characters
to do things like newline (<br>), bold (<b></b>),
italics (<i></i>), etc. Your newline characters 
have been converted to '<br>' so they will show 
up right on your Plotly figure!


Looks like you used a newline character: '\n'.

Plotly uses a subset of HTML escape characters
to do things like newline (<br>), bold (<b></b>),
italics (<i></i>), etc. Your newline characters 
have been converted to '<br>' so they will show 
up right on your Plotly figure!


Looks like you used a newline character: '\n'.

Plotly uses a subset of HTML escape characters
to do things like newline (<br>), bold (<b></b>),
italics (<i></i>), etc. Your newline characters 
have been converted to '<br>' so they will show 
up right on your Plotly figure!


Looks like you used a newline character: '\n'.

Plotly uses a subset of HTML escape characters
to do things like newline (<br>), bold (<b></b>),
italics (<i></i>), etc.

In [264]:
# Apply algorithms from https://networkx.github.io/documentation/stable/reference/algorithms/index.html
# Make weight the best ranking of the song
# Distribute page rank equally among artists
import networkx.algorithms as nxa

#nxa.average_shortest_path_length(H)
sc = list(nxa.strongly_connected_components(G))
nodes = max(sc, key=len)
SC = G.subgraph(nodes)
len(SC)

331

In [43]:
plot_graph(SC)

330/331 


Looks like you used a newline character: '\n'.

Plotly uses a subset of HTML escape characters
to do things like newline (<br>), bold (<b></b>),
italics (<i></i>), etc. Your newline characters 
have been converted to '<br>' so they will show 
up right on your Plotly figure!



In [83]:
nxa.average_shortest_path_length(SC.to_undirected())

2.8595088520845233

In [265]:
import networkx.algorithms as nxa
from functools import reduce
from operator import or_

weak_components = list(nxa.weakly_connected_components(G))
weak_component = max(weak_components, key=len)
WK = G.subgraph(weak_component)
len(WK)

1603

In [267]:
plot_graph(WK)

1600/1603 


Looks like you used a newline character: '\n'.

Plotly uses a subset of HTML escape characters
to do things like newline (<br>), bold (<b></b>),
italics (<i></i>), etc. Your newline characters 
have been converted to '<br>' so they will show 
up right on your Plotly figure!


Looks like you used a newline character: '\n'.

Plotly uses a subset of HTML escape characters
to do things like newline (<br>), bold (<b></b>),
italics (<i></i>), etc. Your newline characters 
have been converted to '<br>' so they will show 
up right on your Plotly figure!


Looks like you used a newline character: '\n'.

Plotly uses a subset of HTML escape characters
to do things like newline (<br>), bold (<b></b>),
italics (<i></i>), etc. Your newline characters 
have been converted to '<br>' so they will show 
up right on your Plotly figure!


Looks like you used a newline character: '\n'.

Plotly uses a subset of HTML escape characters
to do things like newline (<br>), bold (<b></b>),
italics (<i></i>), etc.

In [266]:
nxa.average_shortest_path_length(WK.to_undirected())

4.647910479959938

In [324]:
from collections import Counter

def page_rank(G, weight=None):
    pagerank = nxa.pagerank(G, weight=weight)
    pagerank = Counter(pagerank).most_common()
    return pagerank

G = G.to_undirected()
pagerank = page_rank(G)
_1 = DataFrame(pagerank, columns=["Artist", "Page_Rank"])

pagerank = page_rank(G, weight='naive_weight')
_2 = DataFrame(pagerank, columns=["Artist", "Page_Rank_Naive_Weighted"])

pagerank = page_rank(G, weight='weight')
_3 = DataFrame(pagerank, columns=["Artist", "Page_Rank_Weighted"])

newdf = df.merge(_1, on='Artist')
newdf = newdf.merge(_2, on='Artist')
newdf = newdf.merge(_3, on='Artist')

_df = newdf.sort_values('Page_Rank', ascending=False).reset_index()[:50]
_df = _df.rename(columns={'index': 'old index (Collaboration)'})
_df

Unnamed: 0,old index (Collaboration),Artist,In_Degree,Out_Degree,Total_Degree,Primary,Collaboration,Total Songs,Page_Rank,Page_Rank_Naive_Weighted,Page_Rank_Weighted
0,0,Lil Wayne,68,30,98,38,116,154,0.004385,0.003969,0.003576
1,1,Drake,41,37,78,137,67,204,0.003478,0.005794,0.009204
2,2,Chris Brown,38,33,71,49,52,101,0.00335,0.003593,0.003535
3,6,Jayz,32,28,60,67,48,115,0.003284,0.004771,0.004771
4,3,Kanye West,36,32,68,59,42,101,0.00294,0.003212,0.003836
5,4,Rick Ross,39,26,65,31,60,91,0.002824,0.002324,0.001781
6,7,Ludacris,37,21,58,29,40,69,0.002792,0.002235,0.001958
7,11,Snoop Dogg,27,22,49,26,31,57,0.002744,0.0026,0.002406
8,5,Nicki Minaj,43,19,62,42,52,94,0.002547,0.002465,0.002925
9,8,Ti,31,22,53,46,33,79,0.002396,0.002716,0.002486


In [325]:
_df = newdf.sort_values('Page_Rank_Naive_Weighted', ascending=False).reset_index()
_df = _df.rename(columns={'index': 'old index (Collaboration)'})
_df[:50]

Unnamed: 0,old index (Collaboration),Artist,In_Degree,Out_Degree,Total_Degree,Primary,Collaboration,Total Songs,Page_Rank,Page_Rank_Naive_Weighted,Page_Rank_Weighted
0,1,Drake,41,37,78,137,67,204,0.003478,0.005794,0.009204
1,6,Jayz,32,28,60,67,48,115,0.003284,0.004771,0.004771
2,0,Lil Wayne,68,30,98,38,116,154,0.004385,0.003969,0.003576
3,2,Chris Brown,38,33,71,49,52,101,0.00335,0.003593,0.003535
4,3,Kanye West,36,32,68,59,42,101,0.00294,0.003212,0.003836
5,24,R Kelly,20,16,36,64,22,86,0.001881,0.002976,0.003949
6,10,Future,32,18,50,62,47,109,0.002117,0.002891,0.002948
7,19,Eminem,14,25,39,48,17,65,0.001961,0.002878,0.004203
8,21,Kendrick Lamar,20,17,37,44,20,64,0.002099,0.002727,0.003795
9,8,Ti,31,22,53,46,33,79,0.002396,0.002716,0.002486


In [333]:
_df2 = _df.sort_values('Page_Rank_Weighted', ascending=False)
_df2 = _df2.reset_index()
_df2 = _df2.rename(columns={'index': 'old index (naive_weighted)'})
_df2[:100]

Unnamed: 0,old index (naive_weighted),old index (Collaboration),Artist,In_Degree,Out_Degree,Total_Degree,Primary,Collaboration,Total Songs,Page_Rank,Page_Rank_Naive_Weighted,Page_Rank_Weighted
0,0,1,Drake,41,37,78,137,67,204,0.003478,0.005794,0.009204
1,1,6,Jayz,32,28,60,67,48,115,0.003284,0.004771,0.004771
2,7,19,Eminem,14,25,39,48,17,65,0.001961,0.002878,0.004203
3,5,24,R Kelly,20,16,36,64,22,86,0.001881,0.002976,0.003949
4,4,3,Kanye West,36,32,68,59,42,101,0.002940,0.003212,0.003836
5,8,21,Kendrick Lamar,20,17,37,44,20,64,0.002099,0.002727,0.003795
6,2,0,Lil Wayne,68,30,98,38,116,154,0.004385,0.003969,0.003576
7,3,2,Chris Brown,38,33,71,49,52,101,0.003350,0.003593,0.003535
8,40,9,Dj Khaled,5,46,51,32,4,36,0.002270,0.001472,0.003404
9,6,10,Future,32,18,50,62,47,109,0.002117,0.002891,0.002948


In [None]:
G