# Networks Summative Notebook

## Set Up

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive/')
%cd /content/drive/My Drive/OII/Social_Networks/Summative/Data Analysis

In [None]:
# Import Necessary Packages
import numpy as np
import networkx as nx
import pandas as pd
import csv
import ast
import scipy
import os
import re
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
def csv_to_list(filename, delimiter='\t'):
    data = []
    with open(filename, 'r') as csvfile:
        csvreader = csv.reader(csvfile, delimiter=delimiter)
        for row in csvreader:
            data.append(row)
    return data

## Artist Networks

In [None]:
artists= csv_to_list("data/spotify_artists.csv")
cols = artists.pop(0)

artist_df = pd.DataFrame(artists, columns=cols)

artist_df = artist_df[["name", "popularity", "genres"]]

# REMOVE DUPLICATES AND ARTISTS WITH NO GENRES (FOR SIMPLICITY AND LESS RISK OF REPEATED NODES)
artist_df['genres'] = artist_df['genres'].apply(lambda x: ast.literal_eval(x))
artist_df = artist_df[artist_df['genres'].apply(lambda x: len(x) != 0)]

# 2 additional artist dataframes for merging
artist_df_1 = artist_df.rename(columns={'name': 'artist_1'})
artist_df_2 = artist_df.rename(columns={'name': 'artist_2'})

display(artist_df)

In [None]:
# global artist collaboration data 2017
data_17 = csv_to_list("data/global_2017.csv")
cols = data_17.pop(0)

df_17 = pd.DataFrame(data_17, columns=cols)
#display(df_17)

In [None]:
# merge artists with collaboration to get total number of artists who collaborated on a charting song in 2017 and genres for each artist
merged_df1 = pd.merge(df_17, artist_df_1, on='artist_1', how='inner')

merged_df2 = pd.merge(merged_df1, artist_df_2, on='artist_2', how='inner')
merged_df2  = merged_df2.rename(columns={'popularity_x': 'popularity_1', 'popularity_y': 'popularity_2', 'genres_x': 'genres_1', 'genres_y': 'genres_2'})
merged_df2 = merged_df2[["artist_1", "artist_2", "count", "genres_1", "genres_2", "popularity_1", "popularity_2", "song_ids"]]
merged_df2 ['song_ids'] = merged_df2 ['song_ids'].apply(lambda x: ast.literal_eval(x))
merged_df2 ['count'] = merged_df2 ['count'].apply(int)

merged_df2 = merged_df2.explode("song_ids")
merged_df2 = merged_df2.rename(columns={'song_ids': 'song_id'})

# generate set of artists on which to define the nodes
a1 = list(merged_df2["artist_1"])
a2 = list(merged_df2["artist_2"])
artist_nodes = list(set(a1 +a2))

display(merged_df2)

In [None]:
# get mapping of artists to main genres
art_df_17 = pd.DataFrame(artist_nodes, columns = ["name"])
# display(art_df_17)

artist_2_genres_df = pd.merge(art_df_17, artist_df, on='name', how='inner')
artist_2_genres_df = artist_2_genres_df[["name", "genres"]]
artist_2_genres_df["genres"] = artist_2_genres_df["genres"].apply(lambda x: x[0])


artist_2_genres_dict = dict(zip(artist_2_genres_df['name'], artist_2_genres_df['genres']))
display(artist_2_genres_df)

In [None]:
def generate_dfs(path, merge_df):
  '''
  for each csv in the folder path:
    - create a dataframe
    - merge with input df on song_id to identify collaborations in the top 200 for each week (generate set of noded and edges)
    - return list containing dataframea
  '''
  dfs =[]

  for file in os.listdir(path):
    if file.endswith('.csv'):
      file_path = os.path.join(path, file)
      #data = pd.read_csv(file_path)
      data = csv_to_list(file_path)
      cols = data.pop(0)
      data_df = pd.DataFrame(data, columns=cols)
      df = pd.merge(data_df, merge_df, on='song_id', how='inner')
      df = df[["artist_1", "artist_2", "count", "genres_1", "genres_2", "song_name", "popularity_1", "popularity_2"]]
      dfs.append(df)

  return dfs



dfs = generate_dfs("data/2017/", merged_df2)

In [None]:
def generate_graphs(dfs_list, artist_list):
  '''
  For each dataframe:
    - define nodes on total number of artists to chart over the course of the year (len(artist_list))
    - create nx graph
    - append adjacency and laplacian matrices (np arrays) to lists (return these lists at the end)
  '''
  adj_matrices = []
  lap_matrices = []

  artist_id_map = {i: artist_name for i, artist_name in enumerate(artist_list)}

  for df in dfs_list:
    # set up graph/nodes
    graph = nx.Graph()
    graph.add_nodes_from(artist_list)

    # add edges
    for _, row in df.iterrows():
      if not graph.has_edge(row['artist_1'], row['artist_2']):
        graph.add_edge(row['artist_1'], row['artist_2'], weight = row['count'])

    # adjacency matrix
    adj_sparse = nx.adjacency_matrix(graph, weight = 'weight')
    adj = adj_sparse.toarray()
    adj_matrices.append(adj)

    # laplacian matrix
    lap_sparse = nx.laplacian_matrix(graph, weight = 'weight')
    lap = lap_sparse.toarray()
    lap_matrices.append(lap)

  return adj_matrices, lap_matrices, artist_id_map


adjs, laps, node_2_artist = generate_graphs(dfs, artist_nodes)

## Graph Visualization and Analysis (Weeks 1, 25, 50)

For simplicity, I am only visualising the largest connected components of each graph.

In [None]:
display(dfs[0])

In [None]:
genres_list = list(artist_2_genres_dict.values())
unique_genres_list = list(set(genres_list))

colors = [
    "#4e79a7", "#f28e2b", "#e15759", "#76b7b2", "#59a14f",
    "#edc949", "#af7aa1", "#ff9da7", "#9c755f", "#bab0ab",
    "#8c8c8c", "#e377c2", "#7f7f7f", "#c49c94", "#bcbd22",
    "#9067C6", "#82DDF0", "#D6D84F", "#DB2955", "#B98389",
    "#54494B", "#E0FBFC", "#dbdb8d", "#EE6C4D", "#B0CA87",
    "#2E0014", "#700548", "#7272AB", "#B0D7FF", "#17becf",
    "#aec7e8", "#ffbb78", "#98df8a", "#ff9896", "#c5b0d5",
    "#F55D3E", "#f7b6d2", "#c7c7c7", "#8D86C9", "#9edae5",
    "#9467bd", "#d62728", "#2ca02c", "#1f77b4", "#ff7f0e"
]

genre_2_color = {}

for i, genre in enumerate(unique_genres_list):
  genre_2_color[genre] = colors[i]

# Splitting genre to color dictionary into two lists
genres = list(genre_2_color.keys())
colors = list(genre_2_color.values())

# Create figure and axis with two subplots
fig, axes = plt.subplots(1, 2, figsize=(10, 5))
fig.suptitle('Genre Color Map')

# Plot first 22 colors and labels
axes[0].barh(range(22), np.ones(22), color=colors[:22])
axes[0].set_yticks(range(22))
axes[0].set_yticklabels(genres[:22], fontsize=8)
axes[0].set_xticks([])


# Plot next 22 colors and labels
axes[1].barh(range(22), np.ones(22), color=colors[22:])
axes[1].set_yticks(range(22))
axes[1].set_yticklabels(genres[22:], fontsize=8)
axes[1].set_xticks([])


# Adjust layout
plt.tight_layout()

# Show plot
plt.show()

In [None]:
def draw_lcc_color_by_genre(adj_matrix, node_artist, artist_genre, genre_color, labels = True):
  G = nx.from_numpy_array(adj_matrix)
  G_cc = nx.connected_components(G)
  G_lcc = max(G_cc, key=len)
  G_lcc_graph = G.subgraph(G_lcc)

  node_colors = [genre_color[artist_genre[node_artist[node]]] for node in G_lcc_graph.nodes()]

  pos = nx.spring_layout(G_lcc_graph)
  nx.draw(G_lcc_graph, pos, node_size=50, node_color=node_colors, with_labels=labels,font_size=10)

  return  G_lcc_graph


#### Week 1

In [None]:
# Graph LCC Visualisation (compare number of nodes and edges in LCC across graphs for each week)
G1 = draw_lcc_color_by_genre(adjs[0], node_2_artist, artist_2_genres_dict, genre_2_color)

In [None]:
# Degree by artist and genre (largest connected component)

artist_2_color = {}

for a in artist_2_genres_dict.keys():
  genre = artist_2_genres_dict.get(a)
  color = genre_2_color.get(genre)
  artist_2_color[a] = color

# Artist to node degree dict
def artist_deg_map(G, node_artist_dict):
  artist_2_deg = {}

  for n in G.nodes():
    deg = G.degree[n]
    artist = node_artist_dict.get(n)
    artist_2_deg[artist] = deg

  return artist_2_deg

nd_1 = artist_deg_map(G1, node_2_artist)

artists1 = list(nd_1.keys())
degrees1 = list(nd_1.values())
average_value1 = np.mean(degrees1)
print(average_value1)

plt.figure(figsize=(20, 5))
plt.xlabel('Artists', fontsize=12)
plt.ylabel('Node Degrees', fontsize=12)
plt.title('Degree by Artist and Genre - Week 1', fontsize=14)
plt.bar(range(len(nd_1)), degrees1, tick_label=artists1, color=[artist_2_color[label] for label in artists1])
plt.axhline(y=average_value1, color='black', linestyle='--', label='Average Degree')
plt.xticks(rotation=90)
plt.show()




#### Week 25

In [None]:
# Graph LCC Visualisation
G25 = draw_lcc_color_by_genre(adjs[24], node_2_artist, artist_2_genres_dict, genre_2_color)

In [None]:
# Degree Distribution by artist and genre (largest connected component)
nd_25 = artist_deg_map(G25, node_2_artist)

artists25 = list(nd_25.keys())
degrees25 = list(nd_25.values())
average_value25 = np.mean(degrees25)
print(average_value25)

plt.figure(figsize=(20, 5))
plt.xlabel('Artists', fontsize=12)
plt.ylabel('Node Degrees', fontsize=12)
plt.title('Degree by Artist and Genre - Week 25', fontsize=14)
plt.bar(range(len(nd_25)), degrees25, tick_label=artists25, color=[artist_2_color[label] for label in artists25])
plt.axhline(y=average_value25, color='black', linestyle='--', label='Average Degree')
plt.xticks(rotation=90)
plt.show()

#### Week 50

In [None]:
# Graph LCC Visualisation
G50 = draw_lcc_color_by_genre(adjs[49], node_2_artist, artist_2_genres_dict, genre_2_color)

In [None]:
# Degree Distribution by artist and genre (largest connected component)
nd_50 = artist_deg_map(G50, node_2_artist)

artists50 = list(nd_50.keys())
degrees50 = list(nd_50.values())
average_value50 = np.mean(degrees50)
print(average_value50)

plt.figure(figsize=(20, 5))
plt.xlabel('Artists', fontsize=12)
plt.ylabel('Node Degrees', fontsize=12)
plt.title('Degree by Artist and Genre - Week 50', fontsize=14)
plt.bar(range(len(nd_50)), degrees50, tick_label=artists50, color=[artist_2_color[label] for label in artists50])
plt.axhline(y=average_value50, color='black', linestyle='--', label='Average Degree')
plt.xticks(rotation=90)
plt.show()

## Distance Metrics

### Structural Metric: Jaccard Distance

In [None]:
# calculate distances between each graph and store distances in an array
def calculate_plot_dists(matrix_list, distance_metric, plt_title):
  num_matrices = len(matrix_list)
  distances = np.zeros((num_matrices, num_matrices))
  for i in range(num_matrices):
      for j in range(num_matrices):
          distances[i, j] = distance_metric(matrix_list[i], matrix_list[j])

  # plot results in a heatmap
  sns.heatmap(distances,cmap="YlGnBu")

  tick_interval = 5
  ticks_position = np.arange(0, num_matrices, tick_interval) + 0.5
  ticks_labels = np.arange(0, num_matrices, tick_interval)
  plt.xticks(ticks=ticks_position, labels=ticks_labels)
  plt.yticks(ticks=ticks_position, labels=ticks_labels)


  plt.xlabel('Week')
  plt.ylabel('Week')
  plt.title(plt_title)
  plt.show()

  return distances

In [None]:
# define distance calculator function
def jaccard_dist(adj_1, adj_2):
  '''
  Calculates Jaccard distance between two graphs.

  Parameters: adj_1, adj_2 --> the adjacency matrices
  for each graph as numpy arrays (should have same dimensions)
  '''
  assert adj_1.shape == adj_2.shape
  n = adj_1.shape[0]
  num = np.sum(np.absolute(adj_1 - adj_2))
  denom = np.sum(np.maximum(adj_1, adj_2))

  dist = num/denom
  return dist

def weighted_jaccard_dist(adj_1, adj_2):
  '''
  Calculates Jaccard distance between two weighted graphs.

  Parameters: adj_1, adj_2 --> the adjacency matrices
  for each graph as numpy arrays (should have same dimensions)
  '''
  assert adj_1.shape == adj_2.shape
  n = adj_1.shape[0]
  num = np.sum(np.minimum(adj_1, adj_2))
  denom = np.sum(np.maximum(adj_1, adj_2))

  dist = 1 - num/denom
  return dist

In [None]:
jac_dists = calculate_plot_dists(adjs, weighted_jaccard_dist, 'Jaccard Distances Between Adjacency Matrices')

In [None]:
print(jac_dists)

### Spectral/Mesoscale Metric: Polynomial Dissimilarity

In [None]:
# define distance calculator function
def poly_dist(adj_1, adj_2, k = 2, alpha = 1):
    '''
    Calculates polynomial dissimilarity between two weighted graphs.
    See specifications in Donnat and Holmes (2018).

    Parameters:
      - adj_m1, adj_2: adjacency matrices for the two graphs being compared
      - k: the degree of the polynomial output
      - alpha: a tuning parameter that allows for the different weighting of higher or lower terms in the polynomial.
              for simplicity, alpha = 1
    '''
    assert adj_1.shape == adj_2.shape
    n = adj_1.shape[0]

    e_vals1, e_vec1 = np.linalg.eig(adj_1)
    e_vals2, e_vec2 = np.linalg.eig(adj_2)

    # get polynomials for each adjacency matrix
    v1 = 0
    v2 = 0
    for k in range (1,k+1):
        v1 += e_vals1**k / ((n-1)** (alpha * (k-1)))
        v2 += e_vals2**k / ((n-1)** (alpha * (k-1)))

    w1 = np.diag(v1)
    w2 = np.diag(v2)

    pol_1 = np.dot(np.dot(e_vec1,w1), e_vec1.T)
    #print(pol_1.shape)
    pol_2 = np.dot(np.dot(e_vec2,w2), e_vec2.T)
    #print(pol_2.shape)
    pol_diff = pol_1 - pol_2
    #print(pol_diff.shape)

    # distance is calculated by taking the  Frobenius norm of the difference in polynomials
    # see here for more info: https://inst.eecs.berkeley.edu/~ee127/sp21/livebook/l_mats_norms.html
    dist = (np.linalg.norm(pol_diff, ord = 'fro'))/(n**k)

    return dist



In [None]:
poly_dist(adjs[0], adjs[23])

In [None]:
pol_dists = calculate_plot_dists(adjs, poly_dist, 'Polynomial Dissimilarities Between Adjacency Matrices')
# NOTE: Could a small range of distances for the polynomial measure suggest that most of the changes over
# time are happening in the periphery of the graph, as opposed to in dense areas (See Donnat and Holmes, pp. 24-25)

### Mesoscale Metric: Quantifying Interactions with Connectivity-Based Distances

In [None]:
def con_dist(adj_1, adj_2, p = 2):
  '''
  Calculates connectivity-based distance using eigenvector centrality.
  Eigenvector centrality as a measure of influence (see: Bloch, Jackson and Tebaldi, 2023)/
  Distance formula adapted from equationa 4.1 and 4.2 in Donnat and Holmes (2018).
  Parameters:
      - adj_m1, adj_2: adjacency matrices for the two graphs being compared
      - p: tuning parameter (p >= 1); changes the extent to which the dissimilarity measure is sensitive to changes in centrality.
           p = 2 by default, as shown in the example from Donnat and Holmes

  '''
  assert adj_1.shape == adj_2.shape

  # convert adjacency matrices to graphs
  G1 = nx.from_numpy_array(adj_1)
  G2 = nx.from_numpy_array(adj_2)

  # calculate eigenvector centrality for each node and store as a numpy array
  cent1 = nx.eigenvector_centrality_numpy(G1, weight = "weight")
  cent_1_np = np.array(list(cent1.values()))

  cent2 = nx.eigenvector_centrality_numpy(G2, weight = "weight")
  cent_2_np = np.array(list(cent2.values()))

  # calculate the centrality based distance
  step1 = np.subtract(cent_2_np,cent_1_np)
  step2 = step1**p
  step3 = np.sum(step2)
  dist = step3**(1/p)

  return dist

In [None]:
con_dists = calculate_plot_dists(adjs, con_dist, 'Connectivity-Based Distances Between Adjacency Matrices')
# seems less stable over time than the other metrics --> perhaps this suggests that artists fluctuate in terms of influence on the network
# makes sense as songs continuously move on and off the top 200 charts; reflects collaboration with neighbors that have varying
# levels of connectivity at a given time