-
Notifications
You must be signed in to change notification settings - Fork 0
/
communityGraph.py
164 lines (148 loc) · 6.1 KB
/
communityGraph.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
import csv
import networkx
import matplotlib.pyplot as plt
import matplotlib
import collections
import community
# Set font sizes of plots
font = {'family': 'normal', 'size': 5}
matplotlib.rc('font', **font)
# Parse through spreadsheet data
with open('whosampled30k.csv', mode='r', encoding="cp850") as dataSamples:
dataSamples = csv.reader(dataSamples)
# Read out header with row
header = next(dataSamples)
rowData = [row for row in dataSamples]
# Get all unique artists so we have 1:1 between IDs and artists
uniqueSamplers = list(set([row[0] for row in rowData]))
uniqueSamplees = list(set([row[4] for row in rowData]))
# print('Unique Samplers: ' + str(len(uniqueSamplers)))
# print('Unique Samplees: ' + str(len(uniqueSamplees)))
# Count the number of songs in each genre
uniqueGenreCountsSamplers = list(row[2] for row in rowData)
uniqueGenreCountsSamplees = list(row[6] for row in rowData)
counter = collections.Counter(uniqueGenreCountsSamplers)
# print('Unique Sampler Genres: ' + str(counter))
counter = collections.Counter(uniqueGenreCountsSamplees)
# print('Unique Samplee Genres: ' + str(counter))
# creates a list of tuples with unique ids and their names for each artist
idSampler = list(enumerate(uniqueSamplers))
idSamplee = list(enumerate(uniqueSamplees))
# creates a dictionary(hash map) that maps each id to the artist names
keysSampler = {name: i for i, name in enumerate(uniqueSamplers)}
keysSamplee = {name: i for i, name in enumerate(uniqueSamplees)}
# Links is a list quintles
# links = (samplingArtist, sampledArtist, elemSampled, samplingGenre, sampledGenre)
links = []
for row in rowData:
# Maps all arists in spreadsheet to their IDs
# try:
# # Note this creates links from sampler only. TODO: Change in future to just creating a tuple from start?
# links.append({keysSampler[row[0]]: keysSampler[row[4]]})
# except:
links.append((row[0], row[4], row[9], row[2], row[6]))
# print(links)
# print(len(links))
# Create digraph
g = networkx.Graph()
diG = networkx.DiGraph(g, dataset="5000")
# Add nodes with genre attribute
for node in links:
g.add_node(node[0], genre=node[3]) # Sampling node
g.add_node(node[1], genre=node[4]) # Sampled node
# Put nodes in dictionary organized by {genre: artist}
byGenreNodes = collections.defaultdict(list)
for artist, genre in networkx.get_node_attributes(g, 'genre').items():
byGenreNodes[genre].append(artist)
# print(byGenreNodes)
# Create links, note that networkX needs links in tuples
for node in links: # Change each link and changes to tuple so it can be added
# print(node)
g.add_edge(node[0], node[1], audioElem=node[2])
# List of artists by number of times sampled
most_sampled = {}
# List of artists by number of times they used a sample
most_samples ={}
# TODO: Remove zeros from lists
for node in diG.nodes:
most_sampled[node] = diG.in_degree(node)
most_samples[node] = diG.out_degree(node)
# Print number of times sampled
# print(sorted(most_sampled.items(), key=lambda sample: sample[1]))
# Print number of times sampling something
# print(sorted(most_samples.items(), key=lambda sample: sample[1]))
# Print associated genre
# print(networkx.get_node_attributes(diG, 'genre'))
# Get all intragenre and intergenre samples
intraGenre = []
interGenre = []
for edge in g.edges():
# Print edge if it samples somebody in the same genre
if g.nodes()[edge[0]] == g.nodes()[edge[1]]:
intraGenre.append(edge)
# Print edge if it samples somebody in another genre
else:
interGenre.append(edge)
# print(intraGenre)
# print(interGenre)
# Compute communities using Louvain method
partition = community.best_partition(g)
# Gather all communities and members of each community (group them by value)
unsortLouv = collections.defaultdict(list)
for key, val in partition.items():
unsortLouv[val].append(key)
# Arranging largest communities first in sortedLouv
sortLouvIndex = sorted(unsortLouv, key=lambda k: len(unsortLouv[k]), reverse=True)
topLouvs = []
for i in range(1):
if len(unsortLouv[sortLouvIndex[i]]) > 2:
topLouvs.append(unsortLouv[sortLouvIndex[i]])
else:
break
# What is the genre makeup of each community by percentage
for louvComm in topLouvs:
louvCommPercents = collections.defaultdict(float)
louvCommGenres = collections.defaultdict(list)
for artist in louvComm:
louvCommGenres[g.node[artist]['genre']].append(artist)
for genre in louvCommGenres:
louvCommPercents[genre] = (len(louvCommGenres[genre]) / len(louvComm) * 100)
sortedLouvCommPercents = sorted(louvCommPercents.items(), key=lambda k: k[1])
# Plot percentages
fig, ax = plt.subplots()
for i, v in enumerate(sortedLouvCommPercents):
ax.text(i - .3, v[1], '{0:.2f}%'.format(v[1]))
plt.xticks(rotation=32)
plt.title('Genres by Percentage in 1st Largest Louvain Community; Community Size = ' + str(len(louvComm)))
plt.ylabel('Percentage')
plt.xlabel('Genre')
plt.gcf().subplots_adjust(bottom=0.15)
plt.bar(*zip(*sortedLouvCommPercents))
frame1 = plt.gca()
plt.gcf().subplots_adjust(bottom=0.15)
frame1.axes.yaxis.set_ticklabels([])
# Here is an alternative community-forming algorithm for k-components
# Arranging largest communities first in kComponents
# sortedkComp = []
# for kComp in networkx.k_components(g)[1]:
# # Only include k-components greater than 2
# if len(kComp) > 2:
# sortedkComp.append(kComp)
# sortedkComp = sorted(sortedkComp, key=len, reverse=True)
# print(sortedkComp[-10:])
# TODO: We have communities now; how do we know who they're centered around?
# Take centrality measurement?
# Maybe take the top 10 communities only and study those in-depth?
# pos = networkx.spring_layout(g) # compute graph layout
# # Create edge labels
# edge_labels = networkx.get_edge_attributes(g, 'audioElem')
# networkx.draw_networkx_edge_labels(g, pos, edge_labels, font_size=8)
# # Create color map by genre by mapping respective genres to ints
# color_map_genre = [hash(genre) for genre in networkx.get_node_attributes(g, 'genre').values()]
# plt.axis('off')
# networkx.draw_networkx_nodes(g, pos, node_size=100, cmap=plt.cm.RdYlBu,
# node_color=color_map_genre)
# networkx.draw_networkx_edges(g, pos, alpha=0.3)
# networkx.draw_networkx_labels(g, pos, font_size=8)
# plt.show(g)
plt.savefig('louvain1st.png', dpi=199)