Notebook for preparing edge sheet from network-graph.csv

In [1]:
import csv
import json
import pandas as pd
from itertools import chain, combinations
from collections import Counter
import functools
import operator
from tqdm.notebook import tqdm

In [2]:
# set pandas display options
pd.set_option("display.max_rows", 500)
pd.set_option('display.max_colwidth', 500)
pd.set_option("display.max_columns", 200)

In [3]:
conv_file = "/Users/geistling/Documents/1_Projects/bandcamp/bandcamp_viz/notebooks/network-graph_extended.csv"
edge_file = "/Users/geistling/Documents/1_Projects/bandcamp/bandcamp_viz/notebooks/edge_sheet.csv"
node_file = "/Users/geistling/Documents/1_Projects/bandcamp/bandcamp_viz/notebooks/node_sheet.csv"

In [4]:
album_data = pd.read_csv(conv_file)

# 1. Produce edge_sheet.csv from network-graph.csv

In [5]:
# global tallying, edges
grouped = album_data.groupby('standard_name')
Collected = pd.DataFrame(columns =['genre1', 'genre2','count','location'])

for place, group in grouped:
    edge_list = []
    genre_list = group['genre']
    for item in genre_list:
        genres_in_list = sorted(list(item.split(",")))
        # need to figure out which genres appear most between both columns
        genre_combs = combinations(genres_in_list, 2)
        # return list of tuples
        for combination in genre_combs:
            edge_list.append([combination])
    edge_tally = Counter(chain(*edge_list))

    edge_tallies_dict = dict(edge_tally)

    edgeprep_dataframe = pd.DataFrame.from_dict(edge_tallies_dict, orient='index')
    edgeprep_dataframe.rename(columns = {0:'count'}, inplace = True) 
    edgeprep_dataframe['genre_pair'] = edgeprep_dataframe.index
    edgeprep_dataframe['row'] = edgeprep_dataframe.reset_index().index
    edgeprep_dataframe = edgeprep_dataframe.set_index(['row'])

    genre_edges = pd.DataFrame(list(edgeprep_dataframe['genre_pair']), columns =['genre1', 'genre2']) 

    count = edgeprep_dataframe['count']
    genre_edges = genre_edges.join(count)
    genre_edges['location'] = place
    Collected = Collected.append(genre_edges)
    
Collected

Unnamed: 0,genre1,genre2,count,location
0,ambient,atmospheric,18,"Adelaide SA, Australia"
1,ambient,electronica,29,"Adelaide SA, Australia"
2,atmospheric,electronica,10,"Adelaide SA, Australia"
3,ebm,electronic,28,"Adelaide SA, Australia"
4,ebm,electronica,23,"Adelaide SA, Australia"
...,...,...,...,...
1155,space,stoner,1,"İstanbul, Turkey"
1156,space,stoner rock,1,"İstanbul, Turkey"
1157,ambient,industrial,1,"İstanbul, Turkey"
1158,ambient,techno,1,"İstanbul, Turkey"


In [6]:
Collected.to_csv(edge_file, index=False)

In [8]:
grouped.head()

Unnamed: 0,standard_name,genre
0,"Adelaide SA, Australia","atmospheric,electronica,ambient"
1,"Adelaide SA, Australia","electronic,industrial,ebm,electronica"
2,"Adelaide SA, Australia","electronica,industrial,ebm,electronic"
3,"Adelaide SA, Australia","electronica,ebm,industrial,electronic"
4,"Adelaide SA, Australia","industrial,electronic,electronica,ebm"
...,...,...
525676,"İstanbul, Turkey","bass,ambient,experimental"
525677,"İstanbul, Turkey",electronic
525678,"İstanbul, Turkey",electronic
525679,"İstanbul, Turkey","electroacoustic,electronic,experimental,improvisation,ambient"


In [7]:
# produce id json
df1 = pd.DataFrame(Collected['genre1'].value_counts())
df2 = pd.DataFrame(Collected['genre2'].value_counts())
dfc = pd.concat([df1, df2], axis=1, sort=False)
dfc["total"] = dfc.sum(axis=1)
dfc = dfc.sort_values(['total'], ascending = (False))
dfc['name'] = dfc.index
dfc = dfc.reset_index()
# id_df

alias_json = []
alias_file = "/Users/geistling/Documents/1_Projects/bandcamp/bandcamp_viz/data/ng_ids.json"
with open(alias_file, "r") as jsonFile:
    json_output = json.load(jsonFile)
for i, row in dfc.iterrows():
    id_obj = {}
    id_obj['i'] = i#.astype(int)
    id_obj['g'] = row[0]
    alias_json.append(id_obj)

In [8]:
# whoops, turn id json into dict
alias_obj = {}
alias_file = "/Users/geistling/Documents/1_Projects/bandcamp/bandcamp_viz/data/ng_ids.json"
# for i, row in dfc.iterrows():

with open(alias_file, "r") as jsonFile:
    json_output = json.load(jsonFile)
protodict = dfc['name'].to_dict()
with open(alias_file, "w") as jsonFile:
    json.dump(protodict, jsonFile, separators=(',', ':'))

In [9]:
edge_reader = pd.read_csv(edge_file)
# edge_reader = edge_reader.sort_values(['location', "genre1"], ascending = (True, False))
edge_reader

Unnamed: 0,genre1,genre2,count,location
0,ambient,atmospheric,18,"Adelaide SA, Australia"
1,ambient,electronica,29,"Adelaide SA, Australia"
2,atmospheric,electronica,10,"Adelaide SA, Australia"
3,ebm,electronic,28,"Adelaide SA, Australia"
4,ebm,electronica,23,"Adelaide SA, Australia"
...,...,...,...,...
382717,space,stoner,1,"İstanbul, Turkey"
382718,space,stoner rock,1,"İstanbul, Turkey"
382719,ambient,industrial,1,"İstanbul, Turkey"
382720,ambient,techno,1,"İstanbul, Turkey"


# 3. produce json file from the edge and node sheet

In [11]:
json_file = "/Users/geistling/Documents/1_Projects/bandcamp/bandcamp_viz/data/network_graph.json"

In [12]:
# Attempt to order by most common genre in city

# group link_reader data and tally up occurences of genres in "genre1" and "genre2"
lr = pd.read_csv(edge_file)

grouped2 = lr.groupby('location')
city_genre_tally = pd.DataFrame(columns = ['index','rank','total','location'])
for place, group in grouped2:
    df1 = pd.DataFrame(group['genre1'].value_counts())
    df2 = pd.DataFrame(group['genre2'].value_counts())
    dfc = pd.concat([df1, df2], axis=1, sort=False)
    # place
    dfc["total"] = dfc.sum(axis=1)
    dfc = dfc.sort_values(['total'], ascending = (False))
    dfc['location'] = place
    dfc = dfc.reset_index()
    dfc['rank'] = dfc.index
    city_genre_tally = pd.concat([city_genre_tally, dfc], axis=0, sort=False)

In [None]:
def order_lookup(genre1, genre2, city):
    # return rows for genre1 and genre2 in city_genre_tally for appropriate city
    # the genre that's a smaller rank becomes gen1
    genmatch1 = city_genre_tally.loc[(
        city_genre_tally['index'] == genre1) & (city_genre_tally['location'] == city
                                                ), "rank"]
    genmatch2 = city_genre_tally.loc[(
        city_genre_tally['index'] == genre2) & (city_genre_tally['location'] == city
                                                ), "rank"]
    if genmatch1.values > genmatch2.values:
        gen1 = genre2
        gen2 = genre1
    else:
        gen1 = genre1
        gen2 = genre2
    g1 = genre_lookup(place_obj, gen1)
    g2 = genre_lookup(place_obj, gen2)
    return (g1, g2)

In [13]:
with open(json_file, "r") as jsonFile:
    json_output = json.load(jsonFile)
node_reader = pd.read_csv(node_file)
# order rows so most frequent genres have the smallest numbers
node_reader = node_reader.sort_values(['location', "count"], ascending = (True, False))


def genre_lookup(place_obj, genre):
    node_match = [node for node in alias_json if node['g'] == genre]
    return node_match[0]["i"]

for row in node_reader.itertuples():
    place_obj = {'n':[], 'l':[], 'w': 1}
    place_matches = [obj for obj in json_output if obj['ct'] == row.location]
    if not place_matches:
        place_obj['ct'] = row.location
        place_obj['cor'] = [row.lng, row.lat]
        json_output.append(place_obj)
    elif place_matches:
        place_obj = place_matches[0]
    g = genre_lookup(place_obj, row.genre)
    place_obj["n"].append({"g":g,"c":row.count,"r":round(row.relative, 3)}) #"i":node_id,

link_reader = pd.read_csv(edge_file)

for row in link_reader.itertuples():
    place_matches = [obj for obj in json_output if obj['ct'] == row.location]
    # troubleshoots match issues
    if not place_matches:
        print(row)
    place_obj = place_matches[0]
#     g1, g2 = order_lookup(row.genre1, row.genre2, row.location)
    g1 = genre_lookup(place_obj, row.genre1)
    g2 = genre_lookup(place_obj, row.genre2)
    genre_matches = [obj for obj in place_obj['l'] if obj['s'] == g1]
    if not genre_matches:
        gm = {'s': g1, 'ts': []}
        place_obj['l'].append(gm)
    elif genre_matches:
        gm = genre_matches[0]
    gm['ts'].append({'t': g2, "c":row.count})  
    if row.count > place_obj["w"]:
        place_obj["w"] = row.count
#     place_obj["l"].append({"s":g1,"t":g2,"c":row.count})

with open(json_file, "w") as jsonFile:
    json.dump(json_output, jsonFile, separators=(',', ':'))