Notebook for preparing edge sheet from network-graph.csv

In [1]:
import csv
import json
import pandas as pd
from itertools import chain, combinations
from collections import Counter
import functools
import operator
from tqdm.notebook import tqdm

In [2]:
# set pandas display options
pd.set_option("display.max_rows", 500)
pd.set_option('display.max_colwidth', 500)
pd.set_option("display.max_columns", 200)

In [3]:
conv_file = "/Users/geistling/Documents/1_Projects/bandcamp/bandcamp_viz/notebooks/network-graph.csv"
edge_file = "/Users/geistling/Documents/1_Projects/bandcamp/bandcamp_viz/notebooks/edge_sheet.csv"
node_file = "/Users/geistling/Documents/1_Projects/bandcamp/bandcamp_viz/notebooks/lat-lng-test.csv"

In [4]:
album_data = pd.read_csv(conv_file)

# 1. Produce edge_sheet.csv from network-graph.csv

In [5]:
# global tallying, edges
grouped = album_data.groupby('standard_name')
Collected = pd.DataFrame(columns =['genre1', 'genre2','count','location'])

for place, group in grouped:
    edge_list = []
    genre_list = group['genre']
    for item in genre_list:
        genres_in_list = sorted(list(item.split(",")))
        # need to figure out which genres appear most between both columns
        genre_combs = combinations(genres_in_list, 2)
        # return list of tuples
        for combination in genre_combs:
            edge_list.append([combination])
    edge_tally = Counter(chain(*edge_list))

    edge_tallies_dict = dict(edge_tally)

    edgeprep_dataframe = pd.DataFrame.from_dict(edge_tallies_dict, orient='index')
    edgeprep_dataframe.rename(columns = {0:'count'}, inplace = True) 
    edgeprep_dataframe['genre_pair'] = edgeprep_dataframe.index
    edgeprep_dataframe['row'] = edgeprep_dataframe.reset_index().index
    edgeprep_dataframe = edgeprep_dataframe.set_index(['row'])

    genre_edges = pd.DataFrame(list(edgeprep_dataframe['genre_pair']), columns =['genre1', 'genre2']) 

    count = edgeprep_dataframe['count']
    genre_edges = genre_edges.join(count)
    genre_edges['location'] = place
    Collected = Collected.append(genre_edges)

In [6]:
Collected.to_csv(edge_file, index=False)

In [7]:
Collected.head()

Unnamed: 0,genre1,genre2,count,location
0,ebm,electronic,28,"Adelaide SA, Australia"
1,ebm,electronica,23,"Adelaide SA, Australia"
2,ebm,industrial,27,"Adelaide SA, Australia"
3,electronic,electronica,63,"Adelaide SA, Australia"
4,electronic,industrial,27,"Adelaide SA, Australia"


In [36]:
with open(json_file, "r") as jsonFile:
    json_output = json.load(jsonFile)
node_reader = pd.read_csv(node_file)
node_reader = node_reader.sort_values(['location', "count"], ascending = (True, False))
node_reader

Unnamed: 0,location,genre,lat,lng,count,prop,relative
55,"Adelaide SA, Australia",electronic,-34.928499,138.600746,445,0.072217,0.852948
3,"Adelaide SA, Australia",alternative,-34.928499,138.600746,245,0.039760,1.726768
138,"Adelaide SA, Australia",rap/hip hop,-34.928499,138.600746,196,0.031808,1.089582
142,"Adelaide SA, Australia",rock,-34.928499,138.600746,175,0.028400,1.166589
109,"Adelaide SA, Australia",metal,-34.928499,138.600746,168,0.027264,1.738526
...,...,...,...,...,...,...,...
26037,"İstanbul, Turkey",singer songwriter,41.008238,28.978359,4,0.001190,0.227413
26040,"İstanbul, Turkey",space,41.008238,28.978359,4,0.001190,0.931773
26041,"İstanbul, Turkey",stoner,41.008238,28.978359,4,0.001190,1.074476
26045,"İstanbul, Turkey",synthesizer,41.008238,28.978359,4,0.001190,1.142845


In [39]:
edge_reader = pd.read_csv(edge_file)
# edge_reader = edge_reader.sort_values(['location', "genre1"], ascending = (True, False))
edge_reader

Unnamed: 0,genre1,genre2,count,location
0,ebm,electronic,28,"Adelaide SA, Australia"
1,ebm,electronica,23,"Adelaide SA, Australia"
2,ebm,industrial,27,"Adelaide SA, Australia"
3,electronic,electronica,63,"Adelaide SA, Australia"
4,electronic,industrial,27,"Adelaide SA, Australia"
...,...,...,...,...
372240,space,stoner,1,"İstanbul, Turkey"
372241,space,stoner rock,1,"İstanbul, Turkey"
372242,ambient,industrial,1,"İstanbul, Turkey"
372243,ambient,techno,1,"İstanbul, Turkey"


# 3. produce json file from the edge and node sheet

In [8]:
json_file = "/Users/geistling/Documents/1_Projects/bandcamp/bandcamp_viz/data/network_graph.json"

In [9]:
with open(json_file, "rw") as jsonFile:
    jsonFile.seek(0)
    json_output = json.load(jsonFile)
    node_reader = pd.read_csv(node_file)
    # order rows so most frequent genres have the smallest numbers
    node_reader = node_reader.sort_values(['location', "count"], ascending = (True, False))

    node_id = 1
    for row in node_reader.itertuples():
        place_obj = {'n':[], 'l':[]}
        place_matches = [obj for obj in json_output if obj['ct'] == row.location]
        if not place_matches:
            place_obj['ct'] = row.location
            place_obj['cor'] = [row.lng, row.lat]
            json_output.append(place_obj)
            node_id = 1
        elif place_matches:
            place_obj = place_matches[0]
        place_obj["n"].append({"g":row.genre,"c":row.count,"i":node_id,"r":round(row.relative, 3)})
        node_id += 1

    def genre_lookup(place_obj, genre):
        node_match = [node for node in place_obj['n'] if node['g'] == genre]
        return node_match[0]["i"]


    link_reader = pd.read_csv(edge_file)
    for row in link_reader.itertuples():
    #     place_matches = [json_output[json_output['ct'].str.contains(row.location)]]
        place_matches = [obj for obj in json_output if obj['ct'] == row.location]
        if not place_matches:
            print(row)
        place_obj = place_matches[0]
        g1 = genre_lookup(place_obj, row.genre1)
        g2 = genre_lookup(place_obj, row.genre2)
        place_obj["l"].append({"s":g1,"t":g2,"c":row.count})

    json.dump(json_output, jsonFile, separators=(',', ':'))


NameError: name 'json_output' is not defined

In [12]:
# use this to choose the genre to head the group. data structure like so:
# s:[{t:TARGET, c:COUNT}, {t:TARGET, c:COUNT}]
grouped = Collected.groupby('location')
grouped.count()
for place, group in grouped:
    df1 = pd.DataFrame(group['genre1'].value_counts())
    df2 = pd.DataFrame(group['genre2'].value_counts())
    dfc = pd.concat([df1, df2], axis=1, sort=False)
    dfc["total"] = dfc.sum(axis=1)
    dfc = dfc.sort_values(['total'], ascending = (False))
# dfc