Notebook 2.
- Removes informative non-genres from the genre column and places them in separate columns.
- Generates separate node and edge csvs from the album data.

## Set up

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import csv
import pandas as pd
from itertools import chain, combinations
from collections import Counter
import functools
import operator
from tqdm.notebook import tqdm

In [3]:
# set pandas display options
pd.set_option("display.max_rows", 3500)
pd.set_option('display.max_colwidth', 500)
pd.set_option("display.max_columns", 200)

In [22]:
# start with from_country,to_country,count
conv_file = "/Users/geistling/Documents/1_Projects/bandcamp/network_graph/2_network_graph_cleaned.csv"
# conv_file = "/Users/geistling/Documents/1_Projects/bandcamp/network_graph/network-graph.csv"

In [23]:
book = pd.read_csv(conv_file)

In [24]:
reader = book.copy()
# reader['genre'] = reader['genre'].str.replace(", ", ",", regex=False)

reader.head()

Unnamed: 0,standard_name,genre
0,"Adelaide SA, Australia","electronic,industrial,ebm,electronica"
1,"Adelaide SA, Australia","electronic,industrial,ebm,electronica"
2,"Adelaide SA, Australia","electronic,industrial,ebm,electronica"
3,"Adelaide SA, Australia","electronic,industrial,ebm,electronica"
4,"Adelaide SA, Australia","electronic,industrial,ebm,electronica"


# Recategorizing
Place potentially useful/interesting non-genres into different columns:
- Instruments
- Place and region words
- topical issues
- genre modifiers (words associated with different genres that mean nothing outside the context of that genre)

In [7]:
def remove_from_genre_field(item):
    reader['genre'] = reader['genre'].str.replace("(^|,){}(?=,|$)".format(item), "", regex=True)

In [8]:
instruments = [
    'accordion',
    'acoustic guitar',
    'banjo',
    'cello',
    'clarinet',
    'didgeridoo',
    'drum machine',
    'drums',
    'female fronted',
    'female vocals',
    'fiddle',
    'flute',
    'guitar',
    'guitars',
    'harmonica',
    'synths',
    'harp',
    'horns',
    'mandolin',
    'modular synthesizer',
    'modular synth',
    'modular',
    'moog',
    'organ',
    'percussion',
    'piano',
    'sax',
    'saxophone',
    'singer',
    'sitar',
    'slide guitar',
    'solo guitar',
    'string quartet',
    'strings',
    'synthesizer',
    'trumpet',
    'trombone',
    'ukulele',
    'viola',
    'violin',
    'voice',
    'vocals'
#     'reverb',
#     'feedback',
#     'djent'
]

reader['instruments'] = None

# designed query to only take the genre if it's a full word
for instrument in instruments:
    reader.loc[reader['genre'].str.contains(instrument), ['instruments']] = reader['instruments'].astype(str) + ',' + instrument 
    remove_from_genre_field(instrument)

In [9]:
places = [
    'argentina',
    'athens',
    'austin tx',
    'australia',
    'bay area',
    'barcelona',
    'boston',
    'brasil',
    'berlin',
    'brazil',
    'brooklyn',
    'california',
    'canada',
    'chicago',
    'dc',
    'detroit',
    'east coast',
    'france',
    'germany',
    'greece',
    'india',
    'ireland',
    'italy',
    'japan',
    'london',
    'los angeles',
    'melbourne',
    'memphis',
    'minneapolis',
    'montréal',
    'mexico',
    'new orleans',
    'new york',
    'new york city',
    'new zealand',
    'new zealand music', 
    'norway',
    'nyc',
    'oakland',
    'paris',
    'philadelphia',
    'philly',
    'portland or',
    'portland',
    'russia',
    'richmond va',
    'russian',
    'st louis',
    'san francisco',
    'scotland',
    'seattle',
    'spain',
    'ibiza',
    'jamaica',
    'sydney',
    'texas',
    'tokyo',
    'toronto',
    'uk',
    'united states',
    'united kingdom',
    'usa',
    'washington dc'
]

regionalist = [
    'australian',
    'african',
    'middle eastern',
    'southern',
    'appalachian',
    'canadian',
    'canadiana',
    'caribbean',
    'spanish',
    'arabic',
    'japanese',
    'indian',
    'midwest',
    'greek',
    'irish',
    'nordic',
    'balkan',
    'italian',
    'brazilian',
    'brazilian music',
    'british',
    'scottish',
    'hebrew',
    'romanian',
    'icelandic',
    'turkish',
    'french',
    'international',
    'asian'
]

reader['place_conns'] = None

for place in places:
    reader.loc[reader['genre'].str.contains(place), ['place_conns']] = reader['place_conns'].astype(str) + ',' + place 
    remove_from_genre_field(place)

for region in regionalist:
    reader.loc[reader['genre'].str.contains(region), ['place_conns']] = reader['place_conns'].astype(str) + ',' + region 
    remove_from_genre_field(region)


In [10]:
genremods = [
    'alt',
    'alternative',
    'crossover',
    'contemporary',
    'dark',
    'deep',
    'old school',
    'old skool',
    'positive',
    'power',
    'post',
    'progressive',
    'soulful',
    'uplifting'
]

reader['modifier'] = None

for mod in genremods:
    reader.loc[reader['genre'].str.contains(mod), ['modifier']] = reader['modifier'].astype(str) + ',' + mod 
    remove_from_genre_field(mod)


zeitgeist = [
    'vegan',
    'black lives matter',
    'protest',
    'covid 19',
    '2020',
    'anti fascist',
    'quarantine',
    'corona virus']

reader['zeitgeist'] = None

for geist in zeitgeist:
    reader.loc[reader['genre'].str.contains(geist), ['zeitgeist']] = reader['zeitgeist'].astype(str) + ',' + geist 
    remove_from_genre_field(geist)

In [11]:
moods = [
    'atmospheric',
    'beautiful',
    'calm',
    'cool',
    'cute',
    'depressive',
    'depression',
    'dirty',
    'dreamy',
    'emotional',
    'energy',
    'fast',
#     'full on',
    'fun',
    'funny',
    'happy',
    'hard',
#     'intelligent',
#     'loud',
#     'love',
    'mellow',
    'meditative',
    'moody',
    'peaceful',
    'quiet',
    'raw',
    'relaxing',
    'romantic',
    'sad',
    'scary',
    'soft',
    'spooky',
    'sex',
    'sexy'
]


reader['moods'] = None

for mood in moods:
    reader.loc[reader['genre'].str.contains(mood), ['moods']] = reader['moods'].astype(str) + ',' + mood 
    remove_from_genre_field(mood)

In [12]:
reader['instruments'] = reader['instruments'].str.replace("nan,", "", regex=False)
reader['zeitgeist'] = reader['zeitgeist'].str.replace("nan,", "", regex=False)
reader['modifier'] = reader['modifier'].str.replace("nan,", "", regex=False)
reader['moods'] = reader['moods'].str.replace("nan,", "", regex=False)
reader['place_conns'] = reader['place_conns'].str.replace("nan,", "", regex=False)
reader['genre'] = reader['genre'].str.replace("^,", "", regex=True)

## Delete entries made empty by relocation of non-genres

In [13]:
# place = reader.loc[reader['genre'] == '' ]
# place.count()

reader = reader.loc[reader['genre'] != '' ].copy()
reader.count()

genre            314085
standard_name    314085
lat              314085
lng              314085
instruments       22322
place_conns       35817
modifier          89559
zeitgeist           929
moods             39889
dtype: int64

In [14]:
# genrecheck = reader.loc[reader['zeitgeist'].notnull() ].copy()
genrecheck = reader.groupby(['standard_name'])
reader.head()

Unnamed: 0,genre,standard_name,lat,lng,instruments,place_conns,modifier,zeitgeist,moods
0,"electronic,industrial,ebm,electronica","Adelaide SA, Australia",-34.928499,138.600746,,,,,
1,"electronic,industrial,ebm,electronica","Adelaide SA, Australia",-34.928499,138.600746,,,,,
2,"electronic,industrial,ebm,electronica","Adelaide SA, Australia",-34.928499,138.600746,,,,,
3,"electronic,industrial,ebm,electronica","Adelaide SA, Australia",-34.928499,138.600746,,,,,
4,"electronic,industrial,ebm,electronica","Adelaide SA, Australia",-34.928499,138.600746,,,,,


# Save sorted df to csv

In [15]:
reader.to_csv('3_graph_sorted.csv', index=False)

# Create link and node csvs from cleaned set

In [31]:
# global tallying, nodes

grouped = reader.groupby('standard_name')
Collected_nodes = pd.DataFrame(columns =['genre','count','location'])

for place, group in grouped:
    genre_list = reader.genre
    genre_set = list(genre.split(",") for genre in genre_list)
    new_gen_list = functools.reduce(operator.iconcat, genre_set, [])
    genre_tallies = Counter(new_gen_list)
    genre_tallies_dict = dict(genre_tallies)
    genrenodes = pd.DataFrame(list(genre_tallies_dict.items()),columns = ['genrename','count'])
    genrenodes['locations'] = place
    genrenodes = genrenodes.sort_values(['count'])

    Collected_nodes.append(genrenodes)
    

In [26]:
Collected_nodes.to_csv("FINAL_genre_nodes.csv")
Collected_nodes

genrename    1812
count        1812
dtype: int64

In [27]:
# global tallying, edges
grouped = reader.groupby('standard_name')
Collected = pd.DataFrame(columns =['genre1', 'genre2','count','location'])

for place, group in grouped:
    edge_list = []
    genre_list = group['genre']
    for item in genre_list:
        genres_in_list = sorted(list(item.split(",")))
        genre_combs = combinations(genres_in_list, 2)
        # return list of tuples
        for combination in genre_combs:
            edge_list.append([combination])
    edge_tally = Counter(chain(*edge_list))

    edge_tallies_dict = dict(edge_tally)

    edgeprep_dataframe = pd.DataFrame.from_dict(edge_tallies_dict, orient='index')
    edgeprep_dataframe.rename(columns = {0:'count'}, inplace = True) 
    edgeprep_dataframe['genre_pair'] = edgeprep_dataframe.index
    edgeprep_dataframe['row'] = edgeprep_dataframe.reset_index().index
    edgeprep_dataframe = edgeprep_dataframe.set_index(['row'])

    genre_edges = pd.DataFrame(list(edgeprep_dataframe['genre_pair']), columns =['genre1', 'genre2']) 

    count = edgeprep_dataframe['count']
    genre_edges = genre_edges.join(count)
    genre_edges['location'] = place
    Collected = Collected.append(genre_edges)

In [28]:
Collected.head()
Collected.to_csv("FINAL_genre_edges.csv", index=False)

In [117]:
# edge_tally = Counter(chain(*edge_list))
# edge_tallies_dict = dict(edge_tally)

# edgeprep_dataframe = pd.DataFrame.from_dict(edge_tallies_dict, orient='index')
# edgeprep_dataframe

Unnamed: 0,0
"(ebm, electronic)",3520
"(ebm, electronica)",164
"(ebm, industrial)",2556
"(electronic, electronica)",11219
"(electronic, industrial)",8166
...,...
"(glitch hop, minimal)",1
"(progressive house, tech)",11
"(compilation, tech)",3
"(neo soul, post rock)",1


In [118]:
# edgeprep_dataframe.rename(columns = {0:'count'}, inplace = True) 
# edgeprep_dataframe['genre_pair'] = edgeprep_dataframe.index
# edgeprep_dataframe['row'] = edgeprep_dataframe.reset_index().index
# edgeprep_dataframe = edgeprep_dataframe.set_index(['row'])

# genre_edges = pd.DataFrame(list(edgeprep_dataframe['genre_pair']), columns =['genre1', 'genre2']) 

# count = edgeprep_dataframe['count']
# genre_edges = genre_edges.join(count)

Unnamed: 0,genre1,genre2,count
0,ebm,electronic,3520
1,ebm,electronica,164
2,ebm,industrial,2556
3,electronic,electronica,11219
4,electronic,industrial,8166
...,...,...,...
130827,glitch hop,minimal,1
130828,progressive house,tech,11
130829,compilation,tech,3
130830,neo soul,post rock,1


In [120]:
genre_edges = genre_edges.sort_values('count')
genre_edges.to_csv("FINAL_genre_edges.csv", index=False)

genre_edges.count()

genre1    130832
genre2    130832
count     130832
dtype: int64

## Notes

In [29]:
# genres to examine:
# death doom	death doom metal
# beats for sale
# berlin school electronic music, berlin school 

styles = [
# 'jazzy',
# 'organic',
    'slam',
'vibe',
'vibes'
]

genre_list = ['metal', 'doom', 'techno', 'black metal', 'death metal','hip hop','world']

activities = [
    'sleep',
    'yoga music',
    'study music',
    'meditation music',
]

relabel = [
    ["bluesy", 'blues']
    twee + pop
    ['singer'+'songwriter', 'singer songwriter']
    ['sega'+'genesis', 'sega genesis']
]

medium = [
    'mix',
    'mixtape',
    'remix',
    'reworks',
    reissues
    'film soundtrack'
]

groups = [
    transgender
    lgbtq
    pagan
    trans
    afro brazilian
    afro
    afro latin
    gay
    feminist
    
]

Genre observations:
- modifiers:
    - atmospheric (dark, deep, ambient)
    - cross-genre-ish (death, doom, synth)
    - musical (melodic, minimal)
- not actually genres (remove):
    - free, free download
    - download, download chill out music
    - creative commons
    - collaboration, various, various artists
    - demo, demos
    - youtube
    - diy, diy record label
    - medium
        - vinyl
        
- not actually genres (some other quality)
    - technical qualities:
        - female vocalist
    - purpose or common use:
        - yoga, meditation
    - zeitgeist: 
        - black lives matter
        - covid 19
        - anti fascist