# Extraktion der Daten

Im Folgenden extrahieren wir alle möglichen Informationen der Daten und erstellen davon einen gerichteten Graph mit allen Musikern als Knoten und deren Relationen als Kanten für spätere Darstellung der sozialen Netzwerke.


In [4]:
'''
Loading the data
'''

import json
import os

path = "data/Personen/"

data = {}
for file in os.listdir(path):
    with open(path + file, encoding='utf-8') as f:
        x = json.load(f)
    for key in x:
        x[key].pop('links', None) 
    data.update(x)


Für jede Knote bestehen die Informationen von Namen, Geschlecht, musikalischen bzw. anderen Berufen, Wirkungs-, Geburts- und Sterbeort, Geburts- und Sterbejahr, und dem Musikepoche, zu dem der Musiker (vermutlich) gehört.  

In [5]:
'''
Creating a directed graph from the data
'''

import networkx as nx
from re import *

g = nx.DiGraph()      

eligible_persons = data
#print(len(eligible_persons))


# western music eras 
music_era = {'Medieval': range(500, 1400), 'Renaissance': range(1400, 1600), 'Baroque': range(1600, 1750), 
             'Classical': range(1750, 1820), 'Romantic': range(1820, 1910), 
             'Modern': range(1910, 1975), 'Contemporary': range(1975, 2018)}


def get_node_attributes(person):
    attr = {}
    info = data[person]
    
    # name
    prename = info["defaultPreName"]
    surname = info["defaultSurName"]
    if prename and surname:
        attr['name'] = ' '.join([prename, surname])
    elif prename:
        attr['name'] = prename
    elif surname:
        attr['name'] = surname
        
    # gender
    attr['gender'] = info["gender"][0] if info["gender"] else 'unknown'
    
    # religion
    attr['religion'] = info['konfession'][0] if info['konfession'] else 'unknown'
    
    # music jobs
    clean_music_jobs = [job for job in info['musicalJobs'] if job]   # filters out nulls
    attr['musicJobs'] = ','.join(clean_music_jobs) if clean_music_jobs else 'none'

    # other jobs
    clean_other_jobs = [job for job in info['otherJobs'] if job]   # filters out nulls
    attr['otherJobs'] = ','.join(clean_other_jobs) if clean_other_jobs else 'none'
        
    # workplaces - Wirkungsorte
    attr['workPlace'] = ','.join(info['places']) if info['places'] else 'unknown'
    
    # main workplace - Hauptwirkungsort
    attr['mainPlace'] = info['mainPlace'] if info['mainPlace'] else 'unknown'
    
    # place of birth 
    attr["birthPlace"] = info["dates"][4] if info["dates"][4] else 'unknown'
    
    # place of death
    attr["deathPlace"] = info["dates"][5] if info["dates"][5] else 'unknown'    
    
    # year of birth 
    attr["birthYear"] = int(search('[0-9]{3,4}', info["dates"][0]).group(0)) if info["dates"][0] else 0

    # year of death
    try:
        attr["deathYear"] = int(search('[0-9]{3,4}', info["dates"][2]).group(0)) if info["dates"][2] else 3000
    except AttributeError:
        attr["deathYear"] = 3000    # BAD DATA SAMPLE; catching the exception solves it
        attr["birthPlace"] = info["dates"][2] if info["dates"][2] else 'unknown' #BAD DATA SAMPLE leads to wrong positions of birth place
    
    # era - music era was chosen due to the year of death, in case it is unknown, then the year at age 30 would be considered.
    for era in music_era:
        if attr['deathYear'] in music_era[era]:
            attr['era'] = era
            
    if 'era' not in attr:
        for era in music_era:
            if (attr['birthYear'] + 30) in music_era[era]:
                attr['era'] = era
                
    if 'era' not in attr:
        attr['era'] = 'unknown'
            
    return attr


#all people
for person in eligible_persons:
    attribs = get_node_attributes(person)
    
    g.add_node(person, **attribs)
    
    rels = eligible_persons[person]['relation']
    for rel in rels:
        for other_person in rels[rel]:
            if other_person['target'] in eligible_persons:    # there's usually only 1
                g.add_edge(person, other_person['target'], relation=rel)


Der letzliche gerichtete Graph zur Verwendung späterer Darstellung wurde am 31.08.2018 erstellt.

In [6]:
'''
Exporting the graph 
'''

nx.write_gexf(g, 'graph_full.gexf')

# Deskriptive Statistik
Eine deskriptive Statistik der Daten wurde im Folgenden erfolgt.

In [8]:
'''
Musical Jobs exposing
'''
import json
from re import *

filename = 'data/synberufswd.json'

swdfile = json.load(open(filename))


mjdata_m = {value['F35']: 0 for value in swdfile}
mjdata_w = {value['F35W']: 0 for value in swdfile}

problemjobs = []
for person in eligible_persons:
    attribs = get_node_attributes(person)
    
    if attribs['musicJobs'] != 'none':
        if attribs['gender'] == 'weiblich':
            for job in attribs['musicJobs'].split(','):
                mjdata_w[job] += 1
                
        else:
            for job in attribs['musicJobs'].split(','):
                try:
                    mjdata_m[job] += 1
                except KeyError:
                    problemjobs.append(job)
                    
w = list(mjdata_w.values())
for i, key in enumerate(mjdata_m.keys()):
    mjdata_m[key] += w[i]

# a relatively good (not really bad) solution raised due to BAD SAMPLE DATAS and LITERAL ERROR (Schreibfehler)    
a = {job: 0 for job in set(problemjobs)}
for i in problemjobs:
    a[i] += 1

for key1 in mjdata_m.keys():
    for key2 in a.keys():
        if search(key1.lower(), key2.lower()) is not None or search(key2.lower(), key1.lower()) is not None:
            mjdata_m[key1] += a[key2]
            a[key2] = 0
        
for key2 in a.keys():
    if 'Choreo' not in key2 and 'intrumenten' not in key2 and 'Eletro' not in key2 and a[key2] > 0:
        mjdata_m[key2] = a[key2]

mjdata_m['Choreograf'] = 2
mjdata_m['Holzblasinstrumentenbauer'] += 1
mjdata_m['Elektrophonspieler'] += 1
#print(mjdata_m)

'''
Jobs Summaries
'''

regex = '[0-9]{2}\.'
topjob_data = {value['F35']:0 for value in swdfile if len(findall(regex, value['C'])) <= 1}
topjob = {value['F35']:value['C'] for value in swdfile if len(findall(regex, value['C'])) <= 1}
secjob = {value['F35']:value['C'] for value in swdfile if len(findall(regex, value['C'])) == 2}
thijob = {value['F35']:value['C'] for value in swdfile if len(findall(regex, value['C'])) == 3}

for job, num in mjdata_m.items():
    for j in topjob.keys():
        try:
            if job == j or search(topjob[j], secjob[job]) is not None or search(topjob[j], thijob[job]) is not None:
                topjob_data[j] += num
        except KeyError: #still not figure out why such error came but those don't affect the whole results that much
            pass
#print(topjob_data)

In [9]:
'''
workPlace exposing
'''
places = set()
for person in eligible_persons:
    attribs = get_node_attributes(person)

    for place in attribs['workPlace'].split(','):
        places.add(place)

workplaces = {pl: 0 for pl in places}


for person in eligible_persons:
    attribs = get_node_attributes(person)

    for place in attribs['workPlace'].split(','):
        workplaces[place] += 1
#print(workplaces)

{'': 2, 'Edling': 2, 'Carpentras': 1, 'Zuckmantel': 1, 'Werder': 1, 'Hildburghausen': 20, 'Ursheim/Polsingen': 1, 'Teplitz': 49, 'Rottenbach': 1, 'Osimo': 3, 'St. Veit': 1, 'Memmingen': 48, 'Varazdin': 1, 'Beerbach': 1, 'Lyon': 190, 'Sandy/Oregon': 1, 'Eggstätt': 2, 'Fahrnbach/Inn': 1, 'Gardasee': 1, 'Solln': 1, 'Spiekeroog': 2, 'Haarbach': 1, 'Arkel': 1, 'Waldershof': 1, 'Jackson/Mississippi': 1, 'Nevada': 1, 'Achen': 1, 'Freiburg/Breisgau': 371, 'Hagensdorf': 1, 'Mitau': 7, 'Grünsink': 1, 'Mühlheim': 2, 'Siam': 1, 'Lille': 14, 'Korsika': 1, 'Lukka/Niederlausitz': 1, 'Wöhrd': 2, 'Pennsylvania': 5, 'Hilpoltstein': 12, 'Algund': 3, 'Neustadt/Aisch': 15, 'Uruguay': 4, 'Hirblingen': 1, 'Kiens': 1, 'Schwartau': 1, 'Leichtenburg': 1, 'Allmannshofen': 2, 'Reghin': 2, 'Kilchberg': 1, 'Sri Lanka': 3, 'Spachbrücken': 1, 'Unterjoch': 1, 'Weipoltshausen/Schweinfurt': 1, 'Sardinien': 4, 'Ohlau': 1, 'Žatec': 1, 'Thailand': 2, 'Winterthur': 43, 'Morgenbach': 1, 'Evanston/Illinois': 4, 'Hirschberg/Sc

In [11]:
'''
Datas raised
'''
def attribute(name_attr):
    raw_data = []
    for person in eligible_persons:
        attribs = get_node_attributes(person)
        raw_data.append(attribs[name_attr])

    end_data = {data: 0 for data in set(raw_data)}

    for dat in raw_data:
        end_data[dat] += 1

    return end_data

def reduction(raw_file):
    raw_data = {i:'' for i in sorted(set(raw_file.values()))}
    for key in raw_file.keys():
        if raw_data[raw_file[key]] == '':
            raw_data[raw_file[key]] += key
        else:
            raw_data[raw_file[key]] += ', ' + key
    end_data = {value:key for key, value in raw_data.items()}
    return end_data

'''
musicJobs, genders, religions, mainPlace, birthPlace, eras, workPlace
'''
mjdata = reduction(mjdata_m)  
genderdata = attribute('gender') 
relidata = attribute('religion')
mpdata = reduction(attribute('mainPlace'))
bpdata = reduction(attribute('birthPlace'))
eradata = attribute('era')
wpdata = reduction(workplaces)

In [13]:
'''
Exporting all datas
'''
from csv import *
def export_statistic(data, categorie):
    with open(categorie+'.csv', 'w') as file:
        statisticwriter = writer(file, delimiter=',')
        statisticwriter.writerow([categorie, 'Anzahl'])
        for cat, num in data.items():
            statisticwriter.writerow([cat, num])

export_statistic(topjob_data, 'Übergeordnete musikalische Berufe') 
export_statistic(mjdata, 'Musikalische Berufe')
export_statistic(genderdata, 'Geschlecht')
export_statistic(relidata, 'Konfessionen')
export_statistic(mpdata, 'Hauptwirkungsorte')
export_statistic(bpdata, 'Geburtsort')
export_statistic(eradata, 'Musikepochen')
export_statistic(wpdata, 'Wirkungsorte')