# Extraktion der Daten

Im Folgenden extrahieren wir alle möglichen Informationen der Daten und erstellen davon einen gerichteten Graph mit allen Musikern als Knoten und deren Relationen als Kanten für spätere Darstellung der sozialen Netzwerke.


In [4]:
'''
Loading the data
'''

import json
import os

path = "data/Personen/"

data = {}
for file in os.listdir(path):
    with open(path + file, encoding='utf-8') as f:
        x = json.load(f)
    for key in x:
        x[key].pop('links', None) 
    data.update(x)


Für jede Knote bestehen die Informationen von *Namen*, *Geschlecht*, *musikalischen bzw. anderen Berufen*, *Wirkungs-, Geburts- und Sterbeort*, *Geburts- und Sterbejahr*, und dem *Musikepoche*, zu dem der Musiker (vermutlich) gehört.  

In [5]:
'''
Creating a directed graph from the data
'''

import networkx as nx
from re import *

g = nx.DiGraph()      

eligible_persons = data

# western music eras 
music_era = {'Medieval': range(500, 1400), 'Renaissance': range(1400, 1600), 'Baroque': range(1600, 1750), 
             'Classical': range(1750, 1820), 'Romantic': range(1820, 1910), 
             'Modern': range(1910, 1975), 'Contemporary': range(1975, 2018)}


def get_node_attributes(person):
    attr = {}
    info = data[person]
    
    # name
    prename = info["defaultPreName"]
    surname = info["defaultSurName"]
    if prename and surname:
        attr['name'] = ' '.join([prename, surname])
    elif prename:
        attr['name'] = prename
    elif surname:
        attr['name'] = surname
        
    # gender
    attr['gender'] = info["gender"][0] if info["gender"] else 'unknown'
    
    # religion
    attr['religion'] = info['konfession'][0] if info['konfession'] else 'unknown'
    
    # music jobs
    clean_music_jobs = [job for job in info['musicalJobs'] if job]   # filters out nulls
    attr['musicJobs'] = ','.join(clean_music_jobs) if clean_music_jobs else 'none'

    # other jobs
    clean_other_jobs = [job for job in info['otherJobs'] if job]   # filters out nulls
    attr['otherJobs'] = ','.join(clean_other_jobs) if clean_other_jobs else 'none'
        
    # workplaces - Wirkungsorte
    attr['workPlace'] = ','.join(info['places']) if info['places'] else 'unknown'
    
    # main workplace - Hauptwirkungsort
    attr['mainPlace'] = info['mainPlace'] if info['mainPlace'] else 'unknown'
    
    # place of birth 
    attr["birthPlace"] = info["dates"][4] if info["dates"][4] else 'unknown'
    
    # place of death
    attr["deathPlace"] = info["dates"][5] if info["dates"][5] else 'unknown'    
    
    # year of birth 
    attr["birthYear"] = int(search('[0-9]{3,4}', info["dates"][0]).group(0)) if info["dates"][0] else 0

    # year of death
    try:
        attr["deathYear"] = int(search('[0-9]{3,4}', info["dates"][2]).group(0)) if info["dates"][2] else 3000
    except AttributeError:
        attr["deathYear"] = 3000    # BAD DATA SAMPLE; catching the exception solves it
        attr["birthPlace"] = info["dates"][2] if info["dates"][2] else 'unknown' #BAD DATA SAMPLE leads to wrong positions of birth place
    
    # era - music era was chosen due to the year of death, in case it is unknown, then the year at age 30 would be considered.
    for era in music_era:
        if attr['deathYear'] in music_era[era]:
            attr['era'] = era
            
    if 'era' not in attr:
        for era in music_era:
            if (attr['birthYear'] + 30) in music_era[era]:
                attr['era'] = era
                
    if 'era' not in attr:
        attr['era'] = 'unknown'
            
    return attr


#all people
for person in eligible_persons:
    attribs = get_node_attributes(person)
    
    g.add_node(person, **attribs)
    
    rels = eligible_persons[person]['relation']
    for rel in rels:
        for other_person in rels[rel]:
            if other_person['target'] in eligible_persons:    # there's usually only 1
                g.add_edge(person, other_person['target'], relation=rel)


Der letzliche gerichtete Graph zur Verwendung späterer Darstellung wurde am 31.08.2018 erstellt.

In [6]:
'''
Exporting the graph 
'''

nx.write_gexf(g, 'graph_full.gexf')

# Deskriptive Statistik
Eine deskriptive Statistik der Daten wurde im Folgenden erfolgt.

Für statistische Beschreibung der (musikalische und nicht-musikalische) **Berufe** gibt es 2 Funktionen ***exposing*** *(filename, ind_attr, attribute)* und ***summary*** *(file, data, ind_attr, regex = '[0-9]{2}\.')*. 
- ***exposing*** *(filename, ind_attr, attribute)* zählt alle Berufe, die im Datensatz vorkommen.
- ***summary*** *(file, data, ind_attr, regex = '[0-9]{2}\.')* gibt die übergeordnete Berufe aus. 

*Anmerkung: Aus verschiedenen Gründen wie Schreibfehler oder schlechte Einsortierung des Datensatzes wurden ein paar Elemente manuell bearbeitet (siehe Teile zwischen 2 Funktionen).*

In [30]:
'''
Jobs exposing
'''
import json
from re import *

'''
filename : Name of file in data
ind_attr : 'F35', 'F36'
attribute : musicJobs ; otherJobs
'''
def exposing(filename, ind_attr, attribute):
    
    file = json.load(open(filename))


    data_m = {value[ind_attr]: 0 for value in file}
    data_w = {value[ind_attr+'W']: 0 for value in file}

    problemjobs = []
    for person in eligible_persons:
        attribs = get_node_attributes(person)
    
        if attribs[attribute] != 'none':
            if attribs['gender'] == 'weiblich':
                for job in attribs[attribute].split(','):
                    data_w[job] += 1
                
            else:
                for job in attribs[attribute].split(','):
                    try:
                        data_m[job] += 1
                    except KeyError:
                        problemjobs.append(job)
                    
    w = list(data_w.values())
    for i, key in enumerate(data_m.keys()):
        data_m[key] += w[i]


    # a relatively good (not really bad) solution raised due to BAD SAMPLE DATAS     
    extra = {job: 0 for job in set(problemjobs)}
    for i in problemjobs:
        extra[i] += 1

    for key1 in data_m.keys():
        for key2 in extra.keys():
            if search(key1.lower(), key2.lower()) is not None or search(key2.lower(), key1.lower()) is not None:
                data_m[key1] += extra[key2]
                extra[key2] = 0
    
    return file, data_m, extra

#Manual solutions for end datas due to LITERAL ERROR (Schreibfehler)
'''
Musical Jobs
'''
swdfile, mjdata_m, a = exposing('data/synberufswd.json', 'F35', 'musicJobs')
for key2 in a.keys():
    if 'Choreo' not in key2 and 'intrumenten' not in key2 and 'Eletro' not in key2 and a[key2] > 0:
        mjdata_m[key2] = a[key2]

mjdata_m['Choreograf'] = 2
mjdata_m['Holzblasinstrumentenbauer'] += a['Holzblasintrumentenbauer']
mjdata_m['Elektrophonspieler'] += a['Eletrophonspieler']

'''
Non-Musical Jobs
'''
nmufile, nmjdata_m, b = exposing('data/synberufnmu.json', 'F36', 'otherJobs')        
for key2 in b.keys():
    if 'pieler' not in key2 and 'Schrifs' not in key2 and 'Adeliger' not in key2 and b[key2] > 0:
        nmjdata_m[key2] = b[key2]

nmjdata_m['Schauspieler'] += b['Schaupieler']
nmjdata_m['Schriftsteller'] += b['Schrifsteller']
nmjdata_m['Adliger'] += b['Adeliger']


'''
Jobs Summaries
'''
'''
data: mjdata_m, nmjdata_m
'''
def summary(file, data, ind_attr, regex = '[0-9]{2}\.'):

    topjob_data = {value[ind_attr]:0 for value in file if len(findall(regex, value['C'])) <= 1}
    topjob = {value[ind_attr]:value['C'] for value in file if len(findall(regex, value['C'])) <= 1}
    secjob = {value[ind_attr]:value['C'] for value in file if len(findall(regex, value['C'])) == 2}
    thijob = {value[ind_attr]:value['C'] for value in file if len(findall(regex, value['C'])) == 3}

    for job, num in data.items():
        for j in topjob.keys():
            try:
                if job == j or search(topjob[j], secjob[job]) is not None or search(topjob[j], thijob[job]) is not None:
                    topjob_data[j] += num
            #Einige Berufe wurden gar nicht im vorhandenen Datei erwähnt bzw. kategorisiert, deswegen wurden ignoriert
            except KeyError:
                pass
            
    return topjob_data

mjtopjob = summary(swdfile, mjdata_m, 'F35')
nmjtopjob = summary(nmufile, nmjdata_m, 'F36')


Unten stehen die statistische Beschreibung der **Wirkungsorte**.

In [39]:
'''
workPlace exposing
'''
places = set()
for person in eligible_persons:
    attribs = get_node_attributes(person)

    for place in attribs['workPlace'].split(','):
        places.add(place)

workplaces = {pl: 0 for pl in places}


for person in eligible_persons:
    attribs = get_node_attributes(person)

    for place in attribs['workPlace'].split(','):
        workplaces[place] += 1

Die restlichen Attributen des Datensatzes werden durch 2 Funktionen ***attribute*** *(name_attr)* und ***reduction*** *(raw_file)*.
- ***attribute*** *(name_attr)* gilt für Attribute, die nicht mehr als 1 Wert haben.
- ***reduction*** *(raw_file)* gruppiert die Werte, die gleiche Anzahl haben. 

In [38]:
'''
Datas raised
'''
'''
raw_file: noch nicht einsortierte Datei
'''
def attribute(name_attr):
    raw_data = []
    for person in eligible_persons:
        attribs = get_node_attributes(person)
        raw_data.append(attribs[name_attr])

    end_data = {data: 0 for data in set(raw_data)}

    for dat in raw_data:
        end_data[dat] += 1

    return end_data

def reduction(raw_file):
    raw_data = {i:'' for i in sorted(set(raw_file.values()))}
    for key in raw_file.keys():
        if raw_data[raw_file[key]] == '':
            raw_data[raw_file[key]] += key
        else:
            raw_data[raw_file[key]] += ', ' + key
    end_data = {value:key for key, value in raw_data.items()}
    return end_data

'''
musicJobs, nonmusicJobs, genders, religions, mainPlace, birthPlace, deathPlace, eras, workPlace
'''
mjdata = reduction(mjdata_m)
nmjdata = reduction(nmjdata_m)
genderdata = attribute('gender') 
relidata = attribute('religion')
mpdata = reduction(attribute('mainPlace'))
bpdata = reduction(attribute('birthPlace'))
dpdata = reduction(attribute('deathPlace'))
eradata = attribute('era')
wpdata = reduction(workplaces)

Jedes Attribut ist durch ein csv-Datei abgespeichert.

In [36]:
'''
Exporting all datas
'''
from csv import *
def export_statistic(data, categorie):
    with open(categorie+'.csv', 'w') as file:
        statisticwriter = writer(file, delimiter=',')
        statisticwriter.writerow([categorie, 'Anzahl'])
        for cat, num in data.items():
            statisticwriter.writerow([cat, num])

export_statistic(mjtopjob, 'Übergeordnete musikalische Berufe')
export_statistic(nmjtopjob, 'Übergeordnete nicht-musikalische Berufe')
export_statistic(mjdata, 'Musikalische Berufe')
export_statistic(nmjdata, 'Nicht-Musikalische Berufe')
export_statistic(genderdata, 'Geschlecht')
export_statistic(relidata, 'Konfessionen')
export_statistic(mpdata, 'Hauptwirkungsorte')
export_statistic(bpdata, 'Geburtsort')
export_statistic(dpdata, 'Sterbeort')
export_statistic(eradata, 'Musikepochen')
export_statistic(wpdata, 'Wirkungsorte')