In [8]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [9]:
import pandas as pd
from datetime import datetime

In [168]:
#OperaGlass composer list
df = pd.read_csv('../../data/processed/works/opera_glass.csv', index_col=0)
df = df[['composer', 'composer_info']]
df = df.drop_duplicates()
df.to_csv('../../data/processed/composers/opera_glass.csv', index=False)

In [179]:
#UKTW composer list
df = pd.read_csv('../../data/processed/listings/uk_theatreweb.csv', index_col=0)
df = df['composer']
df = df.dropna().drop_duplicates()
df = df.sort_values()
df.to_csv('../../data/processed/composers/uk_theatreweb.csv', index=False)

In [12]:
#Wikipedia composer list
from SPARQLWrapper import SPARQLWrapper, JSON

import rdflib
from rdflib.graph import Graph
from rdflib.namespace import Namespace
from rdflib import plugin

sparql = SPARQLWrapper("https://query.wikidata.org/sparql")

composer_works = []

# create a construct query template.
# NOTE: To keep it simple we create an exact copy matching a query in the local graph.
sparql.setQuery("""
SELECT * WHERE {
  {
    SELECT ?composer ?composerLabel ?composition ?compositionLabel ?dob ?dod ?pobLabel ?cobLabel
    WHERE {
      ?composer wdt:P106 wd:Q36834.
      ?composer wdt:P136 wd:Q1344.
      OPTIONAL { ?composer wdt:P569 ?dob. }
      OPTIONAL { ?composer wdt:P19 ?pob.
                 ?pob wdt:P17 ?cob .
                 FILTER NOT EXISTS {?cob wdt:P31 wd:Q3024240}     
               }      
      OPTIONAL { ?composer wdt:P570 ?dod. }
      ?composition wdt:P86 ?composer.
      ?composition wdt:P31 wd:Q1344
      SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en,de,it,fr,nl,es,ru,hu,no,cs,ca,sv,bg,als,ja,pt,az,et,uk". }
    }
  }
#FILTER contains(lcase(?composerLabel),"abramsky")
}
ORDER BY ?composerLabel ?compositionLabel
""")

sparql.setReturnFormat(JSON)
results = sparql.query().convert()

for result in results["results"]["bindings"]:
    composer_work = []
    composer      = result['composerLabel']['value']
    composer_url  = result['composer']['value']    
    work          = result['compositionLabel']['value']    
    work_url      = result['composition']['value']

    try:
        country   = result['cobLabel']['value']
    except:
        country   = ''        
            
    try:
        dob       = result['dob']['value']   
        dob       = datetime.strptime(dob[:-10],'%Y-%m-%d').date()        
    except:
        dob       = datetime.strptime('2200-01-01','%Y-%m-%d').date()        
        
    try:
        dod       = result['dod']['value']        
        dod       = datetime.strptime(dod[:-10],'%Y-%m-%d').date()        
    except:
        dod       = datetime.strptime('2200-01-01','%Y-%m-%d').date()        
    
    composer_work.append(composer)
    composer_work.append(work)
    composer_work.append(dob)
    composer_work.append(dod)
    composer_work.append(country)
    composer_work.append(composer_url)
    composer_work.append(work_url)    
    composer_works.append(composer_work)
    
import pandas as pd                     
headers = ['composer', 'work', 'date_of_birth', 'date_of_death', 'country', 'composer_url', 'work_url']
df = pd.DataFrame(composer_works, columns=headers)

In [13]:
#Remove works to reduce size of dataframe
df = df[['composer', 'date_of_birth', 'date_of_death', 'country', 'composer_url']].drop_duplicates()

#A bit of cleanup
df = df[df['country']!='Catalan Republic']
df.loc[df['composer']=='Boris Blacher', 'country'] = 'Germany'
df.loc[df['composer']=='Boris Blacher', 'date_of_birth'] = datetime.strptime('1903-01-06','%Y-%m-%d').date()
df.loc[df['country']=='México', 'country'] = 'Mexico'

#Fixing missing nationalities of composers
az_composers = ['Uzeyir Hajibeyov', 'Zulfugar Hajibeyov']
fr_composers = ['Adolphe Blaise', 'Charles Gounod', 'Charles-Louis Mion', 'Joseph François Salomon', 'Georges Bizet', 
                'Louis de La Coste', 'Prosper-Didier Deshayes']
it_composers = ['Francesco Lucio', 'Carlo Franchi']
ru_composers = ['Alexey Verstovsky', 'Sergei Rachmaninoff', 'Zagir Ismagilov', 'Mukan Tulebaev']
us_composers = ['Ann Millikan', 'David Carlson', 'Mark Bucci', 'William Furst', 'Harry Lawrence Freeman', 'Jan Bach', 'Libby Larsen', 
                'Lora Aborn', 'Reginald De Koven', 'Mohammed Fairouz']
uk_composers = ['Arthur Goring Thomas', 'Jonathan Battishill', 'Julian Grant', 'Ernest Ford', 'John Barnett', 'Phyllis Tate', 
                'Christopher Bowers-Broadbent', 'David Bruce']
ie_composers = ['Arthur Hervey', 'William Michael Rooke']
de_composers = ['E. T. A. Hoffmann', 'Hermann Goetz', 'Otto Nicolai']
nk_composers = ['Edvard Fliflet Bræin', 'Herman Severin Løvenskiold']
dk_composers = ['Erling Brene']
au_composers = ['Eric Gross']
lt_composers = ['Vytautas Klova', 'Bronius Kutavičius']
ukr_composers = ['Mykola Lysenko', 'Heorhiy Maiboroda']
ge_composers = ['Meliton Balanchivadze']
es_composers = ['Manuel Fernández Caballero']
is_composers = ['Karólína Eiríksdóttir']
sk_composers = ['Junsang Bahk']
ca_composers = ['John Burge', 'J. E. P. Aldous']
br_composers = ['Damião Barbosa de Araújo']

df.loc[df['composer'].isin(az_composers), 'country'] = 'Azerbaijan'
df.loc[df['composer'].isin(fr_composers), 'country'] = 'France'
df.loc[df['composer'].isin(it_composers), 'country'] = 'Italy'
df.loc[df['composer'].isin(ru_composers), 'country'] = 'Russia'
df.loc[df['composer'].isin(us_composers), 'country'] = 'United States of America'
df.loc[df['composer'].isin(uk_composers), 'country'] = 'United Kingdom'
df.loc[df['composer'].isin(ie_composers), 'country'] = 'Ireland'
df.loc[df['composer'].isin(de_composers), 'country'] = 'Germany'
df.loc[df['composer'].isin(nk_composers), 'country'] = 'Norway'
df.loc[df['composer'].isin(dk_composers), 'country'] = 'Denmark'
df.loc[df['composer'].isin(au_composers), 'country'] = 'Australia'
df.loc[df['composer'].isin(lt_composers), 'country'] = 'Lithuania'
df.loc[df['composer'].isin(ukr_composers), 'country'] = 'Ukraine'
df.loc[df['composer'].isin(ge_composers), 'country'] = 'Georgia'
df.loc[df['composer'].isin(es_composers), 'country'] = 'Spain'
df.loc[df['composer'].isin(is_composers), 'country'] = 'Iceland'
df.loc[df['composer'].isin(sk_composers), 'country'] = 'South Korea'
df.loc[df['composer'].isin(ca_composers), 'country'] = 'Canada'
df.loc[df['composer'].isin(br_composers), 'country'] = 'Brazil'

#Unique composers after cleanup
df = df.drop_duplicates()
df['composer_key'] = df.apply(lambda row: row['composer_url'].split('/')[-1], axis=1)
df.to_csv('../../data/processed/composers/wikipedia.csv')

In [14]:
#Saving Composers with works
headers = ['composer', 'work', 'date_of_birth', 'date_of_death', 'country', 'composer_url', 'work_url']
df = pd.DataFrame(composer_works, columns=headers)
df = df[['composer', 'work', 'composer_url', 'work_url']].drop_duplicates()
df['composer_key'] = df.apply(lambda row: row['composer_url'].split('/')[-1], axis=1)
df['work_key'] = df.apply(lambda row: row['work_url'].split('/')[-1], axis=1)


df.to_csv('../../data/processed/works/wikipedia.csv')