In [None]:
import os
import json
import pandas as pd
from google.colab import drive
from matplotlib import pyplot as plt

# Change working directory to Google Drive folder containg final project files
wd_path = '/content/drive/MyDrive/2024 FALL/INFO 628 - Data Librarianship/final_project'
os.chdir(wd_path)

# Function to pull the earliest date from an array of concert information
def firstConcertDate(concert_array):
    concert_dates = []

    for concert in concert_array:
        concert_dates.append(concert['Date'])

    concert_dates.sort()
    earliest_date = concert_dates[0]

    return earliest_date

In [None]:
# Import New York Phil Performance History from JSON file to a dictionary
perf_history = {}

with open('ny_phil_performance_history_complete.json', 'r') as json_file:
    perf_history = json.load(json_file)

perf_history_records = perf_history['programs']

In [None]:
# Convert dictionary from JSON to a DataFrame
perf_df = pd.DataFrame(perf_history_records)

# Create new DataFrame by exploding works column, creating a new row for each
# work on a program
perf_df_works_exploded = (perf_df
                          .explode('works')
                          .reset_index()
                          )

# Split array of 'works' info into multiple columns and drop redundant fields
works_and_programs = perf_df_works_exploded[['programID', 'works']]
works_dict = works_and_programs.to_dict(orient='records')
works_df = (pd.json_normalize(works_dict, sep='_')
            .drop(['programID', 'works','works_soloists', 'works_conductorName'], axis=1)
            .fillna('')
            )

# Aggregate all 'movement' information into 'works_movement' column
works_df['works_movement__'] = works_df[['works_movement__', 'works_movement_em']].agg(' '.join, axis=1).str.strip()
works_df['works_movement'] = works_df[['works_movement', 'works_movement__']].agg(' '.join, axis=1).str.strip()

# Aggregate all 'workTitle' information into 'works_workTitle' column
works_df['works_workTitle__'] = works_df[['works_workTitle__', 'works_workTitle_em']].agg(' '.join, axis=1).str.strip()
works_df['works_workTitle'] = works_df[['works_workTitle', 'works_workTitle__']].agg(' '.join, axis=1).str.strip()

# Change column names to singular/lowercase
works_df_cleaned = (works_df
                    .rename(columns={
                        'works_ID' : "work_id",
                        "works_composerName" : "work_composer_name",
                        "works_workTitle" : "work_title",
                        "works_movement" : "work_movement"})
                    )

# Concatonate the exploded/cleaned 'performance' and 'works' DataFrames,
# renaming 'programID' to match the new naming convention
work_history_df = (pd
                   .concat([perf_df_works_exploded, works_df_cleaned], axis=1)
                   .rename(columns={'programID' : 'program_id'})
                   )

# Create a column ('date') that specifies the first date the work was
# performed on a concert for the associated program
work_history_df['date'] = work_history_df['concerts'].apply(firstConcertDate)

# Remove movement information from work_id
work_history_df['work_id'] = work_history_df['work_id'].apply(lambda x: x.split('*')[0])

# Remove rows with no 'work_id'; remove intermission rows ('work_id' = '0*');
# set index to be a combination of 'work_id' and 'program_id'; reorder and
# select relevant columns.
work_history_df_cleaned = (
    work_history_df[(work_history_df['work_id'] != '') & (work_history_df['work_id'] != '0') & (work_history_df['work_id'] != '')]
    .set_index(['work_id', 'program_id'])
    .sort_values(['work_id', 'date'], ascending=[True, True])
    .loc[:,['date', 'season', 'work_title','work_movement','work_composer_name']]
    )

work_history_df_cleaned.tail(40)

Unnamed: 0_level_0,Unnamed: 1_level_0,date,season,work_title,work_movement,work_composer_name
work_id,program_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
996,548,1929-02-16T05:00:00Z,1928-29,"CONCERTO, VIOLIN, F MAJOR, RV 293 (AUTUMN FROM...",,"Vivaldi, Antonio"
996,5357,1955-03-24T05:00:00Z,1954-55,"CONCERTO, VIOLIN, F MAJOR, RV 293 (AUTUMN FROM...",,"Vivaldi, Antonio"
996,3172,1964-02-06T05:00:00Z,1963-64,"CONCERTO, VIOLIN, F MAJOR, RV 293 (AUTUMN FROM...",,"Vivaldi, Antonio"
996,6701,1979-05-17T04:00:00Z,1978-79,"CONCERTO, VIOLIN, F MAJOR, RV 293 (AUTUMN FROM...",,"Vivaldi, Antonio"
996,1520,1982-04-29T04:00:00Z,1981-82,"CONCERTO, VIOLIN, F MAJOR, RV 293 (AUTUMN FROM...",,"Vivaldi, Antonio"
996,5891,1982-05-04T04:00:00Z,1981-82,"CONCERTO, VIOLIN, F MAJOR, RV 293 (AUTUMN FROM...",,"Vivaldi, Antonio"
996,9629,2007-01-14T05:00:00Z,2006-07,"CONCERTO, VIOLIN, F MAJOR, RV 293 (AUTUMN FROM...","Allegro, excerpts","Vivaldi, Antonio"
996,9760,2007-02-18T05:00:00Z,2006-07,"CONCERTO, VIOLIN, F MAJOR, RV 293 (AUTUMN FROM...","Allegro, excerpts","Vivaldi, Antonio"
996,10091,2007-04-15T04:00:00Z,2006-07,"CONCERTO, VIOLIN, F MAJOR, RV 293 (AUTUMN FROM...","Allegro, excerpts","Vivaldi, Antonio"
996,10374,2010-02-28T05:00:00Z,2009-10,"CONCERTO, VIOLIN, F MAJOR, RV 293 (AUTUMN FROM...","Allegro, excerpts","Vivaldi, Antonio"


In [None]:
first_performance_df = (work_history_df_cleaned
                        .groupby('work_id')
                        .first()
                        .sort_values(by='date'))

first_per_season = first_performance_df.season.value_counts()

first_per_season


Unnamed: 0_level_0,count
season,Unnamed: 1_level_1
1945-46,323
2015-16,320
2013-14,263
1900-01,197
2022-23,190
...,...
1869-70,10
1864-65,9
1849-50,7
1877-78,5
