In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [2]:
import pandas as pd
import zipfile
import os
import numpy as np
from datetime import datetime
import math

In [3]:
files = ['../../data/raw/nadac/35577-0001-Data.tsv.zip', '../../data/raw/nadac/35577-0002-Data.tsv.zip']

months_dict = {'fall':'9','summer':'7','spring':'3'}

column_names = ['CASEID', 'SOURCE', 'SEASON', 'STARTDY', 'STARTMO', 'STARTYR', 'ENDDY', 'ENDMO', 'ENDYR', 
                'COUNTRY', 'STATE', 'CITY', 'ORGNAME', 'ORGNOTES','ORGDIR', 'TITLE', 'NOTES', 'NPERFORM']

output_column_names = ['SEASON', 'STATE', 'CITY', 'ORGNAME', 'TITLE', 'NPERFORM', 'STARTDATE', 'ENDDATE']

In [57]:
def make_season(month, year):           
    if int(month) >= 9:
        next_year = str(int(year)+1)[-2:]
        season = year + '-' + next_year
    else:
        prev_year = str(int(year)-1)
        season = prev_year + '-' + year[-2:]
    return season
        
def fix_year(source_doc, year_col):
    try:
        int(year_col)        
    except:
        year_col = source_doc[-8:][0:4]
        
    return year_col

def fix_month(month_col):
    if month_col.lower() in months_dict.keys():
        month_col = months_dict[month_col.lower()]
    
    try:
        int(month_col)
    except:
        month_col = '1' 
        
    return month_col

def splitDataFrameList(df,target_column,separator):
    ''' df = dataframe to split,
    target_column = the column containing the values to split
    separator = the symbol used to perform the split
    returns: a dataframe with each entry for the target column separated, with each element moved into a new row. 
    The values in the other columns are duplicated across the newly divided rows.
    '''
    row_accumulator = []

    def splitListToRows(row, separator):
        split_row = str(row[target_column]).split(separator)
        for s in split_row:
            new_row = row.to_dict()
            new_row[target_column] = s
            row_accumulator.append(new_row)

    df.apply(splitListToRows, axis=1, args = (separator, ))
    new_df = pd.DataFrame(row_accumulator)
    return new_df
        
def prep_df(df):

    #clean up dates as some issues there
    df['STARTYR']=df.apply(lambda row: fix_year(row['SOURCE'], row['STARTYR']), axis=1)
    df['ENDYR']=df.apply(lambda row: fix_year(row['SOURCE'], row['ENDYR']), axis=1)    
    df['STARTMO']=df.apply(lambda row: fix_month(row['STARTMO']), axis=1)
    df['ENDMO']=df.apply(lambda row: fix_month(row['ENDMO']), axis=1)          
    
    if 'SEASON' not in df.columns:
        df['SEASON'] = df.apply(lambda row: make_season(row['STARTMO'],row['STARTYR']), axis=1)
    
    #clean up string columns and select only USA data
    us_territories = ['U.S.A', 'U.S.A.', 'PUERTO RICO', 'U.S. VIRGIN ISLANDS']    
    
    #split the double-billing into separate rows
    df = splitDataFrameList(df, 'TITLE', '&')
    df = splitDataFrameList(df, 'TITLE', '; ')
    df = splitDataFrameList(df, 'TITLE', ';')
    
    df_obj = df.select_dtypes(['object'])
    df[df_obj.columns] = df_obj.apply(lambda x: x.str.strip())
    df = df[df['COUNTRY'].isin(us_territories)]   
    df = df.replace('', np.nan, regex=True)
    
    df['STARTDY'].fillna('1', inplace=True)
    df['ENDDY'].fillna('1', inplace=True)      
    df = df[column_names]
    
    mod_idx = int(df[df['CASEID'].isin([325])].index.values)
    df.at[mod_idx, 'NPERFORM'] = '1'
    mod_idx = int(df[df['CASEID'].isin([805])].index.values)
    df.at[mod_idx, 'NPERFORM'] = '22'    
    if len(df[(df['SOURCE']=='1003.pdf') & (df['CASEID']==2280)].index.values)>0:
        mod_idx = int(df[(df['SOURCE']=='1003.pdf') & (df['CASEID']==2280)].index.values)
        df.at[mod_idx, 'NPERFORM'] = '100'
        
    return df

def make_date(day, month, year):

    if month=='2' and day in ['29','30']:
        day='28'
    
    date = str(day) + '/' + str(month) + '/' + str(year)
    try:
        datetime.strptime(date, '%d/%m/%Y').date()
    except:
        None
    return datetime.strptime(date, '%d/%m/%Y').date()

In [58]:
def get_num_performances(row):
    if not pd.isna(row['NPERFORM']):
        try:
            NPERFORM = int(row['NPERFORM'])
        except:
            print(row['CASEID'])
    else:
        days = (row['ENDDATE']-row['STARTDATE']).days        
        if row['STARTDATE']==row['ENDDATE']:
            NPERFORM = 1
        elif days <= 3:
            NPERFORM = days
        elif days <=30:
            NPERFORM = min([math.ceil(days/2.),10])
        elif days <=60:
            NPERFORM = min([math.ceil(days/4.),15])
        elif days >60:
            NPERFORM = 15
            
    return NPERFORM

def fix_state(row):
    state=''
    if row['COUNTRY']=='PUERTO RICO':
        state='PUERTO RICO'
    if row['COUNTRY']=='U.S. VIRGIN ISLANDS':
        state='U.S. VIRGIN ISLANDS'
    elif row['STATE']=='C.A., A.Z., N.V., O.R.':
        state='CALIFORNIA'
    elif pd.isna(row['STATE']):
        state='NEW YORK CITY'
    else:
        state=row['STATE']
    return state

def fix_title(title):
    unwanted_text =['(opening) ', '*', '(ballet)']
    
    for item in unwanted_text:
        title = title.replace(item, "")

    if title=='':
        title='Unknown'
    
    return title

In [59]:
for idx,file in enumerate(files):
    zip_ref = zipfile.ZipFile(file, 'r')    
    zip_ref.extractall('../../data/raw/nadac/')
    filename = file.split('/')[-1][:-4]
    if idx==0:
        df_tmp_0 = pd.read_csv('../../data/raw/nadac/' + filename, sep='\t', encoding="mac_roman")
        df_tmp_0 = prep_df(df_tmp_0)
    else:
        df_tmp_1 = pd.read_csv('../../data/raw/nadac/' + filename, sep='\t', encoding="mac_roman")                        
        df_tmp_1 = prep_df(df_tmp_1)        
        df       = pd.concat([df_tmp_0, df_tmp_1])       

    os.remove('../../data/raw/nadac/' + filename)
os.rmdir('../../data/raw/nadac/' + '__MACOSX')

df['STARTDATE'] = df.apply(lambda row: make_date(row['STARTDY'],row['STARTMO'],row['STARTYR']), axis=1)
df['ENDDATE'] = df.apply(lambda row: make_date(row['ENDDY'],row['ENDMO'],row['ENDYR']), axis=1)
df['NPERFORM'] = df.apply(lambda row: get_num_performances(row), axis=1)
df['STATE'] = df.apply(lambda row: fix_state(row), axis=1)
df['TITLE'] = df.apply(lambda row: fix_title(str(row['TITLE'])), axis=1)
df = df[output_column_names]
# df = df.sort_values(['TITLE'])



def correct_performances(row):
    if bool(re.search('\d+ pfs', row['TITLE'])):
        performances = re.search('\d+ pfs', row['TITLE']).group(0)
        performances = performances.replace(' pfs', '')   
        
    else:
        performances = str(row['NPERFORM'])
    return performances

def clean_title(title):
    unwanted_text = ['incl.', 'Scenes from', 'Eng.', 'tour']
    for text in unwanted_text:
        title = title.replace(text, '')
        title = title.strip()
    return title
        
    
df['NPERFORM'] = df.apply(lambda row: correct_performances(row), axis=1)
df['TITLE'] = df.apply(lambda row: clean_title(row['TITLE']), axis=1)

In [53]:
# len(list(df['TITLE'].unique()))
df.to_csv('../../data/processed/listings/nadac_2.csv')

In [14]:
import pandas as pd
df = pd.read_csv('../../data/processed/listings/nadac.csv', index_col=0)

In [15]:
df[df['NPERFORM'] < 0]
# df.head()

Unnamed: 0,SEASON,STATE,CITY,ORGNAME,TITLE,NPERFORM,STARTDATE,ENDDATE
5014,SUMMER 1968,CALIFORNIA,Los Angeles,The Hollywood Bowl Festival,"concerts Tuesdays, Thursdays and Saturdays",-671,1968-07-09,1966-09-07
5015,SUMMER 1968,CALIFORNIA,Los Angeles,The Hollywood Bowl Festival,,-671,1968-07-09,1966-09-07
7777,1967-68,CALIFORNIA,,San Francisco Opera,The Visitation,-20,1967-10-28,1967-10-08
8428,1967-68,CALIFORNIA,,Spring Opera of San Francisco,Rigoletto,-27,1968-04-30,1968-04-03
9235,1967-68,COLORADO,,Colorado Springs Opera Assn.,Hansel and Gretel,-28,1967-10-31,1967-10-03
10791,1965-66,FLORIDA,Deland,Stetson University Opera/Music Theatre Wksp.,Gallantry,-3,1966-03-18,1966-03-15
10792,1965-66,FLORIDA,Deland,Stetson University Opera/Music Theatre Wksp.,Trouble in Tahiti,-3,1966-03-18,1966-03-15
14708,1965-66,INDIANA,Terre Haute,Indiana State University Opera Workshop,Donizetti's Le Convenienze ed inconvenienze,-29,1966-03-31,1966-03-02
14799,1967-68,IOWA,Cedar Rapids,Coe College Opera Workshop,The Telephone,-22,1968-04-26,1968-04-04
14800,1967-68,IOWA,Cedar Rapids,Coe College Opera Workshop,The Medium,-22,1968-04-26,1968-04-04


In [47]:
# df_final[df_final['TITLE'].str.contains('pfs')]
import re
# string = 'La BohËme 10 pfs.'
# matches = re.search('\d+ pfs', string)
# bool(re.search('\d+ pfs', string))
# a = 1
# if re.compile('\d+ pfs', string):
#     a = re.search('\d+ pfs', string).group(0)
# a
    

def correct_performances(row):
    if bool(re.search('\d+ pfs', row['TITLE'])):
        performances = re.search('\d+ pfs', row['TITLE']).group(0)
        performances = performances.replace(' pfs', '')   
        
    else:
        performances = str(row['NPERFORM'])
    return performances
    
df_final['ddd'] = df_final.apply(lambda row: correct_performances(row), axis=1)

In [48]:
df_final[df_final['TITLE'].str.contains('pfs')]

Unnamed: 0,SEASON,STATE,CITY,ORGNAME,TITLE,NPERFORM,STARTDATE,ENDDATE,ddd
3428,1968-69,CALIFORNIA,,Western Opera Theatre,La BohËme 18 pfs.,17,1968-01-01,1969-01-01,18
3429,1968-69,CALIFORNIA,,Western Opera Theatre,Gianni Schicchi 7 pfs.,17,1968-01-01,1969-01-01,7
3430,1968-69,CALIFORNIA,,Western Opera Theatre,CosÏ fan tutte 16 pfs.,17,1968-01-01,1969-01-01,16
3431,1968-69,CALIFORNIA,,Western Opera Theatre,The Old Maid and the Thief 13 pfs.,17,1968-01-01,1969-01-01,13
3432,1968-69,CALIFORNIA,,Western Opera Theatre,The Medium 3 pfs.,17,1968-01-01,1969-01-01,3
3805,1984-85,CALIFORNIA,Fresno,Sierra Chamber Opera,75 pfs.,20,1984-01-01,1985-01-01,75
3862,1988-89,CALIFORNIA,Fresno,Sierra Chamber Opera,Rea's The Wizard's Ring 160 pfs.,20,1988-01-01,1989-01-01,160
4963,SUMMER 1984,CALIFORNIA,Los Angeles,Olympic Arts Festival,3 pfs.,4,1984-07-09,1984-07-21,3
4967,SUMMER 1984,CALIFORNIA,Los Angeles,Olympic Arts Festival,4 pfs.,4,1984-07-09,1984-07-21,4
5955,1979-80,CALIFORNIA,San Diego,San Diego Opera,40 pfs. (The Telephone,15,1979-01-01,1980-01-01,40
