In [66]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [67]:
import pandas as pd
import zipfile
import os
import numpy as np
from datetime import datetime
import math

In [68]:
files = ['../../data/raw/nadac/35577-0001-Data.tsv.zip', '../../data/raw/nadac/35577-0002-Data.tsv.zip']

months_dict = {'fall':'9','summer':'7','spring':'3'}

column_names = ['CASEID', 'SOURCE', 'SEASON', 'STARTDY', 'STARTMO', 'STARTYR', 'ENDDY', 'ENDMO', 'ENDYR', 
                'COUNTRY', 'STATE', 'CITY', 'ORGNAME', 'ORGNOTES','ORGDIR', 'TITLE', 'NOTES', 'NPERFORM']

output_column_names = ['SEASON', 'STATE', 'CITY', 'ORGNAME', 'TITLE', 'NPERFORM', 'STARTDATE', 'ENDDATE']

In [69]:
def make_season(month, year):           
    if int(month) >= 9:
        next_year = str(int(year)+1)[-2:]
        season = year + '-' + next_year
    else:
        prev_year = str(int(year)-1)
        season = prev_year + '-' + year[-2:]
    return season
        
def fix_year(source_doc, year_col):
    try:
        int(year_col)        
    except:
        year_col = source_doc[-8:][0:4]
        
    return year_col

def fix_month(month_col):
    if month_col.lower() in months_dict.keys():
        month_col = months_dict[month_col.lower()]
    
    try:
        int(month_col)
    except:
        month_col = '1' 
        
    return month_col

def splitDataFrameList(df,target_column,separator):
    ''' df = dataframe to split,
    target_column = the column containing the values to split
    separator = the symbol used to perform the split
    returns: a dataframe with each entry for the target column separated, with each element moved into a new row. 
    The values in the other columns are duplicated across the newly divided rows.
    '''
    row_accumulator = []

    def splitListToRows(row, separator):
        split_row = str(row[target_column]).split(separator)
        for s in split_row:
            new_row = row.to_dict()
            new_row[target_column] = s
            row_accumulator.append(new_row)

    df.apply(splitListToRows, axis=1, args = (separator, ))
    new_df = pd.DataFrame(row_accumulator)
    return new_df
        
def prep_df(df):

    #clean up dates as some issues there
    df['STARTYR']=df.apply(lambda row: fix_year(row['SOURCE'], row['STARTYR']), axis=1)
    df['ENDYR']=df.apply(lambda row: fix_year(row['SOURCE'], row['ENDYR']), axis=1)    
    df['STARTMO']=df.apply(lambda row: fix_month(row['STARTMO']), axis=1)
    df['ENDMO']=df.apply(lambda row: fix_month(row['ENDMO']), axis=1)          
    
    if 'SEASON' not in df.columns:
        df['SEASON'] = df.apply(lambda row: make_season(row['STARTMO'],row['STARTYR']), axis=1)
    
    #clean up string columns and select only USA data
    us_territories = ['U.S.A', 'U.S.A.', 'PUERTO RICO', 'U.S. VIRGIN ISLANDS']    
    
    #split the double-billing into separate rows
    df = splitDataFrameList(df, 'TITLE', '&')
    
    df_obj = df.select_dtypes(['object'])
    df[df_obj.columns] = df_obj.apply(lambda x: x.str.strip())
    df = df[df['COUNTRY'].isin(us_territories)]   
    df = df.replace('', np.nan, regex=True)
    
    df['STARTDY'].fillna('1', inplace=True)
    df['ENDDY'].fillna('1', inplace=True)      
    df = df[column_names]
    
    mod_idx = int(df[df['CASEID'].isin([325])].index.values)
    df.at[mod_idx, 'NPERFORM'] = '1'
    mod_idx = int(df[df['CASEID'].isin([805])].index.values)
    df.at[mod_idx, 'NPERFORM'] = '22'    
    if len(df[(df['SOURCE']=='1003.pdf') & (df['CASEID']==2280)].index.values)>0:
        mod_idx = int(df[(df['SOURCE']=='1003.pdf') & (df['CASEID']==2280)].index.values)
        df.at[mod_idx, 'NPERFORM'] = '100'
        
    return df

def make_date(day, month, year):

    if month=='2' and day in ['29','30']:
        day='28'
    
    date = str(day) + '/' + str(month) + '/' + str(year)
    try:
        datetime.strptime(date, '%d/%m/%Y').date()
    except:
        None
    return datetime.strptime(date, '%d/%m/%Y').date()

In [70]:
def get_num_performances(row):
    if not pd.isna(row['NPERFORM']):
        try:
            NPERFORM = int(row['NPERFORM'])
        except:
            print(row['CASEID'])
    else:
        days = (row['ENDDATE']-row['STARTDATE']).days        
        if row['STARTDATE']==row['ENDDATE']:
            NPERFORM = 1
        elif days <= 3:
            NPERFORM = days
        elif days <=30:
            NPERFORM = min([math.ceil(days/2.),10])
        elif days <=60:
            NPERFORM = min([math.ceil(days/4.),15])
        elif days >60:
            NPERFORM = 15
            
    return NPERFORM

def fix_state(row):
    state=''
    if row['COUNTRY']=='PUERTO RICO':
        state='PUERTO RICO'
    if row['COUNTRY']=='U.S. VIRGIN ISLANDS':
        state='U.S. VIRGIN ISLANDS'
    elif row['STATE']=='C.A., A.Z., N.V., O.R.':
        state='CALIFORNIA'
    elif pd.isna(row['STATE']):
        state='NEW YORK CITY'
    else:
        state=row['STATE']
    return state

def fix_title(title):
    unwanted_text =['(opening) ', '*', '(ballet)']
    
    for item in unwanted_text:
        title = title.replace(item, "")

    if title=='':
        title='Unknown'
    
    return title

In [71]:
for idx,file in enumerate(files):
    zip_ref = zipfile.ZipFile(file, 'r')    
    zip_ref.extractall('../../data/raw/nadac/')
    filename = file.split('/')[-1][:-4]
    if idx==0:
        df_tmp_0 = pd.read_csv('../../data/raw/nadac/' + filename, sep='\t', encoding="mac_roman")
        df_tmp_0 = prep_df(df_tmp_0)
    else:
        df_tmp_1 = pd.read_csv('../../data/raw/nadac/' + filename, sep='\t', encoding="mac_roman")                        
        df_tmp_1 = prep_df(df_tmp_1)        
        df       = pd.concat([df_tmp_0, df_tmp_1])       

    os.remove('../../data/raw/nadac/' + filename)
os.rmdir('../../data/raw/nadac/' + '__MACOSX')

df['STARTDATE'] = df.apply(lambda row: make_date(row['STARTDY'],row['STARTMO'],row['STARTYR']), axis=1)
df['ENDDATE'] = df.apply(lambda row: make_date(row['ENDDY'],row['ENDMO'],row['ENDYR']), axis=1)
df['NPERFORM'] = df.apply(lambda row: get_num_performances(row), axis=1)
df['STATE'] = df.apply(lambda row: fix_state(row), axis=1)
df['TITLE'] = df.apply(lambda row: fix_title(str(row['TITLE'])), axis=1)
df = df[output_column_names]
# df = df.sort_values(['TITLE'])



In [72]:
# len(list(df['TITLE'].unique()))
df.to_csv('../../data/processed/listings/nadac_2.csv')

In [1]:
import pandas as pd
df = pd.read_csv('../../data/processed/listings/nadac_2.csv', index_col=0)

In [23]:
corrected_listings = {
"La BohËme 18 pfs.; Gianni Schicchi 7 pfs.; CosÏ fan tutte 16 pfs.; The Old Maid and the Thief 13 pfs.; The Medium 3 pfs.":
    [["La BohËme", 18],["Gianni Schicchi", 7],["CosÏ fan tutte", 16],["The Old Maid and the Thief",13],["The Medium",3]],
"Rea's The Prince of Patches prem; 75 pfs.; Pergolesi's Il Maestro di Musica": [["Rea's The Prince of Patches ", 75]],
"Rea's The Wizard's Ring 160 pfs.; Rea's Falstaff In and Out of Love": [["Rea's The Wizard's Ring", 160]],
"Peter Grimes Harper; Vickers, G. Evans; 3 pfs.; Turandot Gwyneth Jones, Mitchell; Domingo; c: Davis; 4 pfs.; Die Zauberflˆte": [["Peter Grimes", 3],["Turandot", 4]],
"40 pfs. (The Telephone; Secret of Susanna)": [["The Telephone", 40],["Secret of Susanna", 40]],
"La Traviata 27 pfs.; Die Fledermaus 23 pfs.; Rita": [["La Traviata", 27],["Die Fledermaus", 23],["Rita", 23]],
"School tour incl. Mollicone's Starbird prem. rev.vers.; 60 pfs.; The Face on the Barroom Floor abt. 30 pfs; The Telephone":
    [["Mollicone's Starbird", 60],["The Face on the Barroom Floor", 30],["The Telephone", 30]],
"The Boor 8 pfs.; The Perfect Wife 6 pfs.; The Telephone": [["The Boor", 8],["The Perfect Wife", 6],["The Telephone", 6]],
"Starbird prod: Texas Opera Theatre, 4 pfs.; also ballet": [["Starbird", 4]],
"Hansel and Gretel 12 pfs.; Little Red Riding Hood": [["Hansel and Gretel", 12],["Little Red Riding Hood", 12]],
"The Telephone 5 pfs. w.p.; Rita Eng. Mead, 3 pfs. w.p.; The Impresario": [["The Telephone", 5],["Rita", 3],["The Impresario", 3]],
"Rita Eng. Mead 14 pfs.; Gallantry 6 pfs.; The Telephone": [["Rita", 14],["Gallantry", 6],["The Telephone", 6]],
"Kay Bethea's The Little Princess 12 pfs. in schools": [["The Little Princess", 12]],
"Musical Revue abt. 120 pfs.; Debbie King, Sharoin Schuster-Craig": [["Musical Revue", 120]],
"musical comedy 4 pfs.": [["musical comedy", 4]],
"Little Red Riding Hood 4 pfs. w.p.; Sunday Excursion": [["Little Red Riding Hood", 4]],
"Wienerblut Eng. Tull, 5 pfs.; Il Campanello Eng. Bamberger, 5 pfs.; H.M.S. Pinafore 6 pfs.; Cox and Box 5 pfs.; The Pirates of Penzance 6 pfs.; Princess Ida 5 pfs.; Iolanthe 6 pfs.; Four Musicals 21 pfs.":
    [["Wienerblut", 5],["Il Campanello", 5],["H.M.S. Pinafore", 6],["Cox and Box", 5],["The Pirates of Penzance", 6],["Princess Ida", 5],["Iolanthe", 5],["Four Musicals", 21]],
"Orpheus in the Underworld 6 pfs.; Don Giovanni 6 pfs.; The Impresario": [["Orpheus in the Underworld", 6],["Don Giovanni", 6],["The Impresario", 6]],
"Tosca 15 pfs.; Daughter of the Regiment 8 pfs.; Daughter of the Double Duke of Dingle": [["Tosca", 15],["Daughter of the Regiment", 8]],
"w.p., 50 pfs.; La BohËme": [["La BohËme", 50]],
"Barnes' The Frog Who Became a Prince w.p., 30 pfs.; K. DiChiera's Nanabush": [["The Frog Who Became a Prince", 30],["Nanabush",30]],
"El Capitan 5 pfs.; Hansel and Gretel 2 pfs.; also K. DiChiera/Kirk's Nanabush": [["El Capitan", 5],["Hansel and Gretel", 2],["Nanabush", 2]],
"Of Mice and Men 5 pfs.; Amahl and the Night Visitors 5 pfs.": [["Of Mice and Men", 5],["Amahl and the Night Visitors", 5]],
"The Princess Who Talked Backwards 3 pfs. w.p.; Spooks and Other Such Things": [["The Princess Who Talked Backwards", 3]],
"Pagliacci incl. 4 pfs. in Bermuda; Barber of Seville": [["Pagliacci", 4],["Barber of Seville", 4]],
"La BohËme 3 pfs.; The Student Prince 3 pfs.; Die Zauberflˆte": [["La BohËme", 3],["The Student Prince", 3],["Die Zauberflˆte", 3]],
"Madama Butterfly 2 pfs.; The Merry Widow 1 pf.; Die Zauberflˆte": [["Madama Butterfly", 2],["The Merry Widow", 1],["Die Zauberflˆte", 1]],
"Pagliacci 2 pfs.; The New Moon 2 pfs.; Die Zauberflˆte": [["Pagliacci", 2],["The New Moon", 2],["Die Zauberflˆte", 2]],
"Jack and the Beanstalk 11 pfs.; Ring of the Fettuceines": [["Jack and the Beanstalk", 11],["Ring of the Fettuceines", 11]],
"Pagliacci Eng. Martin, 14 pfs; Barber of Seville Eng. Martin, 21 pfs; Noda's The Canary": [["Pagliacci", 14],["Barber of Seville", 21],["Noda's The Canary", 21]],
"17 pfs.; The Telephone": [["The Telephone", 17]],
"Duets 15 pfs.; Birthday of the Infanta 2 pfs.; 11 concerts": [["Birthday of the Infanta", 2]],
"Hin und zur¸ck Eng. Farquhar; 3 pfs. tba Shields' Shaman prem.": [["Hin und zur¸ck", 3],["Shields' Shaman", 3]],
"Jack and the Beanstalk 10 pfs.; Ring of the Fettuccines": [["Jack and the Beanstalk", 10],["Ring of the Fettuccines", 10]],
"Don Pasquale 5 pfs.; La Traviata 6 pfs.; The Marriage of Figaro 9 pfs.; El Capitan 2 pfs.; La BohËme": [["Don Pasquale", 5],["La Traviata", 6],["The Marriage of Figaro", 9],["El Capitan", 2],["La BohËme", 2]],
"La Traviata 9 pfs.; Madama Butterfly 6 pfs.; Don Pasquale": [["La Traviata", 9],["Madama Butterfly", 6],["Don Pasquale", 6]],
"Four Note Opera 5 pfs.; 4 pfs.; in Darien, CT; Wise Woman": [["Four Note Opera", 9]],
"LiÈo's La Corte de FaraÛn 2 pfs.; Habaf?a, Antologia Musical 2 pfs; La Zarzuela revue; 3 pfs. and tour; Puerto Rico Sings": [["LiÈo's La Corte de FaraÛn", 2],["Antologia Musical", 3],["La Zarzuela revue", 3]],
"La Traviata (6 pfs.), CosÏ fan tutte (4 pfs.), The Barber of Seville (4 pfs.), Die Fledermaus (4 pfs.)": [["La Traviata", 6],["CosÏ fan tutte", 4],["The Barber of Seville", 4],["Die Fledermaus", 4]],
"The Marriage of Figaro 29 pfs. Die Fledermaus 32 pfs., La BohËme 7 pfs.": [["The Marriage of Figaro", 29],["Die Fledermaus", 32],["La BohËme", 7]],
"La BohËme Verdi's King For a Day {Un Giorno di regno) Eng. Wilder; Martha; El Capitan; 85 pfs.; also Carmen": 
    [["La BohËme", 14],["King For a Day", 14],["Wilder", 14],["Martha", 14],["El Capitan", 14],["Carmen", 14]],
"The Marriage of Figaro 2 pfs.": [["The Marriage of Figaro", 2]],
"Sweet Betsy from Pike; The Telephone; Slow Dusk; Little Red Riding Hood; The Barber of Seville abridg., Eng.; Hansel and Gretel abrid., Eng., 19 pfs.; Schwanda, the Bagpiper abridg., Eng.; Madama Butterfly":
    [["Hansel and Gretel", 19]],
"The Tales of Hoffmann 12 studt. mat., 3 out-of-town pfs.": [["The Tales of Hoffmann", 15]],
"Die Fledermaus 6 pfs.; La BohËme": [["Die Fledermaus", 6],["La BohËme", 6]],
"The Goose Girl 30 pfs.; Chanticleer": [["The Goose Girl", 30]],
"The Impresario 16 pfs. (see also 9/66 Blltn.)": [["The Impresario", 16]],
"Gypsy 12 pfs.; Finian's Rainbow 12 pfs.; The Music Man": [["Gypsy", 12],["Finian's Rainbow", 12],["The Music Man", 12]],
"Down in the Valley 6 pfs. under President's Council of Youth Opportunities": [["Down in the Valley", 6]],
"Barber of Seville 10 pfs; Vienna to Broadway 16 pfs; CosÏ fan tutte 6 pfs; Opera Gala 5 pfs; Introduction to Opera 17 pfs; Madama Butterfly 5 pfs; La BohËme 12 pfs; Carmine's The Duel 20 pfs; Shakespeare in Opera 4 pfs":
    [["Barber of Seville", 10],["Vienna to Broadway", 16],["CosÏ fan tutte", 6],["Opera Gala", 5],["Introduction to Opera", 17],["Madama Butterfly", 5],["La BohËme", 12],["Carmine's The Duel", 20],["Shakespeare in Opera", 4]],
"The Toy Shop; La Cenerentola Eng. Swedberg; The Devil's Tale (Faust) adapt. Swedberg; The Daughter of the Regiment 16 pfs.; CosÏ fan tutte": [["The Daughter of the Regiment", 16]],
"50 pfs.; Hamilton's Raleigh's Dream": [["Raleigh's Dream", 50]],
"Little Red Riding Hood 30 pfs.; The Chocolate Soldier": [["Little Red Riding Hood", 30]]
}


works = []
for key in list(corrected_listings):
    ck_df = df[df['TITLE']==key]
    for idx,row in ck_df.iterrows():
        for mapping in corrected_listings.get(key):
            work=[]
            work.append(row['SEASON'])
            work.append(row['STATE'])
            work.append(row['CITY'])
            work.append(row['ORGNAME'])
            work.append(mapping[0])
            work.append(mapping[1])            
            work.append(row['STARTDATE'])
            work.append(row['ENDDATE'])
            works.append(work)
            
headers    = ['SEASON', 'STATE', 'CITY', 'ORGNAME', 'TITLE', 'NPERFORM', 'STARTDATE', 'ENDDATE']
df_correct = pd.DataFrame(works, columns=headers)
df_final   = pd.concat([df, df_correct])       
df_final.to_csv('../../data/processed/listings/nadac.csv')            