## Setup

In [68]:
import pandas as pd
import plotly.express as px
import sqlalchemy
import numpy as np

In [69]:
df = pd.read_csv(r'Taylor_Train.csv', encoding='unicode_escape')
df.head()

Unnamed: 0,City,Country,Venue,Opening act(s),Attendance (tickets sold / available),Revenue,Tour
0,Evansville,United States,Roberts Municipal Stadium,Gloriana\r\nKellie Pickler,"7,463 / 7,463","$360,617",Fearless_Tour
1,Jonesboro,United States,Convocation Center,Gloriana\r\nKellie Pickler,"7,822 / 7,822","$340,328",Fearless_Tour
2,St. Louis,United States,Scottrade Center,Gloriana\r\nKellie Pickler,"13,764 / 13,764","$650,420",Fearless_Tour
3,Alexandria,United States,Bishop Ireton High School,Gloriana\r\nKellie Pickler,,,Fearless_Tour
4,North Charleston,United States,North Charleston Coliseum,Gloriana\r\nKellie Pickler,"8,751 / 8,751","$398,154",Fearless_Tour


In [76]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 445 entries, 0 to 444
Data columns (total 7 columns):
 #   Column                              Non-Null Count  Dtype 
---  ------                              --------------  ----- 
 0   city                                445 non-null    object
 1   country                             445 non-null    object
 2   venue                               445 non-null    object
 3   opening_acts                        444 non-null    object
 4   attendance_tickets_sold__available  442 non-null    object
 5   revenue                             442 non-null    object
 6   tour                                445 non-null    object
dtypes: object(7)
memory usage: 24.5+ KB


## Cleanup

Normalizing column names and replacing special characters

In [70]:
replace_chars = ['(', ')', '/']
df.columns = df.columns.str.lower().str.strip().str.replace(' ','_')

for i in replace_chars:
    df.columns = df.columns.str.replace(i,'')

df.columns

Index(['city', 'country', 'venue', 'opening_acts',
       'attendance_tickets_sold__available', 'revenue', 'tour'],
      dtype='object')

In [71]:
df['opening_acts'] = df['opening_acts'].str.replace('\r\n', ', ')

df.head()

Unnamed: 0,city,country,venue,opening_acts,attendance_tickets_sold__available,revenue,tour
0,Evansville,United States,Roberts Municipal Stadium,"Gloriana, Kellie Pickler","7,463 / 7,463","$360,617",Fearless_Tour
1,Jonesboro,United States,Convocation Center,"Gloriana, Kellie Pickler","7,822 / 7,822","$340,328",Fearless_Tour
2,St. Louis,United States,Scottrade Center,"Gloriana, Kellie Pickler","13,764 / 13,764","$650,420",Fearless_Tour
3,Alexandria,United States,Bishop Ireton High School,"Gloriana, Kellie Pickler",,,Fearless_Tour
4,North Charleston,United States,North Charleston Coliseum,"Gloriana, Kellie Pickler","8,751 / 8,751","$398,154",Fearless_Tour


In [72]:
df['attendance_tickets_sold__available'] =  df['attendance_tickets_sold__available'].str.replace('','-1/-1')
df.head()

Unnamed: 0,city,country,venue,opening_acts,attendance_tickets_sold__available,revenue,tour
0,Evansville,United States,Roberts Municipal Stadium,"Gloriana, Kellie Pickler","7,463 / 7,463","$360,617",Fearless_Tour
1,Jonesboro,United States,Convocation Center,"Gloriana, Kellie Pickler","7,822 / 7,822","$340,328",Fearless_Tour
2,St. Louis,United States,Scottrade Center,"Gloriana, Kellie Pickler","13,764 / 13,764","$650,420",Fearless_Tour
3,Alexandria,United States,Bishop Ireton High School,"Gloriana, Kellie Pickler",-1/-1,,Fearless_Tour
4,North Charleston,United States,North Charleston Coliseum,"Gloriana, Kellie Pickler","8,751 / 8,751","$398,154",Fearless_Tour


In [79]:
df['attendance_tickets_sold__available'].fillna('-1/-1')

0          7,463 / 7,463
1          7,822 / 7,822
2        13,764 / 13,764
3                  -1/-1
4          8,751 / 8,751
             ...        
440      72,805 / 72,805
441      43,907 / 43,907
442      35,749 / 35,749
443    100,109 / 100,109
444    100,109 / 100,109
Name: attendance_tickets_sold__available, Length: 445, dtype: object

In [80]:
df['attendance_tickets_sold__available'].value_counts()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 445 entries, 0 to 444
Data columns (total 7 columns):
 #   Column                              Non-Null Count  Dtype 
---  ------                              --------------  ----- 
 0   city                                445 non-null    object
 1   country                             445 non-null    object
 2   venue                               445 non-null    object
 3   opening_acts                        444 non-null    object
 4   attendance_tickets_sold__available  442 non-null    object
 5   revenue                             442 non-null    object
 6   tour                                445 non-null    object
dtypes: object(7)
memory usage: 24.5+ KB


In [83]:
df[['attendance', 'venue_seats']] = df['attendance_tickets_sold__available'].str.split('/', expand=True)


In [84]:
df.drop(columns='attendance_tickets_sold__available', inplace=True)

In [None]:
#df['attendance'] = df['attendance'].str.replace(',','').str.strip()
#df['venue_seats'] = df['venue_seats'].str.replace(',','').str.strip()
df['attendance'] = df['attendance'].fillna('-1').astype('int')
df['venue_seats'] = df['venue_seats'].fillna('-1').astype('int')
df['revenue'] = df['revenue'].fillna('-1')
df['opening_acts'] = df['opening_acts'].fillna('None Listed')


In [102]:
df['revenue'] = df['revenue'].str.replace('','-1')
df['opening_acts'] = df['opening_acts'].str.replace('','None Listed')


In [108]:
df['revenue'] = df['revenue'].str.strip('$').str.replace(',','').astype('int')

In [109]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 445 entries, 0 to 444
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   city          445 non-null    object
 1   country       445 non-null    object
 2   venue         445 non-null    object
 3   opening_acts  445 non-null    object
 4   revenue       445 non-null    int32 
 5   tour          445 non-null    object
 6   attendance    445 non-null    int32 
 7   venue_seats   445 non-null    int32 
dtypes: int32(3), object(5)
memory usage: 22.7+ KB


In [115]:
df['empty_seats'] = df['venue_seats'] - df['attendance']
df.sort_values('empty_seats', ascending=False)

df['avg_ticket_cost'] = df['revenue'] / df['attendance']

df['est_lost_income'] = df['avg_ticket_cost'] * df['empty_seats']

df['max_possible_revenue'] = df['avg_ticket_cost'] * df['venue_seats']

## Export

In [117]:
df.to_csv(r'tay_swift_data_clean.csv')