In [1]:
import pandas as pd
import numpy as np
import openpyxl

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

df = pd.read_csv('final_ticketmaster_dataset.csv')

# Convert the imputed -1 back to nan
df.replace({'-1':np.nan,-1:np.nan}, inplace = True)

## GENERAL

- How many events do we have?
- are event ID's unique?

In [2]:
number_of_unique_event_ids = df['event id'].nunique()
number_of_rows = df.shape[0]
number_of_unique_events = df['event name'].nunique()

if number_of_unique_event_ids == number_of_rows:
    print('The data set has one row per event id')
else:
    print('The data set has', number_of_rows, 'rows but only',  number_of_unique_event_ids, 'unique event ids')

pd.set_option('display.max_rows',120)

The data set has 140 rows but only 139 unique event ids


## DUPLICATION

- Events with the same ID keep on reappearing with different information
- What changes?

In [3]:
# Create a dedupped data set with a row count of the duplicate rows
check_duplicates = df[['event id','info']].groupby(['event id'], as_index = False).count().rename({'info':'duplication count'}, axis = 1)

# Put the duplicate event ids in a list
list_of_duplicate_event_ids = list(check_duplicates[check_duplicates['duplication count']>1]['event id'])

# Add the duplication counter and export the duplicate rows for review
duplicate_rows = df[df['event id'].isin(list_of_duplicate_event_ids)]

duplicate_rows_with_dup_counter = pd.merge(
    duplicate_rows, check_duplicates
    ,how = 'inner'
    ,left_on = 'event id'
    ,right_on = 'event id'
    )   

# duplicate_rows_with_dup_counter.to_excel('~/downloads/check ticketmaster duplication.xlsx')

## EDA of Caregorical Variables

- Are certain venues more prolific?
- Breakout of event types. Tabs and Crosstabs 
- How is ticket limit looking?

In [40]:
freq = df.groupby('venue name').agg({'info':'count'}).rename({'info':'row count'}, axis = 1)
freq['%'] = freq['row count']/df.shape[0]
freq


for type_var in ['segment', 'genre', 'subGenre', 'type', 'subType', 'ticket limit']:

    freq = df.groupby(type_var).agg({'info':'count'}).rename({'info':'row count'}, axis = 1)
    freq['%'] = freq['row count']/df.shape[0]
    freq
    

freq = pd.crosstab(df['segment'], df['genre'])
freq

freq = pd.crosstab(df['segment'], df['subGenre'])
freq

freq = pd.crosstab(df['genre'], df['subGenre'])
freq


Unnamed: 0_level_0,row count,%
venue name,Unnamed: 1_level_1,Unnamed: 2_level_1
Carolina Theatre,4,0.029851
Coastal Credit Union Music Park at Walnut Creek,0,0.0
DPAC - Durham Performing Arts Center,60,0.447761
Duke Energy Center for the Performing Arts,0,0.0
Florence Center,0,0.0
PNC Arena,2,0.014925
Raleigh Improv,0,0.0
Red Hat Amphitheater,0,0.0
Steven Tanger Center for the Performing Arts,20,0.149254


Unnamed: 0_level_0,row count,%
segment,Unnamed: 1_level_1,Unnamed: 2_level_1
Arts & Theatre,67,0.5
Miscellaneous,13,0.097015
Music,4,0.029851
Sports,2,0.014925
Undefined,0,0.0


Unnamed: 0_level_0,row count,%
genre,Unnamed: 1_level_1,Unnamed: 2_level_1
Classical,5,0.037313
Comedy,2,0.014925
Country,1,0.007463
Hockey,2,0.014925
Jazz,0,0.0
Opera,0,0.0
Other,0,0.0
R&B,1,0.007463
Rock,2,0.014925
Theatre,60,0.447761


Unnamed: 0_level_0,row count,%
subGenre,Unnamed: 1_level_1,Unnamed: 2_level_1
Bluegrass,1,0.007463
Comedy,2,0.014925
Jazz,0,0.0
Musical,60,0.447761
NHL,2,0.014925
Opera,0,0.0
Other,0,0.0
Pop,2,0.014925
R&B,1,0.007463
Symphonic,5,0.037313


Unnamed: 0_level_0,row count,%
type,Unnamed: 1_level_1,Unnamed: 2_level_1
Event Style,0,0.0
Group,2,0.014925
Merchandise,0,0.0
Undefined,71,0.529851
Upsell,13,0.097015


Unnamed: 0_level_0,row count,%
subType,Unnamed: 1_level_1,Unnamed: 2_level_1
Competition,0,0.0
Gift Certificate,0,0.0
Special Entry,13,0.097015
Team,2,0.014925
Undefined,71,0.529851


Unnamed: 0_level_0,row count,%
ticket limit,Unnamed: 1_level_1,Unnamed: 2_level_1
OFF,0,0.0
There is an overall 6 ticket limit for this event.,1,0.007463
There is an overall 8 ticket limit for this event.,63,0.470149


genre,Classical,Comedy,Country,Hockey,Jazz,Opera,Other,R&B,Rock,Theatre,Undefined
segment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Arts & Theatre,7,31,0,0,0,1,0,0,0,68,0
Miscellaneous,0,0,0,0,0,0,0,0,0,0,15
Music,0,0,1,0,1,0,2,1,4,0,0
Sports,0,0,0,2,0,0,0,0,0,0,0
Undefined,0,0,0,0,0,0,0,0,0,0,1


subGenre,Bluegrass,Comedy,Jazz,Musical,NHL,Opera,Other,Pop,R&B,Symphonic,Undefined
segment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Arts & Theatre,0,31,0,68,0,1,0,0,0,7,0
Miscellaneous,0,0,0,0,0,0,0,0,0,0,15
Music,1,0,1,0,0,0,2,4,1,0,0
Sports,0,0,0,0,2,0,0,0,0,0,0
Undefined,0,0,0,0,0,0,0,0,0,0,1


subGenre,Bluegrass,Comedy,Jazz,Musical,NHL,Opera,Other,Pop,R&B,Symphonic,Undefined
genre,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Classical,0,0,0,0,0,0,0,0,0,7,0
Comedy,0,31,0,0,0,0,0,0,0,0,0
Country,1,0,0,0,0,0,0,0,0,0,0
Hockey,0,0,0,0,2,0,0,0,0,0,0
Jazz,0,0,1,0,0,0,0,0,0,0,0
Opera,0,0,0,0,0,1,0,0,0,0,0
Other,0,0,0,0,0,0,2,0,0,0,0
R&B,0,0,0,0,0,0,0,0,1,0,0
Rock,0,0,0,0,0,0,0,4,0,0,0
Theatre,0,0,0,68,0,0,0,0,0,0,0


## EDA of Caregorical Variables

- What is the range of the date variables?
- Are there missing event start dates?
- price ranges (outliers, etc)

In [41]:
date_vars = ['public sales startDateTime', 'public sales endDateTime', 'event start dateTime', 'event initial start dateTime']

for date_var in date_vars: 
    df[date_var] = pd.to_datetime(df[date_var])

print("N should be", df.shape[0]) 
    
df[date_vars].describe(datetime_is_numeric=True)

df[['price min', 'price max']].describe()



N should be 134


Unnamed: 0,public sales startDateTime,public sales endDateTime,event start dateTime,event initial start dateTime
count,74,74,131,63
mean,2020-01-06 08:55:08.108108032+00:00,2021-09-06 08:09:18.648648704+00:00,2021-11-16 15:06:24.274809088+00:00,2021-02-26 17:22:51.428571648+00:00
min,2009-09-12 00:00:00+00:00,2021-03-18 22:00:00+00:00,2021-03-18 22:00:00+00:00,2020-03-17 00:00:00+00:00
25%,2020-01-13 15:00:00+00:00,2021-05-21 23:52:30+00:00,2021-07-24 23:45:00+00:00,2021-01-17 01:00:00+00:00
50%,2020-02-17 17:00:00+00:00,2021-08-18 00:15:00+00:00,2021-12-09 00:30:00+00:00,2021-03-17 23:30:00+00:00
75%,2020-09-12 14:00:00+00:00,2021-12-31 03:52:30+00:00,2022-03-14 23:45:00+00:00,2021-04-24 09:00:00+00:00
max,2021-03-05 15:00:00+00:00,2022-06-19 23:00:00+00:00,2022-08-07 23:00:00+00:00,2021-09-26 23:00:00+00:00


Unnamed: 0,price min,price max
count,99.0,99.0
mean,33.029495,111.926465
std,9.387486,46.950733
min,15.0,15.0
25%,30.0,82.5
50%,35.0,129.5
75%,35.0,139.5
max,75.0,295.0


## Are prices moving? (TBD)