In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta, time
import re
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import shapiro, zscore

import gzip
import pickle

%matplotlib inline

In [2]:
# matplotlib 한글 깨짐 해결
from matplotlib import rc

rc('font', family='NanumGothic')
plt.rcParams['axes.unicode_minus'] = False

In [3]:
data = pd.read_csv('../processed_v1.csv')
data.head()

Unnamed: 0,age,gender,tran_date,tran_time,play_date,play_st_time,seat,price,ticket_cancel,discount_type,...,general,sac,noble,discount_rate,real_price,discount_cat,all_mem_cnt,blue,green,gold
0,50.0,F,20220114,15:12,20220204,20:00,3층 BOX9 10,10000,2,일반,...,1,0,0,0.0,10000.0,일반,3,0,1,1
1,50.0,M,20220206,16:15,20220302,19:30,1층 B블록12열 7,180000,0,일반,...,1,0,0,0.0,180000.0,일반,1,0,0,0
2,30.0,F,20181124,11:45,20190323,20:00,1층 A블록2열 1,144000,2,블루회원 할인20%,...,1,0,0,20.0,180000.0,예술의 전당,2,1,0,0
3,,,20190613,09:54,20190723,20:00,2층 D블록8열 4,0,0,초대권,...,0,0,0,0.0,0.0,기업 및 관계자,0,0,0,0
4,,F,20190703,09:08,20190721,17:00,1층 C블록17열 3,75000,0,일반,...,1,0,0,0.0,75000.0,일반,2,1,0,0


In [4]:
data.shape

(753454, 29)

# 계절, 시간대, 요일, 주말/평일

In [5]:
data['play_date_dt'] = pd.to_datetime(data['play_date'], format='%Y%m%d')
data['play_date_dt'][:2]

0   2022-02-04
1   2022-03-02
Name: play_date_dt, dtype: datetime64[ns]

In [6]:
# Season of Performance (공연의 계절)
def get_season(date):
    month = date.month
    if 3 <= month <= 5:
        return 'spring'
    elif 6 <= month <= 8:
        return 'summer'
    elif 9 <= month <= 11:
        return 'autumn'
    else:
        return 'winter'

data = data.assign(season=data['play_date_dt'].apply(get_season))

In [7]:
# Performance Time Slot (공연 시간대)
def get_performance_time_slot(time):
    hour = int(time[:2])
    if 6 <= hour < 12:
        return 'morning'
    elif 12 <= hour < 18:
        return 'day'
    else:
        return 'evening'

data = data.assign(performance_time_slot=data['play_st_time'].apply(get_performance_time_slot))

In [8]:
#Day of the Week (요일)
data = data.assign(day_of_week=data['play_date_dt'].dt.day_name())

In [9]:
def label_day(row):
    # 주말로 분류할 요일
    weekend_days = ['Saturday', 'Sunday']
    if row['day_of_week'] in weekend_days:
        return 'weekend'
    elif row['day_of_week'] == 'Friday' and row['performance_time_slot'] == 'evening':
        return 'weekend'
    else:
        return 'weekday'

# 새로운 컬럼에 값 적용
data['weekday_or_weekend'] = data.apply(label_day, axis=1)

# 관여도 변수 생성
- 멤버십 여부  
- 유료회원 여부  
- 유료회원 멤버십 보유 개수
- 유료구매 여부  
- 선예매여부 (위 선예매여부 섹션 확인)

In [10]:
# 멤버십 여부 1, 0으로 코딩하기. Y = 1, N = 0
data['inv1_memyn'] = data['member_yn'].apply(lambda x: 1 if x == 'Y' else 0)
data[['member_yn', 'inv1_memyn']].head()

Unnamed: 0,member_yn,inv1_memyn
0,Y,1
1,Y,1
2,Y,1
3,N,0
4,Y,1


In [11]:
# 유료회원 멤버십(그린, 블루, 골드) 개수 확인
data['inv2_paymem_cnt'] = data[['green', 'blue', 'gold']].sum(axis=1)
data[['green', 'blue', 'gold', 'inv2_paymem_cnt']].head(10)

Unnamed: 0,green,blue,gold,inv2_paymem_cnt
0,1,0,1,2
1,0,0,0,0
2,0,1,0,1
3,0,0,0,0
4,0,1,0,1
5,0,0,0,0
6,0,0,0,0
7,0,0,0,0
8,0,0,1,1
9,0,0,0,0


In [13]:
# 유료구매 여부 1, 0으로 코딩하기. 유료구매 = 1, 무료구매 = 0
data['inv4_buyyn'] = data['price'].apply(lambda x: 1 if x > 0 else 0)
data[['price', 'discount_type', 'inv4_buyyn']].head(10)

Unnamed: 0,price,discount_type,inv4_buyyn
0,10000,일반,1
1,180000,일반,1
2,144000,블루회원 할인20%,1
3,0,초대권,0
4,75000,일반,1
5,24000,K-lang 멤버십 회원(1인2매)20%,1
6,20000,일반,1
7,0,초대권,0
8,42000,골드회원 할인30%,1
9,0,기획사판매,0


In [14]:
# 관여도 점수 계산하기
# data[['inv1_memyn', 'inv2_paymem_cnt', 'inv4_buyyn', 'pre_ticketing']].head(10)
data[['inv1_memyn', 'inv2_paymem_cnt', 'inv4_buyyn']].head(10)

Unnamed: 0,inv1_memyn,inv2_paymem_cnt,inv4_buyyn
0,1,2,1
1,1,0,1
2,1,1,1
3,0,0,0
4,1,1,1
5,1,0,1
6,0,0,1
7,0,0,0
8,1,1,1
9,0,0,0


In [15]:
data['involvement'] = data[['inv1_memyn', 'inv2_paymem_cnt', 'inv4_buyyn']].sum(axis=1)
data[['inv1_memyn', 'inv2_paymem_cnt', 'inv4_buyyn', 'involvement']].head(10)

Unnamed: 0,inv1_memyn,inv2_paymem_cnt,inv4_buyyn,involvement
0,1,2,1,4
1,1,0,1,2
2,1,1,1,3
3,0,0,0,0
4,1,1,1,3
5,1,0,1,2
6,0,0,1,1
7,0,0,0,0
8,1,1,1,3
9,0,0,0,0


In [16]:
data.columns

Index(['age', 'gender', 'tran_date', 'tran_time', 'play_date', 'play_st_time',
       'seat', 'price', 'ticket_cancel', 'discount_type', 'pre_open_date',
       'open_date', 'genre', 'running_time', 'intermission', 'member_yn',
       'new_code', 'key', 'pre_ticketing', 'general', 'sac', 'noble',
       'discount_rate', 'real_price', 'discount_cat', 'all_mem_cnt', 'blue',
       'green', 'gold', 'play_date_dt', 'season', 'performance_time_slot',
       'day_of_week', 'weekday_or_weekend', 'inv1_memyn', 'inv2_paymem_cnt',
       'inv4_buyyn', 'involvement'],
      dtype='object')

In [17]:
data[['key', 'involvement']]

Unnamed: 0,key,involvement
0,0,4
1,1,2
2,2,3
3,4,0
4,5,3
...,...,...
753449,1096949,3
753450,1096950,2
753451,1096951,0
753452,1096952,0


In [18]:
# invite_list = list(data[data['discount_type']=='초대권']['key'])
# data['revised_inv'] = data.apply(lambda x: 0 if x['key'] in invite_list else x['involvement'], axis=1)

In [19]:
invite_mask = data['discount_type'] == '초대권'
invite_mask

0         False
1         False
2         False
3          True
4         False
          ...  
753449    False
753450    False
753451     True
753452    False
753453    False
Name: discount_type, Length: 753454, dtype: bool

In [20]:
invite_mask = data['discount_type'] == '초대권'
invite_keys = data.loc[invite_mask, 'key']

data['revised_inv'] = np.where(data['key'].isin(invite_keys), 0, data['involvement'])

In [21]:
data[['key', 'involvement', 'revised_inv']]

Unnamed: 0,key,involvement,revised_inv
0,0,4,4
1,1,2,2
2,2,3,3
3,4,0,0
4,5,3,3
...,...,...,...
753449,1096949,3,3
753450,1096950,2,2
753451,1096951,0,0
753452,1096952,0,0


In [22]:
data[data['discount_type']=='초대권']['revised_inv'].unique()

array([0])

In [23]:
data.drop(['involvement'], axis=1, inplace=True)
# data.rename(columns={'revised_inv':'involvement'})

In [24]:
data.rename(columns={'revised_inv':'involvement'}, inplace=True)

In [25]:
data.shape

(753454, 38)

In [26]:
data.columns

Index(['age', 'gender', 'tran_date', 'tran_time', 'play_date', 'play_st_time',
       'seat', 'price', 'ticket_cancel', 'discount_type', 'pre_open_date',
       'open_date', 'genre', 'running_time', 'intermission', 'member_yn',
       'new_code', 'key', 'pre_ticketing', 'general', 'sac', 'noble',
       'discount_rate', 'real_price', 'discount_cat', 'all_mem_cnt', 'blue',
       'green', 'gold', 'play_date_dt', 'season', 'performance_time_slot',
       'day_of_week', 'weekday_or_weekend', 'inv1_memyn', 'inv2_paymem_cnt',
       'inv4_buyyn', 'involvement'],
      dtype='object')

# 좌석 특징 변수

## 층 (floor)

In [27]:
# 층 추출하는 함수
def extract_floor(text):
    if '3층' in text:
        return 'third'
    elif '2층' in text:
        return 'second'
    elif '1층' in text:
        return 'first'
    elif '합창석'in text:
        return 'choir'
    else:
        return 'unknown'

In [28]:
data['seat_floor'] = data['seat'].apply(lambda x: extract_floor(x))
data[['seat', 'seat_floor']][:3]

Unnamed: 0,seat,seat_floor
0,3층 BOX9 10,third
1,1층 B블록12열 7,first
2,1층 A블록2열 1,first


## 왼쪽, 오른쪽, 중간 기준

In [29]:
def location_lr(x):
    # 중간
    if any(t in x for t in ['1층 C블록', '2층 C블록', '3층 D블록', '합창석 G블록']):
        return 'mid'
    # 왼쪽
    elif any(t in x for t in ['합창석 H블록', '1층 A블록', '1층 B블록', '2층 A블록', '2층 B블록', '3층 A블록', '3층 B블록', '3층 C블록', '3층 M블록', '2층 BOX1', '2층 BOX2', '2층 BOX3', '3층 BOX7', '3층 BOX8', '3층 BOX9']):
        return 'left'
    # 오른쪽
    elif any(t in x for t in ['합창석 F블록', '1층 D블록', '1층 E블록', '2층 D블록', '2층 E블록', '3층 E블록', '3층 F블록', '3층 G블록', '3층 N블록', '2층 BOX4', '2층 BOX5', '2층 BOX6', '3층 BOX10', '3층 BOX11', '3층 BOX12']):
        return 'right'

In [30]:
data['seat_loc'] = data['seat'].apply(lambda x: location_lr(x))

## 기존 RSABC 기준

In [31]:
def general_zones(x):
    # 합창석
    if '합창석' in x:
        return 'B'
    elif '3층' in x:
        if 'BOX' in x:
            return 'B'
        elif any(t in x for t in ['B블록4열', 'C블록4열', 'D블록4열', 'E블록4열', 'F블록4열']):
            return 'B'
        else:
            return 'C'
    elif '2층' in x:
        if 'BOX' in x:
            return 'A'
        elif any(t in x for t in ['2층 B블록', '2층 C블록', '2층 D블록']):
            if int(x[6:-2].strip()[:-1]) in range(1, 4):
                return 'R'
            elif int(x[6:-2].strip()[:-1]) in range(4, 6):
                return 'S'
            elif int(x[6:-2].strip()[:-1]) in range(6, 9):
                return 'A'
        elif any(t in x for t in ['2층 A블록1열 1', '2층 A블록1열 2', '2층 A블록1열 3', '2층 A블록2열 1', '2층 A블록2열 1', '2층 A블록2열 1',
                '2층 A블록3열 1', '2층 A블록3열 2', '2층 A블록3열 3', '2층 A블록3열 4', '2층 A블록4열 1', '2층 A블록4열 2', '2층 A블록4열 3', '2층 A블록4열 4', '2층 A블록4열 5',
                '2층 A블록5열 1', '2층 A블록5열 2', '2층 A블록5열 3', '2층 A블록5열 4', '2층 A블록5열 5', '2층 A블록5열 6', 
                '2층 A블록6열 1', '2층 A블록6열 2', '2층 A블록6열 3', '2층 A블록6열 4', '2층 A블록6열 5']):
            return 'B'
        elif any(t in x for t in ['2층 A블록7열', '2층 A블록8열', '2층 E블록7열', '2층 E블록8열']):
            return 'B'
        elif any(t in x for t in ['2층 E블록1열 15', '2층 E블록1열 16', '2층 E블록1열 17', '2층 E블록2열 15', '2층 E블록2열 16', '2층 E블록2열 17', '2층 E블록2열 18', 
                '2층 E블록3열 15', '2층 E블록3열 16', '2층 E블록3열 17', '2층 E블록3열 18', '2층 E블록4열 15', '2층 E블록4열 16', '2층 E블록4열 17', '2층 E블록4열 18', '2층 E블록4열 19',
                '2층 E블록5열 15', '2층 E블록5열 16', '2층 E블록5열 17', '2층 E블록5열 18', '2층 E블록5열 19', '2층 E블록5열 20',
                '2층 E블록6열 15', '2층 E블록6열 16', '2층 E블록6열 17', '2층 E블록6열 18', '2층 E블록6열 19']):
            return 'B'
        elif any(t in x for t in ['2층 A블록1열 8', '2층 A블록1열 9', '2층 A블록1열 10', '2층 A블록1열 11', '2층 A블록1열 12', '2층 A블록1열 13', '2층 A블록1열 14', '2층 A블록1열 15', '2층 A블록1열 16', '2층 A블록1열 17',
                '2층 A블록2열 9', '2층 A블록2열 10', '2층 A블록2열 11', '2층 A블록2열 12', '2층 A블록2열 13', '2층 A블록2열 14', '2층 A블록2열 15', '2층 A블록2열 16', '2층 A블록2열 17', '2층 A블록2열 18']):
            return 'S'
        elif any(t in x for t in ['2층 E블록1열', '2층 E블록2열']):
            if int(x[-2:].strip()) in range(11):
                return 'S'
            elif int(x[-2:].strip()) in range(11, 15):
                return 'A'
            else:
                return 'B'
        # elif any(t in x for t in ['2층 E블록1열 14', '2층 E블록1열 13', '2층 E블록2열 14', '2층 E블록2열 11', '2층 E블록2열 12', '2층 E블록1열 12', '2층 E블록1열 11', '2층 E블록2열 13']):
        #     return 'A'
        else:
            return 'A'
    elif '1층' in x:
        if any(t in x for t in ['1층 B블록', '1층 C블록', '1층 D블록']):
            if int(x[6:-2].strip()[:-1]) in range(3, 19):
                return 'R'
            elif int(x[6:-2].strip()[:-1]) in range(19, 21):
                return 'S'
            else:
                return 'A'
        elif '1층 A블록' in x:
            if any(t in x for t in ['1층 A블록3열 8', '1층 A블록3열 9', '1층 A블록4열 9', '1층 A블록4열 10', '1층 A블록5열 9', '1층 A블록5열 10',
                '1층 A블록6열 9', '1층 A블록6열 10', '1층 A블록7열 9', '1층 A블록7열 10', '1층 A블록8열 10', '1층 A블록8열 11',
                '1층 A블록9열 10', '1층 A블록9열 11', '1층 A블록10열 10', '1층 A블록10열 11', '1층 A블록11열 10', '1층 A블록11열 11',
                '1층 A블록12열 11', '1층 A블록12열 12', '1층 A블록13열 11', '1층 A블록13열 12', '1층 A블록14열 11', '1층 A블록14열 12',
                '1층 A블록15열 11', '1층 A블록15열 12', '1층 A블록16열 10', '1층 A블록16열 11', '1층 A블록17열 10', '1층 A블록17열 11', '1층 A블록18열 9', '1층 A블록18열 10']):
                return 'S'
            elif any(t in x for t in ['1층 A블록1열', '1층 A블록22열', '1층 A블록2열 1', '1층 A블록2열 2', '1층 A블록2열 3', '1층 A블록3열 1', '1층 A블록3열 2', '1층 A블록3열 3', '1층 A블록4열 1', '1층 A블록4열 2',
                '1층 A블록5열 1', '1층 A블록5열 2', '1층 A블록6열 1', '1층 A블록6열 2', '1층 A블록7열 1', '1층 A블록7열 2', '1층 A블록8열 1', '1층 A블록8열 2', '1층 A블록8열 3',
                '1층 A블록9열 1', '1층 A블록9열 2', '1층 A블록9열 3', '1층 A블록10열 1', '1층 A블록10열 2', '1층 A블록10열 3', '1층 A블록11열 1', '1층 A블록11열 2', '1층 A블록11열 3',
                '1층 A블록12열 1', '1층 A블록12열 2', '1층 A블록12열 3', '1층 A블록12열 4', '1층 A블록13열 1', '1층 A블록13열 2', '1층 A블록13열 3', '1층 A블록13열 4', '1층 A블록14열 1', '1층 A블록14열 2', '1층 A블록14열 3', '1층 A블록14열 4',
                '1층 A블록15열 1', '1층 A블록15열 2', '1층 A블록15열 3', '1층 A블록15열 4', '1층 A블록16열 1', '1층 A블록16열 2', '1층 A블록16열 3', '1층 A블록17열 1', '1층 A블록17열 2', '1층 A블록17열 3',
                '1층 A블록18열 1', '1층 A블록18열 2', '1층 A블록19열 1', '1층 A블록19열 2', '1층 A블록19열 3', '1층 A블록20열 1', '1층 A블록20열 2', '1층 A블록21열 1', '1층 A블록21열 2', '1층 A블록21열 3']):
                return 'B'
            else:
                return 'A'
        elif '1층 E블록' in x:
            if any(t in x for t in ['1열 E블록1열', '1층 E블록22열', '1층 E블록2열 7', '1층 E블록2열 8', '1층 E블록2열 9', '1층 E블록3열 7', '1층 E블록3열 8', '1층 E블록3열 9', '1층 E블록4열 9', '1층 E블록4열 10',
                '1층 E블록5열 9', '1층 E블록5열 10', '1층 E블록6열 9', '1층 E블록6열 10', '1층 E블록7열 9', '1층 E블록7열 10', '1층 E블록8열 9', '1층 E블록8열 10', '1층 E블록8열 11',
                '1층 E블록9열 9', '1층 E블록9열 10', '1층 E블록9열 11', '1층 E블록10열 9', '1층 E블록10열 10', '1층 E블록10열 11', '1층 E블록11열 9', '1층 E블록11열 10', '1층 E블록11열 11',
                '1층 E블록12열 9', '1층 E블록12열 10', '1층 E블록12열 11', '1층 E블록12열 12', '1층 E블록13열 9', '1층 E블록13열 10', '1층 E블록13열 11', '1층 E블록13열 12', '1층 E블록14열 9', '1층 E블록14열 10', '1층 E블록14열 11', '1층 E블록14열 12',
                '1층 E블록15열 9', '1층 E블록15열 10', '1층 E블록15열 11', '1층 E블록15열 12', '1층 E블록16열 9', '1층 E블록16열 10', '1층 E블록16열 11', '1층 E블록17열 9', '1층 E블록17열 10', '1층 E블록17열 11',
                '1층 E블록18열 9', '1층 E블록18열 10', '1층 E블록19열 7', '1층 E블록19열 8', '1층 E블록19열 9', '1층 E블록20열 7', '1층 E블록20열 8', '1층 E블록21열 5', '1층 E블록21열 6', '1층 E블록21열 7']):
                return 'B'
            elif any(t in x for t in ['1층 E블록3열 1', '1층 E블록3열 2' '1층 E블록4열 1', '1층 E블록4열 2', '1층 E블록5열 1', '1층 E블록5열 2', 
                                    '1층 E블록6열 1', '1층 E블록6열 2', '1층 E블록7열 1', '1층 E블록7열 2', '1층 E블록8열 1', '1층 E블록8열 2', 
                                    '1층 E블록9열 1', '1층 E블록9열 2', '1층 E블록10열 1', '1층 E블록10열 2', '1층 E블록11열 1', '1층 E블록11열 2', 
                                    '1층 E블록12열 1', '1층 E블록12열 2', '1층 E블록13열 1', '1층 E블록13열 2', '1층 E블록14열 1', '1층 E블록14열 2', 
                                    '1층 E블록15열 1', '1층 E블록15열 2', '1층 E블록16열 1', '1층 E블록16열 2', '1층 E블록17열 1', '1층 E블록17열 2', '1층 E블록18열 1', '1층 E블록18열 2']):
                return 'S'
            else:
                return 'A'

In [32]:
data['seat_gen'] = data['seat'].apply(lambda x: general_zones(x))
data[['seat', 'seat_gen']]

Unnamed: 0,seat,seat_gen
0,3층 BOX9 10,B
1,1층 B블록12열 7,R
2,1층 A블록2열 1,B
3,2층 D블록8열 4,A
4,1층 C블록17열 3,R
...,...,...
753449,1층 A블록2열 2,B
753450,3층 A블록6열 4,C
753451,1층 D블록16열 12,R
753452,1층 D블록20열 8,S


In [33]:
data.columns

Index(['age', 'gender', 'tran_date', 'tran_time', 'play_date', 'play_st_time',
       'seat', 'price', 'ticket_cancel', 'discount_type', 'pre_open_date',
       'open_date', 'genre', 'running_time', 'intermission', 'member_yn',
       'new_code', 'key', 'pre_ticketing', 'general', 'sac', 'noble',
       'discount_rate', 'real_price', 'discount_cat', 'all_mem_cnt', 'blue',
       'green', 'gold', 'play_date_dt', 'season', 'performance_time_slot',
       'day_of_week', 'weekday_or_weekend', 'inv1_memyn', 'inv2_paymem_cnt',
       'inv4_buyyn', 'involvement', 'seat_floor', 'seat_loc', 'seat_gen'],
      dtype='object')

# 관심도 변수 생성
- 취소율 (cancel_rate)  
- 본인구매율 (paid_rate)  
- 선예매율 (pre_rate)  
- 좌석점유율 (capa_rate)

In [34]:
# 취소율
# 취소표 함수 만들기
def make_cancel_rate(x):
    # 해당 공연 취소표 제외하기
    total_ticket = len(data[(data['new_code'] == x)])
    cancel_ticket = len(data[(data['new_code'] == x) & (data['ticket_cancel'] == 2)])
    return cancel_ticket / total_ticket

In [35]:
# new_code의 unique값을 뽑아내서 취소표 함수를 적용
cancel_rate = data['new_code'].unique().tolist()
cancel_rate = pd.DataFrame(cancel_rate, columns=['new_code'])
cancel_rate['cancel_rate'] = cancel_rate['new_code'].apply(lambda x: make_cancel_rate(x))
cancel_rate

Unnamed: 0,new_code,cancel_rate
0,435,0.172237
1,449,0.244957
2,31,0.167689
3,100,0.004285
4,99,0.044788
...,...,...
665,363,0.000000
666,276,0.494297
667,336,0.095890
668,418,0.188679


In [36]:
# cancel_rate를 data에 merge
new_df = data.copy()
new_df = pd.merge(new_df, cancel_rate, on='new_code', how='left')
new_df

Unnamed: 0,age,gender,tran_date,tran_time,play_date,play_st_time,seat,price,ticket_cancel,discount_type,...,day_of_week,weekday_or_weekend,inv1_memyn,inv2_paymem_cnt,inv4_buyyn,involvement,seat_floor,seat_loc,seat_gen,cancel_rate
0,50.0,F,20220114,15:12,20220204,20:00,3층 BOX9 10,10000,2,일반,...,Friday,weekend,1,2,1,4,third,left,B,0.172237
1,50.0,M,20220206,16:15,20220302,19:30,1층 B블록12열 7,180000,0,일반,...,Wednesday,weekday,1,0,1,2,first,left,R,0.244957
2,30.0,F,20181124,11:45,20190323,20:00,1층 A블록2열 1,144000,2,블루회원 할인20%,...,Saturday,weekend,1,1,1,3,first,left,B,0.167689
3,,,20190613,09:54,20190723,20:00,2층 D블록8열 4,0,0,초대권,...,Tuesday,weekday,0,0,0,0,second,right,A,0.004285
4,,F,20190703,09:08,20190721,17:00,1층 C블록17열 3,75000,0,일반,...,Sunday,weekend,1,1,1,3,first,mid,R,0.044788
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
753449,60.0,F,20210618,15:09,20210704,15:00,1층 A블록2열 2,90000,2,일반,...,Sunday,weekend,1,1,1,3,first,left,B,0.396887
753450,50.0,M,20230522,17:29,20230613,17:00,3층 A블록6열 4,10000,0,일반,...,Tuesday,weekday,1,0,1,2,third,left,C,0.011291
753451,,,20201009,16:52,20201020,19:30,1층 D블록16열 12,0,0,초대권,...,Tuesday,weekday,0,0,0,0,first,right,R,0.031766
753452,,,20200726,16:55,20200818,19:30,1층 D블록20열 8,0,0,기획사판매,...,Tuesday,weekday,0,0,0,0,first,right,S,0.001638


In [37]:
# 데이터 랜덤 확인
new_df[(new_df['new_code'] == 360)][['new_code', 'cancel_rate']]

Unnamed: 0,new_code,cancel_rate
1036,360,0.020833
1153,360,0.020833
4474,360,0.020833
4856,360,0.020833
6432,360,0.020833
...,...,...
751389,360,0.020833
751808,360,0.020833
751994,360,0.020833
752219,360,0.020833


In [38]:
# 취소율 1.0이상인거나 0.0이하인거 확인
new_df[new_df['cancel_rate'] >= 1.000000000]

Unnamed: 0,age,gender,tran_date,tran_time,play_date,play_st_time,seat,price,ticket_cancel,discount_type,...,day_of_week,weekday_or_weekend,inv1_memyn,inv2_paymem_cnt,inv4_buyyn,involvement,seat_floor,seat_loc,seat_gen,cancel_rate


In [39]:
# 본인구매율

# 본인구매율 계산 함수 만들기
def make_paid_ticket_rate(x):
    total_ticket = len(data[(data['new_code'] == x)])
    paid_ticket = len(data[(data['new_code'] == x) & (data['price'] > 0)])
    return paid_ticket / total_ticket

# new_code의 unique값을 뽑아내서 유료표 함수를 적용
paid_rate = data['new_code'].unique().tolist()
paid_rate = pd.DataFrame(paid_rate, columns=['new_code'])
paid_rate['paid_rate'] = paid_rate['new_code'].apply(lambda x: make_paid_ticket_rate(x))
paid_rate

Unnamed: 0,new_code,paid_rate
0,435,0.996144
1,449,0.968300
2,31,0.991820
3,100,0.028388
4,99,0.348534
...,...,...
665,363,0.956522
666,276,0.996198
667,336,0.465753
668,418,1.000000


In [40]:
# new_df에 paid_rate를 merge
new_df = pd.merge(new_df, paid_rate, on='new_code', how='left')
new_df.head(3)

Unnamed: 0,age,gender,tran_date,tran_time,play_date,play_st_time,seat,price,ticket_cancel,discount_type,...,weekday_or_weekend,inv1_memyn,inv2_paymem_cnt,inv4_buyyn,involvement,seat_floor,seat_loc,seat_gen,cancel_rate,paid_rate
0,50.0,F,20220114,15:12,20220204,20:00,3층 BOX9 10,10000,2,일반,...,weekend,1,2,1,4,third,left,B,0.172237,0.996144
1,50.0,M,20220206,16:15,20220302,19:30,1층 B블록12열 7,180000,0,일반,...,weekday,1,0,1,2,first,left,R,0.244957,0.9683
2,30.0,F,20181124,11:45,20190323,20:00,1층 A블록2열 1,144000,2,블루회원 할인20%,...,weekend,1,1,1,3,first,left,B,0.167689,0.99182


In [41]:
# 잘 붙었나 랜덤 확인
new_df[(new_df['new_code'] == 382)][['new_code', 'paid_rate']]

Unnamed: 0,new_code,paid_rate


In [42]:
# 본인구매율 0(전석초대)인 데이터 있는지 확인. 이거 0인 값 없어야 함
new_df[(new_df['paid_rate'] == 0)][['new_code', 'paid_rate']]

Unnamed: 0,new_code,paid_rate


In [43]:
# 선예매율

# 선예매 비율 계산 함수 만들기
def make_pre_ticketing_rate(x):
    total_ticket = len(data[(data['new_code'] == x)])
    pre_ticketing = len(data[(data['new_code'] == x) & (data['pre_ticketing'] == 1)])
    return pre_ticketing / total_ticket

# new_code의 unique값을 뽑아내서 선예매 비율 함수를 적용
pre_rate = data['new_code'].unique().tolist()
pre_rate = pd.DataFrame(pre_rate, columns=['new_code'])
pre_rate['pre_rate'] = pre_rate['new_code'].apply(lambda x: make_pre_ticketing_rate(x))
pre_rate

Unnamed: 0,new_code,pre_rate
0,435,0.0
1,449,0.0
2,31,0.0
3,100,0.0
4,99,0.0
...,...,...
665,363,0.0
666,276,0.0
667,336,0.0
668,418,0.0


In [44]:
pre_rate['pre_rate'].unique()

array([0.00000000e+00, 8.73362445e-03, 1.26121635e-01, 1.28824477e-02,
       1.34831461e-02, 1.90476190e-02, 1.82876143e-02, 1.91458027e-02,
       2.72071072e-02, 8.66926745e-04, 1.24575311e-02, 6.60904931e-03,
       1.50464094e-01, 4.27046263e-03, 4.54545455e-03, 6.09442060e-02,
       5.42740841e-03, 9.82456140e-03, 2.22861251e-02, 1.27856366e-01,
       2.32312566e-02, 3.87847447e-03, 4.44856348e-02, 1.92083818e-02,
       3.28577605e-02, 7.03765690e-01, 4.88400488e-03, 2.13980029e-03,
       3.45581802e-02, 1.93548387e-02, 1.23756219e-01, 1.87861272e-02,
       4.36654367e-02, 8.07754443e-04, 3.46909439e-01, 4.35986159e-02,
       1.32530120e-01, 2.12290503e-02, 7.62987013e-02, 1.10905730e-02,
       1.34874759e-02, 1.24407583e-02, 5.56962025e-02, 4.25287356e-02,
       3.37283500e-02, 3.32778702e-03, 3.16205534e-02, 1.43678161e-03,
       1.88034188e-02, 2.64554164e-01, 6.51041667e-03, 6.36537237e-04,
       5.30631934e-03, 7.06521739e-02, 1.64581962e-02, 7.02247191e-04,
      

In [45]:
# new_df에 pre_rate를 merge
new_df = pd.merge(new_df, pre_rate, on='new_code', how='left')
new_df

Unnamed: 0,age,gender,tran_date,tran_time,play_date,play_st_time,seat,price,ticket_cancel,discount_type,...,inv1_memyn,inv2_paymem_cnt,inv4_buyyn,involvement,seat_floor,seat_loc,seat_gen,cancel_rate,paid_rate,pre_rate
0,50.0,F,20220114,15:12,20220204,20:00,3층 BOX9 10,10000,2,일반,...,1,2,1,4,third,left,B,0.172237,0.996144,0.000000
1,50.0,M,20220206,16:15,20220302,19:30,1층 B블록12열 7,180000,0,일반,...,1,0,1,2,first,left,R,0.244957,0.968300,0.000000
2,30.0,F,20181124,11:45,20190323,20:00,1층 A블록2열 1,144000,2,블루회원 할인20%,...,1,1,1,3,first,left,B,0.167689,0.991820,0.000000
3,,,20190613,09:54,20190723,20:00,2층 D블록8열 4,0,0,초대권,...,0,0,0,0,second,right,A,0.004285,0.028388,0.000000
4,,F,20190703,09:08,20190721,17:00,1층 C블록17열 3,75000,0,일반,...,1,1,1,3,first,mid,R,0.044788,0.348534,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
753449,60.0,F,20210618,15:09,20210704,15:00,1층 A블록2열 2,90000,2,일반,...,1,1,1,3,first,left,B,0.396887,0.996109,0.003891
753450,50.0,M,20230522,17:29,20230613,17:00,3층 A블록6열 4,10000,0,일반,...,1,0,1,2,third,left,C,0.011291,0.732941,0.019146
753451,,,20201009,16:52,20201020,19:30,1층 D블록16열 12,0,0,초대권,...,0,0,0,0,first,right,R,0.031766,0.246506,0.000000
753452,,,20200726,16:55,20200818,19:30,1층 D블록20열 8,0,0,기획사판매,...,0,0,0,0,first,right,S,0.001638,0.019656,0.000000


In [46]:
# 선예매 1.0인 공연(전석초대) 확인 ==> 이것도 이제 나오면 안됨
new_df[new_df['pre_rate'] == 1.0]

Unnamed: 0,age,gender,tran_date,tran_time,play_date,play_st_time,seat,price,ticket_cancel,discount_type,...,inv1_memyn,inv2_paymem_cnt,inv4_buyyn,involvement,seat_floor,seat_loc,seat_gen,cancel_rate,paid_rate,pre_rate


In [47]:
# new_df[new_df['pre_rate'] == 1.0]['price'].unique()

In [48]:
# 좌석점유율

data['play_date'] = data['play_date'].astype('str')

# 거리두기 시행했던 시기(2020년 2월 1일부터 10월 11일, 2020년 11월 19일 ~ 2021년 12월31일) 공연만 추출 ==> 해당공연은 1252석으로 나눌 예정
distance_y = data[(data['play_date'] >= '20200201') & (data['play_date'] <= '20201011') | (data['play_date'] >= '20201119') & (data['play_date'] <= '20211231')]
distance_y_show = distance_y['new_code'].unique().tolist()

# seat_floor에서 choir인 공연의 new_code 확인 ==> 합창석 있는 공연은 2505석으로 나눌 예정
choir_show = data[data['seat_floor'] == 'choir']['new_code'].unique().tolist()

# 합창석이 포함된 공연의 new_code를 제외한 나머지 공연의 new_code 확인 ==> 합창석 없는 공연은 2231석으로 나눌 예정
non_choir_show = data[~data['new_code'].isin(choir_show)]['new_code'].unique().tolist()

In [49]:
print(len(distance_y_show))
print(len(choir_show))
print(len(non_choir_show))

195
487
183


In [50]:
# choir_show랑 non_choir_show에서 distance_y_show 제외
choir_show = list(set(choir_show) - set(distance_y_show))
non_choir_show = list(set(non_choir_show) - set(distance_y_show))
print(len(choir_show))
print(len(non_choir_show))

347
128


In [51]:
# 두 값이 일치하는지 확인
print(len(distance_y_show) + len(choir_show) + len(non_choir_show))
print(data['new_code'].nunique())

670
670


In [52]:
# 거리두기 한 공연, cho_show, non_cho_show에 겹치는 new_code가 있는지 확인
set(choir_show) & set(non_choir_show) & set(distance_y_show)

set()

In [53]:
# 공연 점유율 계산 함수 만들기
def make_capa_rate(x):
    # 해당 공연의 취소표는 제외하고 예매한 ticket 수 계산
    total_ticket = len(data[(data['new_code'] == x) & (data['ticket_cancel'] == 0)])
    if x in distance_y_show:
        return total_ticket / 1252
    elif x in choir_show:
        return total_ticket / 2505
    else:
        return total_ticket / 2231

In [54]:
# new_code의 unique값을 뽑아내서 점유율 함수를 적용
capa_rate = data['new_code'].unique().tolist()
capa_rate = pd.DataFrame(capa_rate, columns=['new_code'])
capa_rate['capa_rate'] = capa_rate['new_code'].apply(lambda x: make_capa_rate(x))
capa_rate

Unnamed: 0,new_code,capa_rate
0,435,0.257086
1,449,0.522954
2,31,0.649900
3,100,0.742116
4,99,0.525773
...,...,...
665,363,0.018371
666,276,0.106230
667,336,0.052716
668,418,0.068690


In [55]:
# reservation_rate를 data에 merge
new_df = pd.merge(new_df, capa_rate, on='new_code', how='left')
new_df

Unnamed: 0,age,gender,tran_date,tran_time,play_date,play_st_time,seat,price,ticket_cancel,discount_type,...,inv2_paymem_cnt,inv4_buyyn,involvement,seat_floor,seat_loc,seat_gen,cancel_rate,paid_rate,pre_rate,capa_rate
0,50.0,F,20220114,15:12,20220204,20:00,3층 BOX9 10,10000,2,일반,...,2,1,4,third,left,B,0.172237,0.996144,0.000000,0.257086
1,50.0,M,20220206,16:15,20220302,19:30,1층 B블록12열 7,180000,0,일반,...,0,1,2,first,left,R,0.244957,0.968300,0.000000,0.522954
2,30.0,F,20181124,11:45,20190323,20:00,1층 A블록2열 1,144000,2,블루회원 할인20%,...,1,1,3,first,left,B,0.167689,0.991820,0.000000,0.649900
3,,,20190613,09:54,20190723,20:00,2층 D블록8열 4,0,0,초대권,...,0,0,0,second,right,A,0.004285,0.028388,0.000000,0.742116
4,,F,20190703,09:08,20190721,17:00,1층 C블록17열 3,75000,0,일반,...,1,1,3,first,mid,R,0.044788,0.348534,0.000000,0.525773
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
753449,60.0,F,20210618,15:09,20210704,15:00,1층 A블록2열 2,90000,2,일반,...,1,1,3,first,left,B,0.396887,0.996109,0.003891,0.247604
753450,50.0,M,20230522,17:29,20230613,17:00,3층 A블록6열 4,10000,0,일반,...,0,1,2,third,left,C,0.011291,0.732941,0.019146,0.902734
753451,,,20201009,16:52,20201020,19:30,1층 D블록16열 12,0,0,초대권,...,0,0,0,first,right,R,0.031766,0.246506,0.000000,0.341551
753452,,,20200726,16:55,20200818,19:30,1층 D블록20열 8,0,0,기획사판매,...,0,0,0,first,right,S,0.001638,0.019656,0.000000,0.973642


In [56]:
# 제대로 붙었는지 랜덤으로 공연 확인
new_df[(new_df['new_code'] == 231)][['new_code', 'capa_rate']]

Unnamed: 0,new_code,capa_rate
1014,231,1.349042
2179,231,1.349042
4783,231,1.349042
5598,231,1.349042
5867,231,1.349042
...,...,...
750050,231,1.349042
751132,231,1.349042
751185,231,1.349042
751621,231,1.349042


In [57]:
# 원래 점유율이 1인 애들
new_df[new_df['capa_rate'] == 1]['new_code'].unique().tolist()
print(new_df[new_df['capa_rate'] == 1].shape)

(0, 45)


In [58]:
# 점유율 1넘는 애들 확인
over1 = new_df[new_df['capa_rate'] > 1]['new_code'].unique().tolist()
print(len(over1))
print(new_df[new_df['capa_rate'] > 1].shape)

23
(40042, 45)


In [59]:
# 1넘는 애들은 1로 바꾸기
new_df.loc[new_df['new_code'].isin(over1), 'capa_rate'] = 1

In [60]:
# 확인
print(new_df[new_df['capa_rate'] > 1].shape)
print(new_df[new_df['capa_rate'] == 1].shape) # 이 값이 원래 점유율 1이었던 애들이랑 점유율 1로 바꾼애들 더한 값이랑 맞으면 ok

(0, 45)
(40042, 45)


In [61]:
new_df

Unnamed: 0,age,gender,tran_date,tran_time,play_date,play_st_time,seat,price,ticket_cancel,discount_type,...,inv2_paymem_cnt,inv4_buyyn,involvement,seat_floor,seat_loc,seat_gen,cancel_rate,paid_rate,pre_rate,capa_rate
0,50.0,F,20220114,15:12,20220204,20:00,3층 BOX9 10,10000,2,일반,...,2,1,4,third,left,B,0.172237,0.996144,0.000000,0.257086
1,50.0,M,20220206,16:15,20220302,19:30,1층 B블록12열 7,180000,0,일반,...,0,1,2,first,left,R,0.244957,0.968300,0.000000,0.522954
2,30.0,F,20181124,11:45,20190323,20:00,1층 A블록2열 1,144000,2,블루회원 할인20%,...,1,1,3,first,left,B,0.167689,0.991820,0.000000,0.649900
3,,,20190613,09:54,20190723,20:00,2층 D블록8열 4,0,0,초대권,...,0,0,0,second,right,A,0.004285,0.028388,0.000000,0.742116
4,,F,20190703,09:08,20190721,17:00,1층 C블록17열 3,75000,0,일반,...,1,1,3,first,mid,R,0.044788,0.348534,0.000000,0.525773
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
753449,60.0,F,20210618,15:09,20210704,15:00,1층 A블록2열 2,90000,2,일반,...,1,1,3,first,left,B,0.396887,0.996109,0.003891,0.247604
753450,50.0,M,20230522,17:29,20230613,17:00,3층 A블록6열 4,10000,0,일반,...,0,1,2,third,left,C,0.011291,0.732941,0.019146,0.902734
753451,,,20201009,16:52,20201020,19:30,1층 D블록16열 12,0,0,초대권,...,0,0,0,first,right,R,0.031766,0.246506,0.000000,0.341551
753452,,,20200726,16:55,20200818,19:30,1층 D블록20열 8,0,0,기획사판매,...,0,0,0,first,right,S,0.001638,0.019656,0.000000,0.973642


In [62]:
new_df.columns

Index(['age', 'gender', 'tran_date', 'tran_time', 'play_date', 'play_st_time',
       'seat', 'price', 'ticket_cancel', 'discount_type', 'pre_open_date',
       'open_date', 'genre', 'running_time', 'intermission', 'member_yn',
       'new_code', 'key', 'pre_ticketing', 'general', 'sac', 'noble',
       'discount_rate', 'real_price', 'discount_cat', 'all_mem_cnt', 'blue',
       'green', 'gold', 'play_date_dt', 'season', 'performance_time_slot',
       'day_of_week', 'weekday_or_weekend', 'inv1_memyn', 'inv2_paymem_cnt',
       'inv4_buyyn', 'involvement', 'seat_floor', 'seat_loc', 'seat_gen',
       'cancel_rate', 'paid_rate', 'pre_rate', 'capa_rate'],
      dtype='object')

In [63]:
new_df.to_csv('../processed_v2.csv', index=False)