In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta, time
import re
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import shapiro, zscore
from matplotlib.colors import LinearSegmentedColormap

import gzip
import pickle

%matplotlib inline


# matplotlib 한글 깨짐 해결
from matplotlib import rc

rc('font', family='NanumGothic')
plt.rcParams['axes.unicode_minus'] = False

In [2]:
df = pd.read_csv('../processed_v2.csv')
df.head()

Unnamed: 0,age,gender,tran_date,tran_time,play_date,play_st_time,seat,price,ticket_cancel,discount_type,...,inv2_paymem_cnt,inv4_buyyn,involvement,seat_floor,seat_loc,seat_gen,cancel_rate,paid_rate,pre_rate,capa_rate
0,50.0,F,20220114,15:12,20220204,20:00,3층 BOX9 10,10000,2,일반,...,2,1,4,third,left,B,0.172237,0.996144,0.0,0.257086
1,50.0,M,20220206,16:15,20220302,19:30,1층 B블록12열 7,180000,0,일반,...,0,1,2,first,left,R,0.244957,0.9683,0.0,0.522954
2,30.0,F,20181124,11:45,20190323,20:00,1층 A블록2열 1,144000,2,블루회원 할인20%,...,1,1,3,first,left,B,0.167689,0.99182,0.0,0.6499
3,,,20190613,09:54,20190723,20:00,2층 D블록8열 4,0,0,초대권,...,0,0,0,second,right,A,0.004285,0.028388,0.0,0.742116
4,,F,20190703,09:08,20190721,17:00,1층 C블록17열 3,75000,0,일반,...,1,1,3,first,mid,R,0.044788,0.348534,0.0,0.525773


        howfastyoubuy 변수 생성

In [4]:
df['tran_date'] = pd.to_datetime(df['tran_date'], format='%Y%m%d') # 고객 구매날짜
df['play_date'] = pd.to_datetime(df['play_date'], format='%Y%m%d') # 공연날짜
df['open_date'] = pd.to_datetime(df['open_date'], format='%Y%m%d') # 예매일

# 선예매날짜
df['pre_open_date'] = df['pre_open_date'].replace(0, pd.NA)
df['pre_open_date'] = pd.to_datetime(df['pre_open_date'], format='%Y%m%d')

In [5]:
final = df.copy()
#howfastyoubuy 변수 생성
final = final.assign(howfastyoubuy=(final['tran_date'] - final['play_date']).dt.days)

In [6]:
# 구매일이 선예매일자보다 빠른 데이터만 추출
df = df[df['tran_date'] < df['pre_open_date']]
# 초대권이나 기획사 판매인 경우, 제외
new = df[~df['discount_type'].isin(['초대권', '기획사판매'])]
new

Unnamed: 0,age,gender,tran_date,tran_time,play_date,play_st_time,seat,price,ticket_cancel,discount_type,...,inv2_paymem_cnt,inv4_buyyn,involvement,seat_floor,seat_loc,seat_gen,cancel_rate,paid_rate,pre_rate,capa_rate
315,60.0,F,2019-07-26,11:02,2019-09-10,20:00,1층 C블록4열 2,21000,0,골드회원 할인30%,...,1,1,3,first,mid,R,0.051530,0.486312,0.012882,0.235130
533,50.0,F,2022-07-11,15:03,2022-10-11,19:30,2층 A블록5열 9,30000,0,일반,...,1,1,3,second,left,A,0.300353,1.000000,0.000000,0.158084
1098,60.0,F,2019-12-08,12:21,2020-10-01,20:00,3층 N블록1열 15,10000,2,일반,...,1,1,3,third,right,C,0.949495,1.000000,0.000000,0.007987
1559,60.0,F,2020-06-18,15:33,2020-12-12,17:00,1층 C블록17열 9,81000,2,골드회원 할인10%,...,2,1,4,first,mid,R,0.943299,1.000000,0.000000,0.026358
1691,50.0,M,2019-12-08,11:17,2020-10-01,20:00,1층 E블록1열 5,18000,2,골드회원 할인10%,...,1,1,3,first,right,A,0.949495,1.000000,0.000000,0.007987
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
751005,50.0,M,2020-05-30,18:58,2020-07-26,17:00,1층 C블록7열 11,42000,0,골드회원 할인30%,...,1,1,3,first,mid,R,0.483209,1.000000,0.000000,0.221246
751167,50.0,M,2019-07-14,14:25,2019-09-01,17:00,1층 D블록10열 3,10000,0,초/중/고등학생 할인(본인만),...,1,1,3,first,right,R,0.100000,1.000000,0.070652,0.330539
751259,50.0,F,2022-10-30,17:43,2022-12-27,20:00,1층 A블록9열 11,127000,0,조기예매할인(1인1매/~11월3일까지)25%,...,1,1,3,first,left,S,0.063908,0.382923,0.000000,0.713373
752354,50.0,F,2018-11-10,23:55,2019-01-11,20:00,1층 A블록3열 4,100000,0,일반,...,1,1,3,first,left,A,0.112031,0.611076,0.000637,0.556886


In [7]:
# pre_ticketing 값에 따라 tran_date 열의 값을 변경
new.loc[new['pre_ticketing'] == 1, 'tran_date'] = new['pre_open_date']
new.loc[new['pre_ticketing'] == 0, 'tran_date'] = new['open_date']

In [8]:
# 얼마나 빨리 예매했는지 수치 계산(howfastyoubuy 변수 생성)
new = new.assign(howfastyoubuy=(new['tran_date'] - new['play_date']).dt.days)
new

Unnamed: 0,age,gender,tran_date,tran_time,play_date,play_st_time,seat,price,ticket_cancel,discount_type,...,inv4_buyyn,involvement,seat_floor,seat_loc,seat_gen,cancel_rate,paid_rate,pre_rate,capa_rate,howfastyoubuy
315,60.0,F,2019-08-02,11:02,2019-09-10,20:00,1층 C블록4열 2,21000,0,골드회원 할인30%,...,1,3,first,mid,R,0.051530,0.486312,0.012882,0.235130,-39
533,50.0,F,2022-07-16,15:03,2022-10-11,19:30,2층 A블록5열 9,30000,0,일반,...,1,3,second,left,A,0.300353,1.000000,0.000000,0.158084,-87
1098,60.0,F,2020-08-22,12:21,2020-10-01,20:00,3층 N블록1열 15,10000,2,일반,...,1,3,third,right,C,0.949495,1.000000,0.000000,0.007987,-40
1559,60.0,F,2020-11-08,15:33,2020-12-12,17:00,1층 C블록17열 9,81000,2,골드회원 할인10%,...,1,4,first,mid,R,0.943299,1.000000,0.000000,0.026358,-34
1691,50.0,M,2020-08-22,11:17,2020-10-01,20:00,1층 E블록1열 5,18000,2,골드회원 할인10%,...,1,3,first,right,A,0.949495,1.000000,0.000000,0.007987,-40
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
751005,50.0,M,2020-07-18,18:58,2020-07-26,17:00,1층 C블록7열 11,42000,0,골드회원 할인30%,...,1,3,first,mid,R,0.483209,1.000000,0.000000,0.221246,-8
751167,50.0,M,2019-07-19,14:25,2019-09-01,17:00,1층 D블록10열 3,10000,0,초/중/고등학생 할인(본인만),...,1,3,first,right,R,0.100000,1.000000,0.070652,0.330539,-44
751259,50.0,F,2022-10-30,17:43,2022-12-27,20:00,1층 A블록9열 11,127000,0,조기예매할인(1인1매/~11월3일까지)25%,...,1,3,first,left,S,0.063908,0.382923,0.000000,0.713373,-58
752354,50.0,F,2018-12-10,23:55,2019-01-11,20:00,1층 A블록3열 4,100000,0,일반,...,1,3,first,left,A,0.112031,0.611076,0.000637,0.556886,-32


In [9]:
# 인덱스를 기반으로 df를 new의 값으로 업데이트
final.update(new)

In [10]:
final.groupby(['new_code'])['howfastyoubuy'].mean().describe()

count    670.000000
mean     -30.383681
std       26.607020
min     -245.869115
25%      -35.018870
50%      -22.847764
75%      -15.077132
max       -4.016626
Name: howfastyoubuy, dtype: float64

In [11]:
# describe를 통해 50%값보기
mapping = final.groupby('new_code')['howfastyoubuy'].describe().reset_index()
mapping = mapping[['new_code', '50%']]
mapping

Unnamed: 0,new_code,50%
0,0,-9.0
1,1,-20.0
2,2,-31.0
3,5,-17.0
4,6,-32.0
...,...,...
665,745,-43.0
666,746,-37.0
667,747,-55.0
668,748,-58.0


In [12]:
# 데이터프레임 merge
merged_df = final.merge(mapping, on='new_code', how='left')

# 계산하기 쉽게 양수로 변경
merged_df['plushowfastyoubuy'] = merged_df['howfastyoubuy'] * -1
merged_df['plus50%'] = merged_df['50%'] * -1

In [13]:
# 50%이하인 값 보기
real = merged_df[merged_df['plushowfastyoubuy'] <= merged_df['plus50%']]
real

Unnamed: 0,age,gender,tran_date,tran_time,play_date,play_st_time,seat,price,ticket_cancel,discount_type,...,seat_loc,seat_gen,cancel_rate,paid_rate,pre_rate,capa_rate,howfastyoubuy,50%,plushowfastyoubuy,plus50%
1,50.0,M,2022-02-06,16:15,2022-03-02,19:30,1층 B블록12열 7,180000,0,일반,...,left,R,0.244957,0.968300,0.000000,0.522954,-24,-50.0,24,50.0
2,30.0,F,2018-11-24,11:45,2019-03-23,20:00,1층 A블록2열 1,144000,2,블루회원 할인20%,...,left,B,0.167689,0.991820,0.000000,0.649900,-119,-123.0,119,123.0
3,,,2019-06-13,09:54,2019-07-23,20:00,2층 D블록8열 4,0,0,초대권,...,right,A,0.004285,0.028388,0.000000,0.742116,-40,-40.0,40,40.0
6,,,2022-06-24,16:18,2022-06-29,19:30,3층 BOX12 3,20000,2,일반,...,right,B,0.036993,0.202864,0.000000,0.322156,-5,-5.0,5,5.0
7,,,2020-05-11,09:14,2020-05-15,19:30,2층 E블록3열 16,0,0,초대권,...,right,B,0.034934,0.154710,0.008734,1.000000,-4,-4.0,4,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
753447,30.0,F,2023-05-16,17:43,2023-06-03,17:00,3층 D블록6열 6,40000,0,일반,...,mid,C,0.098361,0.742176,0.000000,0.483034,-18,-21.0,18,21.0
753449,60.0,F,2021-06-18,15:09,2021-07-04,15:00,1층 A블록2열 2,90000,2,일반,...,left,B,0.396887,0.996109,0.003891,0.247604,-16,-16.0,16,16.0
753450,50.0,M,2023-05-22,17:29,2023-06-13,17:00,3층 A블록6열 4,10000,0,일반,...,left,C,0.011291,0.732941,0.019146,0.902734,-22,-25.0,22,25.0
753451,,,2020-10-09,16:52,2020-10-20,19:30,1층 D블록16열 12,0,0,초대권,...,right,R,0.031766,0.246506,0.000000,0.341551,-11,-11.0,11,11.0


In [14]:
# 데이터 별로 mean값 구하기
new_gen = real.groupby(['new_code', 'seat_gen'])['howfastyoubuy'].agg(['mean']).reset_index()
new_loc = real.groupby(['new_code', 'seat_loc'])['howfastyoubuy'].agg(['mean']).reset_index()
new_floor = real.groupby(['new_code', 'seat_floor'])['howfastyoubuy'].agg(['mean']).reset_index()

# gen_gen = real.groupby(['genre', 'seat_gen'])['howfastyoubuy'].agg(['mean']).reset_index()
# gen_loc = real.groupby(['genre', 'seat_loc'])['howfastyoubuy'].agg(['mean']).reset_index()
# gen_floor = real.groupby(['genre', 'seat_floor'])['howfastyoubuy'].agg(['mean']).reset_index()


In [15]:
# 컬럼명 변경
new_gen = new_gen.rename(columns={'mean': 'newseatgen_mean'})
new_loc = new_loc.rename(columns={'mean': 'newseatloc_mean'})
new_floor = new_floor.rename(columns={'mean': 'newfloorloc_mean'})
# gen_gen = gen_gen.rename(columns={'mean': 'genre_gen_mean'})
# gen_loc = gen_loc.rename(columns={'mean': 'genre_loc_mean'})
# gen_floor =gen_floor.rename(columns={'mean': 'genre_floor_mean'})

In [16]:
#데이터 합치기
matched_df = merged_df.merge(new_gen, on=['new_code', 'seat_gen'], how='left')
md2 = matched_df.merge(new_loc,on=['new_code', 'seat_loc'], how='left' )
md3 = md2.merge(new_floor,on=['new_code', 'seat_floor'], how='left' )
# md4 = md3.merge(gen_gen,on=['genre', 'seat_gen'], how='left' )
# md5 = md4.merge(gen_loc,on=['genre', 'seat_loc'], how='left' )
# md6 = md5.merge(gen_floor,on=['genre', 'seat_floor'], how='left' )

        확인

In [18]:
check = md3[['new_code', 'genre', 'seat_floor','howfastyoubuy','newseatgen_mean', 'newseatloc_mean', 'newfloorloc_mean']]
check

Unnamed: 0,new_code,genre,seat_floor,howfastyoubuy,newseatgen_mean,newseatloc_mean,newfloorloc_mean
0,435,교향곡,third,-21,-7.866337,-10.000000,-10.752066
1,449,독주,first,-24,-24.009132,-31.433420,-27.927313
2,31,교향곡,first,-119,-70.903101,-71.093434,-69.510129
3,100,교향곡,second,-40,-34.708885,-30.234501,-29.536585
4,99,클래식,first,-18,-5.536585,-5.812500,-5.581522
...,...,...,...,...,...,...,...
753449,325,교향곡,first,-16,-14.660714,-14.306306,-14.878571
753450,732,교향곡,third,-22,-11.803571,-14.924956,-11.508772
753451,250,클래식,first,-11,-9.029520,-7.227437,-8.135524
753452,241,합창,first,-23,-21.533898,-22.644820,-20.560132


In [19]:
zero = check[check['new_code'] == 0]
pd.crosstab(zero.seat_floor, zero.newfloorloc_mean)

newfloorloc_mean,-8.675676,-7.780000,-7.705882
seat_floor,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
first,0,0,1209
second,0,281,0
third,47,0,0


In [20]:
new_floor[new_floor['new_code'] == 0]

Unnamed: 0,new_code,seat_floor,newfloorloc_mean
0,0,first,-7.705882
1,0,second,-7.78
2,0,third,-8.675676


In [21]:
md3['pre_open_date'].isna().sum()

254853

In [22]:
# pre_open_date(null 값)를 0으로 채우기
md3['pre_open_date'].fillna(0, inplace=True)
md3[md3.pre_open_date.isnull()]

Unnamed: 0,age,gender,tran_date,tran_time,play_date,play_st_time,seat,price,ticket_cancel,discount_type,...,paid_rate,pre_rate,capa_rate,howfastyoubuy,50%,plushowfastyoubuy,plus50%,newseatgen_mean,newseatloc_mean,newfloorloc_mean


In [28]:
# 전처리 항목 추가 (running_time보다 intermission이 긴 경우 (3개 공연, 1477개))
md3 = md3[md3['running_time']>md3['intermission']]
md3.shape

(751977, 52)

In [30]:
md3 = md3[['key', 'new_code', 'age', 'gender', 'tran_date', 'tran_time', 'play_date', 'play_st_time',
       'seat', 'price', 'ticket_cancel', 'discount_type', 'pre_open_date', 'open_date', 'genre', 'running_time', 
       'intermission', 'member_yn','pre_ticketing', 'general', 'sac', 'noble', 'green', 'blue', 'gold',
       'discount_rate', 'real_price', 'discount_cat', 'all_mem_cnt', 'season', 'performance_time_slot',
       'day_of_week', 'weekday_or_weekend', 'inv1_memyn', 'inv2_paymem_cnt', 'inv4_buyyn', 'involvement', 
       'seat_floor', 'seat_loc', 'seat_gen', 'cancel_rate', 'paid_rate', 'pre_rate', 'capa_rate', 'howfastyoubuy',
       '50%', 'newseatgen_mean', 'newseatloc_mean', 'newfloorloc_mean']]

        저장

In [31]:
md3.to_csv('/home/finda/final/data/final_data.csv', index=False)