In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta, time
import re
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [2]:
# matplotlib 한글 깨짐 해결
from matplotlib import rc

rc('font', family='NanumGothic')
plt.rcParams['axes.unicode_minus'] = False

In [3]:
data = pd.read_csv('../final_data.csv')
data.head()

Unnamed: 0,key,new_code,age,gender,tran_date,tran_time,play_date,play_st_time,seat,price,...,seat_gen,cancel_rate,paid_rate,pre_rate,capa_rate,howfastyoubuy,50%,newseatgen_mean,newseatloc_mean,newfloorloc_mean
0,0,435,50.0,F,2022-01-14,15:12,2022-02-04,20:00,3층 BOX9 10,10000,...,B,0.172237,0.996144,0.0,0.257086,-21,-19.0,-7.866337,-10.0,-10.752066
1,1,449,50.0,M,2022-02-06,16:15,2022-03-02,19:30,1층 B블록12열 7,180000,...,R,0.244957,0.9683,0.0,0.522954,-24,-50.0,-24.009132,-31.43342,-27.927313
2,2,31,30.0,F,2018-11-24,11:45,2019-03-23,20:00,1층 A블록2열 1,144000,...,B,0.167689,0.99182,0.0,0.6499,-119,-123.0,-70.903101,-71.093434,-69.510129
3,4,100,,,2019-06-13,09:54,2019-07-23,20:00,2층 D블록8열 4,0,...,A,0.004285,0.028388,0.0,0.742116,-40,-40.0,-34.708885,-30.234501,-29.536585
4,5,99,,F,2019-07-03,09:08,2019-07-21,17:00,1층 C블록17열 3,75000,...,R,0.044788,0.348534,0.0,0.525773,-18,-6.0,-5.536585,-5.8125,-5.581522


In [4]:
data.columns

Index(['key', 'new_code', 'age', 'gender', 'tran_date', 'tran_time',
       'play_date', 'play_st_time', 'seat', 'price', 'ticket_cancel',
       'discount_type', 'pre_open_date', 'open_date', 'genre', 'running_time',
       'intermission', 'member_yn', 'pre_ticketing', 'general', 'sac', 'noble',
       'green', 'blue', 'gold', 'discount_rate', 'real_price', 'discount_cat',
       'all_mem_cnt', 'season', 'performance_time_slot', 'day_of_week',
       'weekday_or_weekend', 'inv1_memyn', 'inv2_paymem_cnt', 'inv4_buyyn',
       'involvement', 'seat_floor', 'seat_loc', 'seat_gen', 'cancel_rate',
       'paid_rate', 'pre_rate', 'capa_rate', 'howfastyoubuy', '50%',
       'newseatgen_mean', 'newseatloc_mean', 'newfloorloc_mean'],
      dtype='object')

In [46]:
def grouping(x):
    # 합창석
    if '합창석' in x:
        return 'red'
    elif '2층' in x:
        if 'BOX' in x:
            if any(t in x for t in ['1', '2', '3']):
                return 'blue'
            elif any(t in x for t in ['4', '5', '6']):
                return 'green'
        elif any(t in x for t in ['A블록', 'E블록']):
            return 'yellow'
        elif any(t in x for t in ['B블록', 'D블록']):
            return 'green'
        elif 'C블록' in x:
            return 'blue'
    elif '3층' in x:
        if any(t in x for t in ['BOX', 'A블록', 'G블록', 'M블록', 'N블록']):
            return 'orange'
        elif any(t in x for t in ['B블록', 'C블록', 'D블록', 'E블록', 'F블록']):
            return 'purple'
    elif '1층' in x:
        if 'A블록' in x:
            return 'yellow'
        elif 'B블록' in x:
            if int(x[6:-2].strip()[:-1]) in range(1, 15):
                if int(x[-2:].strip()) in range(4):
                    return 'blue'
                else:
                    return 'red'
            elif int(x[6:-2].strip()[:-1]) in range(15, 23):
                return 'green'
        elif 'C블록' in x:
            if int(x[6:-2].strip()[:-1]) in range(15, 23):
                return 'blue'
            else:
                return 'red'
        elif 'D블록' in x:
            if int(x[6:-2].strip()[:-1]) in range(1, 15):
                if int(x[-2:].strip()) in range(4):
                    return 'blue'
                else:
                    return 'green'
            elif int(x[6:-2].strip()[:-1]) in range(15, 23):
                return 'green'
        elif 'E블록' in x:
            return 'yellow'

In [47]:
data['seat_grouping'] = data['seat'].apply(lambda x: grouping(x))

In [48]:
data[data['seat_grouping'].isna()][['seat', 'seat_grouping']]

Unnamed: 0,seat,seat_grouping


In [50]:
data[['seat', 'seat_grouping']]

Unnamed: 0,seat,seat_grouping
0,3층 BOX9 10,orange
1,1층 B블록12열 7,red
2,1층 A블록2열 1,yellow
3,2층 D블록8열 4,green
4,1층 C블록17열 3,blue
...,...,...
751972,1층 A블록2열 2,yellow
751973,3층 A블록6열 4,orange
751974,1층 D블록16열 12,green
751975,1층 D블록20열 8,green


In [61]:
data[data['seat'].str.contains('1층 C블록3열')]['seat_grouping'].unique()

array(['red'], dtype=object)

In [65]:
data[data['seat'].str.contains('1층 D블록16열 8')]['seat_grouping'].unique()

array(['green'], dtype=object)

In [66]:
data.to_csv('../final_new_grouping.csv', index=False)

In [67]:
data.columns

Index(['key', 'new_code', 'age', 'gender', 'tran_date', 'tran_time',
       'play_date', 'play_st_time', 'seat', 'price', 'ticket_cancel',
       'discount_type', 'pre_open_date', 'open_date', 'genre', 'running_time',
       'intermission', 'member_yn', 'pre_ticketing', 'general', 'sac', 'noble',
       'green', 'blue', 'gold', 'discount_rate', 'real_price', 'discount_cat',
       'all_mem_cnt', 'season', 'performance_time_slot', 'day_of_week',
       'weekday_or_weekend', 'inv1_memyn', 'inv2_paymem_cnt', 'inv4_buyyn',
       'involvement', 'seat_floor', 'seat_loc', 'seat_gen', 'cancel_rate',
       'paid_rate', 'pre_rate', 'capa_rate', 'howfastyoubuy', '50%',
       'newseatgen_mean', 'newseatloc_mean', 'newfloorloc_mean',
       'seat_grouping'],
      dtype='object')