In [1]:
# Module Import
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
import re
%matplotlib inline

# Print Controll Setting
import warnings
warnings.simplefilter(action='ignore')
pd.options.display.max_columns = None

# Visualization Setting
sns.set_theme(style='whitegrid', font_scale=1.5)
sns.set_palette('rocket', n_colors=10)
plt.rc('font', family='malgun gothic')
plt.rc('axes', unicode_minus=False)

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))

In [2]:
import lightgbm as lgb
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
import xgboost as xgb
from xgboost import XGBClassifier

import optuna

from imblearn.over_sampling import SMOTE

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import cohen_kappa_score

from sklearn.model_selection import GroupShuffleSplit

from scipy.optimize import minimize

### 데이터 불러오기

In [10]:
df_concert = pd.read_csv('data/cluster_retouch.csv')

In [11]:
df_concert['seat_floor'] = df_concert['seat_floor'].astype('str')

In [12]:
sum_prices = df_concert.groupby('unique_perform_code')['price'].sum()

codes_with_zero_prices = sum_prices[sum_prices == 0].index

df_concert = df_concert[~df_concert['unique_perform_code'].isin(codes_with_zero_prices)]

In [13]:
df_price_mean = df_concert[df_concert['price'] != 0].groupby('unique_perform_code')['price'].agg({('perform_price_mean','mean')}).reset_index();df_price_mean
df_concert = pd.merge(df_concert, df_price_mean, on='unique_perform_code', how='left')

In [14]:
df_concert = df_concert[['reservation_yn','unique_perform_code','play_holiday','play_weekday','play_month','new_cluster','genre','seat_floor','seat_block',
                         'play_st_time','price','running_time','intermission','seat_row','seat_num','play_year','from_stage','acoustic_value','from_exit','cosine_sim','perform_price_mean']]

* * *

# 좌석별 예매 확률 예측
- **catboost, LGBM, XGBoost** 중 가장 성능 좋은 모델로 사용
- train 데이터, test 데이터, new 데이터로 공연별 분리

### train_test 분리
- 학습 데이터 분리할 때 공연 단위로 train/validation/test 나눌 수 있도록 GroupShuffleSplit 사용)

In [15]:
# train 데이터와 test 데이터(성능 확인용) 분리
train_test_idx = next(GroupShuffleSplit(n_splits=1, train_size=0.99, random_state=42).split(df_concert, df_concert['reservation_yn'], groups=df_concert['unique_perform_code']))

X_train, y_train = df_concert.iloc[train_test_idx[0],1:], df_concert.iloc[train_test_idx[0],0]
X_test, y_test = df_concert.iloc[train_test_idx[1],1:], df_concert.iloc[train_test_idx[1],0]

In [16]:
# 범주형 스케일링
for column in ['genre','seat_floor','seat_block']:
    le = LabelEncoder()
    
    X_train[column] = le.fit_transform(X_train[column])
    X_test[column] = le.transform(X_test[column])

In [17]:
smote = SMOTE(random_state=0)
X_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

X_train_resampled = pd.DataFrame(X_resampled, columns=X_train.columns)

### 데이터 스케일링

In [18]:
# Oversampled 데이터 분리
X_train_resampled_numeric = X_train_resampled[['play_st_time','price','running_time','intermission','seat_row','seat_num','play_year',
                                         'from_stage','acoustic_value','from_exit','cosine_sim','perform_price_mean']]

X_train_resampled = X_train_resampled[['unique_perform_code','play_holiday','play_weekday','play_month','new_cluster','genre','seat_floor','seat_block']]

# test 데이터 분리
X_test_numeric = X_test[['play_st_time','price','running_time','intermission','seat_row','seat_num','play_year',
                     'from_stage','acoustic_value','from_exit','cosine_sim','perform_price_mean']]

X_test = X_test[['unique_perform_code','play_holiday','play_weekday','play_month','new_cluster','genre','seat_floor','seat_block']]

In [19]:
# 수치형 스케일링
scaler = StandardScaler()

X_train_resampled_numeric_scaled = scaler.fit_transform(X_train_resampled_numeric)
X_test_numeric_scaled = scaler.transform(X_test_numeric)

# 원래의 인덱스를 유지하여 스케일링된 데이터프레임 생성
X_train_resampled_numeric = pd.DataFrame(X_train_resampled_numeric_scaled, columns=X_train_resampled_numeric.columns, index=X_train_resampled_numeric.index)
X_test_numeric = pd.DataFrame(X_test_numeric_scaled, columns=X_test_numeric.columns, index=X_test_numeric.index)

# 데이터프레임 합치기 (인덱스 재설정 방지)
X_train_resampled = pd.concat([X_train_resampled, X_train_resampled_numeric], axis=1, ignore_index=False)
X_test = pd.concat([X_test, X_test_numeric], axis=1, ignore_index=False)

### train_val split

In [20]:
# train 데이터와 validation 데이터 분리
train_idx, val_idx = next(GroupShuffleSplit(n_splits=1, train_size=0.7, random_state=42).split(X_train_resampled, y_train_resampled, groups=X_train_resampled['unique_perform_code']))

X_train, y_train = X_train_resampled.iloc[train_idx].iloc[:,1:], y_train_resampled[train_idx]
X_val, y_val = X_train_resampled.iloc[val_idx].iloc[:,1:], y_train_resampled[val_idx]

## 모델 학습 및 파라미터 최적화

### optuna 최적화
- predict_proba로 예매 확률을 예측해야 하므로 평가 지표 ROC-AUC 사용

In [21]:
def lgbm_objective(trial, data=X_train, target=y_train):
    lgbm_params = {
        'objective': 'binary',
        'metric': 'binary_logloss',
        'verbosity': -1,
        'boosting_type': 'gbdt',
        'class_weight': 'balanced',  # 클래스 불균형 해결을 위한 설정
        'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-8, 10.0),
        'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-8, 10.0),
        'num_leaves': trial.suggest_int('num_leaves', 2, 256),
        'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),
        'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
    }
    model = lgb.train(lgbm_params, lgb.Dataset(data, label=target))
    preds_proba = model.predict(X_val)
    return roc_auc_score(y_val, preds_proba)

lgbm_study = optuna.create_study(direction='maximize')
lgbm_study.optimize(lgbm_objective, n_trials=50)
print(f"LGBM Best trial: score {lgbm_study.best_value}, params {lgbm_study.best_params}")

[I 2023-09-27 04:45:12,244] A new study created in memory with name: no-name-925cd337-9d3a-4b10-ab77-e0d28555930a
[I 2023-09-27 04:45:16,829] Trial 0 finished with value: 0.9640692988979008 and parameters: {'lambda_l1': 0.10999047641248189, 'lambda_l2': 2.098044720033773, 'num_leaves': 109, 'feature_fraction': 0.9650908577246595, 'bagging_fraction': 0.8730635777492739, 'bagging_freq': 4, 'min_child_samples': 61}. Best is trial 0 with value: 0.9640692988979008.
[I 2023-09-27 04:45:20,677] Trial 1 finished with value: 0.9596055924986805 and parameters: {'lambda_l1': 8.146595753185291, 'lambda_l2': 0.0017153393986747523, 'num_leaves': 69, 'feature_fraction': 0.8115816803656398, 'bagging_fraction': 0.8160418231518063, 'bagging_freq': 2, 'min_child_samples': 62}. Best is trial 0 with value: 0.9640692988979008.
[I 2023-09-27 04:45:22,986] Trial 2 finished with value: 0.9443341958251346 and parameters: {'lambda_l1': 0.030960494413515414, 'lambda_l2': 0.00022363938051336952, 'num_leaves': 27, 

[I 2023-09-27 04:47:42,671] Trial 23 finished with value: 0.9709483717609995 and parameters: {'lambda_l1': 0.016752588979191504, 'lambda_l2': 2.1669257322766205e-07, 'num_leaves': 196, 'feature_fraction': 0.49463984933472216, 'bagging_fraction': 0.9439692057397939, 'bagging_freq': 7, 'min_child_samples': 26}. Best is trial 9 with value: 0.9720139286978107.
[I 2023-09-27 04:47:50,347] Trial 24 finished with value: 0.971665451932042 and parameters: {'lambda_l1': 1.2147124862903036, 'lambda_l2': 1.3466193956501311e-05, 'num_leaves': 232, 'feature_fraction': 0.43827968143334195, 'bagging_fraction': 0.8801368394727768, 'bagging_freq': 6, 'min_child_samples': 86}. Best is trial 9 with value: 0.9720139286978107.
[I 2023-09-27 04:47:58,509] Trial 25 finished with value: 0.9716557368523774 and parameters: {'lambda_l1': 1.7972551289439014, 'lambda_l2': 6.085579060835341e-06, 'num_leaves': 255, 'feature_fraction': 0.40678483412088656, 'bagging_fraction': 0.87183525024704, 'bagging_freq': 5, 'min_

[I 2023-09-27 04:50:30,284] Trial 47 finished with value: 0.9718123575048322 and parameters: {'lambda_l1': 0.01859801380469218, 'lambda_l2': 0.010499023429478132, 'num_leaves': 254, 'feature_fraction': 0.6969633475543099, 'bagging_fraction': 0.8324477075767394, 'bagging_freq': 1, 'min_child_samples': 99}. Best is trial 44 with value: 0.9728223842521704.
[I 2023-09-27 04:50:36,241] Trial 48 finished with value: 0.9695135273377247 and parameters: {'lambda_l1': 0.051166102232625146, 'lambda_l2': 0.00319694101533669, 'num_leaves': 207, 'feature_fraction': 0.7323619537814765, 'bagging_fraction': 0.7973165401194682, 'bagging_freq': 1, 'min_child_samples': 94}. Best is trial 44 with value: 0.9728223842521704.
[I 2023-09-27 04:50:43,661] Trial 49 finished with value: 0.9718893346715307 and parameters: {'lambda_l1': 0.24328777632979096, 'lambda_l2': 0.016087678588628692, 'num_leaves': 221, 'feature_fraction': 0.6307719528739647, 'bagging_fraction': 0.8448856487633765, 'bagging_freq': 2, 'min_ch

LGBM Best trial: score 0.9728223842521704, params {'lambda_l1': 0.26695418766971, 'lambda_l2': 0.004636011640181229, 'num_leaves': 249, 'feature_fraction': 0.4334686266501061, 'bagging_fraction': 0.8789305762393483, 'bagging_freq': 1, 'min_child_samples': 92}


In [22]:
X_train_all = pd.concat([X_train, X_val], axis=0)
y_train_all = pd.concat([y_train, y_val], axis=0)

In [23]:
best_params = lgbm_study.best_params
best_model_val = lgb.train(best_params, lgb.Dataset(X_train, label=y_train))
best_model = lgb.train(best_params, lgb.Dataset(X_train_all, label=y_train_all))

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004597 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1999
[LightGBM] [Info] Number of data points in the train set: 907062, number of used features: 19
[LightGBM] [Info] Start training from score 0.510898
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005637 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2009
[LightGBM] [Info] Number of data points in the train set: 1325876, number of used features: 19
[LightGBM] [Info] Start training from score 0.500000


In [24]:
y_pred = best_model_val.predict(X_val)
y_pred_class = [1 if proba >= 0.5 else 0 for proba in y_pred]

roc_auc = roc_auc_score(y_val, y_pred)
kappa = cohen_kappa_score(y_val, y_pred_class)
f1 = f1_score(y_val, y_pred_class)

print(f"Optimal LGBM Parameters: {best_params}")
print('-------------------------------------------------')
print(f"ROC-AUC for the best model: {roc_auc:.3f}")
print(f"f1-score for the best model: {f1:.3f}")
print(f"Cohen's Kappa for the best model: {kappa:.3f}")

Optimal LGBM Parameters: {'lambda_l1': 0.26695418766971, 'lambda_l2': 0.004636011640181229, 'num_leaves': 249, 'feature_fraction': 0.4334686266501061, 'bagging_fraction': 0.8789305762393483, 'bagging_freq': 1, 'min_child_samples': 92}
-------------------------------------------------
ROC-AUC for the best model: 0.970
f1-score for the best model: 0.901
Cohen's Kappa for the best model: 0.805


In [25]:
predict_prob = best_model.predict(X_test.iloc[:,1:])
predict = [1 if proba >= 0.5 else 0 for proba in predict_prob]

roc_auc = roc_auc_score(y_test, predict_prob)
kappa = cohen_kappa_score(y_test, predict)
f1 = f1_score(y_test, predict)

print(f"Optimal LGBM Parameters: {best_params}")
print('-------------------------------------------------')
print(f"ROC-AUC for the best model: {roc_auc:.3f}")
print(f"f1-score for the best model: {f1:.3f}")
print(f"Cohen's Kappa for the best model: {kappa:.3f}")

Optimal LGBM Parameters: {'lambda_l1': 0.26695418766971, 'lambda_l2': 0.004636011640181229, 'num_leaves': 249, 'feature_fraction': 0.4334686266501061, 'bagging_fraction': 0.8789305762393483, 'bagging_freq': 1, 'min_child_samples': 92}
-------------------------------------------------
ROC-AUC for the best model: 0.763
f1-score for the best model: 0.940
Cohen's Kappa for the best model: 0.107


In [26]:
def lgbm_objective(trial, data=X_train, target=y_train):
    lgbm_params = {
        'objective': 'binary',
        'metric': 'binary_logloss',
        'verbosity': -1,
        'boosting_type': 'gbdt',
        'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-8, 10.0),
        'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-8, 10.0),
        'num_leaves': trial.suggest_int('num_leaves', 2, 256),
        'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),
        'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
    }
    model = lgb.train(lgbm_params, lgb.Dataset(data, label=target))
    preds_proba = model.predict(X_val)
    return roc_auc_score(y_val, preds_proba)

lgbm_study = optuna.create_study(direction='maximize')
lgbm_study.optimize(lgbm_objective, n_trials=100)
print(f"LGBM Best trial: score {lgbm_study.best_value}, params {lgbm_study.best_params}")

[I 2023-09-27 04:50:57,915] A new study created in memory with name: no-name-ed262128-2ce7-4f0d-9013-66855f8d306e
[I 2023-09-27 04:51:03,782] Trial 0 finished with value: 0.9690817314668378 and parameters: {'lambda_l1': 2.4753609047150124e-08, 'lambda_l2': 0.18188247070532412, 'num_leaves': 170, 'feature_fraction': 0.6473748290100486, 'bagging_fraction': 0.6897020667759703, 'bagging_freq': 4, 'min_child_samples': 100}. Best is trial 0 with value: 0.9690817314668378.
[I 2023-09-27 04:51:10,929] Trial 1 finished with value: 0.9704930025953755 and parameters: {'lambda_l1': 0.3070898445691894, 'lambda_l2': 0.17339779857379928, 'num_leaves': 205, 'feature_fraction': 0.5656338754388232, 'bagging_fraction': 0.8659946700001578, 'bagging_freq': 5, 'min_child_samples': 91}. Best is trial 1 with value: 0.9704930025953755.
[I 2023-09-27 04:51:12,932] Trial 2 finished with value: 0.9312016260857686 and parameters: {'lambda_l1': 3.5491080565472807e-06, 'lambda_l2': 0.003913039726175166, 'num_leaves'

KeyboardInterrupt: 

* * *

# 예매 확률 기반 좌석별 가격 최적화
- 기준 가격 기준 +150%, -50%의 가격을 각각 탐색의 상한과 하한으로 지정
- 기준 가격이란? 주최측에서 희망하는 판매 가격, 예컨대 총 비용을 좌석 수로 나눈 수익분기점 금액.
- 1000원 단위의 그리드 서치 형식으로 최적화 수행
- 단, 모델 학습 때와 달리 모든 좌석에 대해 가격을 예측해야 하므로 예매 내역에 없는 좌석은 임의로 내역 생성

In [27]:
opt_test = df_concert[df_concert['unique_perform_code'].isin(X_test['unique_perform_code'].unique())]

In [28]:
opt_test.groupby('unique_perform_code')['play_holiday'].count().reset_index()

Unnamed: 0,unique_perform_code,play_holiday
0,93,922
1,211,59
2,252,542
3,480,1214
4,502,1245
5,676,1209
6,722,1908


In [105]:
optExample_origin = opt_test[opt_test['unique_perform_code'] == 93]
optExample_scaled = X_test[X_test['unique_perform_code']==93].iloc[:,1:]

In [106]:
# 평균 가격에 따른 상한, 하한 가격 설정
price_mean = optExample_origin['perform_price_mean'].mean()
lower_bound = 0.5 * price_mean
upper_bound = 5.0 * price_mean

In [107]:
price_mean*5.0

72330.31674208147

In [108]:
results = []

In [109]:
# 각 좌석별로 그리드 서치 수행
for idx, row in optExample_scaled.iterrows():
    best_revenue = 0
    best_price_for_row = price_mean
    best_seat_rate_for_row = 0
    
    # 상한, 하한 가격 범위 내에서 1000원 단위로 그리드 서치
    for price in np.arange(lower_bound, upper_bound + 1, 1000):
        row['price'] = price
        # 현재 가격에서의 예매 확률 계산
        seat_rate = best_model.predict([row])
        # 기대 매출 계산
        expected_revenue = seat_rate * price
        
        # 기대 매출이 이전보다 높을 경우 정보 갱신
        if expected_revenue > best_revenue:
            best_revenue = expected_revenue
            best_price_for_row = price
            best_seat_rate_for_row = seat_rate
    
    # 좌석 정보 문자열로 합치기
    seat_str = f"{row['seat_floor']}층, {row['seat_block']}블록, {row['seat_row']}열, {row['seat_num']}번"
    
    # 각 좌석별 최적의 정보를 딕셔너리 형태로 저장
    result = {
        'genre': row['genre'],
        'cluster': row['new_cluster'],
        'seat': seat_str,
        'price': best_price_for_row,
        'best_expected_revenue': best_revenue,
        'best_seat_rate': best_seat_rate_for_row
    }
    
    results.append(result)

optimizedPrice = pd.DataFrame(results)

## 군집별 평균 기대 매출 추정

In [110]:
optimizedPrice = optimizedPrice.groupby(['genre', 'cluster']).agg({
    'price': 'mean',
    'best_expected_revenue': 'mean',
    'best_seat_rate': 'mean'
}).reset_index()

In [111]:
optimizedPrice

Unnamed: 0,genre,cluster,price,best_expected_revenue,best_seat_rate
0,0.0,0.0,72233.031674,[56860.17402415031],[0.7871769010140138]
1,0.0,1.0,72233.031674,[60423.686133272895],[0.8365104541894508]
2,0.0,2.0,72233.031674,[49909.86475462138],[0.6909562508705057]
3,0.0,3.0,72233.031674,[53446.2371044369],[0.7399140789977475]
4,0.0,4.0,72233.031674,[50342.6705778702],[0.696948050096113]
5,0.0,5.0,72233.031674,[59331.91171163766],[0.8213958397764846]
6,0.0,6.0,72233.031674,[65280.27676992166],[0.9037454922888261]


### 군집별 기대 매출 시각화 (파이그래프)

In [61]:
1.648150e+06

1648150.0

In [60]:
1.646512e+06

1646512.0

### 기존 총 매출과의 비교

In [None]:
price_mean * sum(y_test[['unique_perform_code'] == ]['reservation'] == 1)/2489