In [16]:
# 코드 작성일 : 24.8.17 
import os

if 'original_dir' not in globals() :
    original_dir = os.getcwd()
    original_dir = os.path.dirname(os.path.dirname((os.path.dirname(original_dir))))
    os.chdir(original_dir)
from setup.default import *


In [17]:
df = pd.read_csv(original_dir+'/data/preproc/main/feature_lab_v1/menu_w_soldout_w_soldout_ratio.csv')

In [18]:
from sklearn.preprocessing import StandardScaler
# 결측치 처리
df.loc[:,'course_kcal'] = df['course_kcal'].fillna(df['course_kcal'].mean().astype(int))
df.loc[:,'course_protein'] = df['course_protein'].fillna(df['course_protein'].mean().astype(int))
df.loc[:, 'course_na'] = df['course_na'].apply(lambda x: int(x) if pd.notna(x) else np.nan)
df.loc[:,'course_na'] = df['course_na'].fillna(df['course_na'].mean().astype(int))
df.loc[:,'soldout_ratio'] = df['soldout_ratio'].fillna(df['soldout_ratio'].mean().astype(int))
df.loc[:,'soldout_ratio_c'] = df['soldout_ratio_c'].fillna(df['soldout_ratio_c'].mean().astype(int))

# 루트 화
df['sim_menu'] = np.sqrt(df['sim_menu'])
df['sim_menu_c'] = np.sqrt(df['sim_menu_c'])

# # StandardScaler를 사용하여 표준 정규 분포로 변환
columns_to_scale = ['course_kcal', 'course_protein', 'course_na']
scaler = StandardScaler()
df[columns_to_scale] = scaler.fit_transform(df[columns_to_scale])


# 특징과 타겟 변수 설정
df['day'] = pd.to_datetime(df['day'])

test_data = df[(df['day'] >= '2024-04-01')]
train_data = df[(df['day'] < '2024-04-01')]

# 피처와 타겟 분리
X_train = train_data.drop(columns=['post_no','day','meal_time','is_soldout', 'course', 'soldout','soldout_time','first_menu_unpreproc',\
                                   'first_menu','menu_no','only_menu_2','only_menu_2_soldout','course_no_c','only_menu_2_c','only_menu_2_soldout_c'])
y_train = train_data['is_soldout'].astype(int)

# y_train이 1인 샘플의 50%를 무작위로 샘플링하여 추가 (1.5배로 증강)
X_train_1 = X_train[y_train == 1]
y_train_1 = y_train[y_train == 1]

X_train_1_resampled, y_train_1_resampled = resample(X_train_1, y_train_1,
                                                     replace=True,
                                                     n_samples=int(len(X_train_1) * 0.8),
                                                     random_state=None)

# 기존 X_train, y_train과 결합
X_train_resampled = pd.concat([X_train, X_train_1_resampled])
y_train_resampled = pd.concat([y_train, y_train_1_resampled])

X_test = test_data.drop(columns=['post_no','day','meal_time','is_soldout', 'course', 'soldout','soldout_time','first_menu_unpreproc',\
                                   'first_menu','menu_no','only_menu_2','only_menu_2_soldout','course_no_c','only_menu_2_c','only_menu_2_soldout_c'])
y_test = test_data['is_soldout'].astype(int)

# 범주형 변수 인코딩 및 스케일링
numeric_features = ['course_kcal', 'course_protein', 'course_na','sim_menu','sim_menu_c','soldout_ratio','soldout_ratio_c']
categorical_features = ['day_of_week', 'course_no']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(), categorical_features)
    ]
)

In [19]:
# 로지스틱 회귀 모델 파이프라인
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=1000))
])
# 모델 훈련
model.fit(X_train_resampled, y_train_resampled)

# 예측
y_pred = model.predict(X_test)

# 결과 평가
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')
print(classification_report(y_test, y_pred))

Accuracy: 0.53125
              precision    recall  f1-score   support

           0       0.62      0.68      0.65        41
           1       0.32      0.26      0.29        23

    accuracy                           0.53        64
   macro avg       0.47      0.47      0.47        64
weighted avg       0.51      0.53      0.52        64



## RandomForest

In [20]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier  # 다른 모델을 사용하려면 해당 클래스를 임포트

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(), categorical_features)
    ]
)
# 로지스틱 회귀 대신 랜덤 포레스트를 사용한 파이프라인
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
])

# 모델 훈련 및 평가
model.fit(X_train_resampled, y_train_resampled)
y_pred = model.predict(X_test)

# 결과 평가
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')
print(classification_report(y_test, y_pred))


Accuracy: 0.640625
              precision    recall  f1-score   support

           0       0.70      0.76      0.73        41
           1       0.50      0.43      0.47        23

    accuracy                           0.64        64
   macro avg       0.60      0.60      0.60        64
weighted avg       0.63      0.64      0.63        64



In [21]:
# XGBoost 모델 파이프라인
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', XGBClassifier(n_estimators=100, random_state=42, use_label_encoder=False, eval_metric='logloss'))
])

# 모델 훈련
model.fit(X_train, y_train)

# 예측
y_pred = model.predict(X_test)

# 결과 평가
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')
print(classification_report(y_test, y_pred))

Accuracy: 0.59375
              precision    recall  f1-score   support

           0       0.67      0.73      0.70        41
           1       0.42      0.35      0.38        23

    accuracy                           0.59        64
   macro avg       0.54      0.54      0.54        64
weighted avg       0.58      0.59      0.58        64



Parameters: { "use_label_encoder" } are not used.



## LightGBM

In [22]:
from lightgbm import LGBMClassifier
import warnings
warnings.filterwarnings('ignore', category=UserWarning, module='lightgbm')

# LightGBM을 사용한 파이프라인
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LGBMClassifier(n_estimators=100, random_state=42))
])

# 모델 훈련 및 평가
model.fit(X_train_resampled, y_train_resampled)
y_pred = model.predict(X_test)

# 결과 평가
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')
print(classification_report(y_test, y_pred))
print(f'# of soldout : ', y_pred.sum())


[LightGBM] [Info] Number of positive: 181, number of negative: 207
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000143 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 302
[LightGBM] [Info] Number of data points in the train set: 388, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.466495 -> initscore=-0.134222
[LightGBM] [Info] Start training from score -0.134222
Accuracy: 0.515625
              precision    recall  f1-score   support

           0       0.62      0.63      0.63        41
           1       0.32      0.30      0.31        23

    accuracy                           0.52        64
   macro avg       0.47      0.47      0.47        64
weighted avg       0.51      0.52      0.51        64

# of soldout :  22


## 그래디언트 부스팅 머신 (Gradient Boosting Machine)

In [23]:
from sklearn.ensemble import GradientBoostingClassifier

# 그래디언트 부스팅을 사용한 파이프라인
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', GradientBoostingClassifier(n_estimators=100, random_state=42))
])

# 모델 훈련 및 평가
model.fit(X_train_resampled, y_train_resampled)
y_pred = model.predict(X_test)

# 결과 평가
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')
print(classification_report(y_test, y_pred))
print(f'# of soldout : ', y_pred.sum())


Accuracy: 0.59375
              precision    recall  f1-score   support

           0       0.69      0.66      0.68        41
           1       0.44      0.48      0.46        23

    accuracy                           0.59        64
   macro avg       0.57      0.57      0.57        64
weighted avg       0.60      0.59      0.60        64

# of soldout :  25


In [24]:
from catboost import CatBoostClassifier

# CatBoost를 사용한 파이프라인
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', CatBoostClassifier(iterations=100, random_state=42, verbose=0))
])

# 모델 훈련 및 평가
model.fit(X_train_resampled, y_train_resampled)
y_pred = model.predict(X_test)

# 결과 평가
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')
print(classification_report(y_test, y_pred))
print(f'# of soldout : ', y_pred.sum())


Accuracy: 0.640625
              precision    recall  f1-score   support

           0       0.70      0.78      0.74        41
           1       0.50      0.39      0.44        23

    accuracy                           0.64        64
   macro avg       0.60      0.59      0.59        64
weighted avg       0.63      0.64      0.63        64

# of soldout :  18


## Xgboost

In [25]:
# from xgboost import XGBClassifier

# # 예시 데이터 프레임 생성
# df = pd.read_csv(original_dir+'/data/preproc/main/feature_lab_v1/menu_w_soldout_w_soldout_ratio.csv')

# # 결측치 처리
# df.loc[:,'course_kcal'] = df['course_kcal'].fillna(df['course_kcal'].mean().astype(int))
# df.loc[:,'course_protein'] = df['course_protein'].fillna(df['course_protein'].mean().astype(int))
# df.loc[:, 'course_na'] = df['course_na'].apply(lambda x: int(x) if pd.notna(x) else np.nan)
# df.loc[:,'course_na'] = df['course_na'].fillna(df['course_na'].mean().astype(int))
# # df.loc[:,'soldout_ratio'] = df['soldout_ratio'].fillna(df['soldout_ratio'].mean().astype(int))
# # df.loc[:,'soldout_ratio_c'] = df['soldout_ratio_c'].fillna(df['soldout_ratio_c'].mean().astype(int))

# # 루트 화
# df['sim_menu'] = np.sqrt(df['sim_menu'])
# df['sim_menu_c'] = np.sqrt(df['sim_menu_c'])

# # # StandardScaler를 사용하여 표준 정규 분포로 변환
# columns_to_scale = ['course_kcal', 'course_protein', 'course_na']
# scaler = StandardScaler()
# df[columns_to_scale] = scaler.fit_transform(df[columns_to_scale])


# # 특징과 타겟 변수 설정
# df['day'] = pd.to_datetime(df['day'])

# test_data = df[(df['day'] >= '2024-04-01')]
# train_data = df[(df['day'] < '2024-04-01')]

# # 피처와 타겟 분리
# X_train = train_data.drop(columns=['post_no','day','meal_time','is_soldout', 'course', 'soldout','soldout_time','first_menu_unpreproc',\
#                                    'first_menu','menu_no','only_menu_2','only_menu_2_soldout','course_no_c','only_menu_2_c','only_menu_2_soldout_c'])
# y_train = train_data['is_soldout'].astype(int)

# # y_train이 1인 샘플의 50%를 무작위로 샘플링하여 추가 (1.5배로 증강)
# X_train_1 = X_train[y_train == 1]
# y_train_1 = y_train[y_train == 1]

# X_train_1_resampled, y_train_1_resampled = resample(X_train_1, y_train_1,
#                                                      replace=True,
#                                                      n_samples=int(len(X_train_1) * 0.8),
#                                                      random_state=None)

# # 기존 X_train, y_train과 결합
# X_train_resampled = pd.concat([X_train, X_train_1_resampled])
# y_train_resampled = pd.concat([y_train, y_train_1_resampled])

# X_test = test_data.drop(columns=['post_no','day','meal_time','is_soldout', 'course', 'soldout','soldout_time','first_menu_unpreproc',\
#                                    'first_menu','menu_no','only_menu_2','only_menu_2_soldout','course_no_c','only_menu_2_c','only_menu_2_soldout_c'])
# y_test = test_data['is_soldout'].astype(int)

# # 범주형 변수 인코딩 및 스케일링
# numeric_features = ['course_kcal', 'course_protein', 'course_na','sim_menu','sim_menu_c','soldout_ratio','soldout_ratio_c']
# categorical_features = ['day_of_week', 'course_no']

# preprocessor = ColumnTransformer(
#     transformers=[
#         ('num', StandardScaler(), numeric_features),
#         ('cat', OneHotEncoder(), categorical_features)
#     ]
# )

# # XGBoost 모델 파이프라인
# model = Pipeline(steps=[
#     ('preprocessor', preprocessor),
#     ('classifier', XGBClassifier(n_estimators=100, random_state=42, use_label_encoder=False, eval_metric='logloss'))
# ])

# # 모델 훈련
# model.fit(X_train, y_train)

# # 예측
# y_pred = model.predict(X_test)

# # 결과 평가
# accuracy = accuracy_score(y_test, y_pred)
# print(f'Accuracy: {accuracy}')
# print(classification_report(y_test, y_pred))
