In [1]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report


In [2]:
import os
original_dir = os.getcwd()
os.chdir(os.path.dirname(os.path.dirname(os.getcwd())))
exec(open('setup/default.py').read())

In [11]:
menu_w_sold_out = pd.read_csv(os.getcwd()+'/data/preproc/main/menu_w_soldout.csv')
menu_w_sold_out['menu_no'] = menu_w_sold_out.groupby(['day', 'meal_time'])['day'].transform('size')
df = menu_w_sold_out[(menu_w_sold_out.meal_time =='dinner') & 
                     (menu_w_sold_out.menu_no>1)&
                     (menu_w_sold_out.day>='2023-06-01')]


In [14]:
df = menu_w_sold_out[(menu_w_sold_out.meal_time =='dinner') & 
                     (menu_w_sold_out.menu_no>1)&
                     (menu_w_sold_out.day>='2023-06-01')]

In [15]:
# 결측치 처리
df.loc[:,'course_kcal'] = df['course_kcal'].fillna(df['course_kcal'].mean().astype(int))
df.loc[:,'course_protein'] = df['course_protein'].fillna(df['course_protein'].mean().astype(int))
df.loc[:, 'course_na'] = df['course_na'].apply(lambda x: int(x) if pd.notna(x) else np.nan)
df.loc[:,'course_na'] = df['course_na'].fillna(df['course_na'].mean().astype(int))

# 특징과 타겟 변수 설정
X = df.drop(columns=['post_no','day','meal_time','is_soldout', 'course', 'soldout','soldout_time'])
y = df['is_soldout'].astype(int)

# 범주형 변수 인코딩 및 스케일링
numeric_features = ['course_kcal', 'course_protein', 'course_na']
categorical_features = ['day_of_week', 'course_no']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(), categorical_features)
    ]
)

In [16]:
# 로지스틱 회귀 모델 파이프라인
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=1000))
])

In [17]:
df['day'] = pd.to_daytime(df['day'])

# train 데이터: 2024년 4월과 5월
train_data = df[(df['day'] >= '2024-04-01') & (df['day'] <= '2024-05-31')]

# test 데이터: 2024년 4월과 5월을 제외한 나머지
test_data = df[(df['day'] < '2024-04-01') | (df['day'] > '2024-05-31')]

# 피처와 타겟 분리
X_train = train_data.drop(columns=['day', 'y'])
y_train = train_data['y']

X_test = test_data.drop(columns=['day', 'y'])
y_test = test_data['y']

In [18]:
# 모델 훈련
model.fit(X_train, y_train)

In [19]:

# 예측
y_pred = model.predict(X_test)

# 결과 평가
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')
print(classification_report(y_test, y_pred))

Accuracy: 0.6040268456375839
              precision    recall  f1-score   support

           0       0.59      0.62      0.60        73
           1       0.62      0.59      0.60        76

    accuracy                           0.60       149
   macro avg       0.60      0.60      0.60       149
weighted avg       0.60      0.60      0.60       149

