## 데이터 불러오기, 전처리

In [1]:
import os

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error

from pycaret.classification import *
from pycaret.regression import *

import warnings

# 경고 메시지 숨기기
warnings.filterwarnings("ignore")

pd.set_option('display.max_columns', 37) 
pd.set_option('display.max_rows', 1500) 

from matplotlib import font_manager
from matplotlib import rc
import joblib

from lightgbm import LGBMRegressor

# matplotlib의 기본 폰트 설정
plt.rcParams['font.family'] = 'DejaVu Sans'

In [2]:
df = pd.read_csv('250102_ACC_date_1001_1231_ver1.csv') ; df

Unnamed: 0,ID,ADWORD_ID,ADWORD_TYPE,CAMPAIGN_ID,ADVERTISER,DESCRIPTION,DATE,TIME,SPEND,DB_COUNT,CLICKS,IMPRESSIONS,REVENUE,SALES,ACT_DAYS
0,120214830521400033-1,120214830521400033,2,120214830521390033,68.0,인천부평점_오스템49%,2024-12-20,16:30,1132,0,0,47,-1132.0,0.0,1
1,120214830521400033-1,120214830521400033,2,120214830521390033,68.0,인천부평점_오스템49%,2024-12-20,16:45,2512,0,0,87,-2512.0,0.0,1
2,120214830521400033-1,120214830521400033,2,120214830521390033,68.0,인천부평점_오스템49%,2024-12-20,17:00,7183,0,1,208,-7183.0,0.0,1
3,120214830521400033-1,120214830521400033,2,120214830521390033,68.0,인천부평점_오스템49%,2024-12-20,17:15,7183,0,1,208,-7183.0,0.0,1
4,120214830521400033-1,120214830521400033,2,120214830521390033,68.0,인천부평점_오스템49%,2024-12-20,17:30,8481,0,1,256,-8481.0,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9628521,725177893914-1,725177893914,3,22020193868,142.0,부산점_메가젠33만원,2024-12-16,13:30,1560,0,1,237,-1560.0,0.0,5
9628522,725177893914-1,725177893914,3,22020193868,142.0,부산점_메가젠33만원,2024-12-16,13:45,1560,0,1,237,-1560.0,0.0,5
9628523,725177893914-1,725177893914,3,22020193868,142.0,부산점_메가젠33만원,2024-12-16,14:00,1560,0,1,237,-1560.0,0.0,5
9628524,725177893914-1,725177893914,3,22020193868,142.0,부산점_메가젠33만원,2024-12-16,14:15,1560,0,1,237,-1560.0,0.0,5


### AD 단위 누적합 +타임미닛 - 예산/단가 

In [3]:
df = df.sort_values(by=['ADWORD_ID', 'DATE', 'TIME']).reset_index(drop=True)

# DATE 열을 datetime 형식으로 변환
df['DATE'] = pd.to_datetime(df['DATE'])

# df = df.sort_values(by=['ADWORD_ID', 'DATE'])

# # 각 ADWORD_ID 그룹에 대해 시작일부터 현재 날짜까지의 일수를 계산합니다.
# df['ACT_DAYS'] = df.groupby('ADWORD_ID')['DATE'].transform(lambda x: (x - x.min()).dt.days + 1)


# TIME 열을 문자열로 변환 후 시간 형식으로 처리
df['TIME'] = df['TIME'].astype(str).str.slice(0, 5)
df['TIME'] = pd.to_datetime(df['TIME'], format='%H:%M').dt.time

# TIME을 분 단위로 변환
df['TIME_MINUTES'] = df['TIME'].apply(lambda x: x.hour * 60 + x.minute)

df['YEAR'] = df['DATE'].dt.year
df['MONTH'] = df['DATE'].dt.month
df['DAY'] = df['DATE'].dt.day
df['WEEK'] = df['DATE'].dt.dayofweek

In [4]:
df['ADVERTISER'] = df['ADVERTISER'].astype('category')

In [5]:
df['ADWORD_ID'] = df['ADWORD_ID'].astype('category')

### 노출 모델

In [7]:
# IMPRESSIONS 예측
shift_intervals = [1440, 2880, 4320, 5760, 7200, 8640, 10080]

# 각 시프트 간격에 대해 반복
for shift_interval in shift_intervals:
    
    # 시프트 값을 계산 (15분 단위로 나누기)
    shift_value = shift_interval // 15
    

    
    # 시프트 변환 적용
    df['IMPRESSIONS_H_LATER'] = df.groupby(['ADWORD_ID'])['IMPRESSIONS'].transform(lambda x: x.shift(-shift_value)).fillna(0)
    
    def fill_last_four_with_fifth_from_last(group):
        if len(group) >= shift_value:
            fill_value = group.iloc[-1]['IMPRESSIONS']  # 마지막 로우 값
            group.iloc[(-shift_value):, group.columns.get_loc('IMPRESSIONS_H_LATER')] = fill_value
        return group

    # 마지막 값을 채우는 함수 적용
    df = df.groupby(['ADWORD_ID']).apply(fill_last_four_with_fifth_from_last).reset_index(drop=True)

    

    # 피처와 타깃 정의
    X = df[['ADWORD_ID', 'ACT_DAYS', 'TIME_MINUTES', 'MONTH', 'WEEK',
           'ADVERTISER', 
           'CLICKS', 'SPEND', 'DB_COUNT', 'REVENUE','SALES']]
    y = df['IMPRESSIONS_H_LATER']

    # 데이터 분할
    X_tr, X_val, y_tr, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

    # 모델 초기화 및 학습
    lg = LGBMRegressor(random_state=42)
    lg.fit(X_tr, y_tr)

    # 모델 저장
    model_filename = f'250102_IMP_{shift_interval}M_VER1.pkl'
    joblib.dump(lg, model_filename)
    

    print(f"모델 {model_filename} 저장되었습니다.")

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.108614 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 10914
[LightGBM] [Info] Number of data points in the train set: 7702820, number of used features: 11
[LightGBM] [Info] Start training from score 227559.723432
모델 250102_IMP_1440M_VER1.pkl 저장되었습니다.
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.102187 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 10914
[LightGBM] [Info] Number of data points in the train set: 7702820, number of used features: 11
[LightGBM] [Info] Start training from score 232664.878402
모델 250102_IMP_2880M_VER1.pkl 저장되었습니다.
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.112317 s

### 클릭 모델

In [8]:
# CLICKS 예측
shift_intervals = [1440, 2880, 4320, 5760, 7200, 8640, 10080]

# 각 시프트 간격에 대해 반복
for shift_interval in shift_intervals:
    
    # 시프트 값을 계산 (15분 단위로 나누기)
    shift_value = shift_interval // 15
    

    
    # 시프트 변환 적용
    df['CLICKS_H_LATER'] = df.groupby(['ADWORD_ID'])['CLICKS'].transform(lambda x: x.shift(-shift_value)).fillna(0)
    
    def fill_last_four_with_fifth_from_last(group):
        if len(group) >= shift_value:
            fill_value = group.iloc[-1]['CLICKS']  # 마지막 로우 값
            group.iloc[(-shift_value):, group.columns.get_loc('CLICKS_H_LATER')] = fill_value
        return group

    # 마지막 값을 채우는 함수 적용
    df = df.groupby(['ADWORD_ID']).apply(fill_last_four_with_fifth_from_last).reset_index(drop=True)

    

    # 피처와 타깃 정의
    X = df[['ADWORD_ID', 'ACT_DAYS', 'TIME_MINUTES', 'MONTH', 'WEEK',
           'ADVERTISER', 
           'IMPRESSIONS', 'SPEND', 'DB_COUNT', 'REVENUE','SALES']]
    y = df['CLICKS_H_LATER']

    # 데이터 분할
    X_tr, X_val, y_tr, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

    # 모델 초기화 및 학습
    lg = LGBMRegressor(random_state=42)
    lg.fit(X_tr, y_tr)

    # 모델 저장
    model_filename = f'250102_CLK_{shift_interval}M_VER1.pkl'
    joblib.dump(lg, model_filename)


    print(f"모델 {model_filename} 저장되었습니다.")



[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.102121 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 10914
[LightGBM] [Info] Number of data points in the train set: 7702820, number of used features: 11
[LightGBM] [Info] Start training from score 1620.906134
모델 250102_CLK_1440M_VER1.pkl 저장되었습니다.
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.104973 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 10914
[LightGBM] [Info] Number of data points in the train set: 7702820, number of used features: 11
[LightGBM] [Info] Start training from score 1651.974849
모델 250102_CLK_2880M_VER1.pkl 저장되었습니다.
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.102034 secon

### 지출 모델

In [9]:
# SPEND 예측
shift_intervals = [1440, 2880, 4320, 5760, 7200, 8640, 10080]

# 각 시프트 간격에 대해 반복
for shift_interval in shift_intervals:
    
    # 시프트 값을 계산 (15분 단위로 나누기)
    shift_value = shift_interval // 15
    

    
    # 시프트 변환 적용
    df['SPEND_H_LATER'] = df.groupby(['ADWORD_ID'])['SPEND'].transform(lambda x: x.shift(-shift_value)).fillna(0)
    
    def fill_last_four_with_fifth_from_last(group):
        if len(group) >= shift_value:
            fill_value = group.iloc[-1]['SPEND']  # 마지막 로우 값
            group.iloc[(-shift_value):, group.columns.get_loc('SPEND_H_LATER')] = fill_value
        return group

    # 마지막 값을 채우는 함수 적용
    df = df.groupby(['ADWORD_ID']).apply(fill_last_four_with_fifth_from_last).reset_index(drop=True)

    

    # 피처와 타깃 정의
    X = df[['ADWORD_ID', 'ACT_DAYS', 'TIME_MINUTES', 'MONTH', 'WEEK',
           'ADVERTISER', 
           'IMPRESSIONS', 'CLICKS', 'DB_COUNT', 'REVENUE','SALES']]
    y = df['SPEND_H_LATER']

    # 데이터 분할
    X_tr, X_val, y_tr, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

    # 모델 초기화 및 학습
    lg = LGBMRegressor(random_state=42)
    lg.fit(X_tr, y_tr)

    # 모델 저장
    model_filename = f'250102_SPD_{shift_interval}M_VER1.pkl'
    joblib.dump(lg, model_filename)

    print(f"모델 {model_filename} 저장되었습니다.")

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.103103 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 10914
[LightGBM] [Info] Number of data points in the train set: 7702820, number of used features: 11
[LightGBM] [Info] Start training from score 1059145.621291
모델 250102_SPD_1440M_VER1.pkl 저장되었습니다.
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.109246 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 10914
[LightGBM] [Info] Number of data points in the train set: 7702820, number of used features: 11
[LightGBM] [Info] Start training from score 1092203.934327
모델 250102_SPD_2880M_VER1.pkl 저장되었습니다.
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.107998