In [None]:
import os
import gc
import ctypes
import joblib
from itertools import product
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import ndcg_score
from sklearn.datasets import dump_svmlight_file
import xgboost as xgb
from xgboost import XGBRanker
from lightgbm import LGBMRanker, log_evaluation, early_stopping
import random
from itertools import product

In [None]:
df1 = pd.read_csv("광고목록_전처리.csv")
df2 = pd.read_csv("광고적립_전처리.csv")
df3 = pd.read_csv("진짜_시간별_1년치_전처리_0910.csv")

In [None]:
df1 = df1[['ads_idx', 'ads_category', 'ads_type','ads_code']]
df2 = df2[['ads_idx', 'adv_cost', 'earn_cost', 'mda_idx']]
df3 = df3[['ads_idx', 'mda_idx', 'rpt_time_date', 'rpt_time_time', 'rpt_time_clk', 'rpt_time_turn']]

In [None]:
df3['rpt_time_date'] = pd.to_datetime(df3['rpt_time_date'], errors='coerce')

In [None]:
df = pd.merge(df1, df2, on="ads_idx", how="inner")

In [None]:
df = pd.merge(df,df3, on=["ads_idx", "mda_idx"],how="left")

In [None]:
del df1, df2, df3
import gc
gc.collect()

In [None]:
df = df.dropna()

In [None]:
def reduce_memory_usage(df, verbose=True):
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object and not pd.api.types.is_datetime64_any_dtype(col_type):
            c_min = df[col].min()
            c_max = df[col].max()

            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                else:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

        elif col_type == object:
            num_unique = df[col].nunique()
            num_total = len(df[col])
            if num_unique / num_total < 0.5:
                df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    if verbose:
        print(f'Memory usage decreased from {start_mem:.2f} MB to {end_mem:.2f} MB '
              f'({100 * (start_mem - end_mem) / start_mem:.1f}% reduction)')
    return df

In [None]:
df["margin"] = df["adv_cost"] - df["earn_cost"]
df["margin_rate"] = ((df["margin"] / df["adv_cost"]) * 100).round(2)
df['margin_rate'] = df['margin_rate'].replace([np.inf, -np.inf], np.nan)

In [None]:
margin_by_media_category = df.groupby(["mda_idx", "ads_category"]).agg({"adv_cost": "sum", "earn_cost": "sum"}).reset_index()

margin_by_media_category["margin"] = (margin_by_media_category["adv_cost"] - margin_by_media_category["earn_cost"])

margin_by_media_category["margin_rate"] = ((margin_by_media_category["margin"] / margin_by_media_category["adv_cost"]) * 100).round(2)

In [None]:
df['hour'] = pd.to_numeric(df['rpt_time_time'], errors="coerce").fillna(0).astype('int16')

df['time_period_encoded'] = pd.cut(
    df['hour'],
    bins=[-1, 5, 11, 17, 23],
    labels=[0, 1, 2, 3]
).astype('int8')

df['CVR'] = np.where(
    (df['rpt_time_clk'] > 0) & df['rpt_time_turn'].notna(),
    (df['rpt_time_turn'] / df['rpt_time_clk'] * 100),
    np.nan
).astype('float32')

df['rpt_time_date'] = pd.to_datetime(df['rpt_time_date'], errors="coerce")
df['day_of_week'] = df['rpt_time_date'].dt.dayofweek.astype('int8')
df['is_weekend'] = (df['day_of_week'] >= 5).astype('int8')

In [None]:
df.isnull().sum()

In [None]:
df['rpt_time_date'] = pd.to_datetime(df['rpt_time_date'], errors='coerce')

In [None]:
if 'rpt_time_date' in df.columns:
    del df['rpt_time_date']

if 'rpt_time_time' in df.columns:
    del df['rpt_time_time']

In [None]:
for col in df.columns:
    if df[col].dtype == "int64":
        df[col] = pd.to_numeric(df[col], downcast="integer")
    elif df[col].dtype == "float64":
        df[col] = pd.to_numeric(df[col], downcast="float")

In [None]:
df = df.dropna()

In [None]:
df = df[df['rpt_time_turn']>0]

In [None]:
print("NaN 개수:", df['margin_rate'].isna().sum())
print("Inf 개수:", np.isinf(df['margin_rate']).sum())

In [None]:
for col in df.columns:
    if df[col].dtype == "int64":
        df[col] = pd.to_numeric(df[col], downcast="integer")
    elif df[col].dtype == "float64":
        df[col] = pd.to_numeric(df[col], downcast="float")

In [None]:
df.dtypes

In [None]:
del df1, df2, df3
import gc
gc.collect()

In [None]:
req_cols=["mda_idx","CVR","adv_cost","earn_cost","margin","margin_rate","ads_code","ads_type","ads_category"]
if "time_period" in df.columns: 
    req_cols.append("time_period")
df=df[req_cols].copy()

for c in df.select_dtypes(include=["float64"]).columns: 
    df[c]=pd.to_numeric(df[c],downcast="float")
for c in df.select_dtypes(include=["int64"]).columns: 
    df[c]=pd.to_numeric(df[c],downcast="integer")

df["CVR"]=pd.to_numeric(df["CVR"],errors="coerce").fillna(0.0)
df["margin_rate"]=pd.to_numeric(df["margin_rate"],errors="coerce").fillna(0.0)

mda_stats=(df.groupby("mda_idx")
             .agg(n_rows=("mda_idx","size"),mean_cvr=("CVR","mean"))
             .reset_index())

cvr_thresh=mda_stats["mean_cvr"].quantile(0.7)
rows_thresh=mda_stats["n_rows"].quantile(0.7)

def assign_group(row):
    if row["mean_cvr"]>=cvr_thresh and row["n_rows"]>=rows_thresh: 
        return "A_high_perf_big"
    elif row["mean_cvr"]>=cvr_thresh: 
        return "B_high_perf_small"
    elif row["n_rows"]>=rows_thresh: 
        return "C_low_perf_big"
    else: 
        return "D_low_perf_small"

mda_stats["group"]=mda_stats.apply(assign_group,axis=1)
df=df.merge(mda_stats[["mda_idx","group"]],on="mda_idx",how="left")

le_ads=LabelEncoder().fit(df["ads_code"].astype(str))
le_type=LabelEncoder().fit(df["ads_type"].astype(str))
le_cat=LabelEncoder().fit(df["ads_category"].astype(str))
le_time=None
if "time_period" in df.columns: 
    le_time=LabelEncoder().fit(df["time_period"].astype(str))

df["ads_code_encoded"]=le_ads.transform(df["ads_code"].astype(str))
df["ads_type_encoded"]=le_type.transform(df["ads_type"].astype(str))
df["ads_category_encoded"]=le_cat.transform(df["ads_category"].astype(str))
if le_time is not None: 
    df["time_period_encoded"]=le_time.transform(df["time_period"].astype(str))

for g in df["group"].unique():
    gdf=df[df["group"]==g].copy()
    if len(gdf)==0: 
        continue
    fname=f"dataset_{g}.parquet"
    gdf.to_parquet(fname,index=False)
    print(f"{g}: {fname} 저장 완료 (rows={len(gdf)})")

gc.collect()

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost import XGBRanker
from xgboost.callback import EarlyStopping
import joblib

df_a = pd.read_parquet("dataset_A_high_perf_big.parquet")

X = df_a[["ads_code_encoded","ads_type_encoded","ads_category_encoded","adv_cost","earn_cost","margin","margin_rate"]]

df_a["label"] = pd.qcut(df_a["target_score"], q=3, labels=False, duplicates="drop")
y = df_a["label"].astype(int)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
group_train = df_a.loc[X_train.index].groupby("mda_idx").size().to_numpy()
group_test  = df_a.loc[X_test.index].groupby("mda_idx").size().to_numpy()

ranker_A = XGBRanker(
    objective="rank:ndcg",
    eval_metric="ndcg@10",
    tree_method="hist",
    n_estimators=100,
    max_depth=3,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42)

ranker_A.fit(
    X_train, y_train,
    group=group_train,
    eval_set=[(X_test, y_test)],
    eval_group=[group_test],
    #early_stopping_rounds=20,
    verbose=True)

joblib.dump(ranker_A, "xgb_ranker_A.pkl")

In [None]:
del df_A

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost import XGBRanker
from xgboost.callback import EarlyStopping
import joblib

df_B = pd.read_parquet("dataset_B_high_perf_small.parquet")

X = df_B[["ads_code_encoded","ads_type_encoded","ads_category_encoded","adv_cost","earn_cost","margin","margin_rate"]]

df_B["label"] = pd.qcut(df_B["target_score"], q=3, labels=False, duplicates="drop")
y = df_B["label"].astype(int)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
group_train = df_B.loc[X_train.index].groupby("mda_idx").size().to_numpy()
group_test  = df_B.loc[X_test.index].groupby("mda_idx").size().to_numpy()

ranker_B = XGBRanker(
    objective="rank:ndcg",
    eval_metric="ndcg@10",
    tree_method="hist",
    n_estimators=300,
    max_depth=4,
    learning_rate=0.08,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42)

ranker_B.fit(
    X_train, y_train,
    group=group_train,
    eval_set=[(X_test, y_test)],
    eval_group=[group_test],
    #early_stopping_rounds=20,
    verbose=True)

joblib.dump(ranker_B, "xgb_ranker_B.pkl")

In [None]:
del df_B

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost import XGBRanker
import joblib

df_C = pd.read_parquet("dataset_C_low_perf_big.parquet")

X = df_C[["ads_code_encoded","ads_type_encoded","ads_category_encoded","adv_cost","earn_cost","margin","margin_rate"]]

df_C["label"] = pd.qcut(df_C["target_score"], q=8, labels=False, duplicates="drop").astype(int)
#df_C["label"] = pd.qcut(df_C["target_score"], q=3, labels=False, duplicates="drop")
y = df_C["label"].astype(int)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
group_train = df_C.loc[X_train.index].groupby("mda_idx").size().to_numpy()
group_test  = df_C.loc[X_test.index].groupby("mda_idx").size().to_numpy()

ranker_C = XGBRanker(
    objective="rank:ndcg",
    eval_metric="ndcg@10",
    tree_method="hist",
    n_estimators=200,
    learning_rate=0.05,
    max_depth=4,
    subsample=0.8,
    colsample_bytree=0.8,
    min_child_weight=10,
    reg_lambda=1.0,
    reg_alpha=0.1,
    random_state=42)

ranker_C.fit(
    X_train, y_train,
    group=group_train,
    eval_set=[(X_test, y_test)],
    eval_group=[group_test],
    #early_stopping_rounds=20,
    verbose=True)

joblib.dump(ranker_C, "xgb_ranker_C.pkl")

In [None]:
del df_C

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost import XGBRanker
import joblib

df_D = pd.read_parquet("dataset_D_low_perf_small.parquet")

X = df_D[["ads_code_encoded","ads_type_encoded","ads_category_encoded","adv_cost","earn_cost","margin","margin_rate"]]

df_D["label"] = pd.qcut(df_D["target_score"], q=3, labels=False, duplicates="drop")
y = df_D["label"].astype(int)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
group_train = df_D.loc[X_train.index].groupby("mda_idx").size().to_numpy()
group_test  = df_D.loc[X_test.index].groupby("mda_idx").size().to_numpy()

ranker_D = XGBRanker(
    objective="rank:ndcg",
    eval_metric="ndcg@10",
    tree_method="hist",
    n_estimators=200,
    max_depth=7,
    learning_rate=0.04,
    subsample=0.9
    colsample_bytree=0.9,
    min_child_weight=5,
    reg_alpha=0.2,
    reg_lambda=8.0,
    gamma=0.5,
    random_state=42)

ranker_D.fit(
    X_train, y_train,
    group=group_train,
    eval_set=[(X_test, y_test)],
    eval_group=[group_test],
    #early_stopping_rounds=100,
    verbose=True)

joblib.dump(ranker_D, "xgb_ranker_D.pkl")

In [None]:
import pandas as pd
import joblib
import shap
import matplotlib.pyplot as plt
import xgboost as xgb
import numpy as np

ranker_A = joblib.load("xgb_ranker_A.pkl")
df_A = pd.read_parquet("dataset_A_high_perf_big.parquet")

feat_cols = ["ads_code_encoded","ads_type_encoded","ads_category_encoded",
             "adv_cost","earn_cost","margin","margin_rate"]
X_A = df_A[feat_cols]

# X_A = X_A.sample(5000, random_state=42)

booster = ranker_A.get_booster()
importance_gain = booster.get_score(importance_type="gain")
print("Gain 기반 중요도:", importance_gain)

xgb.plot_importance(ranker_A, importance_type="gain")
plt.show()

explainer = shap.TreeExplainer(ranker_A)
shap_values = explainer.shap_values(X_A)

shap.summary_plot(shap_values, X_A)
shap.dependence_plot("margin_rate", shap_values, X_A)
shap.dependence_plot("adv_cost", shap_values, X_A)

mean_abs_shap = np.abs(shap_values).mean(axis=0)
shap_importance = pd.DataFrame({
    "feature": X_A.columns,
    "mean_abs_shap": mean_abs_shap
}).sort_values("mean_abs_shap", ascending=False)
print("\n평균 절대 SHAP 값 기반 중요도:")
print(shap_importance)

print("\n=== 인사이트 요약 ===")
top_feat = shap_importance.iloc[0]["feature"]
print(f"모델이 가장 민감하게 반응하는 변수는 '{top_feat}' 입니다.")

for i, row in shap_importance.iterrows():
    feat = row["feature"]
    score = row["mean_abs_shap"]
    print(f"- {feat}: 평균 SHAP 영향력 {score:.4f}")

In [None]:
import pandas as pd
import joblib
import shap
import matplotlib.pyplot as plt
import xgboost as xgb
import numpy as np

ranker_B = joblib.load("xgb_ranker_B.pkl")
df_B = pd.read_parquet("dataset_B_high_perf_small.parquet")

feat_cols = ["ads_code_encoded","ads_type_encoded","ads_category_encoded",
             "adv_cost","earn_cost","margin","margin_rate"]
X_B = df_B[feat_cols]

# X_B = X_B.sample(5000, random_state=42)

booster = ranker_B.get_booster()
importance_gain = booster.get_score(importance_type="gain")
print("Gain 기반 중요도:", importance_gain)

xgb.plot_importance(ranker_B, importance_type="gain")
plt.show()

explainer = shap.TreeExplainer(ranker_B)
shap_values = explainer.shap_values(X_B)

shap.summary_plot(shap_values, X_B)
shap.dependence_plot("margin_rate", shap_values, X_B)
shap.dependence_plot("adv_cost", shap_values, X_B)

mean_abs_shap = np.abs(shap_values).mean(axis=0)
shap_importance = pd.DataFrame({
    "feature": X_B.columns,
    "mean_abs_shap": mean_abs_shap
}).sort_values("mean_abs_shap", ascending=False)
print("\n평균 절대 SHAP 값 기반 중요도:")
print(shap_importance)

print("\n=== 인사이트 요약 ===")
top_feat = shap_importance.iloc[0]["feature"]
print(f"모델이 가장 민감하게 반응하는 변수는 '{top_feat}' 입니다.")

for i, row in shap_importance.iterrows():
    feat = row["feature"]
    score = row["mean_abs_shap"]
    print(f"- {feat}: 평균 SHAP 영향력 {score:.4f}")

In [None]:
import pandas as pd
import joblib
import shap
import matplotlib.pyplot as plt
import xgboost as xgb
import numpy as np

ranker_C = joblib.load("xgb_ranker_C.pkl")
df_C = pd.read_parquet("dataset_C_low_perf_big.parquet")

feat_cols = ["ads_code_encoded","ads_type_encoded","ads_category_encoded",
             "adv_cost","earn_cost","margin","margin_rate"]
X_C = df_C[feat_cols]

# X_C = X_C.sample(5000, random_state=42)

booster = ranker_C.get_booster()
importance_gain = booster.get_score(importance_type="gain")
print("Gain 기반 중요도:", importance_gain)

xgb.plot_importance(ranker_C, importance_type="gain")
plt.show()

explainer = shap.TreeExplainer(ranker_C)
shap_values = explainer.shap_values(X_C)

shap.summary_plot(shap_values, X_C)
shap.dependence_plot("margin_rate", shap_values, X_C)
shap.dependence_plot("adv_cost", shap_values, X_C)

mean_abs_shap = np.abs(shap_values).mean(axis=0)
shap_importance = pd.DataFrame({
    "feature": X_C.columns,
    "mean_abs_shap": mean_abs_shap
}).sort_values("mean_abs_shap", ascending=False)
print("\n평균 절대 SHAP 값 기반 중요도:")
print(shap_importance)

print("\n=== 인사이트 요약 ===")
top_feat = shap_importance.iloc[0]["feature"]
print(f"모델이 가장 민감하게 반응하는 변수는 '{top_feat}' 입니다.")

for i, row in shap_importance.iterrows():
    feat = row["feature"]
    score = row["mean_abs_shap"]
    print(f"- {feat}: 평균 SHAP 영향력 {score:.4f}")

In [None]:
import pandas as pd
import joblib
import shap
import matplotlib.pyplot as plt
import xgboost as xgb
import numpy as np

ranker_D = joblib.load("xgb_ranker_D.pkl")
df_D = pd.read_parquet("dataset_D_low_perf_small.parquet")

feat_cols = ["ads_code_encoded","ads_type_encoded","ads_category_encoded",
             "adv_cost","earn_cost","margin","margin_rate"]
X_D = df_D[feat_cols]

# X_D = X_D.sample(5000, random_state=42)

booster = ranker_D.get_booster()
importance_gain = booster.get_score(importance_type="gain")
print("Gain 기반 중요도:", importance_gain)

xgb.plot_importance(ranker_D, importance_type="gain")
plt.show()

explainer = shap.TreeExplainer(ranker_D)
shap_values = explainer.shap_values(X_D)

shap.summary_plot(shap_values, X_D)
shap.dependence_plot("margin_rate", shap_values, X_D)
shap.dependence_plot("adv_cost", shap_values, X_D)

mean_abs_shap = np.abs(shap_values).mean(axis=0)
shap_importance = pd.DataFrame({
    "feature": X_D.columns,
    "mean_abs_shap": mean_abs_shap
}).sort_values("mean_abs_shap", ascending=False)
print("\n평균 절대 SHAP 값 기반 중요도:")
print(shap_importance)

print("\n=== 인사이트 요약 ===")
top_feat = shap_importance.iloc[0]["feature"]
print(f"모델이 가장 민감하게 반응하는 변수는 '{top_feat}' 입니다.")

for i, row in shap_importance.iterrows():
    feat = row["feature"]
    score = row["mean_abs_shap"]
    print(f"- {feat}: 평균 SHAP 영향력 {score:.4f}")

In [None]:
import pandas as pd
df_A = pd.read_parquet("dataset_A_high_perf_big.parquet"); df_A["group"] = "A"
df_B = pd.read_parquet("dataset_B_high_perf_small.parquet"); df_B["group"] = "B"
df_C = pd.read_parquet("dataset_C_low_perf_big.parquet"); df_C["group"] = "C"
df_D = pd.read_parquet("dataset_D_low_perf_small.parquet"); df_D["group"] = "D"

df = pd.concat([df_A, df_B, df_C, df_D], ignore_index=True)

df["ROAS"] = df["earn_cost"] / df["adv_cost"].replace(0, 1)
df["profitability"] = (df["earn_cost"] - df["adv_cost"]) / df["adv_cost"].replace(0, 1)
if "model_score" not in df.columns and "target_score" in df.columns:
    df["model_score"] = df["target_score"]

combo_summary = (
    df.groupby(["group", "mda_idx", "ads_category", "ads_type"])
      .agg(
          mean_margin_rate=("margin_rate", "mean"),
          mean_roas=("ROAS", "mean"),
          total_earn=("earn_cost", "sum"),
          total_adv=("adv_cost", "sum"),
          mean_score=("model_score", "mean")).reset_index())

best_combos = (
    combo_summary.sort_values(
        by=["mean_score", "mean_roas", "mean_margin_rate", "total_earn"],
        ascending=[False, False, False, False])
    .groupby(["group", "mda_idx"])
    .head(1)
    .reset_index(drop=True))

best_combos_sorted = best_combos.sort_values(
    by=["group", "mean_score", "mean_roas", "mean_margin_rate", "total_earn"],
    ascending=[True, False, False, False, False])

pd.set_option("display.max_rows", None)

print("\n=== 그룹별 mda_idx 최고의 조합 (카테고리 + 타입) ===")
print(best_combos_sorted[["group", "mda_idx", "ads_category", "ads_type", 
                          "mean_score", "mean_roas", "mean_margin_rate", "total_earn"]])

In [None]:
import pandas as pd
import joblib
import xgboost as xgb

def recommend_media(model_path, df_input, top_n=5):
    ranker = joblib.load(model_path)
    df_input["margin"] = df_input["adv_cost"] - df_input["earn_cost"]
    df_input["margin_rate"] = df_input["margin"] / df_input["adv_cost"].replace(0, 1)
    feat_cols = [
        "ads_code_encoded","ads_type_encoded","ads_category_encoded",
        "adv_cost","earn_cost","margin","margin_rate"]
    X = df_input[feat_cols]

    groups = df_input.groupby("mda_idx").size().to_numpy()

    dtest = xgb.DMatrix(X)
    dtest.set_group(groups)

    preds = ranker.get_booster().predict(dtest)
    df_input["pred_score"] = preds

    recommendations = (
        df_input.sort_values(["mda_idx","pred_score"], ascending=[True,False])
                .groupby("mda_idx")
                .head(top_n))
    return recommendations[["mda_idx","ads_code_encoded","pred_score"]]

In [None]:
# df_new = pd.DataFrame({
#     "mda_idx": [1,1,1,2,2],
#     "ads_code_encoded": [101,102,103,201,202],
#     "ads_type_encoded": [1,2,1,3,2],
#     "ads_category_encoded": [10,11,12,20,21],
#     "adv_cost": [120.0,150.0,90.0,200.0,180.0],
#     "earn_cost": [70.0,80.0,50.0,120.0,100.0]})

# recommendations = recommend_media("xgb_ranker_A.pkl", df_new, top_n=2)

# print(recommendations)