In [None]:
import sys
print(sys.version)

In [None]:
!pip install catboost==1.2.8
!pip install optuna==4.3.0

In [None]:
import pandas as pd
import numpy as np
import catboost
import sklearn
import optuna

versions = {
    "pandas": pd.__version__,
    "numpy": np.__version__,
    "catboost": catboost.__version__,
    "sklearn": sklearn.__version__,
    "optuna": optuna.__version__,
}

import pprint
pprint.pprint(versions)

In [None]:
pip freeze > requirements.txt
# 파일 정리에는 v2-8 TPU을 사용.

import pandas as pd
from google.colab import drive
import os

drive.mount('/content/drive')

In [None]:
drive_folder = '/content/drive/MyDrive/base_file/'

months = ["07", "08", "09", "10", "11", "12"]

categories = ["회원정보", "신용정보", "승인매출정보", "청구입금정보", "잔액정보", "채널정보", "마케팅정보", "성과정보"]
data_types = ["train", "test"]

def merge_monthly_data(data_type, category):
    merged_list = []

    for month in months:
        file_name = f"{drive_folder}2018{month}_{data_type}_{category}.parquet"
        try:
            df = pd.read_parquet(file_name, engine="pyarrow")
            merged_list.append(df)
            print(f"✅ {file_name} 변환 완료")
        except FileNotFoundError:
            print(f"⚠️ 파일 없음: {file_name}")

    if merged_list:
        merged_df = pd.concat(merged_list, ignore_index=True)
        output_file = f"{drive_folder}{data_type}_{category}.csv"
        merged_df.to_csv(output_file, index=False)
        print(f"✅ {output_file} 저장 완료 (Shape: {merged_df.shape})")
    else:
        print(f"❌ {data_type}_{category} 데이터 없음")

for data_type in data_types:
    for category in categories:
        merge_monthly_data(data_type, category)

In [None]:
base_path = "/content/drive/MyDrive/base_file/"

file_names = [
    "train_회원정보.csv",
    "train_신용정보.csv",
    "train_승인매출정보.csv",
    "train_청구입금정보.csv",
    "train_잔액정보.csv",
    "train_채널정보.csv",
    "train_마케팅정보.csv",
    "train_성과정보.csv"
]

df = pd.read_csv(base_path + file_names[0])

for idx, file in enumerate(file_names[1:], start=2):
    print(f"\n🔹 병합 중: {file} ({idx}/{len(file_names)})")
    temp_df = pd.read_csv(base_path + file)

    df = df.merge(temp_df, how="left", on=["ID", "기준년월"])
    print(f"✅ 병합 후 크기: {df.shape}")

output_file = base_path + "base_train.csv"
df.to_csv(output_file, index=False)

print(f"\n✅ 최종 데이터 저장 완료: {output_file}")
print(f"🧾 최종 데이터 크기: {df.shape[0]}행, {df.shape[1]}열")

In [None]:
base_path = "/content/drive/MyDrive/base_file/"

file_names = [
    "test_회원정보.csv",
    "test_신용정보.csv",
    "test_승인매출정보.csv",
    "test_청구입금정보.csv",
    "test_잔액정보.csv",
    "test_채널정보.csv",
    "test_마케팅정보.csv",
    "test_성과정보.csv"
]

df = pd.read_csv(base_path + file_names[0])

for idx, file in enumerate(file_names[1:], start=2):
    print(f"\n🔹 병합 중: {file} ({idx}/{len(file_names)})")
    temp_df = pd.read_csv(base_path + file)

    df = df.merge(temp_df, how="left", on=["ID", "기준년월"])
    print(f"✅ 병합 후 크기: {df.shape}")

output_file = base_path + "base_test.csv"
df.to_csv(output_file, index=False)

print(f"\n✅ 최종 데이터 저장 완료: {output_file}")
print(f"🧾 최종 데이터 크기: {df.shape[0]}행, {df.shape[1]}열")

In [None]:
base_path = "/content/drive/MyDrive/base_file/"

file_names = [
    "train_회원정보.csv",
    "train_신용정보.csv",
    "train_승인매출정보.csv",
    "train_청구입금정보.csv",
    "train_잔액정보.csv",
    "train_채널정보.csv",
    "train_마케팅정보.csv",
    "train_성과정보.csv"
]

df = pd.read_csv(base_path + file_names[0])

original_shape = df.shape

for idx, file in enumerate(file_names[1:], start=2):
    print(f"\n🔹 병합 진행 중: {file} (파일 {idx} / {len(file_names)})")

    temp_df = pd.read_csv(base_path + file)
    df = df.merge(temp_df, how="left", on=["ID", "기준년월"])
    print(f"✅ 병합 후 데이터 크기: {df.shape[0]}행, {df.shape[1]}열")

    constant_cols = [col for col in df.columns if df[col].nunique() == 1]
    if constant_cols:
        print(f"📌 제거된 모든 값이 동일한 칼럼: {constant_cols}")
        df = df.drop(columns=constant_cols)
    else:
        print("📌 모든 값이 동일한 칼럼 없음")

    col_groups = {}
    for col in df.columns:
        for key in col_groups:
            if df[col].equals(df[key]):
                col_groups[key].append(col)
                break
        else:
            col_groups[col] = [col]

    duplicate_cols = [col for group in col_groups.values() for col in group[1:]]
    if duplicate_cols:
        print(f"📌 제거된 중복 칼럼: {duplicate_cols}")
        df = df.drop(columns=duplicate_cols)
    else:
        print("📌 중복 칼럼 없음")

    if 'ID' in df.columns and df.columns.str.contains('ID').sum() > 1:
        df = df.loc[:, ~df.columns.str.contains('ID', case=False)].join(df[['ID']])

    if '기준년월' in df.columns and df.columns.str.contains('기준년월').sum() > 1:
        df = df.loc[:, ~df.columns.str.contains('기준년월', case=False)].join(df[['기준년월']])

    print(f"🔹 {file} 처리 완료. 현재 데이터 크기: {df.shape[0]}행, {df.shape[1]}열")

new_shape = df.shape

output_file = base_path + "base_clean_train.csv"
df.to_csv(output_file, index=False)

print(f"\n✅ 원래 데이터 크기: {original_shape[0]}행, {original_shape[1]}열")
print(f"✅ 병합 후 최종 데이터 크기: {new_shape[0]}행, {new_shape[1]}열")

print(f"\n✅ 최종 데이터 저장 완료: {output_file}")

In [None]:
base_path = "/content/drive/MyDrive/base_file/"

test_file_names = [
    "test_회원정보.csv",
    "test_신용정보.csv",
    "test_승인매출정보.csv",
    "test_청구입금정보.csv",
    "test_잔액정보.csv",
    "test_채널정보.csv",
    "test_마케팅정보.csv",
    "test_성과정보.csv"
]

test_df = pd.read_csv(base_path + test_file_names[0])

test_original_shape = test_df.shape
for idx, file in enumerate(test_file_names[1:], start=2):
    print(f"\n🔹 병합 진행 중: {file} (파일 {idx} / {len(test_file_names)})")
    temp_df = pd.read_csv(base_path + file)
    test_df = test_df.merge(temp_df, how="left", on=["ID", "기준년월"])
    print(f"✅ 병합 후 데이터 크기: {test_df.shape[0]}행, {test_df.shape[1]}열")

train_df = pd.read_csv(base_path + "base_clean_train.csv", nrows=1)
train_columns = train_df.columns
test_columns_to_keep = [col for col in test_df.columns if col in train_columns]

test_df = test_df[test_columns_to_keep]
test_final_shape = test_df.shape

test_output_file = base_path + "base_clean_test.csv"
test_df.to_csv(test_output_file, index=False)

print(f"\n✅ 원래 test 데이터 크기: {test_original_shape[0]}행, {test_original_shape[1]}열")
print(f"✅ 병합 후 최종 test 데이터 크기: {test_final_shape[0]}행, {test_final_shape[1]}열")
print(f"\n✅ 최종 test 데이터 저장 완료: {test_output_file}")

train_col_set = set(train_columns)
test_col_set = set(test_df.columns)

if train_col_set == test_col_set:
    print("\n✅ train과 test의 컬럼이 완전히 일치합니다!")
else:
    train_only_cols = train_col_set - test_col_set
    test_only_cols = test_col_set - train_col_set

    print(f"\n⚠️ train과 test의 컬럼이 다릅니다!")
    print(f"🔹 train에만 있는 컬럼 ({len(train_only_cols)}개): {train_only_cols}")
    print(f"🔹 test에만 있는 컬럼 ({len(test_only_cols)}개): {test_only_cols}")

In [None]:
# BASE model에는 A100 GPU을 사용.

!pip install catboost==1.2.8
!pip install optuna==4.3.0

In [None]:
from google.colab import drive
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
from sklearn.preprocessing import LabelEncoder
import optuna
import json

In [None]:
drive.mount('/content/drive')

In [None]:
train = pd.read_csv('/content/drive/MyDrive/base_file/base_clean_train.csv')
test = pd.read_csv('/content/drive/MyDrive/base_file/base_clean_test.csv')

ab_ids = train[train['Segment'].isin(['A', 'B'])]['ID'].unique()

train = train[~train['ID'].isin(ab_ids)].copy()

In [None]:
label_encoder = LabelEncoder()
train['Segment'] = label_encoder.fit_transform(train['Segment'])

X = train.drop(columns=['Segment', 'ID'])
y = train['Segment']
X_test = test.drop(columns=['ID'])

cat_features = [col for col in X.columns if X[col].dtype == 'object']
for col in cat_features:
    X[col] = X[col].astype(str)
    X_test[col] = X_test[col].astype(str)

best_params = {
    "bootstrap_type": "Bayesian",
    "learning_rate": 0.2997682904093563,
    "l2_leaf_reg": 9.214022161348987,
    "random_strength": 7.342192789415524,
    "bagging_temperature": 0.11417356499443036,
    "border_count": 251,
    "iterations": 1500,
    "loss_function": "MultiClass",
    "eval_metric": "TotalF1",
    "task_type": "GPU",
    "verbose": 100,
    "random_seed": 42,
    "depth": 8,
    "class_weights": [2, 1, 1]
}

n_classes = len(np.unique(y))
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
all_test_probs = np.zeros((X_test.shape[0], n_classes))

for fold, (train_idx, valid_idx) in enumerate(kf.split(X, y)):
    print(f"🚀 Fold {fold+1} training...")

    X_train_fold, y_train_fold = X.iloc[train_idx], y.iloc[train_idx]
    X_valid_fold, y_valid_fold = X.iloc[valid_idx], y.iloc[valid_idx]

    model = CatBoostClassifier(**best_params)
    model.fit(X_train_fold, y_train_fold, cat_features=cat_features)

    fold_probs = model.predict_proba(X_test)
    all_test_probs += fold_probs

avg_test_probs = all_test_probs / kf.get_n_splits()
prob_df = pd.DataFrame(avg_test_probs, columns=range(n_classes))
prob_df['ID'] = test['ID'].values

mean_probs = prob_df.groupby('ID').mean().reset_index()
mean_probs['Segment'] = mean_probs.drop(columns='ID').values.argmax(axis=1)

segment_mapping = {0: 'C', 1: 'D', 2: 'E'}
mean_probs['Segment'] = mean_probs['Segment'].map(segment_mapping)

submission = pd.DataFrame({'ID': mean_probs['ID'], 'Segment': mean_probs['Segment']})
submission.to_csv('/content/drive/MyDrive/base_file/base_catboost_kfold.csv', index=False)

print("✅ CatBoost + 10-Fold CV 예측 완료 및 저장 🎯")

In [None]:
# VIP model에는 재현성 유지를 위해 L4 GPU 사용.
# 파일 생성 부분 RAM 부족 문제로 v2-8 TPU 사용. L4 GPU 전환지점 따로 표시 예정.
from google.colab import drive
import pandas as pd
drive.mount('/content/drive')

In [None]:
train = pd.read_csv('/content/drive/MyDrive/base_file/base_clean_train.csv')
test = pd.read_csv('/content/drive/MyDrive/base_file/base_clean_test.csv')

train_A = train[train['Segment'] == 'A']

cols_to_check = [col for col in train.columns if col not in ['ID', 'Segment']]

def is_fixed_column(df, col):
    return df[col].nunique() == 1

fixed_columns_A = {col: train_A[col].iloc[0] for col in cols_to_check if is_fixed_column(train_A, col)}
max_column_values = fixed_columns_A.copy()

fixed_cols = list(max_column_values.keys())

print(f"📦 고정된 칼럼 {len(fixed_cols)}개 제거할 예정입니다.")

In [None]:
matching_ids_train = train.copy()
for col, value in max_column_values.items():
    matching_ids_train = matching_ids_train[matching_ids_train[col] == value]

matching_ids_train_grouped = matching_ids_train.groupby('ID').filter(lambda x: len(x) == 6)
matching_ids_train_list = matching_ids_train_grouped['ID'].unique()

matching_ids_test = test.copy()
for col, value in max_column_values.items():
    matching_ids_test = matching_ids_test[matching_ids_test[col] == value]

matching_ids_test_grouped = matching_ids_test.groupby('ID').filter(lambda x: len(x) == 6)
matching_ids_test_list = matching_ids_test_grouped['ID'].unique()

train_filtered = train[train['ID'].isin(matching_ids_train_list)].drop(columns=fixed_cols)
test_filtered = test[test['ID'].isin(matching_ids_test_list)].drop(columns=fixed_cols)

print(f"🚀 최종 train 데이터 shape: {train_filtered.shape}")
print(f"🚀 최종 test 데이터 shape: {test_filtered.shape}")

train_filtered.to_csv('/content/drive/MyDrive/base_file/train_vips_A.csv', index=False)
test_filtered.to_csv('/content/drive/MyDrive/base_file/test_vips_A.csv', index=False)

In [None]:
train = pd.read_csv('/content/drive/MyDrive/base_file/base_clean_train.csv')
test = pd.read_csv('/content/drive/MyDrive/base_file/base_clean_test.csv')

train_B = train[train['Segment'] == 'B']

cols_to_check = [col for col in train.columns if col not in ['ID', 'Segment']]

def is_fixed_column(df, col):
    return df[col].nunique() == 1

fixed_columns_B = {col: train_B[col].iloc[0] for col in cols_to_check if is_fixed_column(train_B, col)}
max_column_values = fixed_columns_B.copy()

fixed_cols = list(max_column_values.keys())

print(f"📦 고정된 칼럼 {len(fixed_cols)}개 제거할 예정입니다.")

In [None]:
matching_ids_train = train.copy()
for col, value in max_column_values.items():
    matching_ids_train = matching_ids_train[matching_ids_train[col] == value]

matching_ids_train_grouped = matching_ids_train.groupby('ID').filter(lambda x: len(x) == 6)
matching_ids_train_list = matching_ids_train_grouped['ID'].unique()

matching_ids_test = test.copy()
for col, value in max_column_values.items():
    matching_ids_test = matching_ids_test[matching_ids_test[col] == value]

matching_ids_test_grouped = matching_ids_test.groupby('ID').filter(lambda x: len(x) == 6)
matching_ids_test_list = matching_ids_test_grouped['ID'].unique()

train_filtered = train[train['ID'].isin(matching_ids_train_list)].drop(columns=fixed_cols)
test_filtered = test[test['ID'].isin(matching_ids_test_list)].drop(columns=fixed_cols)

print(f"🚀 최종 train 데이터 shape: {train_filtered.shape}")
print(f"🚀 최종 test 데이터 shape: {test_filtered.shape}")

train_filtered.to_csv('/content/drive/MyDrive/base_file/train_vips_B.csv', index=False)
test_filtered.to_csv('/content/drive/MyDrive/base_file/test_vips_B.csv', index=False)

In [None]:
# 해당 코드부터 v2-8 TPU 대신 L4 GPU 사용.
!pip install catboost==1.2.8

In [None]:
from google.colab import drive
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from catboost import CatBoostClassifier

In [None]:
drive.mount('/content/drive')

In [None]:
train = pd.read_csv('/content/drive/MyDrive/base_file/train_vips_A.csv')
test = pd.read_csv('/content/drive/MyDrive/base_file/test_vips_A.csv')

label_encoder = LabelEncoder()
train['Segment'] = label_encoder.fit_transform(train['Segment'])

X = train.drop(columns=['Segment', 'ID'])
y = train['Segment']
X_test = test.drop(columns=['ID'])

cat_features = [col for col in X.columns if X[col].dtype == 'object']
for col in cat_features:
    X[col] = X[col].astype(str)
    X_test[col] = X_test[col].astype(str)

params = {
    'iterations': 2000,
    'learning_rate': 0.05,
    'depth': 6,
    'loss_function': 'MultiClass',
    'eval_metric': 'MultiClass',
    'verbose': 100,
    'random_seed': 42,
    'task_type': 'GPU',
    'class_weights': [20, 50, 2, 1, 1],
}

n_classes = 5
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

print(f"\n🚀 단일 Model Run 시작")

all_test_probs = np.zeros((X_test.shape[0], n_classes))

for fold, (train_idx, valid_idx) in enumerate(kf.split(X, y)):
    print(f"📂 Fold {fold + 1}")

    X_train_fold, X_valid_fold = X.iloc[train_idx], X.iloc[valid_idx]
    y_train_fold, y_valid_fold = y.iloc[train_idx], y.iloc[valid_idx]

    model = CatBoostClassifier(**params)
    model.fit(
        X_train_fold, y_train_fold,
        eval_set=(X_valid_fold, y_valid_fold),
        cat_features=cat_features,
        early_stopping_rounds=100,
        use_best_model=True
    )

    test_probs = model.predict_proba(X_test)
    all_test_probs += test_probs

avg_test_probs = all_test_probs / kf.get_n_splits()
prob_df = pd.DataFrame(avg_test_probs, columns=[0, 1, 2, 3, 4])
prob_df['ID'] = test['ID'].values

mean_probs = prob_df.groupby('ID').mean().reset_index()
mean_probs['Segment'] = mean_probs[[0, 1, 2, 3, 4]].idxmax(axis=1)

segment_mapping = {0: 'A', 1: 'B', 2: 'C', 3: 'D', 4: 'E'}
mean_probs['Segment'] = mean_probs['Segment'].map(segment_mapping)

a_ids = mean_probs.loc[mean_probs['Segment'] == 'A', 'ID'].tolist()

print(f"\n✅ A로 분류된 ID 수 = {len(a_ids)}개")
print(f"🔎 A ID: {a_ids[:50]}")

In [None]:
train = pd.read_csv('/content/drive/MyDrive/base_file/train_vips_B.csv')
test = pd.read_csv('/content/drive/MyDrive/base_file/test_vips_B.csv')

label_encoder = LabelEncoder()
train['Segment'] = label_encoder.fit_transform(train['Segment'])

X = train.drop(columns=['Segment', 'ID'])
y = train['Segment']
X_test = test.drop(columns=['ID'])

cat_features = [col for col in X.columns if X[col].dtype == 'object']
for col in cat_features:
    X[col] = X[col].astype(str)
    X_test[col] = X_test[col].astype(str)

params = {
    'iterations': 1000,
    'learning_rate': 0.03,
    'depth': 8,
    'loss_function': 'MultiClass',
    'eval_metric': 'MultiClass',
    'verbose': 100,
    'random_seed': 42,
    'task_type': 'GPU',
    'class_weights': [10, 10, 1, 1, 1],
}

n_classes = 5
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

print(f"\n🚀 단일 Model Run 시작")

all_test_probs = np.zeros((X_test.shape[0], n_classes))

for fold, (train_idx, valid_idx) in enumerate(kf.split(X, y)):
    print(f"📂 Fold {fold + 1}")

    X_train_fold, X_valid_fold = X.iloc[train_idx], X.iloc[valid_idx]
    y_train_fold, y_valid_fold = y.iloc[train_idx], y.iloc[valid_idx]

    model = CatBoostClassifier(**params)
    model.fit(
        X_train_fold, y_train_fold,
        eval_set=(X_valid_fold, y_valid_fold),
        cat_features=cat_features,
        early_stopping_rounds=100,
        use_best_model=True
    )

    test_probs = model.predict_proba(X_test)
    all_test_probs += test_probs

avg_test_probs = all_test_probs / kf.get_n_splits()
prob_df = pd.DataFrame(avg_test_probs, columns=[0, 1, 2, 3, 4])
prob_df['ID'] = test['ID'].values

mean_probs = prob_df.groupby('ID').mean().reset_index()
mean_probs['Segment'] = mean_probs[[0, 1, 2, 3, 4]].idxmax(axis=1)

segment_mapping = {0: 'A', 1: 'B', 2: 'C', 3: 'D', 4: 'E'}
mean_probs['Segment'] = mean_probs['Segment'].map(segment_mapping)

b_ids = mean_probs.loc[mean_probs['Segment'] == 'B', 'ID'].tolist()

print(f"\n✅ B로 분류된 ID 수 = {len(b_ids)}개")
print(f"🔎 B ID: {b_ids[:5]}")

In [None]:
base_df = pd.read_csv('/content/drive/MyDrive/base_file/base_catboost_kfold.csv')

base_df.loc[base_df['ID'].isin(a_ids), 'Segment'] = 'A'
base_df.loc[base_df['ID'].isin(b_ids), 'Segment'] = 'B'

base_df.to_csv('/content/drive/MyDrive/base_file/final_catboost.csv', index=False)

print(f"✅ Segment가 'A'로 수정된 {len(a_ids)}개 ID 반영 완료")
print(f"✅ Segment가 'B'로 수정된 {len(b_ids)}개 ID 반영 완료")
print("🎯 최종 결과 저장 완료: final_catboost.csv")