In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# 데이터분석 4종 세트
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

base_path = "/content/drive/MyDrive/Colab Notebooks/platform_subscribe/"

train = pd.read_csv(base_path + "train.csv")
test = pd.read_csv(base_path + "test.csv")
submission = pd.read_csv(base_path + "sample_submission.csv")

In [3]:
pip install catboost

Collecting catboost
  Downloading catboost-1.2.2-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2.2


In [4]:
pip install optuna

Collecting optuna
  Downloading optuna-3.4.0-py3-none-any.whl (409 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m409.6/409.6 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.13.0-py3-none-any.whl (230 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m230.6/230.6 kB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting colorlog (from optuna)
  Downloading colorlog-6.8.0-py3-none-any.whl (11 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.0-py3-none-any.whl (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.6/78.6 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: Mako, colorlog, alembic, optuna
Successfully installed Mako-1.3.0 alembic-1.13.0 colorlog-6.8.0 optuna-3.4.0


In [5]:
# Optuna Libraries
import optuna
from optuna import Trial
from optuna.samplers import TPESampler

from catboost import CatBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# 평가 지표
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

evulation_metric = f1_score

### data_scaling + optuna+second_best = ['average_time_per_learning_session', 'recent_learning_achievement'] + data_scaling + catboost

In [6]:
second_best = ['average_time_per_learning_session', 'recent_learning_achievement']
third_best = ['average_time_per_learning_session',
   'recent_learning_achievement',
   'average_login_time',
   'subscription_duration',
   'recent_login_time',
   'monthly_active_learning_days',
   'total_completed_courses',
   'abandoned_learning_sessions',
   'payment_pattern',
   'customer_inquiry_history']

In [7]:
X=train[second_best]
y=train.target
X

Unnamed: 0,average_time_per_learning_session,recent_learning_achievement
0,8.427187,68.360455
1,72.646087,97.567322
2,21.774492,94.358763
3,42.659066,70.153228
4,30.744287,81.917908
...,...,...
9995,84.053558,64.966803
9996,45.464833,82.750244
9997,127.302411,81.567839
9998,5.297234,89.885656


In [8]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train,y_train, test_size=0.25, random_state=42)

In [9]:
from sklearn.preprocessing import StandardScaler
std = StandardScaler()
std.fit(X_train)
X_train_scaled = std.transform(X_train)
X_val_scaled = std.transform(X_val)

In [10]:
# random sampler
sampler = TPESampler(seed=10)

# define function
def objective(trial):

    cbrm_param = {
        'iterations':trial.suggest_int("iterations", 4000, 25000),
        'od_wait':trial.suggest_int('od_wait', 500, 2300),
        'learning_rate' : trial.suggest_uniform('learning_rate',0.01, 1),
        'reg_lambda': trial.suggest_uniform('reg_lambda',1e-5,100),
        'subsample': trial.suggest_uniform('subsample',0,1),
        'random_strength': trial.suggest_uniform('random_strength',10,50),
        'depth': trial.suggest_int('depth',1, 15),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf',1,30),
        'leaf_estimation_iterations': trial.suggest_int('leaf_estimation_iterations',1,15),
        'bagging_temperature' :trial.suggest_loguniform('bagging_temperature', 0.01, 100.00),
        'colsample_bylevel':trial.suggest_float('colsample_bylevel', 0.4, 1.0),
    }

    # Generate model
    model_cbclf = CatBoostClassifier(**cbrm_param)
    model_cbclf = model_cbclf.fit(X_train_scaled, y_train, eval_set=[(X_val, y_val)],
                           verbose=0, early_stopping_rounds=25)
    pred_val_cbclf=model_cbclf.predict(X_val_scaled)
    evulation_metric_val_cb = evulation_metric(y_val, pred_val_cbclf)
    return evulation_metric_val_cb

In [11]:
optuna_cbclf = optuna.create_study(direction="maximize", sampler=sampler)
optuna_cbclf.optimize(objective, n_trials=50)

[I 2023-12-10 09:09:12,384] A new study created in memory with name: no-name-89377d29-cdfe-4497-b1f2-5363b578dad4
  'learning_rate' : trial.suggest_uniform('learning_rate',0.01, 1),
  'reg_lambda': trial.suggest_uniform('reg_lambda',1e-5,100),
  'subsample': trial.suggest_uniform('subsample',0,1),
  'random_strength': trial.suggest_uniform('random_strength',10,50),
  'bagging_temperature' :trial.suggest_loguniform('bagging_temperature', 0.01, 100.00),
[I 2023-12-10 09:09:12,646] Trial 0 finished with value: 0.7650509416486571 and parameters: {'iterations': 20198, 'od_wait': 537, 'learning_rate': 0.6373117525770127, 'reg_lambda': 74.88039076582236, 'subsample': 0.4985070123025904, 'random_strength': 18.991865821233908, 'depth': 3, 'min_data_in_leaf': 23, 'leaf_estimation_iterations': 3, 'bagging_temperature': 0.022561047334047252, 'colsample_bylevel': 0.8112158910206784}. Best is trial 0 with value: 0.7650509416486571.
  'learning_rate' : trial.suggest_uniform('learning_rate',0.01, 1),


In [12]:
cbrm_trial = optuna_cbclf.best_trial
cbrm_trial_params = cbrm_trial.params

cbrm_trial_params

{'iterations': 11840,
 'od_wait': 1714,
 'learning_rate': 0.44741484267876613,
 'reg_lambda': 43.40140499318944,
 'subsample': 0.6177669784693172,
 'random_strength': 30.525529702175636,
 'depth': 10,
 'min_data_in_leaf': 19,
 'leaf_estimation_iterations': 13,
 'bagging_temperature': 1.2206428444236328,
 'colsample_bylevel': 0.945189328485201}

In [13]:
best_model = CatBoostClassifier(**cbrm_trial_params, verbose=False)
best_model.fit(X_train_scaled, y_train)
pred_val_cbclf=best_model.predict(X_val_scaled)

evulation_metric_val_cb = evulation_metric(y_val, pred_val_cbclf)
evulation_metric_val_cb

0.631452581032413

### test 확인

In [14]:
X_test_scaled = std.transform(X_test)
pred_test_cb=best_model.predict(X_test_scaled)

evulation_metric_test_cb=evulation_metric(y_test, pred_test_cb)
print(evulation_metric_test_cb)

0.6338363780778395


### submission

In [15]:
X_submission=test[second_best]
X_submission_scaled = std.transform(X_submission)
print(X_submission_scaled)
preds=best_model.predict(X_submission_scaled)
preds

[[-0.87958133  0.71317603]
 [-0.56081826  0.55890233]
 [-0.97664269 -0.13439635]
 ...
 [-0.6378283   1.44183736]
 [ 1.8374985   0.72769309]
 [-0.0327581  -0.2759617 ]]


array([1, 0, 1, ..., 1, 1, 1])

In [16]:
submission['target']=preds
submission

Unnamed: 0,user_id,target
0,0001d6e9,1
1,0002c77d,0
2,0002df5b,1
3,000b6068,0
4,00184a0c,0
...,...,...
9995,ffe2eba5,1
9996,ffe710f1,0
9997,ffeccdef,1
9998,fff3fcea,1


In [17]:
unique, counts = np.unique(preds, return_counts = True)
cnt_dict = dict(zip(unique, counts))
cnt_dict

{0: 3702, 1: 6298}

In [18]:
submission.to_csv(base_path+"submission_1210_5.csv", index=False)