# ✏️ Basic Setting

## 🔎 Importing Libraries

In [None]:
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer, label_binarize
from sklearn.metrics import roc_auc_score
from lightgbm import LGBMClassifier
import pandas as pd
from sklearn.ensemble import VotingClassifier
import numpy as np

## 🔎 Functions

In [None]:
def custom_eval_lgbm(y_true, y_pred):
  preds = y_pred.reshape(11, -1).T
  lb = LabelBinarizer()
  lb.fit(y_true)
  label = lb.transform(y_true)
  return 'roc_auc' , roc_auc_score(label, preds), True

def custom_eval_xgb(pred, dtrain):
  labels = dtrain.get_label()
  lb = LabelBinarizer()
  lb.fit(labels)
  label = lb.transform(labels)
  print(roc_auc_score(label, pred))
  return 'roc_auc' , -roc_auc_score(label, pred)

def multiclass_roc_auc_score(y_test, y_pred_proba, average=None): #average='macro' when you wanna get mean
  lb = LabelBinarizer()
  lb.fit(y_test)
  y_test = lb.transform(y_test)
  unique, counts = np.unique(y_test, return_counts=True)
  #y_pred = lb.transform(y_pred)   #if y_pred_proba is not a probability
  roc_auc = roc_auc_score(y_test, y_pred_proba, average=average)
  for i in range(11):
    print('\033[36m'+'가맹점 %d_rocauc =\t'%i + str(round(roc_auc[i], 4))+'\033[0m') 
  print('\033[33m' + 'Mean_rocauc =\t\t' + str(round(np.mean(roc_auc), 4)) + '\033[0m', sep='\n')
  return roc_auc

def LIFT20 (original_df, y_test, y_pred):
  LIFT_value = []
  total_data = len(original_df)
  predict_df = pd.DataFrame(data = y_pred, columns=['가맹점0','가맹점1','가맹점2','가맹점3','가맹점4','가맹점5',
                                  '가맹점6','가맹점7','가맹점8','가맹점9','가맹점10'])
  predict_df['y_test'] = y_test
  for classes in range(11):
      df = predict_df.sort_values(by=['가맹점%s'%classes], ascending=False)
      df = df[:int(len(predict_df)*0.2)]
      
      denominator = len(original_df[original_df['MRC_ID_DI']==int(classes)])/total_data
      numerator = len(df[df['y_test'] == classes])/int(len(predict_df)*0.2)
      LIFT_value.append(numerator/denominator)
  for i in range(11):
    print('\033[36m' + '가맹점 %d_LIFT =\t\t'%i + str(round(LIFT_value[i], 4)) + '\033[0m') 
  print('\033[33m' + 'Mean_LIFT =\t\t' + str(round(np.mean(LIFT_value), 4)) + '\033[0m', sep='\n')

  return LIFT_value

## 🔎 Data import

In [None]:
%cd /content/drive/My Drive/samsung_card/preprocess  

path = '../data/'
test_file = 'df_merged.csv'
df = pd.read_csv(path+test_file, index_col='cst_id_di')
quiz_result = pd.read_csv(path+'quiz.csv', index_col='cst_id_di')
quiz = pd.read_csv(path+'cst_feat_feb_quiz.csv', index_col='cst_id_di')

/content/drive/My Drive/samsung_card/preprocess


In [None]:
X = df.iloc[:, 1:].values; y = df.iloc[:, 0].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=1234)

# ✏️ Voting model

## 🔎 LightGBM model

In [None]:
lgbm = LGBMClassifier(class_weight=None,
                        colsample_bytree = max(min(0.4, 1), 0), 
                        learning_rate = 0.01,
                        max_depth = int(8.661), 
                        min_child_weight = 19.55, 
                        subsample = max(min(1, 1), 0),
                        num_leaves = min(2**int(8.661),131072),
                        n_estimators = 1500, 
                        n_jobs=-1, 
                        device='gpu',
                        random_state=1234, 
                        silent=True,)

In [None]:
lgbm.fit(X_train, y_train,
            early_stopping_rounds = 100,
            eval_set=[(X_test, y_test)],
            eval_metric=custom_eval_lgbm)#verbose=False,  

In [None]:
## 🔎 XGBoost model

In [None]:
xgb = XGBClassifier(max_depth = int(8.047),
                    gamma = 1.489,
                    learning_rate = 0.01984,
                    subsample = max(min(0.961, 1), 0),
                    colsample_bytree = max(min(0.6343, 1), 0),
                    min_child_weight = 29.08,
                    max_delta_step = int(7.761),
                    n_estimators = 1500,
                    random_state=1234, 
                    tree_method='gpu_hist' ,
                    silent=True)


In [None]:
xgb.fit(X_train, y_train,
        early_stopping_rounds = 100,
        eval_set=[(X_test, y_test)], 
        eval_metric=custom_eval_xgb)#verbose=False

## 🔎 As Voting model

In [None]:
voting_clf = VotingClassifier(estimators=[('lgbm', lgbm),
                                          ('xgb', xgb)],
                                          voting='soft')

In [None]:
voting_clf.fit(X_train,y_train)
voting_clf.score(X_test, y_test)

0.7029753341988001

In [None]:
import pickle

# Save Model
with open('./voting_model.pkl', 'wb') as f:
    pickle.dump(voting_clf, f

# Load Model
with open('./voting_model.pkl', 'rb') as f:
    voting_clf = pickle.load(f)

In [None]:
y_pred = voting_clf.predict_proba(X_test)

In [None]:
result = multiclass_roc_auc_score(y_test, y_pred)

[36m가맹점 0_rocauc =	0.8958[0m
[36m가맹점 1_rocauc =	0.8567[0m
[36m가맹점 2_rocauc =	0.971[0m
[36m가맹점 3_rocauc =	0.851[0m
[36m가맹점 4_rocauc =	0.8891[0m
[36m가맹점 5_rocauc =	0.957[0m
[36m가맹점 6_rocauc =	0.8196[0m
[36m가맹점 7_rocauc =	0.8634[0m
[36m가맹점 8_rocauc =	0.8393[0m
[36m가맹점 9_rocauc =	0.9169[0m
[36m가맹점 10_rocauc =	0.9672[0m
[33mMean_rocauc =		0.8934[0m


In [None]:
LIFT20(df, y_test, y_pred)

[36m가맹점 0_LIFT =		1.4983[0m
[36m가맹점 1_LIFT =		3.6707[0m
[36m가맹점 2_LIFT =		4.8898[0m
[36m가맹점 3_LIFT =		3.4206[0m
[36m가맹점 4_LIFT =		4.0414[0m
[36m가맹점 5_LIFT =		4.88[0m
[36m가맹점 6_LIFT =		2.8386[0m
[36m가맹점 7_LIFT =		3.1602[0m
[36m가맹점 8_LIFT =		3.228[0m
[36m가맹점 9_LIFT =		4.4647[0m
[36m가맹점 10_LIFT =		4.8985[0m
[33mMean_LIFT =		3.7264[0m


[1.4982894736842105,
 3.670735959659628,
 4.889808823529412,
 3.4206268666622424,
 4.041438464582509,
 4.88004623847755,
 2.8385795228188218,
 3.1601767383405703,
 3.2280073651307877,
 4.464685164845058,
 4.898506607593003]

In [None]:
result

array([0.89583107, 0.8567159 , 0.97101125, 0.85099725, 0.88908454,
       0.95698426, 0.81964277, 0.86343153, 0.83932609, 0.91688671,
       0.96717935])

In [None]:
roc_auc = result

[36m가맹점 0_rocauc =	0.8958[0m
[36m가맹점 1_rocauc =	0.8567[0m
[36m가맹점 2_rocauc =	0.971[0m
[36m가맹점 3_rocauc =	0.851[0m
[36m가맹점 4_rocauc =	0.8891[0m
[36m가맹점 5_rocauc =	0.957[0m
[36m가맹점 6_rocauc =	0.8196[0m
[36m가맹점 7_rocauc =	0.8634[0m
[36m가맹점 8_rocauc =	0.8393[0m
[36m가맹점 9_rocauc =	0.9169[0m
[36m가맹점 10_rocauc =	0.9672[0m
[33mMean_rocauc =		0.8934[0m


## 🔎 Performance check

In [None]:
X_result = quiz.iloc[:, :].values

In [None]:
y_result = voting_clf.predict_proba(X_result)

In [None]:
y_result

array([[9.99761909e-01, 5.58773601e-05, 3.43722156e-07, ...,
        8.06855512e-05, 2.71270659e-06, 1.11011342e-06],
       [9.96414294e-01, 6.71695081e-04, 2.43092603e-05, ...,
        1.14012273e-03, 2.64460675e-05, 2.95271947e-05],
       [9.99854210e-01, 1.79508399e-05, 5.58822220e-07, ...,
        3.59084316e-05, 1.81676954e-05, 1.84353516e-06],
       ...,
       [8.49496773e-01, 2.47431105e-02, 1.86888308e-05, ...,
        8.54091800e-02, 8.75489101e-05, 7.16387508e-05],
       [9.95760918e-01, 7.78282545e-04, 1.68699653e-05, ...,
        1.02528018e-03, 2.22402098e-04, 1.37652359e-05],
       [9.93423686e-01, 1.25895764e-03, 1.44473173e-05, ...,
        1.83211409e-03, 2.21798810e-04, 1.95928903e-04]])

In [None]:
pred_df = pd.DataFrame(data = y_result, index = quiz.index)
pred_df

In [None]:
from tqdm import tqdm
score = []
for i in tqdm(range(len(quiz_result))):
  score.append(pred_df.loc[quiz_result.index[i],quiz_result.iloc[i:i+1, 0]].iloc[0].iloc[0])

100%|██████████| 300000/300000 [03:31<00:00, 1420.71it/s]


In [None]:
quiz_result['Score'] = score

In [None]:
quiz_result