# 영업 성공 여부 분류 경진대회

### [Timeline]
#### 1. EDA
#### 2. 데이터 전처리
#### 3. 파생변수 생성
#### 4. 인코딩
#### 5. 모델 학습
#### 6. Parameter Tuning, OOF
#### 7. 앙상블
#### 8. 제출

## 1. EDA

- 필수 라이브러리

In [1]:
!pip install xgboost
!pip install category_encoders
!pip install optuna
!pip install imblearn

Collecting category_encoders
  Downloading category_encoders-2.6.3-py2.py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.9/81.9 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: category_encoders
Successfully installed category_encoders-2.6.3
Collecting optuna
  Downloading optuna-3.5.0-py3-none-any.whl (413 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m413.4/413.4 kB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.13.1-py3-none-any.whl (233 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.4/233.4 kB[0m [31m16.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting colorlog (from optuna)
  Downloading colorlog-6.8.2-py3-none-any.whl (11 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.2-py3-none-any.whl (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.7/78.7 kB[0m

In [1]:
# 기본 라이브러리
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import statsmodels.api as sm
import re
from sklearn import set_config
from tqdm import tqdm
%matplotlib inline

# 시각화 및 폰트
import seaborn as sns
import matplotlib.pyplot as plt
plt.rc('font', family='Malgun Gothic') # 폰트 지정
plt.rc('axes', unicode_minus=False) # 마이너스 폰트 설정
%config InlineBackend.figure_format='retina' # 그래프 글씨 뚜렷

# 전처리 관련 라이브러리
from sklearn.preprocessing import FunctionTransformer, PowerTransformer, OrdinalEncoder, OneHotEncoder
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, SimpleImputer
import category_encoders as ce
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import SequentialFeatureSelector,SelectPercentile
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from imblearn.under_sampling import RandomUnderSampler

# 훈련 및 평가 관련 라이브러리
from sklearn.model_selection import train_test_split, ShuffleSplit,cross_val_score, cross_validate, KFold
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import (accuracy_score, confusion_matrix, f1_score, precision_score, recall_score)

# 모델 라이브러리
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeClassifier
import lightgbm
from lightgbm import LGBMRegressor, LGBMClassifier, plot_importance
# from catboost import CatBoostRegressor
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
# from catboost import CatBoostClassifier

# 파라미터 튜닝 관련 라이브러리
import optuna
from sklearn.model_selection import cross_validate

- 데이터 셋 읽어오기

In [14]:
df_train = pd.read_csv("train.csv") # 학습용 데이터
df_test = pd.read_csv("submission.csv") # 테스트 데이터(제출파일의 데이터)

UnicodeDecodeError: 'utf-8' codec can't decode byte 0x99 in position 158049: invalid start byte

In [4]:
df_test.describe()

Unnamed: 0,id,bant_submit,com_reg_ver_win_rate,customer_idx,historical_existing_cnt,id_strategic_ver,it_strategic_ver,idit_strategic_ver,lead_desc_length,ver_cus,ver_pro,ver_win_rate_x,ver_win_ratio_per_bu,lead_owner
count,5271.0,5271.0,1788.0,5271.0,1275.0,593.0,53.0,646.0,5271.0,5271.0,5271.0,2373.0,1906.0,5271.0
mean,10686.39594,0.60572,0.102336,24664.497629,21.272941,1.0,1.0,1.0,81.816164,0.146841,0.081389,0.001093,0.058388,304.458736
std,6141.012698,0.365631,0.15179,14615.896802,75.187401,0.0,0.0,0.0,135.063265,0.353981,0.273457,0.001189,0.035738,241.502331
min,1.0,0.0,0.003788,1.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,2e-06,0.011583,0.0
25%,5644.0,0.25,0.032787,11173.5,0.0,1.0,1.0,1.0,3.0,0.0,0.0,6e-05,0.04863,97.0
50%,10067.0,0.5,0.053892,24151.0,4.0,1.0,1.0,1.0,26.0,0.0,0.0,0.000572,0.053571,231.0
75%,16019.5,1.0,0.075,37680.0,19.0,1.0,1.0,1.0,104.0,0.0,0.0,0.001183,0.064566,428.0
max,21340.0,1.0,1.0,47466.0,2219.0,1.0,1.0,1.0,1143.0,1.0,1.0,0.003079,0.285714,1108.0


## 2. 데이터 전처리

In [5]:
columns = df_train.columns ; columns

Index(['bant_submit', 'customer_country', 'business_unit',
       'com_reg_ver_win_rate', 'customer_idx', 'customer_type', 'enterprise',
       'historical_existing_cnt', 'id_strategic_ver', 'it_strategic_ver',
       'idit_strategic_ver', 'customer_job', 'lead_desc_length',
       'inquiry_type', 'product_category', 'product_subcategory',
       'product_modelname', 'customer_country.1', 'customer_position',
       'response_corporate', 'expected_timeline', 'ver_cus', 'ver_pro',
       'ver_win_rate_x', 'ver_win_ratio_per_bu', 'business_area',
       'business_subarea', 'lead_owner', 'is_converted'],
      dtype='object')

In [6]:
df_all = pd.concat([df_train[columns], df_test[columns]])

 ### customer_country.1 제거

In [7]:
df_all.drop(["customer_country.1"], axis=1, inplace=True)

In [8]:
df_all.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 64570 entries, 0 to 5270
Data columns (total 28 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   bant_submit              64570 non-null  float64
 1   customer_country         63588 non-null  object 
 2   business_unit            64570 non-null  object 
 3   com_reg_ver_win_rate     16356 non-null  float64
 4   customer_idx             64570 non-null  int64  
 5   customer_type            19152 non-null  object 
 6   enterprise               64570 non-null  object 
 7   historical_existing_cnt  15031 non-null  float64
 8   id_strategic_ver         4037 non-null   float64
 9   it_strategic_ver         1174 non-null   float64
 10  idit_strategic_ver       5211 non-null   float64
 11  customer_job             44398 non-null  object 
 12  lead_desc_length         64570 non-null  int64  
 13  inquiry_type             62337 non-null  object 
 14  product_category       

### 1) bant_submit

결측값 없음

### 2) customer_country
- 고객 국적 & 영업 성공 여부 count와 비율

In [9]:
# 결측값 처리
df_all['customer_country'].fillna('other', inplace=True)

In [10]:
# 알파벳 소문자로 전체 변경
df_all['customer_country'] = df_all['customer_country'].apply(lambda x: x.lower())

In [11]:
# 전처리 : 국가 값으로 값 변경
df_all['customer_country'] = df_all['customer_country'].str.split('/').str[-1].str.strip()

In [12]:
df_all['customer_country'].nunique()

550

In [13]:
keyword = df_all[df_all['customer_country'].str.contains('other|@')]['customer_country']
df_all['customer_country'] = df_all['customer_country'].replace(list(keyword),'other')

keyword = df_all[df_all['customer_country'].str.contains('united states|us|las vegas|minneapolis|usa|nevada|virginia|hudson|rockefeller|lawrenceville|tucson|manhattan|virgin|boston|angeles|englewood|diego|houston|dexter|watertown|vestavia|patroon|itbprovout')]['customer_country']
df_all['customer_country'] = df_all['customer_country'].replace(list(keyword),'united states')

keyword = df_all[df_all['customer_country'].str.contains('turkey|türkiye')]['customer_country']
df_all['customer_country'] = df_all['customer_country'].replace(list(keyword),'turkey')

keyword = df_all[df_all['customer_country'].str.contains('u.a.e|uae')]['customer_country']
df_all['customer_country'] = df_all['customer_country'].replace(list(keyword),'uae')

keyword = df_all[df_all['customer_country'].str.contains('congo')]['customer_country']
df_all['customer_country'] = df_all['customer_country'].replace(list(keyword),'congo')

keyword = df_all[df_all['customer_country'].str.contains('italy')]['customer_country']
df_all['customer_country'] = df_all['customer_country'].replace(list(keyword),'italy')

keyword = df_all[df_all['customer_country'].str.contains('antigua')]['customer_country']
df_all['customer_country'] = df_all['customer_country'].replace(list(keyword),'antigua')

keyword = df_all[df_all['customer_country'].str.contains('mississauga|canada')]['customer_country']
df_all['customer_country'] = df_all['customer_country'].replace(list(keyword),'canada')

keyword = df_all[df_all['customer_country'].str.contains('colombia')]['customer_country']
df_all['customer_country'] = df_all['customer_country'].replace(list(keyword),'colombia')

keyword = df_all[df_all['customer_country'].str.contains('vietnam|ha noi')]['customer_country']
df_all['customer_country'] = df_all['customer_country'].replace(list(keyword),'vietnam')


In [14]:
df_all['customer_country'].nunique()

371

In [15]:
counts = df_all['customer_country'].value_counts()
result = counts[counts <= 2]
idx_lst = result.index.tolist()

for val in idx_lst:
    idx = df_all['customer_country'][df_all['customer_country'] == val].index
    df_all['customer_country'].loc[idx] = 'other'

In [16]:
df_all['customer_country'].replace({'':'other'}, inplace=True)

In [17]:
df_all['customer_country'].nunique()

146

In [18]:
df_all['customer_country'].unique()

array(['philippines', 'india', 'nigeria', 'saudi arabia', 'singapore',
       'brazil', 'uae', 'south africa', 'united states', 'colombia',
       'mexico', 'ghana', 'egypt', 'congo', 'ethiopia', 'other', 'kenya',
       'indonesia', 'oman', 'pakistan', 'united kingdom', 'guatemala',
       'panama', 'canada', 'bangladesh', 'papua new guinea',
       'united republic of tanzania', 'qatar', 'afghanistan', 'chile',
       'mozambique', 'turkey', 'el salvador', 'togo', 'jordan', 'iraq',
       'israel', 'sri lanka', 'south korea', 'portugal', 'uruguay',
       'peru', 'germany', 'romania', 'norway', 'jamaica', 'hungary',
       'poland', 'czech', 'spain', 'argentina', 'ecuador', 'senegal',
       'hong kong', 'malaysia', 'japan', 'kuwait', 'ireland', 'albania',
       'greece', 'algeria', 'nicaragua', 'slovenia', 'italy',
       'netherlands', 'dominican republic', 'france', 'uganda', 'iran',
       'paraguay', 'bolivia', 'namibia', 'tunisia', 'puerto rico',
       'anguilla', 'croatia', 

### 3) business_unit
- MQL 상품 대응 사업부 & 영업 성공 여부

결측값 없음

### 5) customer_idx
- 고객 회사명(numeric)

결측값 없음

### 6) customer_type
- 고객 유형

In [19]:
# 전처리 : 중복명 및 통합
df_all['customer_type'].replace({'End Customer':'End-Customer', 'Specifier / Influencer': 'Specifier/Influencer',
                             'Home Owner':'Homeowner', 'Etc.':'Others','Other':'Others','End-user':'End-Customer',
                             'Commercial end-user':'End-Customer','Software / Solution Provider':'Software/Solution Provider',
                             'Dealer/Distributor	':'Distributor'},
                            inplace=True)

In [20]:
# 결측값 처리
df_all['customer_type'].fillna('Others', inplace=True)

### 7) enterprise
- Global 기업인지, Small/Medium 규모의 기업인지  
대기업 / 중소기업

결측값 없음

### 9) id_strategic_ver
- (도메인 지식) 특정 사업부(Business Unit), 특정 사업 영역(Vertical Level1)에 대해 가중치를 부여  
단일값(1.0)과 결측치만 존재

In [21]:
# 결측값 처리
df_all['id_strategic_ver'].fillna(0, inplace=True)

### 10) it_strategic_ver
- (도메인 지식) 특정 사업부(Business Unit), 특정 사업 영역(Vertical Level1)에 대해 가중치를 부여
단일값(1.0)과 결측치만 존재

In [22]:
# 결측값 처리
df_all['it_strategic_ver'].fillna(0, inplace=True)

### 11) idit_strategic_ver
- Id_strategic_ver이나 it_strategic_ver 값 중 하나라도 1의 값을 가지면 1 값으로 표현   
단일값(1.0)과 결측치만 존재

In [23]:
# 결측값 처리
df_all['idit_strategic_ver'].fillna(0, inplace=True)

### 12) customer_job
- 고객의 직업군

### preprocess

In [24]:
sum(df_all['customer_job'].isnull())

20172

In [25]:
df_all['customer_job'].fillna('other', inplace=True)

In [26]:
keyword = df_all[df_all['customer_job'].str.contains('purchas')]['customer_job']
df_all['customer_job'] = df_all['customer_job'].replace(list(keyword),'purchasing')

keyword = df_all[df_all['customer_job'].str.contains('media|communi')]['customer_job']
df_all['customer_job'] = df_all['customer_job'].replace(list(keyword),'media and communication')

keyword = df_all[df_all['customer_job'].str.contains('engine|executive')]['customer_job']
df_all['customer_job'] = df_all['customer_job'].replace(list(keyword),'engineering')

keyword = df_all[df_all['customer_job'].str.contains('consult|strategy')]['customer_job']
df_all['customer_job'] = df_all['customer_job'].replace(list(keyword),'consulting')

keyword = df_all[df_all['customer_job'].str.contains('program and project management|program|project')]['customer_job']
df_all['customer_job'] = df_all['customer_job'].replace(list(keyword),'program and project management')

keyword = df_all[df_all['customer_job'].str.contains('other|others|drop|need|%')]['customer_job']
df_all['customer_job'] = df_all['customer_job'].replace(list(keyword),'other')

keyword = df_all[df_all['customer_job'].str.contains('sales|sale')]['customer_job']
df_all['customer_job'] = df_all['customer_job'].replace(list(keyword),'sales')

keyword = df_all[df_all['customer_job'].str.contains('operations|ops|employee|managing partner|managing contractor|managing director')]['customer_job']
df_all['customer_job'] = df_all['customer_job'].replace(list(keyword),'operations')

keyword = df_all[df_all['customer_job'].str.contains('business development|business|procurement|sourcing')]['customer_job']
df_all['customer_job'] = df_all['customer_job'].replace(list(keyword),'business development')

keyword = df_all[df_all['customer_job'].str.contains('information technology|it|application|network|system|tech|software')]['customer_job']
df_all['customer_job'] = df_all['customer_job'].replace(list(keyword),'information technology')

keyword = df_all[df_all['customer_job'].str.contains('account')]['customer_job']
df_all['customer_job'] = df_all['customer_job'].replace(list(keyword),'accounting')

keyword = df_all[df_all['customer_job'].str.contains('educat|academy|teach')]['customer_job']
df_all['customer_job'] = df_all['customer_job'].replace(list(keyword),'education')

keyword = df_all[df_all['customer_job'].str.contains('health|clinic|medical|surgery|pathologist|cirugía|radiología|imag|cirugano|tierarzt|radiology|doctor')]['customer_job']
df_all['customer_job'] = df_all['customer_job'].replace(list(keyword),'healthcare services')

keyword = df_all[df_all['customer_job'].str.contains('human resources|hr|human')]['customer_job']
df_all['customer_job'] = df_all['customer_job'].replace(list(keyword),'human resources')

keyword = df_all[df_all['customer_job'].str.contains('support|service|help|advis|resell|supervisor|facilit')]['customer_job']
df_all['customer_job'] = df_all['customer_job'].replace(list(keyword),'support')

keyword = df_all[df_all['customer_job'].str.contains('finance|pénzügy')]['customer_job']
df_all['customer_job'] = df_all['customer_job'].replace(list(keyword),'finance')

keyword = df_all[df_all['customer_job'].str.contains('marketing')]['customer_job']
df_all['customer_job'] = df_all['customer_job'].replace(list(keyword),'marketing')

keyword = df_all[df_all['customer_job'].str.contains('art|design|color|gallery|meseum')]['customer_job']
df_all['customer_job'] = df_all['customer_job'].replace(list(keyword),'arts and design')

keyword = df_all[df_all['customer_job'].str.contains('research')]['customer_job']
df_all['customer_job'] = df_all['customer_job'].replace(list(keyword),'research')

keyword = df_all[df_all['customer_job'].str.contains('product management|product')]['customer_job']
df_all['customer_job'] = df_all['customer_job'].replace(list(keyword),'product management')

keyword = df_all[df_all['customer_job'].str.contains('architect')]['customer_job']
df_all['customer_job'] = df_all['customer_job'].replace(list(keyword),'architect')

keyword = df_all[df_all['customer_job'].str.contains('install|integrator')]['customer_job']
df_all['customer_job'] = df_all['customer_job'].replace(list(keyword),'installer')

keyword = df_all[df_all['customer_job'].str.contains('curation')]['customer_job']
df_all['customer_job'] = df_all['customer_job'].replace(list(keyword),'curation')

keyword = df_all[df_all['customer_job'].str.contains('owner|ceo|founder|president|boss')]['customer_job']
df_all['customer_job'] = df_all['customer_job'].replace(list(keyword),'owner')

keyword = df_all[df_all['customer_job'].str.contains('energy')]['customer_job']
df_all['customer_job'] = df_all['customer_job'].replace(list(keyword),'energy')

keyword = df_all[df_all['customer_job'].str.contains('advert')]['customer_job']
df_all['customer_job'] = df_all['customer_job'].replace(list(keyword),'advertising')

keyword = df_all[df_all['customer_job'].str.contains('contractor|build|construction')]['customer_job']
df_all['customer_job'] = df_all['customer_job'].replace(list(keyword),'contractor')

keyword = df_all[df_all['customer_job'].str.contains('hotel')]['customer_job']
df_all['customer_job'] = df_all['customer_job'].replace(list(keyword),'hotel')

keyword = df_all[df_all['customer_job'].str.contains('distribu|supplie|retail')]['customer_job']
df_all['customer_job'] = df_all['customer_job'].replace(list(keyword),'distribution')

keyword = df_all[df_all['customer_job'].str.contains('film production|photo|film')]['customer_job']
df_all['customer_job'] = df_all['customer_job'].replace(list(keyword),'film production')

keyword = df_all[df_all['customer_job'].str.contains('buyer|purcha')]['customer_job']
df_all['customer_job'] = df_all['customer_job'].replace(list(keyword),'buyer')

keyword = df_all[df_all['customer_job'].str.contains('manager|general|management|manage|vp|gm|admin|pm|manger|manag')]['customer_job']
df_all['customer_job'] = df_all['customer_job'].replace(list(keyword),'manage')

In [27]:
counts = df_all['customer_job'].value_counts()
result = counts[counts <= 12]
print(result)

vertrieb                    12
medien_und_kommunikation    11
finanzen                    10
decision maker               9
otro                         8
                            ..
appliance specialist         1
av estimator                 1
pricing                      1
ranger 2                     1
k12 school                   1
Name: customer_job, Length: 161, dtype: int64


In [28]:
idx_lst = result.index.tolist()

for val in idx_lst:
    idx = df_all['customer_job'][df_all['customer_job'] == val].index
    df_all['customer_job'].loc[idx] = 'other'

### 13) lead_desc_length
- 고객이 작성한 Lead Description 텍스트 총 길이  

결측값 없음

### 14) inquiry_type
- 고객의 문의 유형

In [29]:
sum(df_all['inquiry_type'].isnull())

2233

In [30]:
# 결측값 처리
df_all['inquiry_type'].fillna('Other', inplace=True)

In [31]:
# 전처리 : 중복 통합,오탈자 정리
df_all['inquiry_type'].replace({'Quotation or purchase consultation':'Quotation or Purchase Consultation',
                            'quotation_or_purchase_consultation':'Quotation or Purchase Consultation',
                            'Quotation or Purchase consultation':'Quotation or Purchase Consultation',
                            'Purchase or Quotation':'Quotation or Purchase Consultation',
                            'Others':'Other','other_':'Other','other':'Other','ETC.':'Other','Etc.':'Other','others':'Other',
                            'Usage or Technical Consultation':'Technical Consultation	',
                            'Usage or Technical Consultation':'Technical Consultation	',
                            'usage or technical consultation':'Technical Consultation	',
                            },
                            inplace=True)

### 15) product_category
- 요청 제품 카테고리

In [32]:
df_all['product_category'] = df_all['product_category'].fillna('etc.')

In [33]:
keyword = df_all[df_all['product_category'].str.contains('otros|outros|ฯลฯ|آخر|Etc|Other|Others|other|others|error')]['product_category']
df_all['product_category'] = df_all['product_category'].replace(list(keyword),'etc.')

# ฯลฯ etc آخر 최신 otros 기타(스페인)

In [34]:
# 전처리 함수 정의
def preprocess_category(text):
    # 만약 ','가 존재하면 첫 번째 ',' 앞의 문장을 추출, 아니면 그대로 반환
    if ',' in text:
        return text.split(',')[0].strip()
    else:
        return text.strip()

# 열에 대해 전처리 함수 적용
df_all['product_category'] = df_all['product_category'].apply(preprocess_category)

In [35]:
# 중복 카테고리 통일하기
keyword = df_all[df_all['product_category'].str.contains('accessories')]['product_category']
df_all['product_category'] = df_all['product_category'].replace(list(keyword),'accessories')

keyword = df_all[df_all['product_category'].str.contains('chiller|مبرد (تشيلر)')]['product_category']
df_all['product_category'] = df_all['product_category'].replace(list(keyword),'chiller')

keyword = df_all[df_all['product_category'].str.contains('air conditionor|ar condicionado residencial|aire acondicionado residencial|เครื่องปรับอากาศเผื่อที่อยู่อาศัย|تكييف وتبريد')]['product_category']
df_all['product_category'] = df_all['product_category'].replace(list(keyword),'air conditionor')

keyword = df_all[df_all['product_category'].str.contains('cloud device')]['product_category']
df_all['product_category'] = df_all['product_category'].replace(list(keyword),'cloud device')

keyword = df_all[df_all['product_category'].str.contains('control')]['product_category']
df_all['product_category'] = df_all['product_category'].replace(list(keyword),'control')

keyword = df_all[df_all['product_category'].str.contains('ess')]['product_category']
df_all['product_category'] = df_all['product_category'].replace(list(keyword),'ess')

keyword = df_all[df_all['product_category'].str.contains('حلول التدفئة|heat|heating|aquecimento|calefacción')]['product_category']
df_all['product_category'] = df_all['product_category'].replace(list(keyword),'heating')

keyword = df_all[df_all['product_category'].str.contains('高亮度顯示屏|high brightness|high brightness signage')]['product_category']
df_all['product_category'] = df_all['product_category'].replace(list(keyword),'high brightness signage')

keyword = df_all[df_all['product_category'].str.contains('醫院電視|hospital tv')]['product_category']
df_all['product_category'] = df_all['product_category'].replace(list(keyword),'hospital tv')

keyword = df_all[df_all['product_category'].str.contains('酒店電視|hotel tv')]['product_category']
df_all['product_category'] = df_all['product_category'].replace(list(keyword),'hotel tv')

keyword = df_all[df_all['product_category'].str.contains('pol|ctv|htv|tv ')]['product_category']
df_all['product_category'] = df_all['product_category'].replace(list(keyword),'tv')

keyword = df_all[df_all['product_category'].str.contains('idb|interactive digital board')]['product_category']
df_all['product_category'] = df_all['product_category'].replace(list(keyword),'interactive digital board')

keyword = df_all[df_all['product_category'].str.contains('互動式顯示屏|interactive signage')]['product_category']
df_all['product_category'] = df_all['product_category'].replace(list(keyword),'interactive signage')

keyword = df_all[df_all['product_category'].str.contains('laptop|notebook')]['product_category']
df_all['product_category'] = df_all['product_category'].replace(list(keyword),'laptop')

keyword = df_all[df_all['product_category'].str.contains('顯示屏|led signage')]['product_category']
df_all['product_category'] = df_all['product_category'].replace(list(keyword),'led signage')

keyword = df_all[df_all['product_category'].str.contains('medical display|medical displays')]['product_category']
df_all['product_category'] = df_all['product_category'].replace(list(keyword),'medical display')

keyword = df_all[df_all['product_category'].str.contains('monitor')]['product_category']
df_all['product_category'] = df_all['product_category'].replace(list(keyword),'monitor')

keyword = df_all[df_all['product_category'].str.contains('multi-split')]['product_category']
df_all['product_category'] = df_all['product_category'].replace(list(keyword),'multi-split')

keyword = df_all[df_all['product_category'].str.contains('oled|oled 顯示屏|oled signage')]['product_category']
df_all['product_category'] = df_all['product_category'].replace(list(keyword),'oled signage')
# 'oled 顯示屏'의 sub_category 'OLED 透明觸控顯示屏'-> Transparent OLED Touch Signage로 변경

keyword = df_all[df_all['product_category'].str.contains('one-quick series|one quick|quick')]['product_category']
df_all['product_category'] = df_all['product_category'].replace(list(keyword),'one-quick series')

keyword = df_all[df_all['product_category'].str.contains('projector')]['product_category']
df_all['product_category'] = df_all['product_category'].replace(list(keyword),'projector')

keyword = df_all[df_all['product_category'].str.contains('signage care solution')]['product_category']
df_all['product_category'] = df_all['product_category'].replace(list(keyword),'signage care solution')

keyword = df_all[df_all['product_category'].str.contains('single split|single-split')]['product_category']
df_all['product_category'] = df_all['product_category'].replace(list(keyword),'single-split')

keyword = df_all[df_all['product_category'].str.contains('軟體|software solution')]['product_category']
df_all['product_category'] = df_all['product_category'].replace(list(keyword),'software solution')

keyword = df_all[df_all['product_category'].str.contains('特別顯示屏|special signage')]['product_category']
df_all['product_category'] = df_all['product_category'].replace(list(keyword),'special signage')

keyword = df_all[df_all['product_category'].str.contains('標準顯示屏|standard signage')]['product_category']
df_all['product_category'] = df_all['product_category'].replace(list(keyword),'standard signage')

keyword = df_all[df_all['product_category'].str.contains('air|ventilation')]['product_category']
df_all['product_category'] = df_all['product_category'].replace(list(keyword),'ventilation')

keyword = df_all[df_all['product_category'].str.contains('wall')]['product_category']
df_all['product_category'] = df_all['product_category'].replace(list(keyword),'video wall signage')

keyword = df_all[df_all['product_category'].str.contains('vrf|ahu|نظام التدفق المتغير')]['product_category']
df_all['product_category'] = df_all['product_category'].replace(list(keyword),'vrf')

keyword = df_all[df_all['product_category'].str.contains('webos')]['product_category']
df_all['product_category'] = df_all['product_category'].replace(list(keyword),'webos')

keyword = df_all[df_all['product_category'].str.contains('allin|aio')]['product_category']
df_all['product_category'] = df_all['product_category'].replace(list(keyword),'aio')

In [36]:
df_all['product_category'].value_counts(ascending=True)

parts                            1
hospitality                      1
אחר                              1
pro centric hotel                1
43us660h (na)                    1
                             ...  
led signage                   3667
multi-split                   4225
interactive digital board     6259
vrf                           6500
etc.                         23559
Name: product_category, Length: 155, dtype: int64

### 16) product_subcategory
- 요청 제품 하위 카테고리

In [37]:
df_all['product_subcategory'] = df_all['product_subcategory'].fillna('etc.')

### 17) product_modelname
- 요청 제품 모델명

In [38]:
# 전처리 : 결측값 Anything 으로 대체
df_all['product_modelname'] = df_all['product_modelname'].fillna('Anything')
# 전처리 : 값에 국가 존재하면 값에서 제거 후 대체
df_all['product_modelname'] = df_all['product_modelname'].str.replace(r'\(NA\)|\(MEA\)|\(EU\)|\(EU/CIS\)|\(ASIA\)|\(INDIA\)|\(CIS\)|\(Colombia\)|\(SCA\)|\(EU Only\)|\(Brazil Only\)|\(LATAM\)|\(Japan\)', '', regex=True).str.strip()
# 전처리 : 모델명과 모델 코드 모두 존재하면 모델 코드로 대체
df_all['product_modelname'] = df_all['product_modelname'].replace({'UltraWide Ergo(34WN780)':'34WN780','UltraFine Ergo(32UN880)':'32UN880',
                                     'DualUp(28MQ780)':'28MQ780','Ergo Dual(27QP88D)':'27QP88D',
                                     '65EP5G OLED Pro':'65EP5G'})
# 전처리 : 같은 것을 지칭하는 경우
df_all['product_modelname'] = df_all['product_modelname'].replace({'B, 32HL512D':'32HL512D','Diagnostic Monitors':'Diagnostic Monitor','SuperSign CMS':'LG SuperSign CMS'})
# 전처리 : 모델명이 아닌 문장인 것들은 'other'로 대체
df_all['product_modelname'] = df_all['product_modelname'].replace(['Total Care Thru One-stop Service', 'Architect , We are Meeting for Enqiry Generation ( This is not a Inquiry)', 'Total Care Thru One', 'Due to budget they have hold the requiement', 'Required After 3 Months', 'Want Split AC', 'Only Installation Need', 'Passed on to Fixxy distribution', 'full', 'This is being dealt with by LG Germany.', 'SuperSign Media Editor', 'SuperSign WB', 'ALL Surgical', 'Surgical', 'diagnostic', 'LGESL Export team is follow up the lead', 'Video', 'Inquiry forwarded to Shaker', 'AI/Machine Learning | Antennas, Transmitters and Towers | Audience Measurement | Cameras and Lenses', 'One:Quick', 'Solution'], 'Other')

# 전처리 : '-' 다음의 띄어쓰기를 없애고 반환
def preprocess_modelname(modelname):
    return modelname.replace('- ', '')
df_all['product_modelname'] = df_all['product_modelname'].apply(preprocess_modelname)

### 18) customer_country.1
- 담당 자사 법인명 기반의 지역 정보(대륙)

Drop

### 19) customer_position
- 고객의 회사 직책

### preprocess

In [39]:
df_all['customer_position'].fillna('other', inplace=True)

In [40]:
keyword = df_all[df_all['customer_position'].str.contains('none|other|others|not|exhibition|no|customer|unpaid')]['customer_position']
df_all['customer_position'] = df_all['customer_position'].replace(list(keyword),'other')

keyword = df_all[df_all['customer_position'].str.contains('manager|mana')]['customer_position']
df_all['customer_position'] = df_all['customer_position'].replace(list(keyword),'manager')

keyword = df_all[df_all['customer_position'].str.contains('ceo|maker|founder|chief|president|boss|vp|vice|chairman|owner|lider')]['customer_position']
df_all['customer_position'] = df_all['customer_position'].replace(list(keyword),'ceo/founder')

keyword = df_all[df_all['customer_position'].str.contains('director')]['customer_position']
df_all['customer_position'] = df_all['customer_position'].replace(list(keyword),'director')

keyword = df_all[df_all['customer_position'].str.contains('associate')]['customer_position']
df_all['customer_position'] = df_all['customer_position'].replace(list(keyword),'associate/analyst')

keyword = df_all[df_all['customer_position'].str.contains('partner')]['customer_position']
df_all['customer_position'] = df_all['customer_position'].replace(list(keyword),'partner')

keyword = df_all[df_all['customer_position'].str.contains('entry|intern')]['customer_position']
df_all['customer_position'] = df_all['customer_position'].replace(list(keyword),'entry level')

keyword = df_all[df_all['customer_position'].str.contains('hospital|medical|tierarzt|surgery|pathologist|főorvos|radiology')]['customer_position']
df_all['customer_position'] = df_all['customer_position'].replace(list(keyword),'hospital')

keyword = df_all[df_all['customer_position'].str.contains('executive')]['customer_position']
df_all['customer_position'] = df_all['customer_position'].replace(list(keyword),'c-level executive')

keyword = df_all[df_all['customer_position'].str.contains('consult')]['customer_position']
df_all['customer_position'] = df_all['customer_position'].replace(list(keyword),'consultant')

keyword = df_all[df_all['customer_position'].str.contains('decision influencer|decision-influencer')]['customer_position']
df_all['customer_position'] = df_all['customer_position'].replace(list(keyword),'decision influencer')

keyword = df_all[df_all['customer_position'].str.contains('edu|teach|prof|academ|college|lecturer|faculty|coach|exam|physics|dean')]['customer_position']
df_all['customer_position'] = df_all['customer_position'].replace(list(keyword),'education')

keyword = df_all[df_all['customer_position'].str.contains('research')]['customer_position']
df_all['customer_position'] = df_all['customer_position'].replace(list(keyword),'research')

keyword = df_all[df_all['customer_position'].str.contains('end')]['customer_position']
df_all['customer_position'] = df_all['customer_position'].replace(list(keyword),'end-user')

keyword = df_all[df_all['customer_position'].str.contains('operation|gerente|genel')]['customer_position']
df_all['customer_position'] = df_all['customer_position'].replace(list(keyword),'operation')

keyword = df_all[df_all['customer_position'].str.contains('sales|develop')]['customer_position']
df_all['customer_position'] = df_all['customer_position'].replace(list(keyword),'sales')

keyword = df_all[df_all['customer_position'].str.contains('técnico|software|system|tech')]['customer_position']
df_all['customer_position'] = df_all['customer_position'].replace(list(keyword),'software')

In [41]:
counts = df_all['customer_position'].value_counts()
result = counts[counts == 1]
print(result)

idx_lst = result.index.tolist()

for val in idx_lst:
    idx = df_all['customer_position'][df_all['customer_position'] == val].index
    df_all['customer_position'].loc[idx] = 'other'

distributor                                                1
this is a consume display requirement for home purpose.    1
proprietário(a)                                            1
mindenes                                                   1
pgt chemistry                                              1
Name: customer_position, dtype: int64


### 20) response_corporate
- 담당 자사 법인명

결측값 없음

### 21) expected_timeline
- 고객의 요청한 처리 일정  
NLP 처리 필요

In [42]:
df_all['expected_timeline'].fillna('neutral', inplace=True)

In [43]:
# 결측값 처리
df_all['expected_timeline'] = df_all['expected_timeline'].fillna('neutral')
# 기간값 수치형 변환
df_all['expected_timeline'] = df_all['expected_timeline'].replace({'3 months':3,'3 months ~ 6 months':4.5,'3_months_~_6_months':4.5,
                                                            '45 days':1.5,'6 months ~ 9 months':7.5,'6_months_~_9_months':7.5,
                                                            '9 months - 1 year':10.5,'9 months ~ 1 year':10.5,'9_months_-_1_year':10.5,
                                                            'more than a year':12,'more then 3 months':3,'more_than_a_year':12,
                                                            'less than 3 months':1.5,'less than 5 months':2.5,'less than 6 months':3,
                                                            'less then 6 months':3,'less_than_3_months':1.5 })
# 문자형일 경우 0으로 변환
df_all['expected_timeline'] = df_all['expected_timeline'].apply(lambda x:0 if isinstance(x,str) else x)

In [44]:
df_all['expected_timeline'].value_counts()

0.0     33999
1.5     19061
4.5      5461
12.0     3299
10.5     1366
7.5      1269
3.0       113
2.5         2
Name: expected_timeline, dtype: int64

### 22) ver_cus
- 특정 Vertical Level 1(사업영역) 이면서 Customer_type(고객 유형)이 소비자(End-user)인 경우에 대한 가중치

In [45]:
df_all['ver_cus'].fillna(0, inplace=True)

### 23) ver_pro
- 특정 Vertical Level 1(사업영역) 이면서 특정 Product Category(제품 유형)인 경우에 대한 가중치

In [46]:
df_all['ver_pro'].fillna(0, inplace=True)

### 26) business_area
- 고객의 사업 영역

In [47]:
df_all['business_area'].fillna('Others', inplace=True)

### 27)  business_subarea
- 고객의 세부 사업 영역

In [48]:
df_all['business_subarea'].fillna('Others', inplace=True)

### 28)  lead_owner
- 영업 담당자 이름

결측값 없음

In [49]:
df_all

Unnamed: 0,bant_submit,customer_country,business_unit,com_reg_ver_win_rate,customer_idx,customer_type,enterprise,historical_existing_cnt,id_strategic_ver,it_strategic_ver,...,response_corporate,expected_timeline,ver_cus,ver_pro,ver_win_rate_x,ver_win_ratio_per_bu,business_area,business_subarea,lead_owner,is_converted
0,1.00,philippines,AS,0.066667,32160,End-Customer,Enterprise,,0.0,0.0,...,LGEPH,1.5,1,0,0.003079,0.026846,corporate / office,Engineering,0,True
1,1.00,philippines,AS,0.066667,23122,End-Customer,Enterprise,12.0,0.0,0.0,...,LGEPH,1.5,1,0,0.003079,0.026846,corporate / office,Advertising,1,True
2,1.00,india,AS,0.088889,1755,End-Customer,Enterprise,144.0,0.0,0.0,...,LGEIL,1.5,1,0,0.003079,0.026846,corporate / office,Construction,2,True
3,1.00,india,AS,0.088889,4919,End-Customer,Enterprise,,0.0,0.0,...,LGEIL,1.5,1,0,0.003079,0.026846,corporate / office,IT/Software,3,True
4,1.00,india,AS,0.088889,17126,Specifier/ Influencer,Enterprise,,0.0,0.0,...,LGEIL,1.5,0,0,0.003079,0.026846,corporate / office,Others,4,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5266,0.50,brazil,AS,,40292,Others,Enterprise,10.0,0.0,0.0,...,LGESP,0.0,0,0,,,Others,Others,97,False
5267,0.25,united states,IT,,47466,Others,Enterprise,0.0,0.0,0.0,...,LGEUS,0.0,0,0,,,Others,Others,438,False
5268,0.75,brazil,AS,,46227,Specifier/ Influencer,Enterprise,,0.0,0.0,...,LGESP,1.5,0,0,,,Others,Others,97,True
5269,0.00,germany,IT,,45667,End-Customer,SMB,,0.0,0.0,...,LGEDG,0.0,0,0,,,Others,Others,429,False


In [50]:
df_train = df_all.iloc[: len(df_train)]
df_test = df_all.iloc[len(df_train) :]

### train / test 나눠서 결측값 처리

In [51]:
# 4) com_reg_ver_win_rate
business_unit_lst =  df_all['business_unit'].unique().tolist()
country_lst = df_all['customer_country'].unique().tolist()

for unit in business_unit_lst:
    for country in country_lst:
        # 특정 사업부이면서 특정 국가인 경우의 com_reg_ver_win_rate의 평균값 계산
        mean_rate = df_train[(df_train['business_unit'] == unit) & (df_train['customer_country'] == country)]['com_reg_ver_win_rate'].mean()

        # 평균값이 nan인 경우
        if pd.isna(mean_rate) == True:
            df_train.loc[(df_train['business_unit'] == unit) & (df_train['customer_country'] == country) & df_train['com_reg_ver_win_rate'].isna(), 'com_reg_ver_win_rate'] = df_train['com_reg_ver_win_rate'].median()
            df_test.loc[(df_test['business_unit'] == unit) & (df_test['customer_country'] == country) & df_test['com_reg_ver_win_rate'].isna(), 'com_reg_ver_win_rate'] = df_train['com_reg_ver_win_rate'].median()
        else:
            # 평균값으로 결측값 대체
            df_train.loc[(df_train['business_unit'] == unit) & (df_train['customer_country'] == country) & df_train['com_reg_ver_win_rate'].isna(), 'com_reg_ver_win_rate'] = mean_rate
            df_test.loc[(df_test['business_unit'] == unit) & (df_test['customer_country'] == country) & df_test['com_reg_ver_win_rate'].isna(), 'com_reg_ver_win_rate'] = mean_rate



# business_unit_lst =  df_test['business_unit'].unique().tolist()
# country_lst = df_test['customer_country'].unique().tolist()

# for unit in business_unit_lst:
#     for country in country_lst:
#         # 특정 사업부이면서 특정 국가인 경우의 com_reg_ver_win_rate의 평균값 계산
#         mean_rate = df_test[(df_test['business_unit'] == unit) & (df_test['customer_country'] == country)]['com_reg_ver_win_rate'].mean()

#         # 평균값이 nan인 경우
#         if pd.isna(mean_rate) == True:
#             df_test.loc[(df_test['business_unit'] == unit) & (df_test['customer_country'] == country) & df_test['com_reg_ver_win_rate'].isna(), 'com_reg_ver_win_rate'] = df_test['com_reg_ver_win_rate'].median()
#         else:
#             # 평균값으로 결측값 대체
#             df_test.loc[(df_test['business_unit'] == unit) & (df_test['customer_country'] == country) & df_test['com_reg_ver_win_rate'].isna(), 'com_reg_ver_win_rate'] = mean_rate

In [52]:
# 8) historical_existing_cnt
med = df_train['historical_existing_cnt'].median()
df_train['historical_existing_cnt'] = df_train['historical_existing_cnt'].fillna(med)
df_test['historical_existing_cnt'] = df_test['historical_existing_cnt'].fillna(med)

# 24) ver_win_rate_x
med = df_train['ver_win_rate_x'].median()
df_train['ver_win_rate_x'] = df_train['ver_win_rate_x'].fillna(med)
df_test['ver_win_rate_x'] = df_test['ver_win_rate_x'].fillna(med)

# 25) ver_win_ratio_per_bu
med = df_train['ver_win_ratio_per_bu'].median()
df_train['ver_win_ratio_per_bu'] = df_train['ver_win_ratio_per_bu'].fillna(med)
df_test['ver_win_ratio_per_bu'] = df_test['ver_win_ratio_per_bu'].fillna(med)

### train / test 나눠서 스케일링

In [53]:
# 연속형 수치형 칼럼(인덱스를 의미하는 정수형 포함X)
numeric_col = ['bant_submit','com_reg_ver_win_rate','historical_existing_cnt','lead_desc_length',
               'expected_timeline','ver_win_rate_x','ver_win_ratio_per_bu']

# StandardScaler 객체 생성 및 훈련 데이터에 적용
scaler = StandardScaler()
df_train[numeric_col] = scaler.fit_transform(df_train[numeric_col])
df_test[numeric_col] = scaler.transform(df_test[numeric_col]) # fit_transform -> transform

In [54]:
columns = df_train.columns ; columns

Index(['bant_submit', 'customer_country', 'business_unit',
       'com_reg_ver_win_rate', 'customer_idx', 'customer_type', 'enterprise',
       'historical_existing_cnt', 'id_strategic_ver', 'it_strategic_ver',
       'idit_strategic_ver', 'customer_job', 'lead_desc_length',
       'inquiry_type', 'product_category', 'product_subcategory',
       'product_modelname', 'customer_position', 'response_corporate',
       'expected_timeline', 'ver_cus', 'ver_pro', 'ver_win_rate_x',
       'ver_win_ratio_per_bu', 'business_area', 'business_subarea',
       'lead_owner', 'is_converted'],
      dtype='object')

In [55]:
df = pd.concat([df_train[columns], df_test[columns]])

## 3. 파생변수 생성

In [56]:
# 영업 전환율 기준으로 상(40%~), 중(10%~40%), 하(~10%)로 담당 자사 구분
high_cor = [1.0, 2.0, 1027.0, 4.0, 5.0, 7.0, 10.0, 12.0, 14.0, 1043.0, 532.0, 1046.0, 542.0, 34.0, 551.0, 43.0, 557.0, 560.0, 566.0, 570.0, 571.0, 572.0, 579.0, 585.0, 84.0, 597.0, 86.0, 599.0, 603.0, 608.0, 611.0, 612.0, 104.0, 621.0, 113.0, 114.0, 629.0, 119.0, 632.0, 636.0, 637.0, 128.0, 129.0, 137.0, 145.0, 146.0, 147.0, 149.0, 152.0, 156.0, 668.0, 158.0, 163.0, 165.0, 166.0, 172.0, 687.0, 183.0, 705.0, 227.0, 228.0, 751.0, 786.0, 278.0, 281.0, 291.0, 297.0, 303.0, 312.0, 315.0, 831.0, 832.0, 833.0, 837.0, 839.0, 327.0, 330.0, 331.0, 332.0, 335.0, 849.0, 339.0, 345.0, 858.0, 351.0, 353.0, 866.0, 355.0, 867.0, 368.0, 369.0, 371.0, 372.0, 373.0, 375.0, 377.0, 890.0, 379.0, 901.0, 396.0, 406.0, 926.0, 931.0, 937.0, 428.0, 941.0, 943.0, 437.0, 438.0, 439.0, 956.0, 448.0, 960.0, 453.0, 455.0, 457.0, 971.0, 460.0, 462.0, 975.0, 976.0, 977.0, 466.0, 464.0, 469.0, 983.0, 474.0, 476.0, 989.0, 991.0, 480.0, 479.0, 483.0, 489.0, 1003.0, 492.0, 501.0, 503.0, 504.0, 507.0, 510.0]
medium_cor = [514.0, 3.0, 1026.0, 518.0, 6.0, 8.0, 9.0, 520.0, 11.0, 13.0, 525.0, 1040.0, 17.0, 19.0, 534.0, 1047.0, 24.0, 25.0, 28.0, 31.0, 32.0, 35.0, 36.0, 550.0, 552.0, 553.0, 556.0, 46.0, 47.0, 48.0, 49.0, 564.0, 567.0, 568.0, 61.0, 68.0, 580.0, 73.0, 75.0, 76.0, 589.0, 78.0, 77.0, 83.0, 600.0, 88.0, 89.0, 601.0, 604.0, 93.0, 92.0, 99.0, 100.0, 615.0, 616.0, 617.0, 106.0, 618.0, 619.0, 620.0, 624.0, 625.0, 626.0, 627.0, 116.0, 628.0, 630.0, 118.0, 120.0, 121.0, 117.0, 123.0, 635.0, 634.0, 135.0, 138.0, 657.0, 148.0, 150.0, 151.0, 153.0, 154.0, 155.0, 666.0, 667.0, 157.0, 159.0, 161.0, 162.0, 164.0, 167.0, 680.0, 169.0, 170.0, 168.0, 171.0, 173.0, 682.0, 681.0, 177.0, 182.0, 186.0, 188.0, 190.0, 191.0, 195.0, 201.0, 203.0, 211.0, 213.0, 726.0, 729.0, 217.0, 220.0, 222.0, 223.0, 229.0, 231.0, 238.0, 239.0, 240.0, 241.0, 242.0, 760.0, 254.0, 766.0, 255.0, 257.0, 260.0, 772.0, 262.0, 263.0, 264.0, 267.0, 269.0, 270.0, 271.0, 783.0, 272.0, 275.0, 279.0, 282.0, 283.0, 288.0, 289.0, 298.0, 302.0, 304.0, 308.0, 310.0, 311.0, 313.0, 314.0, 316.0, 317.0, 835.0, 324.0, 325.0, 838.0, 333.0, 334.0, 847.0, 337.0, 340.0, 342.0, 343.0, 348.0, 352.0, 356.0, 360.0, 362.0, 875.0, 374.0, 382.0, 386.0, 388.0, 410.0, 430.0, 431.0, 436.0, 950.0, 440.0, 953.0, 442.0, 955.0, 445.0, 446.0, 957.0, 449.0, 962.0, 451.0, 450.0, 969.0, 970.0, 459.0, 972.0, 467.0, 982.0, 472.0, 473.0, 475.0, 477.0, 482.0, 485.0, 487.0, 488.0, 493.0, 495.0, 506.0, 508.0]
low_cor = [0.0, 15.0, 16.0, 18.0, 20.0, 21.0, 22.0, 23.0, 26.0, 27.0, 29.0, 30.0, 33.0, 37.0, 38.0, 39.0, 40.0, 41.0, 42.0, 44.0, 45.0, 50.0, 51.0, 52.0, 53.0, 54.0, 55.0, 56.0, 57.0, 58.0, 59.0, 60.0, 62.0, 63.0, 64.0, 65.0, 66.0, 67.0, 69.0, 70.0, 71.0, 72.0, 74.0, 79.0, 80.0, 81.0, 82.0, 85.0, 87.0, 90.0, 91.0, 94.0, 95.0, 96.0, 97.0, 98.0, 101.0, 102.0, 103.0, 105.0, 107.0, 108.0, 109.0, 110.0, 111.0, 112.0, 115.0, 122.0, 124.0, 125.0, 126.0, 127.0, 130.0, 133.0, 134.0, 136.0, 139.0, 140.0, 141.0, 142.0, 143.0, 144.0, 160.0, 174.0, 175.0, 176.0, 178.0, 179.0, 180.0, 181.0, 184.0, 185.0, 187.0, 189.0, 192.0, 193.0, 194.0, 196.0, 197.0, 198.0, 199.0, 200.0, 202.0, 204.0, 205.0, 206.0, 207.0, 208.0, 209.0, 210.0, 212.0, 214.0, 215.0, 216.0, 218.0, 219.0, 221.0, 224.0, 225.0, 226.0, 230.0, 232.0, 233.0, 234.0, 235.0, 236.0, 237.0, 243.0, 244.0, 245.0, 246.0, 247.0, 248.0, 249.0, 250.0, 251.0, 252.0, 253.0, 256.0, 258.0, 259.0, 261.0, 265.0, 266.0, 268.0, 273.0, 274.0, 276.0, 277.0, 280.0, 284.0, 285.0, 286.0, 287.0, 290.0, 292.0, 293.0, 294.0, 295.0, 296.0, 299.0, 300.0, 301.0, 305.0, 306.0, 307.0, 309.0, 318.0, 319.0, 320.0, 321.0, 323.0, 326.0, 328.0, 329.0, 336.0, 338.0, 341.0, 344.0, 346.0, 347.0, 349.0, 350.0, 354.0, 357.0, 358.0, 359.0, 361.0, 363.0, 364.0, 365.0, 366.0, 367.0, 370.0, 376.0, 378.0, 380.0, 381.0, 383.0, 384.0, 385.0, 387.0, 389.0, 390.0, 391.0, 392.0, 393.0, 394.0, 395.0, 397.0, 398.0, 399.0, 400.0, 401.0, 402.0, 403.0, 404.0, 405.0, 407.0, 408.0, 409.0, 411.0, 412.0, 413.0, 414.0, 415.0, 416.0, 417.0, 418.0, 419.0, 420.0, 421.0, 422.0, 423.0, 424.0, 425.0, 426.0, 427.0, 429.0, 432.0, 433.0, 434.0, 435.0, 441.0, 443.0, 444.0, 447.0, 452.0, 454.0, 456.0, 458.0, 461.0, 463.0, 465.0, 468.0, 470.0, 471.0, 478.0, 481.0, 484.0, 486.0, 490.0, 491.0, 494.0, 496.0, 497.0, 498.0, 499.0, 500.0, 502.0, 505.0, 509.0, 511.0, 512.0, 513.0, 515.0, 516.0, 517.0, 519.0, 521.0, 522.0, 523.0, 524.0, 526.0, 527.0, 528.0, 529.0, 530.0, 531.0, 533.0, 535.0, 536.0, 537.0, 538.0, 539.0, 540.0, 541.0, 543.0, 544.0, 545.0, 546.0, 547.0, 548.0, 549.0, 554.0, 555.0, 558.0, 559.0, 562.0, 563.0, 565.0, 569.0, 573.0, 574.0, 575.0, 576.0, 577.0, 578.0, 581.0, 582.0, 583.0, 584.0, 586.0, 587.0, 588.0, 591.0, 592.0, 593.0, 594.0, 595.0, 596.0, 598.0, 602.0, 605.0, 606.0, 607.0, 609.0, 610.0, 613.0, 614.0, 622.0, 623.0, 631.0, 633.0, 638.0, 639.0, 640.0, 641.0, 642.0, 643.0, 644.0, 645.0, 646.0, 647.0, 648.0, 649.0, 650.0, 651.0, 652.0, 653.0, 654.0, 655.0, 656.0, 658.0, 659.0, 660.0, 661.0, 662.0, 663.0, 664.0, 665.0, 669.0, 670.0, 671.0, 672.0, 673.0, 674.0, 675.0, 676.0, 677.0, 678.0, 679.0, 683.0, 684.0, 685.0, 686.0, 688.0, 689.0, 690.0, 691.0, 692.0, 693.0, 694.0, 695.0, 696.0, 697.0, 698.0, 699.0, 700.0, 701.0, 702.0, 703.0, 704.0, 706.0, 707.0, 708.0, 709.0, 710.0, 711.0, 712.0, 713.0, 714.0, 715.0, 716.0, 717.0, 718.0, 719.0, 720.0, 721.0, 722.0, 723.0, 724.0, 725.0, 727.0, 728.0, 730.0, 731.0, 732.0, 733.0, 734.0, 735.0, 736.0, 737.0, 738.0, 739.0, 740.0, 741.0, 742.0, 743.0, 744.0, 745.0, 746.0, 747.0, 748.0, 749.0, 750.0, 752.0, 753.0, 754.0, 755.0, 756.0, 757.0, 758.0, 759.0, 761.0, 762.0, 763.0, 764.0, 765.0, 767.0, 768.0, 769.0, 771.0, 773.0, 774.0, 775.0, 776.0, 777.0, 778.0, 779.0, 780.0, 781.0, 782.0, 784.0, 785.0, 787.0, 788.0, 789.0, 790.0, 791.0, 792.0, 793.0, 794.0, 795.0, 796.0, 797.0, 798.0, 799.0, 800.0, 801.0, 802.0, 803.0, 804.0, 805.0, 806.0, 807.0, 808.0, 809.0, 810.0, 811.0, 812.0, 813.0, 814.0, 815.0, 816.0, 817.0, 818.0, 819.0, 820.0, 821.0, 822.0, 823.0, 825.0, 826.0, 827.0, 828.0, 834.0, 841.0, 842.0, 843.0, 844.0, 845.0, 846.0, 848.0, 850.0, 851.0, 852.0, 853.0, 854.0, 855.0, 856.0, 857.0, 859.0, 860.0, 861.0, 862.0, 864.0, 865.0, 868.0, 869.0, 870.0, 871.0, 873.0, 874.0, 876.0, 877.0, 878.0, 879.0, 881.0, 882.0, 883.0, 885.0, 886.0, 887.0, 888.0, 889.0, 891.0, 892.0, 894.0, 895.0, 896.0, 897.0, 898.0, 899.0, 900.0, 902.0, 903.0, 904.0, 905.0, 906.0, 907.0, 908.0, 909.0, 910.0, 911.0, 912.0, 915.0, 916.0, 917.0, 919.0, 920.0, 921.0, 922.0, 923.0, 924.0, 925.0, 927.0, 928.0, 929.0, 932.0, 933.0, 934.0, 935.0, 936.0, 938.0, 942.0, 944.0, 946.0, 947.0, 948.0, 949.0, 951.0, 952.0, 954.0, 958.0, 959.0, 961.0, 963.0, 964.0, 965.0, 966.0, 968.0, 973.0, 974.0, 978.0, 979.0, 980.0, 981.0, 984.0, 985.0, 986.0, 987.0, 988.0, 990.0, 992.0, 993.0, 994.0, 995.0, 996.0, 997.0, 998.0, 999.0, 1000.0, 1001.0, 1002.0, 1004.0, 1005.0, 1006.0, 1007.0, 1008.0, 1009.0, 1010.0, 1011.0, 1012.0, 1013.0, 1014.0, 1015.0, 1016.0, 1017.0, 1018.0, 1019.0, 1020.0, 1021.0, 1023.0, 1024.0, 1025.0, 1028.0, 1030.0, 1031.0, 1032.0, 1033.0, 1037.0, 1038.0, 1039.0, 1041.0, 1042.0, 1044.0, 1045.0, 1048.0, 1049.0, 1055.0, 1056.0, 1058.0, 1062.0, 1063.0, 1065.0, 1066.0, 1067.0, 1068.0, 1069.0, 1070.0, 1071.0, 1072.0, 1073.0, 1074.0, 1075.0, 1076.0, 1077.0, 1078.0, 1079.0, 1080.0, 1081.0, 1082.0, 1083.0, 1084.0, 1085.0, 1086.0, 1088.0, 1089.0, 1090.0, 1091.0, 1092.0, 1093.0, 1094.0, 1095.0, 1096.0, 1098.0, 1099.0, 1101.0, 1102.0, 1103.0, 1105.0, 1106.0, 1108.0, 1109.0, 1110.0, 1111.0, 1114.0]

df['lead_owner_rank'] = 0
high_cor_row = df['lead_owner'].isin(high_cor)
df.loc[high_cor_row,'lead_owner_rank'] = 2
medium_cor_row = df['lead_owner'].isin(medium_cor)
df.loc[medium_cor_row,'lead_owner_rank'] = 1
low_cor_row = df['lead_owner'].isin(low_cor)
df.loc[low_cor_row,'lead_owner'] = 0

# 과거 영업 전환 되었던 비율이 양수인 고객과 음수인 고객 구분
df['historical_convert_plus_minus'] = 0
tmp_idx = df['historical_existing_cnt'].sort_values(ascending=False)[:4706].index
df.loc[tmp_idx, 'historical_convert_plus_minus'] = 1
tmp_idx = df['historical_existing_cnt'].sort_values(ascending=False)[4706:].index
df.loc[tmp_idx, 'historical_convert_plus_minus'] = -1

# 파생변수 : 과거 영업 전환 되었던 횟수가 100번 이상인 고객 인덱스
tmp_idx = df['historical_existing_cnt'].sort_values(ascending=False).head(1400).index
df['historical_convert_over_100'] = 0
df.loc[tmp_idx, 'historical_convert_over_100'] = 1

# 영업 전환율 기준으로 상(20%~), 중(10%~20%), 하(~10%)로 담당 자사 구분
high_cor = ['LGEHK', 'LGETT', 'LGEAF', 'LGECZ', 'LGERO', 'LGEMC', 'LGELA']
medium_cor = ['LGEIN', 'LGEPH', 'LGETH', 'LGESA', 'LGEGF', 'LGEUS', 'LGEPT', 'LGEAR', 'LGEHS', 'LGEAP', 'LGESL']
low_cor = ['LGECB', 'LGEML', 'LGEPS', 'LGEIR', 'LGEEB', 'LGELF', 'LGEVH', 'LGEES', 'LGEPR', 'LGEIS', 'LGEAS', 'LGEUR', 'LGEYK', 'LGEEG', 'LGECH', 'LGECI', 'LGEIL', 'LGECL', 'LGESP', 'LGERA', 'LGEKR', 'LGESJ', 'LGEBT', 'LGEJP', 'LGEAG', 'LGEMK', 'LGEFS', 'LGEBN', 'LGEPL', 'LGEUK', 'LGETK', 'LGEMS', 'LGEEF', 'LGESW', 'LGEDG']
df['corporate_rank'] = 0
high_cor_row = df['response_corporate'].isin(high_cor)
df.loc[high_cor_row,'corporate_rank'] = 2
medium_cor_row = df['response_corporate'].isin(medium_cor)
df.loc[medium_cor_row,'corporate_rank'] = 1
low_cor_row = df['response_corporate'].isin(low_cor)
df.loc[low_cor_row,'corporate_rank'] = 0

In [57]:
high_convert_position= ['associate/analyst','consultant','decision influencer','manager']
df['important_customer_position'] = df['customer_position'].isin(high_convert_position).astype(int)

In [58]:
# 국적과 사업부 조합
df['feat_2'] = df['customer_country'] + df['business_unit']

# 고객 유형과 직업 조합
df['feat_3'] = df['customer_type'] + df['customer_job']

# 고객 회사명과 국가 조합
df['customer_idx'] = df['customer_idx'].astype(str)
df['customer_idx_country'] = df['customer_idx'] + df['customer_country']

# 영업 담당자 이름과 국가 조합
df['lead_owner'] = df['lead_owner'].astype(str)
df['lead_owner_country'] = df['lead_owner'] + df['customer_country']

In [59]:
df

Unnamed: 0,bant_submit,customer_country,business_unit,com_reg_ver_win_rate,customer_idx,customer_type,enterprise,historical_existing_cnt,id_strategic_ver,it_strategic_ver,...,is_converted,lead_owner_rank,historical_convert_plus_minus,historical_convert_over_100,corporate_rank,important_customer_position,feat_2,feat_3,customer_idx_country,lead_owner_country
0,1.277360,philippines,AS,-0.120277,32160,End-Customer,Enterprise,-0.163685,0.0,0.0,...,True,0,-1,0,1,0,philippinesAS,End-Customerbuyer,32160philippines,0philippines
1,1.277360,philippines,AS,-0.120277,23122,End-Customer,Enterprise,0.191067,0.0,0.0,...,True,2,-1,0,1,0,philippinesAS,End-Customermedia and communication,23122philippines,1philippines
2,1.277360,india,AS,0.062087,1755,End-Customer,Enterprise,6.044481,0.0,0.0,...,True,2,-1,1,0,0,indiaAS,End-Customerengineering,1755india,2india
3,1.277360,india,AS,0.062087,4919,End-Customer,Enterprise,-0.163685,0.0,0.0,...,True,1,-1,0,0,0,indiaAS,End-Customerentrepreneurship,4919india,3india
4,1.277360,india,AS,0.062087,17126,Specifier/ Influencer,Enterprise,-0.163685,0.0,0.0,...,True,2,-1,0,0,0,indiaAS,Specifier/ Influencerconsulting,17126india,4india
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5266,-0.470501,brazil,AS,-0.588290,40292,Others,Enterprise,0.102379,0.0,0.0,...,False,0,-1,0,0,1,brazilAS,Othersother,40292brazil,0brazil
5267,-1.344431,united states,IT,4.382139,47466,Others,Enterprise,-0.341062,0.0,0.0,...,False,2,-1,0,1,0,united statesIT,Othersother,47466united states,438united states
5268,0.403430,brazil,AS,-0.588290,46227,Specifier/ Influencer,Enterprise,-0.163685,0.0,0.0,...,True,0,-1,0,0,1,brazilAS,Specifier/ Influencerentrepreneurship,46227brazil,0brazil
5269,-2.218362,germany,IT,1.226414,45667,End-Customer,SMB,-0.163685,0.0,0.0,...,False,0,-1,0,0,0,germanyIT,End-Customerother,45667germany,0germany


## 4. 인코딩

In [60]:
df_train = df.iloc[: len(df_train)]
df_test = df.iloc[len(df_train) :]

- Label Encoding

In [61]:
def label_encoding(series: pd.Series) -> pd.Series:
    """범주형 데이터를 시리즈 형태로 받아 숫자형 데이터로 변환"""
    my_dict = {}

    # 모든 요소를 문자열로 변환
    series = series.astype(str)

    for idx, value in enumerate(sorted(series.unique())):
        my_dict[value] = idx
    series = series.map(my_dict)

    return series

In [62]:
# 레이블 인코딩할 칼럼들
label_columns  = df.select_dtypes(include=['object']).columns.to_list()
# label_columns.remove('enterprise')

for col in label_columns:
    df_train[col] = label_encoding(df_train[col])

for col in label_columns:
    df_test[col] = label_encoding(df_test[col])

## 5. 모델 학습

- Checking Feature Importance

In [63]:
#feature importance가 낮은 파생변수 drop.

df_train.drop(["historical_convert_over_100"], axis=1, inplace=True)
df_test.drop(["historical_convert_over_100"], axis=1, inplace=True)

- 학습, 검증 데이터 분리 + Under Sampling

In [64]:
x_train = df_train.drop(["is_converted"], axis=1)
x_test = df_test.drop(["is_converted"], axis=1)
y_train = df_train["is_converted"]
y_test = df_test["is_converted"]

# 언더샘플링을 위한 RandomUnderSampler 객체 생성
rus = RandomUnderSampler(sampling_strategy='auto', random_state=42)
x_train_sample, y_train_sample= rus.fit_resample(x_train,y_train)
x_train, x_val, y_train, y_val = train_test_split(x_train_sample, y_train_sample,test_size=0.2,
                                                  shuffle=True,random_state=400,
                                                  stratify=y_train_sample)

In [65]:
dt_clf= DecisionTreeClassifier(random_state=42)
dt_clf.fit(x_train, y_train)
feature_importance = dt_clf.feature_importances_ * 100
result_df = pd.DataFrame({'Feature': x_train.columns, 'Importance': feature_importance})
# Feature Importance를 Importance 기준으로 내림차순 정렬
result_df = result_df.sort_values(by='Importance', ascending=False)
print(result_df)

                          Feature  Importance
34             lead_owner_country   46.536288
33           customer_idx_country   14.608633
4                    customer_idx   10.715218
5                   customer_type    3.420007
32                         feat_3    2.997317
27                lead_owner_rank    2.679123
18             response_corporate    2.124817
12               lead_desc_length    2.083323
3            com_reg_ver_win_rate    1.537771
0                     bant_submit    1.158287
11                   customer_job    1.122062
24                  business_area    0.997467
7         historical_existing_cnt    0.953978
25               business_subarea    0.909173
19              expected_timeline    0.829191
31                         feat_2    0.722636
17              customer_position    0.710843
13                   inquiry_type    0.672031
15            product_subcategory    0.658462
26                     lead_owner    0.621038
16              product_modelname 

### Basic Model

- 모델 객체 모음

In [66]:
dt_clf= DecisionTreeClassifier(random_state=42)
# lr_clf = LogisticRegression(random_state=42)
# rf_clf = RandomForestClassifier(random_state=42)
# ada_clf = AdaBoostClassifier(random_state=42)
# xgb_clf = XGBClassifier(random_state=42)

- 모델 훈련

In [67]:
dt_clf.fit(x_train, y_train)
# lr_clf.fit(x_train, y_train)
# rf_clf.fit(x_train, y_train)
# ada_clf.fit(x_train, y_train)
# xgb_clf.fit(x_train, y_train)

- 기본 성능 평가

In [68]:
def get_clf_eval(y_test, y_pred=None):
    confusion = confusion_matrix(y_test, y_pred, labels=[True, False])
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, labels=[True, False])
    recall = recall_score(y_test, y_pred)
    F1 = f1_score(y_test, y_pred, labels=[True, False])

    print("오차행렬:\n", confusion)
    print("\n정확도: {:.4f}".format(accuracy))
    print("정밀도: {:.4f}".format(precision))
    print("재현율: {:.4f}".format(recall))
    print("F1: {:.4f}".format(F1))

In [69]:
print('검증 데이터 성능')
pred = dt_clf.predict(x_val.fillna(0))
print(get_clf_eval(y_val, pred))

print('테스트 데이터 성능')
test_pred = dt_clf.predict(x_test)
sum(test_pred) # True로 예측된 개수
print(get_clf_eval(y_test, test_pred))

검증 데이터 성능
오차행렬:
 [[886  84]
 [ 93 877]]

정확도: 0.9088
정밀도: 0.9050
재현율: 0.9134
F1: 0.9092
None
테스트 데이터 성능
오차행렬:
 [[ 387  774]
 [1035 3075]]

정확도: 0.6568
정밀도: 0.2722
재현율: 0.3333
F1: 0.2997
None


## 6. Parameter Tuning, OOF

### optuna parameter tuning

XGBoost

In [70]:
def objective(trial):
    params = {
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=1, log=True),
        'max_depth': trial.suggest_int('max_depth', 1, 10, step=1, log=False),
        'learning_rate': trial.suggest_float('learning_rate', 0.0001, 0.1, log=True),
        'n_estimators': trial.suggest_int('n_estimators', 8, 1024, step=1, log=True),
        'class_weight': trial.suggest_categorical('class_weight', ['balanced', None]),
        'min_child_samples': trial.suggest_int('min_child_samples', 10, 50, step=1, log=False),
        'subsample': trial.suggest_uniform('subsample', 0.7, 1.0),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.7, 1.0),
        'reg_alpha': trial.suggest_uniform('reg_alpha', 0.0, 1.0),
        'reg_lambda': trial.suggest_uniform('reg_lambda', 0.0, 10.0),
        'random_state': 0
    }

    classifier_obj = XGBClassifier(**params)

    score = cross_val_score(classifier_obj, x_train, y_train, scoring='f1', cv=5)
    f1_score = score.mean()
    return f1_score

# 최적화 실행
xgb_study = optuna.create_study(sampler=optuna.samplers.TPESampler(seed=0), direction="maximize")
xgb_study.optimize(objective, n_trials=18)

#최적화 결과 보기
print("Best score:", xgb_study.best_value)
print("Best parameters:", xgb_study.best_params)

[I 2024-03-01 11:54:44,540] A new study created in memory with name: no-name-bfe09eb9-37aa-4755-9e0c-91b9db37c308
[I 2024-03-01 11:54:46,533] Trial 0 finished with value: 0.9224871167740568 and parameters: {'num_leaves': 54, 'max_depth': 8, 'learning_rate': 0.006431172050131994, 'n_estimators': 109, 'class_weight': None, 'min_child_samples': 27, 'subsample': 0.9675319002346239, 'colsample_bytree': 0.9890988281503088, 'reg_alpha': 0.3834415188257777, 'reg_lambda': 7.917250380826646}. Best is trial 0 with value: 0.9224871167740568.
[I 2024-03-01 11:54:46,790] Trial 1 finished with value: 0.9229710974995129 and parameters: {'num_leaves': 47, 'max_depth': 6, 'learning_rate': 0.05981221901152557, 'n_estimators': 11, 'class_weight': 'balanced', 'min_child_samples': 44, 'subsample': 0.9334470252849552, 'colsample_bytree': 0.9610036444740457, 'reg_alpha': 0.978618342232764, 'reg_lambda': 7.9915856421672355}. Best is trial 1 with value: 0.9229710974995129.
[I 2024-03-01 11:54:50,225] Trial 2 fi

Best score: 0.9394335772792282
Best parameters: {'num_leaves': 10, 'max_depth': 9, 'learning_rate': 0.034067960632029944, 'n_estimators': 370, 'class_weight': None, 'min_child_samples': 21, 'subsample': 0.7669898676720104, 'colsample_bytree': 0.9492780685840049, 'reg_alpha': 0.8062666777599033, 'reg_lambda': 4.627373657351357}


LightGBM

In [71]:
def objective(trial):
    params = {
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=1, log=True),
        'max_depth': trial.suggest_int('max_depth', 1, 10, step=1, log=False),
        'learning_rate': trial.suggest_float('learning_rate', 0.0001, 0.1, log=True),
        'n_estimators': trial.suggest_int('n_estimators', 8, 1024, step=1, log=True),
        'class_weight': trial.suggest_categorical('class_weight', ['balanced', None]),
        'min_child_samples': trial.suggest_int('min_child_samples', 10, 50, step=1, log=False),
        'subsample': trial.suggest_uniform('subsample', 0.7, 1.0),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.7, 1.0),
        'reg_alpha': trial.suggest_uniform('reg_alpha', 0.0, 1.0),
        'reg_lambda': trial.suggest_uniform('reg_lambda', 0.0, 10.0),
        'random_state': 0
    }

    classifier_obj = LGBMClassifier(**params)

    score = cross_val_score(classifier_obj, x_train, y_train, scoring='f1', cv=5)
    f1_score = score.mean()
    return f1_score

# 최적화 실행
lgbm_study = optuna.create_study(sampler=optuna.samplers.TPESampler(seed=0), direction="maximize")
lgbm_study.optimize(objective, n_trials=18)

#최적화 결과 보기
print("Best score:", lgbm_study.best_value)
print("Best parameters:", lgbm_study.best_params)

[I 2024-03-01 11:56:38,123] A new study created in memory with name: no-name-72a0f9ba-7280-44aa-b3d5-51d77d27f4ea


[LightGBM] [Info] Number of positive: 3104, number of negative: 3104
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001546 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2350
[LightGBM] [Info] Number of data points in the train set: 6208, number of used features: 35
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 3104, number of negative: 3104
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001251 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2393
[LightGBM] [Info] Number of data points in the train set: 6208, number of used features: 35
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-03-01 11:56:39,135] Trial 0 finished with value: 0.9174460948199993 and parameters: {'num_leaves': 54, 'max_depth': 8, 'learning_rate': 0.006431172050131994, 'n_estimators': 109, 'class_weight': None, 'min_child_samples': 27, 'subsample': 0.9675319002346239, 'colsample_bytree': 0.9890988281503088, 'reg_alpha': 0.3834415188257777, 'reg_lambda': 7.917250380826646}. Best is trial 0 with value: 0.9174460948199993.


[LightGBM] [Info] Number of positive: 3104, number of negative: 3104
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001257 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2350
[LightGBM] [Info] Number of data points in the train set: 6208, number of used features: 35
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 3104, number of negative: 3104
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001404 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2393
[LightGBM] [Info] Number of data points in the train set: 6208, number of used features: 35
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-03-01 11:56:39,376] Trial 1 finished with value: 0.9127974308210469 and parameters: {'num_leaves': 47, 'max_depth': 6, 'learning_rate': 0.05981221901152557, 'n_estimators': 11, 'class_weight': 'balanced', 'min_child_samples': 44, 'subsample': 0.9334470252849552, 'colsample_bytree': 0.9610036444740457, 'reg_alpha': 0.978618342232764, 'reg_lambda': 7.9915856421672355}. Best is trial 0 with value: 0.9174460948199993.


[LightGBM] [Info] Number of positive: 3104, number of negative: 3104
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001206 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2350
[LightGBM] [Info] Number of data points in the train set: 6208, number of used features: 35
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 3104, number of negative: 3104
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001318 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2393
[LightGBM] [Info] Number of data points in the train set: 6208, number of used features: 35
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-03-01 11:56:40,716] Trial 2 finished with value: 0.9162933916805491 and parameters: {'num_leaves': 30, 'max_depth': 8, 'learning_rate': 0.00022637229697395497, 'n_estimators': 174, 'class_weight': None, 'min_child_samples': 31, 'subsample': 0.8243985819971571, 'colsample_bytree': 0.7793666836313881, 'reg_alpha': 0.7742336894342167, 'reg_lambda': 4.5615033221654855}. Best is trial 0 with value: 0.9174460948199993.


[LightGBM] [Info] Number of positive: 3104, number of negative: 3104
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001284 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2350
[LightGBM] [Info] Number of data points in the train set: 6208, number of used features: 35
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 3104, number of negative: 3104
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001379 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2393
[LightGBM] [Info] Number of data points in the train set: 6208, number of used features: 35
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-03-01 11:56:41,133] Trial 3 finished with value: 0.8391483274826396 and parameters: {'num_leaves': 61, 'max_depth': 1, 'learning_rate': 0.007126995609048582, 'n_estimators': 152, 'class_weight': None, 'min_child_samples': 37, 'subsample': 0.8078523701721357, 'colsample_bytree': 0.8311095861398023, 'reg_alpha': 0.6976311959272649, 'reg_lambda': 0.6022547162926983}. Best is trial 0 with value: 0.9174460948199993.


[LightGBM] [Info] Number of positive: 3104, number of negative: 3104
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001405 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2350
[LightGBM] [Info] Number of data points in the train set: 6208, number of used features: 35
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 3104, number of negative: 3104
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001350 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2393
[LightGBM] [Info] Number of data points in the train set: 6208, number of used features: 35
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-03-01 11:56:41,470] Trial 4 finished with value: 0.9214565944533923 and parameters: {'num_leaves': 116, 'max_depth': 7, 'learning_rate': 0.0004277083049962072, 'n_estimators': 14, 'class_weight': None, 'min_child_samples': 33, 'subsample': 0.8315804540386961, 'colsample_bytree': 0.9965121514177678, 'reg_alpha': 0.10204481074802807, 'reg_lambda': 2.088767560948347}. Best is trial 4 with value: 0.9214565944533923.


[LightGBM] [Info] Number of positive: 3104, number of negative: 3104
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001622 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2386
[LightGBM] [Info] Number of data points in the train set: 6208, number of used features: 35
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 3104, number of negative: 3104
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001554 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2350
[LightGBM] [Info] Number of data points in the train set: 6208, number of used features: 35
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-03-01 11:56:42,005] Trial 5 finished with value: 0.878272129336396 and parameters: {'num_leaves': 4, 'max_depth': 7, 'learning_rate': 0.000575274081411669, 'n_estimators': 74, 'class_weight': 'balanced', 'min_child_samples': 14, 'subsample': 0.896898876839582, 'colsample_bytree': 0.7414548854045842, 'reg_alpha': 0.1965823616800535, 'reg_lambda': 3.687251706609641}. Best is trial 4 with value: 0.9214565944533923.


[LightGBM] [Info] Number of positive: 3104, number of negative: 3104
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001635 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2386
[LightGBM] [Info] Number of data points in the train set: 6208, number of used features: 35
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 3104, number of negative: 3104
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001759 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2350
[LightGBM] [Info] Number of data points in the train set: 6208, number of used features: 35
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-03-01 11:56:42,275] Trial 6 finished with value: 0.8384908887890885 and parameters: {'num_leaves': 319, 'max_depth': 1, 'learning_rate': 0.03264635677523523, 'n_estimators': 12, 'class_weight': 'balanced', 'min_child_samples': 50, 'subsample': 0.8814536559235138, 'colsample_bytree': 0.9217790738194904, 'reg_alpha': 0.039187792254320675, 'reg_lambda': 2.828069625764096}. Best is trial 4 with value: 0.9214565944533923.


[LightGBM] [Info] Number of positive: 3104, number of negative: 3104
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003694 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2386
[LightGBM] [Info] Number of data points in the train set: 6208, number of used features: 35
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 3104, number of negative: 3104
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001656 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2350
[LightGBM] [Info] Number of data points in the train set: 6208, number of used features: 35
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 3104, number of negative: 

[I 2024-03-01 11:56:42,667] Trial 7 finished with value: 0.8384908887890885 and parameters: {'num_leaves': 3, 'max_depth': 3, 'learning_rate': 0.00022708223335487552, 'n_estimators': 36, 'class_weight': 'balanced', 'min_child_samples': 38, 'subsample': 0.8699804362619725, 'colsample_bytree': 0.7796168472818336, 'reg_alpha': 0.5232480534666997, 'reg_lambda': 0.9394051075844168}. Best is trial 4 with value: 0.9214565944533923.


[LightGBM] [Info] Number of positive: 3104, number of negative: 3104
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003587 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2386
[LightGBM] [Info] Number of data points in the train set: 6208, number of used features: 35
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 3104, number of negative: 3104
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001240 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2350
[LightGBM] [Info] Number of data points in the train set: 6208, number of used features: 35
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 3104, number of negative: 

[I 2024-03-01 11:56:44,345] Trial 8 finished with value: 0.9159358717694228 and parameters: {'num_leaves': 64, 'max_depth': 10, 'learning_rate': 0.000903039728888594, 'n_estimators': 200, 'class_weight': None, 'min_child_samples': 21, 'subsample': 0.754957408602135, 'colsample_bytree': 0.8759538804430249, 'reg_alpha': 0.020107546187493552, 'reg_lambda': 8.289400292173632}. Best is trial 4 with value: 0.9214565944533923.


[LightGBM] [Info] Number of positive: 3104, number of negative: 3104
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001228 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2350
[LightGBM] [Info] Number of data points in the train set: 6208, number of used features: 35
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 3104, number of negative: 3104
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001493 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2393
[LightGBM] [Info] Number of data points in the train set: 6208, number of used features: 35
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-03-01 11:56:44,980] Trial 9 finished with value: 0.8384908887890885 and parameters: {'num_leaves': 2, 'max_depth': 7, 'learning_rate': 0.0006456897906217104, 'n_estimators': 279, 'class_weight': 'balanced', 'min_child_samples': 33, 'subsample': 0.8776125793815517, 'colsample_bytree': 0.871675571737262, 'reg_alpha': 0.2230816326406183, 'reg_lambda': 9.52749011516985}. Best is trial 4 with value: 0.9214565944533923.


[LightGBM] [Info] Number of positive: 3104, number of negative: 3104
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001431 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2350
[LightGBM] [Info] Number of data points in the train set: 6208, number of used features: 35
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 3104, number of negative: 3104
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001228 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2393
[LightGBM] [Info] Number of data points in the train set: 6208, number of used features: 35
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-03-01 11:56:49,307] Trial 10 finished with value: 0.9025847640497784 and parameters: {'num_leaves': 700, 'max_depth': 4, 'learning_rate': 0.00011542675973617445, 'n_estimators': 725, 'class_weight': None, 'min_child_samples': 11, 'subsample': 0.7178120820232871, 'colsample_bytree': 0.9247056876505003, 'reg_alpha': 0.4028635122986128, 'reg_lambda': 5.914865070833098}. Best is trial 4 with value: 0.9214565944533923.


[LightGBM] [Info] Number of positive: 3104, number of negative: 3104
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001416 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2350
[LightGBM] [Info] Number of data points in the train set: 6208, number of used features: 35
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 3104, number of negative: 3104
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001433 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2393
[LightGBM] [Info] Number of data points in the train set: 6208, number of used features: 35
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-03-01 11:56:49,761] Trial 11 finished with value: 0.9153222294944101 and parameters: {'num_leaves': 188, 'max_depth': 9, 'learning_rate': 0.004853364390476444, 'n_estimators': 35, 'class_weight': None, 'min_child_samples': 24, 'subsample': 0.9921812768132974, 'colsample_bytree': 0.9921500351839189, 'reg_alpha': 0.34044514094885936, 'reg_lambda': 6.090349002831858}. Best is trial 4 with value: 0.9214565944533923.


[LightGBM] [Info] Number of positive: 3104, number of negative: 3104
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001375 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2355
[LightGBM] [Info] Number of data points in the train set: 6208, number of used features: 35
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 3104, number of negative: 3104
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001407 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2386
[LightGBM] [Info] Number of data points in the train set: 6208, number of used features: 35
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-03-01 11:56:50,156] Trial 12 finished with value: 0.916202954022302 and parameters: {'num_leaves': 13, 'max_depth': 5, 'learning_rate': 0.013358710038685755, 'n_estimators': 39, 'class_weight': None, 'min_child_samples': 24, 'subsample': 0.9944749447332812, 'colsample_bytree': 0.996178829237443, 'reg_alpha': 0.5454891924731106, 'reg_lambda': 2.0116384380183217}. Best is trial 4 with value: 0.9214565944533923.


[LightGBM] [Info] Number of positive: 3104, number of negative: 3104
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002001 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2355
[LightGBM] [Info] Number of data points in the train set: 6208, number of used features: 35
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 3104, number of negative: 3104
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001371 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2386
[LightGBM] [Info] Number of data points in the train set: 6208, number of used features: 35
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-03-01 11:56:50,901] Trial 13 finished with value: 0.9142142124585062 and parameters: {'num_leaves': 122, 'max_depth': 9, 'learning_rate': 0.0020738813360221425, 'n_estimators': 77, 'class_weight': None, 'min_child_samples': 27, 'subsample': 0.9405655672821595, 'colsample_bytree': 0.943070459113173, 'reg_alpha': 0.20545863511548002, 'reg_lambda': 6.800909004602219}. Best is trial 4 with value: 0.9214565944533923.


[LightGBM] [Info] Number of positive: 3104, number of negative: 3104
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001494 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2350
[LightGBM] [Info] Number of data points in the train set: 6208, number of used features: 35
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 3104, number of negative: 3104
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001360 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2393
[LightGBM] [Info] Number of data points in the train set: 6208, number of used features: 35
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-03-01 11:56:51,179] Trial 14 finished with value: 0.9119580577645534 and parameters: {'num_leaves': 12, 'max_depth': 7, 'learning_rate': 0.0019622858794048764, 'n_estimators': 18, 'class_weight': None, 'min_child_samples': 37, 'subsample': 0.7925845240012799, 'colsample_bytree': 0.9980647985060505, 'reg_alpha': 0.3430454576572856, 'reg_lambda': 4.61173770720476}. Best is trial 4 with value: 0.9214565944533923.


[LightGBM] [Info] Number of positive: 3104, number of negative: 3104
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001422 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2355
[LightGBM] [Info] Number of data points in the train set: 6208, number of used features: 35
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 3104, number of negative: 3104
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001416 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2386
[LightGBM] [Info] Number of data points in the train set: 6208, number of used features: 35
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-03-01 11:56:59,082] Trial 15 finished with value: 0.9378041531529975 and parameters: {'num_leaves': 519, 'max_depth': 10, 'learning_rate': 0.017864027861461247, 'n_estimators': 519, 'class_weight': None, 'min_child_samples': 18, 'subsample': 0.9386822352351896, 'colsample_bytree': 0.9016280399697625, 'reg_alpha': 0.12987243368736023, 'reg_lambda': 9.655399031799776}. Best is trial 15 with value: 0.9378041531529975.


[LightGBM] [Info] Number of positive: 3104, number of negative: 3104
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001405 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2350
[LightGBM] [Info] Number of data points in the train set: 6208, number of used features: 35
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 3104, number of negative: 3104
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001399 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2393
[LightGBM] [Info] Number of data points in the train set: 6208, number of used features: 35
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-03-01 11:57:11,434] Trial 16 finished with value: 0.9393046835396731 and parameters: {'num_leaves': 930, 'max_depth': 10, 'learning_rate': 0.034067960632029944, 'n_estimators': 735, 'class_weight': None, 'min_child_samples': 16, 'subsample': 0.8417713845693325, 'colsample_bytree': 0.903746741663144, 'reg_alpha': 0.09596747056870118, 'reg_lambda': 9.916933285895103}. Best is trial 16 with value: 0.9393046835396731.


[LightGBM] [Info] Number of positive: 3104, number of negative: 3104
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001452 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2350
[LightGBM] [Info] Number of data points in the train set: 6208, number of used features: 35
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 3104, number of negative: 3104
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001340 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2393
[LightGBM] [Info] Number of data points in the train set: 6208, number of used features: 35
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-03-01 11:57:24,234] Trial 17 finished with value: 0.9390422456407649 and parameters: {'num_leaves': 973, 'max_depth': 10, 'learning_rate': 0.0897443559305108, 'n_estimators': 775, 'class_weight': None, 'min_child_samples': 17, 'subsample': 0.9089179510674643, 'colsample_bytree': 0.890871994386616, 'reg_alpha': 0.12875819304262714, 'reg_lambda': 9.68097100884682}. Best is trial 16 with value: 0.9393046835396731.


Best score: 0.9393046835396731
Best parameters: {'num_leaves': 930, 'max_depth': 10, 'learning_rate': 0.034067960632029944, 'n_estimators': 735, 'class_weight': None, 'min_child_samples': 16, 'subsample': 0.8417713845693325, 'colsample_bytree': 0.903746741663144, 'reg_alpha': 0.09596747056870118, 'reg_lambda': 9.916933285895103}


### OOF(Out-Of-Fold) Prediction

xgbm

In [72]:
# Kaggle에서는 특정모형의 과대적합을 줄이기 위해 OOF(Out-Of-Fold) Prediction을 자주 사용한다.
xgb_models = cross_validate(XGBClassifier(**xgb_study.best_params), # 최적화된 hyperparameter 사용
                        x_train, y_train, cv=10, scoring='f1',
                        return_estimator=True)
xgb_oof_pred_val = np.array([m.predict_proba(x_val)[:,1] for m in xgb_models['estimator']]).mean(axis=0)
xgb_oof_pred = np.array([m.predict_proba(x_test)[:,1] for m in xgb_models['estimator']]).mean(axis=0)

In [73]:
xgb_preds_1d_val = xgb_oof_pred_val.flatten() # 차원 펴주기
xgb_val_pred = np.where(xgb_preds_1d_val > 0.5, True , False)

xgb_preds_1d = xgb_oof_pred.flatten() # 차원 펴주기
xgb_test_pred = np.where(xgb_preds_1d > 0.5, True , False)

In [74]:
f1_val = f1_score(y_val, xgb_val_pred, average='micro')
print(f'검증 데이터 F1 Score: {f1_val:.4f}')
f1_test = f1_score(y_test, xgb_test_pred, average='micro')
print(f'테스트 데이터 F1 Score: {f1_test:.4f}')

검증 데이터 F1 Score: 0.9402
테스트 데이터 F1 Score: 0.7018


LightGBM

In [75]:
# Kaggle에서는 특정모형의 과대적합을 줄이기 위해 OOF(Out-Of-Fold) Prediction을 자주 사용한다.
lgbm_models = cross_validate(LGBMClassifier(**lgbm_study.best_params), # 최적화된 hyperparameter 사용
                        x_train, y_train, cv=10, scoring='f1',
                        return_estimator=True)
lgbm_oof_pred_val = np.array([m.predict_proba(x_val)[:,1] for m in lgbm_models['estimator']]).mean(axis=0)
lgbm_oof_pred = np.array([m.predict_proba(x_test)[:,1] for m in lgbm_models['estimator']]).mean(axis=0)

[1;30;43m스트리밍 출력 내용이 길어서 마지막 5000줄이 삭제되었습니다.[0m
[LightGBM] [Info] Number of positive: 3492, number of negative: 3492
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001391 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2393
[LightGBM] [Info] Number of data points in the train set: 6984, number of used features: 35
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 3492, number of negative: 3492
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001382 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2406
[LightGBM] [Info] Number of data points in the train set: 6984, number of used features: 35
[LightGBM] [Info] [binary:Boo

In [76]:
lgbm_preds_1d_val = lgbm_oof_pred_val.flatten() # 차원 펴주기
lgbm_val_pred = np.where(lgbm_preds_1d_val > 0.5, True , False)

lgbm_preds_1d = lgbm_oof_pred.flatten() # 차원 펴주기
lgbm_test_pred = np.where(lgbm_preds_1d > 0.5, True , False)

In [77]:
f1_val = f1_score(y_val, lgbm_val_pred, average='micro')
print(f'검증 데이터 F1 Score: {f1_val:.4f}')
f1_test = f1_score(y_test, lgbm_test_pred, average='micro')
print(f'테스트 데이터 F1 Score: {f1_test:.4f}')

검증 데이터 F1 Score: 0.9433
테스트 데이터 F1 Score: 0.6879


## 7. 앙상블
XGBoost와 LightGBM을 블렌딩

In [78]:
val_ensemble = xgb_val_pred * 0.5 + lgbm_val_pred * 0.5
val_ensemble = np.where(val_ensemble > 0.5, True , False)

In [79]:
test_ensemble = xgb_test_pred * 0.5 + lgbm_test_pred * 0.5
test_ensemble = np.where(test_ensemble > 0.5, True , False)

In [80]:
f1_val = f1_score(y_val, val_ensemble, average='micro')
print(f'검증 데이터 F1 Score: {f1_val:.4f}')
f1_test = f1_score(y_test, test_ensemble, average='micro')
print(f'테스트 데이터 F1 Score: {f1_test:.4f}')

검증 데이터 F1 Score: 0.9418
테스트 데이터 F1 Score: 0.7071


## 8. 제출

In [81]:
# 제출 데이터 읽어오기 (df_test는 전처리된 데이터가 저장됨)
df_sub = pd.read_csv("submission.csv")
df_sub["is_converted"] = test_ensemble

# 제출 파일 저장
df_sub.to_csv("submission.csv", index=False)