In [1]:
import numpy as np
import random
import os
def seed_everything(seed: int = 24):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
seed_everything(24)

In [2]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
import koreanize_matplotlib
%matplotlib inline
import matplotlib.font_manager as fm
plt.rcParams['font.family'] = 'NanumGothic'

import random
import os
import sys
import joblib

import time
from tqdm import tqdm
import warnings                                              
warnings.filterwarnings('ignore')  

import sklearn
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder, OneHotEncoder
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, PolynomialFeatures
from scipy.stats import skew
from scipy.stats import boxcox
from scipy.stats import zscore

from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN, BorderlineSMOTE

In [3]:
from sklearn.linear_model import LogisticRegression, RidgeClassifier

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

from xgboost import XGBClassifier
from xgboost import plot_importance

from lightgbm import LGBMClassifier
from lightgbm import plot_importance

import catboost
from catboost import CatBoostClassifier

from sklearn.metrics import confusion_matrix, accuracy_score  #분류- 성능지표
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import f1_score, roc_auc_score

from sklearn.model_selection import train_test_split as tts
from hyperopt import hp, fmin, tpe, Trials
from tqdm import tqdm

from sklearn.model_selection import StratifiedKFold, GridSearchCV, KFold


In [4]:
train = pd.read_csv("train.csv") # 학습용 데이터
test = pd.read_csv("submission.csv") # 테스트 데이터(제출파일의 데이터)
test = test.drop(columns="id")

In [5]:
train.shape, test.shape        #test는 id열 포함 

((59299, 29), (5271, 29))

In [14]:
                                                          #변수들의 결측치, 라벨 확인
dataFeatures = []
dataType     = []
null         = []
nullPCT      = []    #결측치 비율 
unique       = []
minValue     = []
maxValue     = []
uniqueSample = []    #값 예시 


for item in list(train):
    dataFeatures.append(item)

for item in dataFeatures:
    dataType.append(train[item].dtype.name)
    
for item in dataFeatures:
    null.append(len(train[train[item].isnull() == True]))

for item in dataFeatures:
    nullPCT.append(round(len(train[train[item].isnull() == True])/len(train[item])*100,2))
    
for item in dataFeatures:
    unique.append(train[item].nunique())

for item in dataFeatures:
    uniqueSample.append(train[item].unique()[0:5])

train_info = pd.DataFrame({
    'dataFeatures' : dataFeatures,
    'dataType' : dataType,
    'null' : null,
    'nullPCT':nullPCT,
    'unique' : unique,
    'uniqueSample':uniqueSample
})
train_info

Unnamed: 0,dataFeatures,dataType,null,nullPCT,unique,uniqueSample
0,bant_submit,float64,0,0.0,5,"[1.0, 0.75, 0.5, 0.25, 0.0]"
1,customer_country,object,982,1.66,15399,"[/Quezon City/Philippines, /PH-00/Philippines,..."
2,business_unit,object,0,0.0,5,"[AS, ID, IT, Solution, CM]"
3,com_reg_ver_win_rate,float64,44731,75.43,80,"[0.0666666666666666, 0.0888888888888888, 0.040..."
4,customer_idx,int64,0,0.0,35112,"[32160, 23122, 1755, 4919, 17126]"
5,customer_type,object,43961,74.13,33,"[End-Customer, Specifier/ Influencer, Service ..."
6,enterprise,object,0,0.0,2,"[Enterprise, SMB]"
7,historical_existing_cnt,float64,45543,76.8,136,"[nan, 12.0, 144.0, 3.0, 23.0]"
8,id_strategic_ver,float64,55855,94.19,1,"[nan, 1.0]"
9,it_strategic_ver,float64,58178,98.11,1,"[nan, 1.0]"


In [7]:
train["is_converted"].value_counts()

is_converted
False    54449
True      4850
Name: count, dtype: int64

In [8]:
numeric_columns = train.select_dtypes(include=['float64', 'int64']).columns                        #각 라벨의 변수 평균값
grouped_by_mean = train.groupby('is_converted')[numeric_columns].mean();grouped_by_mean

Unnamed: 0_level_0,bant_submit,com_reg_ver_win_rate,customer_idx,historical_existing_cnt,id_strategic_ver,it_strategic_ver,idit_strategic_ver,lead_desc_length,ver_cus,ver_pro,ver_win_rate_x,ver_win_ratio_per_bu,lead_owner
is_converted,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
False,0.634805,0.075727,27361.32794,19.957518,1.0,1.0,1.0,74.808371,0.037815,0.05023,0.001132,0.048497,255.73028
True,0.632216,0.25935,24344.150722,19.023952,1.0,1.0,1.0,129.378351,0.084124,0.05732,0.000928,0.061138,339.282062


In [9]:
numeric_columns = train.select_dtypes(include=['float64', 'int64']).columns                        #각 라벨의 변수 표준편차값
grouped_by_B = train.groupby('is_converted')[numeric_columns].mean();grouped_by_B

Unnamed: 0_level_0,bant_submit,com_reg_ver_win_rate,customer_idx,historical_existing_cnt,id_strategic_ver,it_strategic_ver,idit_strategic_ver,lead_desc_length,ver_cus,ver_pro,ver_win_rate_x,ver_win_ratio_per_bu,lead_owner
is_converted,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
False,0.634805,0.075727,27361.32794,19.957518,1.0,1.0,1.0,74.808371,0.037815,0.05023,0.001132,0.048497,255.73028
True,0.632216,0.25935,24344.150722,19.023952,1.0,1.0,1.0,129.378351,0.084124,0.05732,0.000928,0.061138,339.282062


## 전처리

In [10]:
# def label_encoding(series: pd.Series) -> pd.Series:                                    #예시 코드 
#     """범주형 데이터를 시리즈 형태로 받아 숫자형 데이터로 변환합니다."""

#     my_dict = {}

#     # 모든 요소를 문자열로 변환
#     series = series.astype(str)

#     for idx, value in enumerate(sorted(series.unique())):
#         my_dict[value] = idx
#     series = series.map(my_dict)

#     return series

# # 레이블 인코딩할 칼럼들
# label_columns = [
#     "customer_country",
#     "business_subarea",
#     "business_area",
#     "business_unit",
#     "customer_type",
#     "enterprise",
#     "customer_job",
#     "inquiry_type",
#     "product_category",
#     "product_subcategory",
#     "product_modelname",
#     "customer_country.1",
#     "customer_position",
#     "response_corporate",
#     "expected_timeline",
# ]

# df_all = pd.concat([train[label_columns], test[label_columns]])

# for col in label_columns:
#     df_all[col] = label_encoding(df_all[col])
    
# for col in label_columns:  
#     train[col] = df_all.iloc[: len(train)][col]
#     test[col] = df_all.iloc[len(train) :][col]

In [28]:
train.shape, test.shape

((59299, 29), (5271, 29))

### 결측치 처리

In [6]:
def preprocess_data(train, test):                   #결측치 많은 변수 제거, 기초적인 방법
    # 결측치가 50% 이상인 변수 제거
    threshold = 0.5
    cols_to_drop = train.columns[train.isnull().mean() >= threshold]
    train = train.drop(cols_to_drop, axis=1)
    test = test.drop(cols_to_drop, axis=1)

    # 결측치가 있는 행 제거
    #train = train.dropna()

    return train, test

train, test = preprocess_data(train, test)


In [7]:
def drop_columns(train, test):                                    #국가 변수제거, 임시조치
    train = train.drop(['customer_country', 'customer_country.1'], axis=1)               
    test = test.drop(['customer_country', 'customer_country.1'], axis=1)

    return train, test 
    
train, test = drop_columns(train, test)

In [8]:
def drop_columns2(train, test):                                    #애매한 결측 변수 제거 
    train = train.drop(['customer_job', 'inquiry_type', 'product_category'], axis=1)               
    test = test.drop(['customer_job', 'inquiry_type', 'product_category'], axis=1)

    return train, test
    
train, test = drop_columns2(train, test)

### 변수 내부 변환

In [9]:
def preprocess_customer_idx(train, test):     #customer_idx 전처리. 예를들어 a,a,v,s,v라면 2,2,2,1,2 로 변경한다
    # 두 데이터프레임을 합쳐서 중복되는 값의 개수를 계산
    combined = pd.concat([train['customer_idx'], test['customer_idx']])
    value_counts = combined.value_counts()
    
    # train과 test 데이터셋에 새로운 열을 추가하고, 원래의 'customer_idx' 열은 제거
    train['customer_idx_counts'] = train['customer_idx'].map(value_counts).astype('int64')
    test['customer_idx_counts'] = test['customer_idx'].map(value_counts).astype('int64')
    # train.drop('customer_idx', axis=1, inplace=True)
    # test.drop('customer_idx', axis=1, inplace=True)
    
    return train, test

# 함수 적용
train, test = preprocess_customer_idx(train, test)

In [10]:
#이 변수는 라벨인코딩 해줘야함
def preprocess_customer_position(train, test):        #customer_position 전처리, 소수이거나 타겟이 F인 직급 무효화
    
    # train 데이터에서 customer_position의 값 분포 계산
    value_counts = train["customer_position"].value_counts()
    # 100개 이상인 값만 필터링
    major_values = value_counts[value_counts >= 100].index
    
    # train에서 각 customer_position 값에 대해 is_converted가 모두 True인지 확인
    all_true_converted = train.groupby("customer_position")['is_converted'].transform(lambda x: all(x))
    
    # train 데이터 전처리
    # is_converted가 모두 True이거나, major_values에 속하는 경우에는 그대로 유지, 아니면 'the_other'로 변경
    train["customer_position"] = train.apply(lambda x: x["customer_position"] if x["customer_position"] in major_values or all_true_converted[x.name] else "the_other", axis=1)
    
    # test 데이터에서 train 데이터의 범주에 없는 값은 "the_other"로 변경
    # train 데이터에 없는 값, 또는 is_converted가 모두 True이지 않은 'the_other' 값으로 변경
    test["customer_position"] = test["customer_position"].apply(lambda x: x if x in major_values else "the_other")
    
    return train, test
    
train, test = preprocess_customer_position(train, test)

### 인코딩

In [11]:
#범주형 변수 라벨 인코딩 

def encode_selected_columns(train, test, column_names):
    for column in column_names:
        le = LabelEncoder()
        train[column] = le.fit_transform(train[column])
        test[column] = le.transform(test[column])
    return train, test

# 선택된 컬럼 이름
column_names = [ "customer_position","business_unit","enterprise","response_corporate",
                "ver_cus","ver_pro"]

# 함수 호출
train, test = encode_selected_columns(train, test, column_names)


In [12]:
#lead_owner 변수는 test에 train에는 없는 라벨이 있기에 원앤핫 인코딩한다, 아니면 drop

def one_hot_encode_column(train, test, column_name):                           #오직 train기준으로 test 대출목적 인코딩 (원앤핫)
    # train 데이터에 대해 원-핫 인코딩 적용
    train_dummies = pd.get_dummies(train[column_name], prefix=column_name)
    train_encoded = pd.concat([train, train_dummies], axis=1)
    
    # 원-핫 인코딩으로 생성된 컬럼만 추출
    new_columns = train_dummies.columns.tolist()

    # test 데이터에 대해 동일한 컬럼에 대해 원-핫 인코딩 적용
    test_dummies = pd.get_dummies(test[column_name], prefix=column_name)
    
    # train에 있는 새로운 컬럼을 test에 추가
    for col in new_columns:
        test[col] = test_dummies[col] if col in test_dummies else 0

    # 원래 컬럼 삭제
    train_encoded = train_encoded.drop(column_name, axis=1)
    test = test.drop(column_name, axis=1)

    return train_encoded, test


train, test = one_hot_encode_column(train, test, 'lead_owner')


# def drop_columns2(train, test):           #lead_owner 변수 drop                                  
#     train = train.drop(['lead_owner'], axis=1)               
#     test = test.drop(['lead_owner'], axis=1)

#     return train, test 
    
# train, test = drop_columns2(train, test)


In [13]:
#customer_idx_counts 변수는 test에 train에는 없는 라벨이 있기에 원앤핫 인코딩한다, 아니면 drop

def one_hot_encode_column(train, test, column_name):                           #오직 train기준으로 test 대출목적 인코딩 (원앤핫)
    # train 데이터에 대해 원-핫 인코딩 적용
    train_dummies = pd.get_dummies(train[column_name], prefix=column_name)
    train_encoded = pd.concat([train, train_dummies], axis=1)
    
    # 원-핫 인코딩으로 생성된 컬럼만 추출
    new_columns = train_dummies.columns.tolist()

    # test 데이터에 대해 동일한 컬럼에 대해 원-핫 인코딩 적용
    test_dummies = pd.get_dummies(test[column_name], prefix=column_name)
    
    # train에 있는 새로운 컬럼을 test에 추가
    for col in new_columns:
        test[col] = test_dummies[col] if col in test_dummies else 0

    # 원래 컬럼 삭제
    train_encoded = train_encoded.drop(column_name, axis=1)
    test = test.drop(column_name, axis=1)

    return train_encoded, test


train, test = one_hot_encode_column(train, test, 'customer_idx_counts')


# def drop_columns2(train, test):           #lead_owner 변수 drop                                  
#     train = train.drop(['customer_idx_counts'], axis=1)               
#     test = test.drop(['customer_idx_counts'], axis=1)

#     return train, test 
    
# train, test = drop_columns2(train, test)


In [14]:
train.shape, test.shape

((59299, 1047), (5271, 1047))

In [16]:
train.to_csv("data1.csv", index=False)
test.to_csv("test_after_tune.csv", index=False)

## val set 분리 (valid_x, valid_y는 검증할때만 사용하는객체, train객체만을 사용하여 학습하기)

In [47]:
def sep_ml_xy(df, target):                                     #train valid 분리
    y = df[target]
    x = df.drop(columns=target)
    return x, y

train_x, train_y = sep_ml_xy(train, "is_converted")                                
train_x, valid_x, train_y, valid_y = tts(train_x, train_y, train_size=0.8, shuffle=True, stratify=train_y)

In [48]:
train = pd.concat([train_x, train_y], axis=1)

In [122]:

def oversample_train_data(train):                                                    #오버샘플링 (4가지 중 한개) 비추천!
    X = train.drop('is_converted', axis=1)
    y = train['is_converted']

    # 오버샘플링 방법 선택
#     smote = SMOTE()
#     X_resampled, y_resampled = smote.fit_resample(X, y)
    
#     adasyn = ADASYN()
#     X_resampled, y_resampled = adasyn.fit_resample(X, y)

    borderline_smote = BorderlineSMOTE()
    X_resampled, y_resampled = borderline_smote.fit_resample(X, y)
    
#     ros = RandomOverSampler()
#     X_resampled, y_resampled = ros.fit_resample(X, y)

    train_resampled = pd.DataFrame(X_resampled, columns=X.columns)
    train_resampled['is_converted'] = y_resampled

    return train_resampled


train = oversample_train_data(train)


In [123]:
#train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 87118 entries, 0 to 87117
Data columns (total 10 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   bant_submit          87118 non-null  float64
 1   business_unit        87118 non-null  int32  
 2   enterprise           87118 non-null  int32  
 3   lead_desc_length     87118 non-null  int64  
 4   customer_position    87118 non-null  int32  
 5   response_corporate   87118 non-null  int32  
 6   ver_cus              87118 non-null  int64  
 7   ver_pro              87118 non-null  int64  
 8   customer_idx_counts  87118 non-null  int64  
 9   is_converted         87118 non-null  bool   
dtypes: bool(1), float64(1), int32(4), int64(4)
memory usage: 4.7 MB


In [124]:
#test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5271 entries, 0 to 5270
Data columns (total 10 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   bant_submit          5271 non-null   float64
 1   business_unit        5271 non-null   int32  
 2   enterprise           5271 non-null   int32  
 3   lead_desc_length     5271 non-null   int64  
 4   customer_position    5271 non-null   int32  
 5   response_corporate   5271 non-null   int32  
 6   ver_cus              5271 non-null   int64  
 7   ver_pro              5271 non-null   int64  
 8   is_converted         0 non-null      float64
 9   customer_idx_counts  5271 non-null   int64  
dtypes: float64(2), int32(4), int64(4)
memory usage: 329.6 KB


customer_job inquiry_type product_category

## 기본모델 성능 확인

In [49]:

X_train = train.drop('is_converted', axis=1)
y_train = train['is_converted']
X_tr, X_val, y_tr, y_val = tts(X_train, y_train, test_size=0.3)

def train_and_evaluate_classifiers(X_tr, X_val, y_tr, y_val):
    classifiers = [
        ('DecisionTreeClassifier', DecisionTreeClassifier()),
        ('XGBClassifier', XGBClassifier(use_label_encoder=False, eval_metric='logloss')),
        ('LGBMClassifier', LGBMClassifier()),
        ('RandomForestClassifier', RandomForestClassifier()),
        #('CatBoostClassifier', CatBoostClassifier(verbose=0)),
        #('RidgeClassifier', RidgeClassifier()),
        #('SVC', SVC()),
    ]

    scores = []
    for name, model in tqdm(classifiers, desc="Training Classifiers"):
        model.fit(X_tr, y_tr)
        predictions = model.predict(X_val)
        score = f1_score(y_val, predictions) 
        scores.append((name, score, model))

    # Sort models by their F1 score
    scores.sort(key=lambda x: x[1], reverse=True)

    # Print models and their scores
    for name, score, _ in scores:
        print(f'{name}: F1 Score = {score:.4f}')

    # Return the best model
    best_model_name, best_score, best_model = scores[0]
    print(f'\nBest Model: {best_model_name} with F1 Score = {best_score:.4f}')
    print(f'Best Model Hyperparameters: {best_model.get_params()}')

    return best_model

# Train and evaluate
best_model = train_and_evaluate_classifiers(X_tr, X_val, y_tr, y_val)


Training Classifiers:  75%|██████████████████████████████████████████████▌               | 3/4 [00:11<00:02,  2.88s/it]

[LightGBM] [Info] Number of positive: 2718, number of negative: 30489
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001894 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1230
[LightGBM] [Info] Number of data points in the train set: 33207, number of used features: 329
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.081850 -> initscore=-2.417470
[LightGBM] [Info] Start training from score -2.417470


Training Classifiers: 100%|██████████████████████████████████████████████████████████████| 4/4 [00:24<00:00,  6.17s/it]

RandomForestClassifier: F1 Score = 0.8261
DecisionTreeClassifier: F1 Score = 0.7950
LGBMClassifier: F1 Score = 0.7807
XGBClassifier: F1 Score = 0.7782

Best Model: RandomForestClassifier with F1 Score = 0.8261
Best Model Hyperparameters: {'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': 'sqrt', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}





In [50]:
final_f1 = f1_score(valid_y, best_model.predict(valid_x))
print('Final F1 Score: {0:.4f}'.format(final_f1))

Final F1 Score: 0.8394


## 제출

In [145]:
x_test = test.drop(["is_converted"], axis=1)
test_pred = best_model.predict(x_test.fillna(0))

def convert_to_boolean_vector(input_vector):
    return np.array(input_vector == 1, dtype=bool)          #0,1로 제출해도 되는데 찜찜해서 True False로 변경 
    
result_vector = convert_to_boolean_vector(test_pred)

In [146]:
df_sub = pd.read_csv("submission.csv")
df_sub["is_converted"] = result_vector

# 제출 파일 저장
df_sub.to_csv("submission4.csv", index=False)

In [147]:
df = pd.read_csv("submission4.csv")                       #True가 10%정도라면 좋은것.
df["is_converted"].value_counts()

is_converted
False    4791
True      480
Name: count, dtype: int64