### Inquiry_type & customer type 변환

In [1]:
import pandas as pd
import numpy as np
import random 
import re 
import os 
import warnings
warnings.filterwarnings("ignore")
import category_encoders as ce 

from sklearn.metrics import (    accuracy_score,    confusion_matrix,f1_score,precision_score, recall_score,roc_auc_score)
from sklearn.metrics import make_scorer
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold,cross_val_score, StratifiedShuffleSplit

from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE , ADASYN

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import BaggingClassifier,RandomForestClassifier,ExtraTreesClassifier,GradientBoostingClassifier,HistGradientBoostingClassifier,AdaBoostClassifier
from sklearn.ensemble import VotingClassifier,StackingClassifier
from sklearn.neighbors import KNeighborsClassifier
from catboost import CatBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

import optuna
from optuna.samplers import TPESampler

from sklearn.base import BaseEstimator, TransformerMixin, ClassifierMixin, clone
from sklearn.preprocessing import FunctionTransformer, StandardScaler, LabelEncoder

In [2]:
pd.set_option('display.max_rows', 500)
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42)

nation_corp = {
    'Austria': ['LGEAG'],    'Czech Republic': ['LGECZ'],    'France': ['LGEFS'],    'Germany': ['LGEDG'],    'Greece': ['LGEHS'],    'Hungary': ['LGEMK'],    'Italy': ['LGEIS'],    'Netherlands': ['LGESC', 'LGEEH', 'LGEBN'],    'Poland': ['LGEWR', 'LGEPL', 'LGEMA'],    'Portugal': ['LGEPT','LGEBT'],
    'EUs': ['LGEEB'],    'Romania': ['LGERO'],    'Spain': ['LGEES'],    'Sweden': ['LGENO', 'LGESW'],    'United Kingdom': ['LGEUK'],      'Kazakhstan': ['LGEAK'],    'Russia': ['LGERM', 'LGERI', 'LGERA'],
    'Ukraine': ['LGEUR'],    'Latvia': ['LGELV','LGELA'],    'Algeria': ['LGEAS'],
    'Egypt': ['LGEEG'],    'Jordan': ['LGELF'],    'Kenya': ['LGESK','LGEEF'],    'Morocco': ['LGEMC'],
    'Saudi Arabia': ['LGESJ'],    'Iran':['LGEIR'],     'Israel':['LGEYK'],     'The Republic of South Africa': ['LGESA'],
    'Tunisia': ['LGETU'],    'U.A.E': ['LGEOT', 'LGEDF', 'LGEGF', 'LGEME', 'LGEAF'],    'Nigeria': ['LGEAO', 'LGENI'],
    'Turkey': ['LGETK', 'LGEAT'],    'Australia': ['LGEAP'],
    'China': ['LGEQA', 'LGETL', 'LGECH', 'LGEYT', 'LGETR', 'LGETA', 'LGESY', 'LGESH', 'LGEQH', 'LGEQD', 'LGEPN', 'LGEND', 'LGEKS', 'LGEHZ', 'LGEHN', 'LGEHK'],
    'India': ['LGEIL'],    'Indonesia': ['LGEIN'],    'Japan': ['LGEJP'],    'Malaysia': ['LGEML'],    'Philippines': ['LGEPH'],
    'Singapore': ['LGESL'],    'Taiwan': ['LGETT'],    'Korea' :['LGEKR'],    'Thailand': ['LGETH'],    'Vietnam': ['LGEVN','LGEVH'],
     'Canada': ['LGECI'],    'Mexico': ['LGERS', 'LGEMX', 'LGEMS', 'LGEMM'],    'United States': ['LGEMR', 'LGEUS', 'LGEMU', 'LGEAI'],
    'Argentina': ['LGEAG','LGEAR'],    'Brazil': ['LGEBR','LGESP'],    'Chile': ['LGECL'],    'Colombia': ['LGEVZ', 'LGECB'],
    'Panama': ['Guatemala', 'LGEPS'],    'Peru': ['LGEPR']}
continent_nation={
    'Europe':['EUs','Austria', 'Czech Republic' ,'France' ,'Germany', 'Greece' ,'Hungary', 'Italy', 'Netherlands' ,'Poland' ,'Portugal' ,'Romania', 'Spain' ,'Sweden','United Kingdom'], 
    'Russia and CIS':['Kazakhstan','Russia', 'Ukraine', 'Latvia'],     'Africa and MiddleEast': ['Israel','Iran','Algeria', 'Egypt', 'Jordan', 'Kenya', 'Morocco','Saudi Arabia','The Republic of South Africa','Tunisia', 'U.A.E', 'Nigeria', 'Turkey'], 
    'Asia':['Korea','Australia','China','India','Indonesia','Japan','Malaysia','Philippines','Singapore','Taiwan','Thailand','Vietnam'], 
    'NorthAmerica' : ['Canada','Mexico','United States'],    'SouthAmerica' :['Argentina','Brazil','Chile','Colombia','Panama','Peru']
    
}
hemisphere = {
    'Northern': ['EUs', 'Austria', 'Czech Republic', 'France', 'Germany', 'Greece', 'Hungary', 'Italy', 'Netherlands', 'Poland', 'Portugal', 'Romania', 'Spain', 'Sweden', 'United Kingdom', 'Kazakhstan', 'Russia', 'Ukraine', 'Latvia', 'Israel', 'Iran', 'Jordan', 'Morocco', 'Saudi Arabia', 'Tunisia', 'Turkey', 'Korea', 'China', 'Japan', 'Taiwan', 'Canada', 'United States', 'Mexico', 'Panama'],
    'Southern': ['Algeria', 'Egypt', 'Kenya', 'The Republic of South Africa', 'U.A.E', 'Nigeria', 'Australia', 'India', 'Indonesia', 'Malaysia', 'Philippines', 'Singapore', 'Thailand', 'Vietnam', 'Argentina', 'Brazil', 'Chile', 'Colombia', 'Peru']
}
mapping_dict = {
    "others": "Other",
    "Others": "Other",
    "other_": "Other",
    "other": "Other",
    "Etc.": "ETC.",
}


In [3]:
# 데이터 생성 및 전처리 함수 
def get_datas():
    train = pd.read_csv("train.csv")
    test = pd.read_csv("submission.csv").drop(['id','is_converted'], axis =1) # 테스트 데이터(제출파일의 데이터)
    train['is_converted']=np.where(train['is_converted']==True,1,0)
    return train, test 


def delete_cols(data, cols):
    data = data.drop(columns=cols)
    return data

def log_transform(data,cols):
    for col in cols :
        data[col+'log']=np.log1p(data[col]) 
    return data 


def eda_expected_timeline(df):
    
    def timeline_label(time):
    
        time = str(time).lower().replace(' ','').replace('_','').replace('/','').replace(',','').replace('~','').replace('&','').replace('-','').replace('.','')
        
        if time == 'lessthan3months':
            result = 'less than 3 months'
        elif time == '3months6months':
            result = '3 months ~ 6 months'
        elif time == '6months9months':
            result = '6 months ~ 9 months'
        elif time == '9months1year':
            result = '9 months ~ 1 year'
        elif time == 'morethanayear':
            result = 'more than a year'
        else:
            result = 'aimers_0203'
            
        return result
    
    df['expected_timeline'] = df['expected_timeline'].apply(timeline_label)
    
    return df


def inquiry_type_preprocessing(train, test, categorical_list):
    df = pd.concat([train, test], axis=0)
    df.reset_index(drop=True, inplace=True)
    idx = len(train)
    for feature in categorical_list:
        df[feature] = df[feature].str.lower()
        # replace() 함수를 사용하여 특수 문자 대체
        df[feature] = df[feature].str.replace(pat=r'[^\w\s]', repl=r' ', regex=True)
        df[feature] = df[feature].str.replace(pat=r'_', repl=r' ', regex=True)

    token_list = df['inquiry_type'].str.split()
    for i in range(len(df)):
        # nan 처리
        if type(token_list[i]) == float:
            continue
        if 'purchase' in token_list[i]  or 'quotation' in token_list[i]:
            df['inquiry_type'][i] = 'purchase or quotation' 

        if 'partnership' in token_list[i] or 'distributorship' in token_list[i] :
            df['inquiry_type'][i] = 'partnership' 
        
        if 'technical' in token_list[i] :
            df['inquiry_type'][i] = 'technical'

        if 'sales' in token_list[i] :
            df['inquiry_type'][i] = 'sales' 
            
    df.loc[df['inquiry_type'] == 'others', 'inquiry_type'] = 'other'
    df.loc[df['inquiry_type'] == 'other ', 'inquiry_type'] = 'other'
    df.loc[df['inquiry_type'] == 'etc ', 'inquiry_type'] = 'other'

    value_counts = df['inquiry_type'].value_counts()
    return df[:idx], df[idx:]


def customer_type_preprocessing(train, test):
    df = pd.concat([train, test], axis=0)
    df.reset_index(drop=True, inplace=True)
    idx = len(train)
    
    df["customer_type"] = df["customer_type"].str.lower()
    # replace() 함수를 사용하여 특수 문자 대체
    df["customer_type"] = df["customer_type"].str.replace(pat=r'[^\w\s]', repl=r' ', regex=True)
    df["customer_type"] = df["customer_type"].str.replace(pat=r'_', repl=r' ', regex=True)

    token_list = df["customer_type"].str.split()
    for i in range(len(df)):
        # nan 처리
        if type(token_list[i]) == float:
            continue
        if 'partner' in token_list[i]:
            df["customer_type"][i] = 'partner' 

        if 'influencer' in token_list[i]:
            df["customer_type"][i] = 'influencer' 
        
        if 'engineer' in token_list[i] or 'software' in token_list[i] or 'developer' in token_list[i] or "technician" in token_list[i]:
            df["customer_type"][i] = 'engineer' 

        if 'end' in token_list[i]:
            df["customer_type"][i] = 'end customer' 
        
        if 'homeowner' in token_list[i]:
            df["customer_type"][i] = 'home owner'

        if 'consultant' in token_list[i]:
            df["customer_type"][i] = 'consultant' 

        if 'others' in token_list[i] or 'etc' in token_list[i] or 'other ' in token_list[i]:
            df["customer_type"][i] = 'other' 

    return df[:idx], df[idx:]


# total_area 변수로 통일
def eda_business_area(df):
    for col in ['business_area','business_subarea']:
        df[col] = df[col].str.lower()
        df[col] = df[col].str.replace(" ", "") 
        df[col] = df[col].str.replace(r'[^\w\s]', "") 
        df[col] = df[col].fillna('nan') 
    df['total_area'] = df['business_area'].astype(str) + df['business_subarea'].astype(str)
    return df 

# 새로운 국가명, 대륙 열을 만들기 
def get_nation_continent(df):
    nation_corp_reverse ={v:k for k , values in nation_corp.items() for v in values }
    df['nation']=df['response_corporate'].map(nation_corp_reverse)
    continent_nation_reverse ={v:k for k , values in continent_nation.items() for v in values }
    df['continent']=df['nation'].map(continent_nation_reverse)
#     df = df.drop('customer_country',axis=1) 
    return df 

#라벨 인코딩 
def label_encoding(series: pd.Series) -> pd.Series:
    my_dict = {}
    series = series.astype(str)
    for idx, value in enumerate(sorted(series.unique())):
        my_dict[value] = idx
    series = series.map(my_dict)
    return series

# com_reg_ver_win_rate 최빈값으로 채우기 
def com_reg_fill(train,test):
    train['com_reg_ver_win_rate'] = train['com_reg_ver_win_rate'].fillna(train['com_reg_ver_win_rate'].mode()[0])
    test['com_reg_ver_win_rate'] = test['com_reg_ver_win_rate'].fillna(train['com_reg_ver_win_rate'].mode()[0])
    return train,test

def timeline_tonumber(row):
    if row['expected_timeline'] == 'less than 3 months':
        return int(1)
    elif row['expected_timeline'] == '3 months ~ 6 months':
        return int(3)
    elif row['expected_timeline'] == '6 months ~ 9 months':
        return int(6)
    elif row['expected_timeline'] == '9 months ~ 1 year':
        return int(9)    
    elif row['expected_timeline'] =='aimer_0203':
        return np.nan
    else : 
        return int(12)
    
def create_grouped_features(train, test, group, numeric_var):
    # 범주형 특성들에 대해서 다른 수치형 데이터의 중앙값, 최대, 합을 새로운 열로 추가하기 
    train = train.copy()
    test = test.copy()
    aggs = ['median', 'max','sum']
    for agg in aggs:
        # groupby 후 aggregation
        a1 = train.groupby([group])[numeric_var].agg(agg).to_dict()
        # 새로운 feature 생성
        train[numeric_var+'_'+group+'_'+agg] = train[group].map(a1)
        test[numeric_var+'_'+group+'_'+agg] = test[group].map(a1)
    return train, test

def do_scale(train,test, scale_cols) :
    for c in scale_cols:
        min_value = train[c].min()
        max_value = train[c].max()
        train[c] = (train[c] - min_value) / (max_value - min_value)
        test[c] = (test[c] - min_value) / (max_value - min_value)
    return train,test



In [4]:
groups = ['business_unit','customer_idx']
numeric_vars = ['historical_existing_cnt', 'lead_desc_length']
scale_cols = ['com_reg_ver_win_rate','historical_existing_cnt', 'lead_desc_length','ver_win_rate_x'] 

In [5]:
# data 갖고오기 
train,test= get_datas() 

# 스케일링 하기 
train,test =do_scale(train,test,scale_cols)
# 범주형 데이터에 대해 수치형 데이터 통계값 추가
for group in groups:
    for numeric_var in numeric_vars:
        train, test = create_grouped_features(train, test, group, numeric_var)
        
# 전처리, 로그변환 수행하기 
columns_to_log=['com_reg_ver_win_rate','lead_desc_length']
train,test= log_transform(train,columns_to_log ),log_transform(test,columns_to_log)
train,test =eda_business_area(train),eda_business_area(test)
train,test= get_nation_continent(train),get_nation_continent(test)
train,test=eda_expected_timeline(train) ,eda_expected_timeline(test)
train,test=customer_type_preprocessing(train, test)
train,test=inquiry_type_preprocessing(train, test, ['inquiry_type']) 

for col in ['customer_idx','customer_type',]:
    train[col+'count'] =train[col].map(train[col].value_counts())
    test[col+'count'] =test[col].map(train[col].value_counts())
    

train['idx_unit'] = train['customer_idx'].astype(str)+train['business_unit'].astype(str)
test['idx_unit'] = test['customer_idx'].astype(str)+test['business_unit'].astype(str)
train['idx_posi'] = train['customer_idx'].astype(str)+train['customer_position'].astype(str)
test['idx_posi'] = test['customer_idx'].astype(str)+test['customer_position'].astype(str)
train['conti_inquiry'] = train['continent'].astype(str)+train['inquiry_type'].astype(str)
test['conti_inquiry'] = test['continent'].astype(str)+test['inquiry_type'].astype(str)
# train['job_unit'] = train['business_unit'].astype(str)+train['customer_job'].astype(str)
# test['job_unit'] = test['business_unit'].astype(str)+test['customer_job'].astype(str)
#0.717 나옴 4개 변수 

In [6]:
# country.1지우지말기 
columns_to_delete=['customer_country']
# columns_to_delete=[]
train,test =delete_cols(train, columns_to_delete), delete_cols(test,columns_to_delete)

cols = [     'customer_country',    "business_subarea",    "business_area",    "business_unit",    "customer_type",    "enterprise",    "customer_job",    "inquiry_type",    "product_category",    "product_subcategory",    "product_modelname",    "customer_position",
      'customer_country.1', "response_corporate", "expected_timeline",
'nation','continent','lead_owner','idx_posi', 'conti_inquiry', 'idx_unit','bant_submit'
,'total_area'   ]
label_columns =list(set(cols)-set(columns_to_delete))

from category_encoders import CatBoostEncoder
enc = CatBoostEncoder(cols=label_columns)
enc.fit(train[label_columns], train['is_converted'])  # 'target'은 실제 데이터의 타겟 변수 이름에 맞게 변경
# 인코딩 적용
train[label_columns] = enc.transform(train[label_columns])
test[label_columns] = enc.transform(test[label_columns])

In [7]:
test = test.fillna(0)
train = train.fillna(0)
x = train.drop('is_converted', axis=1)
y = train.is_converted

In [8]:
test = test.drop('is_converted', axis = 1)

In [9]:
def LGBM_skfold(zero_wei,one_wei,seed) :
     #Decisiontree에 대해서만 skfold 적용하는 함수 
    real_preds = []
    class_weight={0:zero_wei , 1:one_wei}
    model = LGBMClassifier(random_state=seed ,class_weight =class_weight, verbose=-1)
    Skfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)  
#     Skfold = StratifiedShuffleSplit(n_splits=10, random_state=42)  
    cv_precision_scores, cv_recall_scores, cv_confusion_matrices, cv_f1_scores, cv_roc_auc_scores, cv_TN = [],[],[],[],[],[]
    tt = []
    for train_index, test_index in Skfold.split(x, y):  
        x_train, x_test, y_train, y_test= x.iloc[train_index], x.iloc[test_index], y.iloc[train_index], y.iloc[test_index]
    
        model.fit(x_train, y_train)
        pred = model.predict(x_test)
        test_pred = model.predict(test)
        real_preds.append(test_pred)
        
        pred_proba = model.predict_proba(x_test)[:, 1]  
        
        f1 = np.round(f1_score(y_test, pred, average='binary'), 4)  
        precision = np.round(precision_score(y_test, pred, average='binary'), 4)  
        recall = np.round(recall_score(y_test, pred, average='binary'), 4)  
        conf_matrix = confusion_matrix(y_test, pred)  
        roc_auc = roc_auc_score(y_test, pred_proba)  
        
        TN = conf_matrix[1][1]  # TN 값 저장
        ttone = conf_matrix[0][0]
        cv_TN.append(TN)  # TN 값 저장
        tt.append(ttone)
        cv_f1_scores.append(f1)  
        cv_precision_scores.append(precision)  
        cv_recall_scores.append(recall)  
        cv_confusion_matrices.append(conf_matrix)  
        cv_roc_auc_scores.append(roc_auc)  
        
    average_conf_matrix = np.mean(np.array(cv_confusion_matrices), axis=0)
    print(f"> F1 Score: {np.mean(cv_f1_scores)}, 정밀도: {np.mean(cv_precision_scores)}, 재현율: {np.mean(cv_recall_scores)}, ROC-AUC: {np.mean(cv_roc_auc_scores)}")
    print('> 평균 검증 오차행렬: \n', average_conf_matrix)  
    
    return real_preds, np.mean(cv_f1_scores), np.mean(cv_roc_auc_scores), np.mean(cv_TN) ,np.mean(tt) # TN 평균 값 리턴

f1_avg,roc_avg,tt=0,0,0
avg_get_1 =0
for seed in [5,11,30,322,8940]:
    _,f1,roc,ones,tts =LGBM_skfold(1,1,seed)
    f1_avg+= f1 
    roc_avg+= roc 
    avg_get_1+=ones
    tt+= tts 
print(f1_avg/5,roc_avg/5,avg_get_1/5, tt/5)


> F1 Score: 0.89961, 정밀도: 0.9322399999999998, 재현율: 0.86928, ROC-AUC: 0.9959477578725165
> 평균 검증 오차행렬: 
 [[5414.2   30.7]
 [  63.4  421.6]]
> F1 Score: 0.89961, 정밀도: 0.9322399999999998, 재현율: 0.86928, ROC-AUC: 0.9959477578725165
> 평균 검증 오차행렬: 
 [[5414.2   30.7]
 [  63.4  421.6]]
> F1 Score: 0.89961, 정밀도: 0.9322399999999998, 재현율: 0.86928, ROC-AUC: 0.9959477578725165
> 평균 검증 오차행렬: 
 [[5414.2   30.7]
 [  63.4  421.6]]
> F1 Score: 0.89961, 정밀도: 0.9322399999999998, 재현율: 0.86928, ROC-AUC: 0.9959477578725165
> 평균 검증 오차행렬: 
 [[5414.2   30.7]
 [  63.4  421.6]]
> F1 Score: 0.89961, 정밀도: 0.9322399999999998, 재현율: 0.86928, ROC-AUC: 0.9959477578725165
> 평균 검증 오차행렬: 
 [[5414.2   30.7]
 [  63.4  421.6]]
0.89961 0.9959477578725165 421.6 5414.2


In [10]:
LGBM_preds,_,_,_,_= LGBM_skfold(1,1,3)

LGBM_predicts_array = np.array(LGBM_preds)
LGBM_final_prediction = np.mean(LGBM_predicts_array, axis=0)
LGBM_final_prediction = np.where(LGBM_final_prediction < 0.0000000001, 0, 1)

count_0 = np.size(np.where(LGBM_final_prediction == 0))
count_2 = np.size(np.where(LGBM_final_prediction >0))
count_1 = np.size(np.where(LGBM_final_prediction == 1))

# 각 값을 출력
print("Count of 0:", count_0)
print("Count of 1:", count_1)

> F1 Score: 0.89961, 정밀도: 0.9322399999999998, 재현율: 0.86928, ROC-AUC: 0.9959477578725165
> 평균 검증 오차행렬: 
 [[5414.2   30.7]
 [  63.4  421.6]]
Count of 0: 4584
Count of 1: 687


In [11]:
def XGB_skfold(zero_wei,one_wei,seed) :
     #Decisiontree에 대해서만 skfold 적용하는 함수 
    real_preds = []
    class_weight={0:zero_wei , 1:one_wei}
    model = XGBClassifier(random_state=seed ,class_weight =class_weight, verbose=-1)
    Skfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)  
#     Skfold = StratifiedShuffleSplit(n_splits=10, random_state=42)  
    cv_precision_scores, cv_recall_scores, cv_confusion_matrices, cv_f1_scores, cv_roc_auc_scores, cv_TN = [],[],[],[],[],[]
    tt = []
    for train_index, test_index in Skfold.split(x, y):  
        x_train, x_test, y_train, y_test= x.iloc[train_index], x.iloc[test_index], y.iloc[train_index], y.iloc[test_index]
    
        model.fit(x_train, y_train)
        pred = model.predict(x_test)
        test_pred = model.predict(test)
        real_preds.append(test_pred)
        
        pred_proba = model.predict_proba(x_test)[:, 1]  
        
        f1 = np.round(f1_score(y_test, pred, average='binary'), 4)  
        precision = np.round(precision_score(y_test, pred, average='binary'), 4)  
        recall = np.round(recall_score(y_test, pred, average='binary'), 4)  
        conf_matrix = confusion_matrix(y_test, pred)  
        roc_auc = roc_auc_score(y_test, pred_proba)  
        
        TN = conf_matrix[1][1]  # TN 값 저장
        ttone = conf_matrix[0][0]
        cv_TN.append(TN)  # TN 값 저장
        tt.append(ttone)
        cv_f1_scores.append(f1)  
        cv_precision_scores.append(precision)  
        cv_recall_scores.append(recall)  
        cv_confusion_matrices.append(conf_matrix)  
        cv_roc_auc_scores.append(roc_auc)  
        
    average_conf_matrix = np.mean(np.array(cv_confusion_matrices), axis=0)
    print(f"> F1 Score: {np.mean(cv_f1_scores)}, 정밀도: {np.mean(cv_precision_scores)}, 재현율: {np.mean(cv_recall_scores)}, ROC-AUC: {np.mean(cv_roc_auc_scores)}")
    print('> 평균 검증 오차행렬: \n', average_conf_matrix)  
    
    return real_preds, np.mean(cv_f1_scores), np.mean(cv_roc_auc_scores), np.mean(cv_TN) ,np.mean(tt) # TN 평균 값 리턴

f1_avg,roc_avg,tt=0,0,0
avg_get_1 =0
for seed in [5,11,30,322,8940]:
    _,f1,roc,ones,tts =XGB_skfold(1,1,seed)
    f1_avg+= f1 
    roc_avg+= roc 
    avg_get_1+=ones
    tt+= tts 
print(f1_avg/5,roc_avg/5,avg_get_1/5, tt/5)


> F1 Score: 0.89606, 정밀도: 0.9287000000000001, 재현율: 0.86578, ROC-AUC: 0.9958321375367852
> 평균 검증 오차행렬: 
 [[5412.6   32.3]
 [  65.1  419.9]]
> F1 Score: 0.89606, 정밀도: 0.9287000000000001, 재현율: 0.86578, ROC-AUC: 0.9958321375367852
> 평균 검증 오차행렬: 
 [[5412.6   32.3]
 [  65.1  419.9]]
> F1 Score: 0.89606, 정밀도: 0.9287000000000001, 재현율: 0.86578, ROC-AUC: 0.9958321375367852
> 평균 검증 오차행렬: 
 [[5412.6   32.3]
 [  65.1  419.9]]
> F1 Score: 0.89606, 정밀도: 0.9287000000000001, 재현율: 0.86578, ROC-AUC: 0.9958321375367852
> 평균 검증 오차행렬: 
 [[5412.6   32.3]
 [  65.1  419.9]]
> F1 Score: 0.89606, 정밀도: 0.9287000000000001, 재현율: 0.86578, ROC-AUC: 0.9958321375367852
> 평균 검증 오차행렬: 
 [[5412.6   32.3]
 [  65.1  419.9]]
0.89606 0.9958321375367852 419.9 5412.6


In [12]:
XGB_preds,_,_,_,_= XGB_skfold(1,1,100)

XGB_predicts_array = np.array(XGB_preds)
XGB_final_prediction = np.mean(XGB_predicts_array, axis=0)
XGB_final_prediction = np.where(XGB_final_prediction < 0.0000000001, 0, 1)

count_0 = np.size(np.where(XGB_final_prediction == 0))
count_2 = np.size(np.where(XGB_final_prediction >0))
count_1 = np.size(np.where(XGB_final_prediction == 1))

# 각 값을 출력
print("Count of 0:", count_0)
print("Count of 1:", count_1)

> F1 Score: 0.89606, 정밀도: 0.9287000000000001, 재현율: 0.86578, ROC-AUC: 0.9958321375367852
> 평균 검증 오차행렬: 
 [[5412.6   32.3]
 [  65.1  419.9]]
Count of 0: 4496
Count of 1: 775


In [13]:
def dtc_skfold(zero_wei,one_wei,seed) :
     #Decisiontree에 대해서만 skfold 적용하는 함수 
    real_preds = []
    class_weight={0:zero_wei , 1:one_wei}
    model = DecisionTreeClassifier(random_state=seed ,class_weight =class_weight)
    Skfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)  
#     Skfold = StratifiedShuffleSplit(n_splits=10, random_state=42)  
    cv_precision_scores, cv_recall_scores, cv_confusion_matrices, cv_f1_scores, cv_roc_auc_scores, cv_TN = [],[],[],[],[],[]
    tt = []
    for train_index, test_index in Skfold.split(x, y):  
        x_train, x_test, y_train, y_test= x.iloc[train_index], x.iloc[test_index], y.iloc[train_index], y.iloc[test_index]
    
        model.fit(x_train, y_train)
        pred = model.predict(x_test)
        test_pred = model.predict(test)
        real_preds.append(test_pred)
        
        pred_proba = model.predict_proba(x_test)[:, 1]  
        
        f1 = np.round(f1_score(y_test, pred, average='binary'), 4)  
        precision = np.round(precision_score(y_test, pred, average='binary'), 4)  
        recall = np.round(recall_score(y_test, pred, average='binary'), 4)  
        conf_matrix = confusion_matrix(y_test, pred)  
        roc_auc = roc_auc_score(y_test, pred_proba)  
        
        TN = conf_matrix[1][1]  # TN 값 저장
        ttone = conf_matrix[0][0]
        cv_TN.append(TN)  # TN 값 저장
        tt.append(ttone)
        cv_f1_scores.append(f1)  
        cv_precision_scores.append(precision)  
        cv_recall_scores.append(recall)  
        cv_confusion_matrices.append(conf_matrix)  
        cv_roc_auc_scores.append(roc_auc)  
        
    average_conf_matrix = np.mean(np.array(cv_confusion_matrices), axis=0)
    print(f"> F1 Score: {np.mean(cv_f1_scores)}, 정밀도: {np.mean(cv_precision_scores)}, 재현율: {np.mean(cv_recall_scores)}, ROC-AUC: {np.mean(cv_roc_auc_scores)}")
    print('> 평균 검증 오차행렬: \n', average_conf_matrix)  
    
    return real_preds, np.mean(cv_f1_scores), np.mean(cv_roc_auc_scores), np.mean(cv_TN) ,np.mean(tt) # TN 평균 값 리턴

f1_avg,roc_avg,tt=0,0,0
avg_get_1 =0
for seed in [5,11,30,322,8940]:
    _,f1,roc,ones,tts =dtc_skfold(1,1,seed)
    f1_avg+= f1 
    roc_avg+= roc 
    avg_get_1+=ones
    tt+= tts 
print(f1_avg/5,roc_avg/5,avg_get_1/5, tt/5)



> F1 Score: 0.8612299999999999, 정밀도: 0.8573999999999999, 재현율: 0.8651500000000001, ROC-AUC: 0.9275408477911682
> 평균 검증 오차행렬: 
 [[5375.    69.9]
 [  65.4  419.6]]
> F1 Score: 0.8632500000000001, 정밀도: 0.8630799999999998, 재현율: 0.86351, ROC-AUC: 0.9272356743221561
> 평균 검증 오차행렬: 
 [[5378.4   66.5]
 [  66.2  418.8]]
> F1 Score: 0.8630599999999999, 정밀도: 0.86047, 재현율: 0.86577, ROC-AUC: 0.9278020625220285
> 평균 검증 오차행렬: 
 [[5376.7   68.2]
 [  65.1  419.9]]
> F1 Score: 0.8627, 정밀도: 0.8598800000000001, 재현율: 0.86557, ROC-AUC: 0.9277747776911024
> 평균 검증 오차행렬: 
 [[5376.4   68.5]
 [  65.2  419.8]]
> F1 Score: 0.8609499999999999, 정밀도: 0.8575200000000001, 재현율: 0.86453, ROC-AUC: 0.9272422804050343
> 평균 검증 오차행렬: 
 [[5375.1   69.8]
 [  65.7  419.3]]
0.862238 0.9275191285462979 419.48 5376.32


In [14]:
dtc_preds,_,_,_,_= dtc_skfold(1,1,3)

dtc_predicts_array = np.array(dtc_preds)
dtc_final_prediction = np.mean(dtc_predicts_array, axis=0)
dtc_final_prediction = np.where(dtc_final_prediction < 0.1, 0, 1)

count_0 = np.size(np.where(dtc_final_prediction == 0))
count_2 = np.size(np.where(dtc_final_prediction >0))
count_1 = np.size(np.where(dtc_final_prediction == 1))

# 각 값을 출력
print("Count of 0:", count_0)
print("Count of 1:", count_1)

> F1 Score: 0.86283, 정밀도: 0.8610800000000001, 재현율: 0.8647500000000001, ROC-AUC: 0.9276289726627786
> 평균 검증 오차행렬: 
 [[5377.1   67.8]
 [  65.6  419.4]]
Count of 0: 3056
Count of 1: 2215


In [15]:
len(test)

5271

In [16]:
sum(LGBM_final_prediction)

687

In [17]:
LGBM_final_prediction

array([0, 1, 0, ..., 0, 0, 1])

In [18]:
sum(XGB_final_prediction)

775

In [19]:
XGB_final_prediction

array([0, 1, 0, ..., 0, 0, 1])

In [20]:
con_true = 0
for i in range(len(XGB_final_prediction)):
    if XGB_final_prediction[i] == 1 and dtc_final_prediction[i] == 1:
        con_true += 1

In [21]:
con_true

758

In [22]:
con_true = 0
for i in range(len(XGB_final_prediction)):
    if LGBM_final_prediction[i] == 1 and dtc_final_prediction[i] == 1:
        con_true += 1

In [23]:
con_true

671

In [25]:
for idx in range(len(dtc_final_prediction)):
    if dtc_final_prediction[idx] == 0:
        change = 0
        if LGBM_final_prediction[idx] == 1 or XGB_final_prediction[idx] == 1:
            change = 1
        dtc_final_prediction[idx] = change

print(sum(dtc_final_prediction))
         

2240


In [26]:
# seed(3)에 대해서만 제출 할 때 
sub=pd.read_csv('submission.csv')
sub['is_converted']= dtc_final_prediction
sub.to_csv('XGB_LGBM_DTC.csv',index= False)