In [205]:
import pandas as pd
import numpy as np
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)
import copy
import random
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import json
from fancyimpute import IterativeImputer
%matplotlib inline

import warnings
warnings.filterwarnings(action="ignore")

pd.set_option('display.max_rows', None)
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\wlsyo\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\wlsyo\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [206]:
# 시드 고정
import os

SEED=42
random.seed(SEED)
np.random.seed(SEED)
os.environ['PYTHONHASHSEED']=str(SEED)

In [207]:
train = pd.read_csv("train.csv") # 학습용 데이터
test = pd.read_csv("submission.csv") # 테스트 데이터(제출파일의 데이터)

In [208]:
# 학습 타겟 데이터
targets=train['is_converted']
rows=train.shape[0]

with open('permitted.json','r') as f:
    permitted=json.load(f)

# 한번에 전처리를 위해 데이터 병합
train=train.drop('is_converted',axis=1)
test=test.drop('id',axis=1)
test=test.drop('is_converted',axis=1)
total_data=pd.concat([train,test])
#total_data=total_data.drop('is_converted',axis=1)
total_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 64570 entries, 0 to 5270
Data columns (total 28 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   bant_submit              64570 non-null  float64
 1   customer_country         63588 non-null  object 
 2   business_unit            64570 non-null  object 
 3   com_reg_ver_win_rate     16356 non-null  float64
 4   customer_idx             64570 non-null  int64  
 5   customer_type            19152 non-null  object 
 6   enterprise               64570 non-null  object 
 7   historical_existing_cnt  15031 non-null  float64
 8   id_strategic_ver         4037 non-null   float64
 9   it_strategic_ver         1174 non-null   float64
 10  idit_strategic_ver       5211 non-null   float64
 11  customer_job             44398 non-null  object 
 12  lead_desc_length         64570 non-null  int64  
 13  inquiry_type             62337 non-null  object 
 14  product_category       

In [209]:
# columns
cols_by_type={}

cols_by_type['categorical']=train.columns[train.dtypes=='object'].tolist()
cols_by_type['numerical']=train.columns[train.dtypes!='object'].tolist()
#colsByType['numerical'].remove('is_converted')

print('\nnumerical columns: '+str(len(cols_by_type['numerical'])))
print('categorical columns: '+str(len(cols_by_type['categorical'])))
print('total columns: '+str(len(cols_by_type['numerical'])+len(cols_by_type['categorical'])))


numerical columns: 13
categorical columns: 15
total columns: 28


In [210]:
# delete cols
del_cols=['ver_win_rate_x','ver_win_ratio_per_bu',
          'business_subarea', 'product_subcategory', 'product_modelname', 
          'customer_country.1']

# preserve
# preserve=pd.DataFrame()
# preserve['com_reg_ver_win_rate']=total_data['com_reg_ver_win_rate']

total_data=total_data.drop(del_cols,axis=1)

In [211]:
# id_strategic_ver it_strategic_ver idit_strategic_ver
ver=['id_strategic_ver', 'it_strategic_ver', 'idit_strategic_ver']
total_data['strategic_ver']=np.where(total_data['idit_strategic_ver']>0,1,0)
total_data=total_data.drop(ver,axis=1)

In [212]:
# country columns
# region
def preprocess_region(x,permitted):
    if type(x)==type(''):
        if permitted.get(x):
            return permitted[x]
        return 'OT'
    return np.nan

# def preprocess_response_corporate(x,permitted):
#     if type(x)==type(''):
#         if permitted.get(x):
#             return x
#         return 'OT'
#     return np.nan


def country_encoding(train_data,test_data):
    permit={}

    train_labels=train_data.apply(lambda x:x.lower().replace(' ','').replace('/',' ').split(' ')[-1] if type(x)==type('') else np.nan).value_counts()
    train_labels=sorted(train_labels.items(),key=lambda x:x[1],reverse=True)
    test_labels=test_data.apply(lambda x:x.lower().replace(' ','').replace('/',' ').split(' ')[-1] if type(x)==type('') else np.nan).value_counts()
    test_labels=sorted(test_labels.items(),key=lambda x:x[1],reverse=True)

    for test_label in test_labels[:100]:
        for train_label in train_labels[:100]:
            if test_label[0]==train_label[0]:
                permit[f'{test_label[0]}']=1
                break
    
    if permit.get(''):
        del permit['']
    permit['dump_key']=1
    return permit

def preprocess_customer_country(x,permitted):
    if type(x)==type(''):
        x=x.lower().replace(' ','').replace('/',' ')
        for word in x.split(' '):
            if permitted.get(word):
                return word
        return 'OT'
    return np.nan

# region
total_data['region']=total_data['response_corporate'].apply(lambda x:preprocess_region(x,permitted=permitted['region']))

# response_corporate
# total_data['response_corporate']=total_data['response_corporate'].apply(lambda x:preprocess_response_corporate(x,permitted=permitted['response_corporate']))

# customer_country   
permitted['customer_country']=country_encoding(train['customer_country'],test['customer_country'])
total_data['customer_country']=total_data['customer_country'].apply(lambda x:preprocess_customer_country(x,permitted=permitted['customer_country']))

In [213]:
# business_unit
total_data['business_unit']=total_data['business_unit'].replace('Solution','ETC')
total_data['business_unit']=total_data['business_unit'].replace('CM','ETC')

In [214]:
# customer_type
def preprocess_customer_type(x,permitted):
    if type(x)==type(''):
        x=x.lower().replace('-','').replace(' ','')
        if permitted.get(x):
            return permitted[x]
        else:
            return 'OT'
    return x
    
total_data['customer_type']=total_data['customer_type'].apply(lambda x:preprocess_customer_type(x,permitted=permitted['customer_type']))

In [215]:
# business_area
# total_data['business_area']=total_data['business_area'].replace('hospital & health care','ETC')
# total_data['business_area']=total_data['business_area'].replace('factory','ETC')
# total_data['business_area']=total_data['business_area'].replace('government department','ETC')
# total_data['business_area']=total_data['business_area'].replace('public facility','ETC')
# total_data['business_area']=total_data['business_area'].replace('transportation','ETC')
# total_data['business_area']=total_data['business_area'].replace('power plant / renewable energy','ETC')
total_data['business_area']=total_data['business_area'].fillna('UNK')

In [216]:
# ver_cus, ver_pro
grant=['ver_cus', 'ver_pro']
total_data['grant_weight']=np.where(total_data['ver_cus']>0,1,0)
total_data['grant_weight']=np.where(total_data['ver_pro']>0,1,total_data['grant_weight'])
total_data=total_data.drop(grant,axis=1)

In [217]:
# expected_timeline
def preprocess_expected_timeline(x,permitted):
    if type(x)==type(''):
        x=x.lower().replace(' ','').replace('_','')
        if permitted.get(x):
            return permitted[x]
        return 'OT'
    return x

total_data['expected_timeline']=total_data['expected_timeline'].apply(lambda x:preprocess_expected_timeline(x,permitted=permitted['expected_timeline']))

In [218]:
# from sklearn.preprocessing import StandardScaler
# scaler = StandardScaler()

# lead_desc_length, historical_existing_cnt
numerical=['lead_desc_length','historical_existing_cnt']
#total_data[numerical]=scaler.fit_transform(total_data[numerical])
total_data[numerical]=np.log1p(total_data[numerical])

In [219]:
# inquiry_type
def preprocess_inquiry_type(x,permitted):
    if type(x)==type(''):
        x=x.lower().replace('_',' ')
        for word in x.split(' '):
            if permitted.get(word):
                return permitted[word]
        return 'OT'
    return np.nan

total_data['inquiry_type']=total_data['inquiry_type'].apply(lambda x:preprocess_inquiry_type(x,permitted=permitted['inquiry_type']))

In [220]:
# customer_job
def preprocess_customer_job(x,permitted):
    if type(x)==type(''):
        porter=PorterStemmer()
        tokens=word_tokenize(x)
        stems=[porter.stem(token) for token in tokens]
        for stem in stems:
            if permitted.get(stem):
                return permitted[stem]
        return 'OT'
    return np.nan

total_data['customer_job']=total_data['customer_job'].apply(lambda x:preprocess_customer_job(x,permitted=permitted['customer_job']))

In [221]:
# product_category
def preprocess_product_category(x,permitted):
    if type(x)==type(''):
        porter=PorterStemmer()
        tokens=word_tokenize(x)
        stems=[porter.stem(token) for token in tokens]

        prefer={}
        for pf in permitted['dump_key']:
            prefer[pf]=0
        
        for stem in stems:
            if permitted.get(stem):
                prefer[permitted[stem]]=1

        for pf in permitted['dump_key']:
            if prefer[pf]>0:
                return pf
        return 'OT'
    return np.nan

total_data['product_category']=total_data['product_category'].apply(lambda x:preprocess_product_category(x,permitted=permitted['product_category']))

In [222]:
# customer_poisition
def preprocess_customer_position(x,permitted):
    if type(x)==type(''):
        x=x.lower().replace('-',' ').replace('/',' ')
        for word in x.split(' '):
            if permitted.get(word):
                return permitted[word]
        return 'OT'
    return np.nan

total_data['customer_position']=total_data['customer_position'].apply(lambda x:preprocess_customer_position(x,permitted=permitted['customer_position']))

In [223]:
# columns
cols_by_type={}

cols_by_type['categorical']=total_data.columns[total_data.dtypes=='object'].tolist()
cols_by_type['numerical']=total_data.columns[total_data.dtypes!='object'].tolist()

print('\nnumerical columns: '+str(len(cols_by_type['numerical'])))
print('categorical columns: '+str(len(cols_by_type['categorical'])))
print('total columns: '+str(len(cols_by_type['numerical'])+len(cols_by_type['categorical'])))


numerical columns: 8
categorical columns: 12
total columns: 20


In [224]:
# split data

train_split=total_data[:rows]
test_data=total_data[rows:]

print(total_data.shape)
print(train_split.shape)
print(test_data.shape)

(64570, 20)
(59299, 20)
(5271, 20)


In [225]:
# data kfold
from sklearn.model_selection import KFold

train_datas=[]
train_split['is_converted']=targets
train_data_false=train_split[train_split['is_converted']==0]
train_data_true=train_split[train_split['is_converted']==1]

# kfold
K=11
dkf=KFold(n_splits=K,shuffle=True,random_state=SEED)
preserves=[pd.DataFrame() for x in range(0,K+1)]
for i,(_,index) in enumerate(dkf.split(train_data_false)):
    print(f'-{i+1} fold data-')
    y_data_false=train_data_false['is_converted'].iloc[index]
    X_data_false=train_data_false.drop('is_converted',axis=1).iloc[index]
    
    y_data=pd.concat([y_data_false,train_data_true['is_converted']],ignore_index=True)
    X_data=pd.concat([X_data_false,train_data_true.drop('is_converted',axis=1)],ignore_index=True)

    # preserve
    preserves[i]['com_reg_ver_win_rate']=X_data['com_reg_ver_win_rate']
    X_data=X_data.drop('com_reg_ver_win_rate',axis=1)

    print(f'X data shape: {X_data.shape}')
    print(f'y data shape: {y_data.shape}')
    train_datas.append((X_data,y_data))

preserves[K]['com_reg_ver_win_rate']=test_data['com_reg_ver_win_rate']
test_data=test_data.drop('com_reg_ver_win_rate',axis=1)
print(f'test data shape: {test_data.shape}')

-1 fold data-
X data shape: (9800, 19)
y data shape: (9800,)
-2 fold data-
X data shape: (9800, 19)
y data shape: (9800,)
-3 fold data-
X data shape: (9800, 19)
y data shape: (9800,)
-4 fold data-
X data shape: (9800, 19)
y data shape: (9800,)
-5 fold data-
X data shape: (9800, 19)
y data shape: (9800,)
-6 fold data-
X data shape: (9800, 19)
y data shape: (9800,)
-7 fold data-
X data shape: (9800, 19)
y data shape: (9800,)
-8 fold data-
X data shape: (9800, 19)
y data shape: (9800,)
-9 fold data-
X data shape: (9800, 19)
y data shape: (9800,)
-10 fold data-
X data shape: (9800, 19)
y data shape: (9800,)
-11 fold data-
X data shape: (9799, 19)
y data shape: (9799,)
test data shape: (5271, 19)


In [226]:
# missing values, preprocess data type
for i,(train_data,target) in enumerate(train_datas):
    print(f'-{i+1} fold train data missing values-')
    train_data['grant_weight']=train_data['grant_weight'].astype(int)
    train_data['strategic_ver']=train_data['strategic_ver'].astype(int)
    print(train_data.isnull().sum())

test_data['grant_weight']=test_data['grant_weight'].astype(int)
test_data['strategic_ver']=test_data['strategic_ver'].astype(int)
print('test data missing values')
print(test_data.isnull().sum())

-1 fold train data missing values-
bant_submit                   0
customer_country            216
business_unit                 0
customer_idx                  0
customer_type              6591
enterprise                    0
historical_existing_cnt    7920
customer_job               3084
lead_desc_length              0
inquiry_type                232
product_category           2826
customer_position             0
response_corporate            0
expected_timeline          4891
business_area                 0
lead_owner                    0
strategic_ver                 0
region                        0
grant_weight                  0
dtype: int64
-2 fold train data missing values-
bant_submit                   0
customer_country            213
business_unit                 0
customer_idx                  0
customer_type              6630
enterprise                    0
historical_existing_cnt    7928
customer_job               3094
lead_desc_length              0
inquiry_type         

In [227]:
# impute data
from fancyimpute import KNN

# train data
train_dummys=[]
for i,(train_data,target) in enumerate(train_datas):
    print(f'-{i+1} fold train data imputation start-')
    train_dummy=pd.get_dummies(train_data)

    # nan data in numerical data
    for col in train_data.columns[train_data.dtypes!='object'].tolist():
        train_dummy[col]=np.where(pd.isnull(train_data[col]),np.nan,train_dummy[col])

    # nan data in objective data
    for col in train_data.columns[train_data.dtypes=='object'].tolist():
        for value in train_data[col].value_counts().index:
            train_dummy[col+'_'+value]=np.where(pd.isnull(train_data[col]),np.nan,train_dummy[col+'_'+value])

    imputer=KNN(verbose=1)
    train_dummy.iloc[:,:]=imputer.fit_transform(train_dummy)
    train_dummys.append(train_dummy)

# test data
print(f'test data imputation start-')
test_dummy=pd.get_dummies(test_data)
# nan data in numerical data
for col in train_data.columns[test_data.dtypes!='object'].tolist():
    test_dummy[col]=np.where(pd.isnull(test_data[col]),np.nan,test_dummy[col])
# nan data in objective data
for col in test_data.columns[test_data.dtypes=='object'].tolist():
    for value in test_data[col].value_counts().index:
        test_dummy[col+'_'+value]=np.where(pd.isnull(test_data[col]),np.nan,test_dummy[col+'_'+value])
imputer=KNN(verbose=1)
test_dummy.iloc[:,:]=imputer.fit_transform(test_dummy)

-1 fold train data imputation start-
Imputing row 1/9800 with 1 missing, elapsed time: 42.695
Imputing row 101/9800 with 25 missing, elapsed time: 42.723
Imputing row 201/9800 with 5 missing, elapsed time: 42.741
Imputing row 301/9800 with 5 missing, elapsed time: 42.779
Imputing row 401/9800 with 5 missing, elapsed time: 42.816
Imputing row 501/9800 with 16 missing, elapsed time: 42.857
Imputing row 601/9800 with 15 missing, elapsed time: 42.901
Imputing row 701/9800 with 16 missing, elapsed time: 42.941
Imputing row 801/9800 with 5 missing, elapsed time: 42.973
Imputing row 901/9800 with 25 missing, elapsed time: 43.017
Imputing row 1001/9800 with 5 missing, elapsed time: 43.053
Imputing row 1101/9800 with 1 missing, elapsed time: 43.074
Imputing row 1201/9800 with 19 missing, elapsed time: 43.102
Imputing row 1301/9800 with 19 missing, elapsed time: 43.148
Imputing row 1401/9800 with 9 missing, elapsed time: 43.197
Imputing row 1501/9800 with 8 missing, elapsed time: 43.242
Imputing

In [245]:
# decoding one hot datas
def decoding(origin_data,data_hot,object_columns):
    result=pd.DataFrame()
    for col in origin_data.columns:
        if col in object_columns:
            target_columns=[]
            for target_column in origin_data[col].value_counts().index:
                target_columns.append(col+'_'+target_column)
            result[col]=data_hot[target_columns].idxmax(axis=1).apply(lambda x:x.split('_')[-1])
        else:
            result[col]=data_hot[col]
    
    return result

object_columns=total_data.columns.to_list()
object_columns.remove('bant_submit')
object_columns.remove('historical_existing_cnt')
object_columns.remove('lead_desc_length')
object_columns.remove('strategic_ver')
object_columns.remove('grant_weight')
object_columns.remove('customer_idx')
object_columns.remove('lead_owner')

for i,(train_data,target) in enumerate(train_datas):
    train_data=decoding(train_data,train_dummys[i],object_columns)

test_data=decoding(test_data,test_dummy,object_columns)

TypeError: can only concatenate str (not "float") to str

In [230]:
# com_reg_ver_win_rate
def com_reg_ver_win_rate_encoding(data,reference_column,impute_column):
    permit={}

    data_labels=data[reference_column].value_counts().index

    for data_label in data_labels:
        permit[data_label]=data.loc[data[reference_column]==data_label,impute_column].mean()
    
    for key in permit.keys():
        if np.isnan(permit[key]):
            permit[key]=data[impute_column].mean()
    
    permit['dump_key']=1
    return permit

def impute_com_reg_ver_win_rate(x,permitted):
    if type(x)==type(''):
        return permitted[x]
    else:
        return x
    
for i,(train_data,target) in enumerate(train_datas):
    train_data['com_reg_ver_win_rate']=preserves[i]['com_reg_ver_win_rate']
    permitted['com_reg_ver_win_rate']=com_reg_ver_win_rate_encoding(train_data,'customer_country','com_reg_ver_win_rate')
    train_data['com_reg_ver_win_rate']=np.where(train_data['com_reg_ver_win_rate'].isnull(),train_data['customer_country'],train_data['com_reg_ver_win_rate'])
    train_data['com_reg_ver_win_rate']=train_data['com_reg_ver_win_rate'].apply(lambda x:impute_com_reg_ver_win_rate(x,permitted=permitted['com_reg_ver_win_rate']))

test_data['com_reg_ver_win_rate']=preserves[K]['com_reg_ver_win_rate']
permitted['com_reg_ver_win_rate']=com_reg_ver_win_rate_encoding(test_data,'customer_country','com_reg_ver_win_rate')
test_data['com_reg_ver_win_rate']=np.where(test_data['com_reg_ver_win_rate'].isnull(),test_data['customer_country'],test_data['com_reg_ver_win_rate'])
test_data['com_reg_ver_win_rate']=test_data['com_reg_ver_win_rate'].apply(lambda x:impute_com_reg_ver_win_rate(x,permitted=permitted['com_reg_ver_win_rate']))

In [231]:
# process data type
for (train_data,target) in train_datas:
    train_data['customer_idx']=train_data['customer_idx'].astype(str)
    train_data['lead_owner']=train_data['lead_owner'].astype(str)
    target=target.apply(lambda x:1 if x else 0)

test_data['customer_idx']=test_data['customer_idx'].astype(str)
test_data['lead_owner']=test_data['lead_owner'].astype(str)

In [232]:
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from catboost import CatBoostClassifier
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics         import mean_squared_error

In [233]:
# 모델 성능 테스트
def get_clf_eval(y_test, y_pred=None):
    confusion = confusion_matrix(y_test, y_pred, labels=[True, False])
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, labels=[True, False])
    recall = recall_score(y_test, y_pred)
    F1 = f1_score(y_test, y_pred, labels=[True, False])

    print("오차행렬:\n", confusion)
    print("\n정확도: {:.4f}".format(accuracy))
    print("정밀도: {:.4f}".format(precision))
    print("재현율: {:.4f}".format(recall))
    print("F1: {:.4f}".format(F1))
    return F1

In [234]:
class KMODEL:
    def __init__(self,dataset_K,train_K=5):
        self.k_data=dataset_K
        self.k_fold=train_K
        self.models=[[] for i in range(0,K)]
        self.scores=[[] for i in range(0,K)]
        self.thresholds=[[] for i in range(0,K)]
        self.cv_scores=[]

    def modeling_kfold(self,iters,n_estimators,max_depth,learning_rate,cat_features,train_data,targets_data,core):
        # k-fold
        kf=StratifiedKFold(n_splits=self.k_fold,shuffle=True,random_state=SEED)

        for i,(train_index,val_index) in enumerate(kf.split(train_data,targets_data)):
            print(f'-[{iters+1}-{i+1}] fold-')
            X_train,X_val=train_data.iloc[train_index],train_data.iloc[val_index]
            y_train,y_val=targets_data.iloc[train_index],targets_data.iloc[val_index]

            # logloss
            classifier=CatBoostClassifier(n_estimators=n_estimators, max_depth=max_depth, learning_rate=learning_rate, eval_metric='F1', random_state=SEED, bootstrap_type ='Bernoulli',task_type=core) # default:logloss
            
            # randomforst
            #classifier=RandomForestClassifier(n_estimators=n_estimators,max_depth=max_depth,random_state=SEED)

            # regressor=CatBoostRegressor(n_estimators=n_estimators, max_depth=max_depth, learning_rate=learning_rate, eval_metric='RMSE',random_state=SEED, bootstrap_type ='Bernoulli',task_type=core)

            model=classifier.fit(X_train, y_train, eval_set=(X_val,y_val),verbose=100, early_stopping_rounds=100,cat_features=cat_features,use_best_model=True)
            #model=classifier.fit(X_train,y_train)
            # model=regressor.fit(X_train, y_train, eval_set=(X_val,y_val),verbose=100, early_stopping_rounds=100,cat_features=cat_features)
            
            
            # pred=model.predict(X_val,prediction_type='RawFormulaVal')  # focal loss
            # coordinates=np.linspace(pred.min(),pred.max(),100)   # focal loss
            pred=model.predict_proba(X_val)[:,1]
            coordinates = np.linspace(0, 1, 100)

            best_score=0
            best_coordinate=0
            for coordinate in coordinates:
                pred_value=pred>coordinate
                score=f1_score(y_val,pred_value)
                if best_score<score:
                    best_score=score
                    best_coordinate=coordinate
            
            pred=(pred>best_coordinate)
            self.scores[iters].append(get_clf_eval(y_val,pred))
            self.thresholds[iters].append(best_coordinate)
            # scores.append(np.sqrt(mean_squared_error(y_val,pred)))
            self.models[iters].append(model)
        
        self.cv_scores.append(np.mean(self.scores[iters]))
        print(f'[{iters}-{i}] F1 scores mean: {self.cv_scores[iters]}')

    def modeling_kdata(self,n_estimators,max_depth,learning_rate,cat_features,train_datas,core='CPU'):
        for iter,(train_data,target) in train_datas:
            self.modeling_kfold(iter,n_estimators,max_depth,learning_rate,cat_features,train_data,target,core=core)
        print(f'Total F1 scores mean: {np.mean(self.cv_scores)}')

    def predict(self,test_data):
        test_pred=pd.Series([0 for x in range(len(test_data))], index=test_data.index)
        for models,thresholds in zip(self.models,self.thresholds):
            for model,threshold in zip(models,thresholds):
                pred=model.predict_proba(test_data)[:,1]
                test_pred+=(pred>threshold)
        test_pred=test_pred/(self.k_data*self.k_fold)
        test_pred=test_pred.apply(lambda x:1 if x>0.5 else 0)
        return test_pred



In [239]:
# cat_features=train_datas[0][0].columns.to_list()
# cat_features.remove('bant_submit')
# cat_features.remove('lead_desc_length')
# cat_features.remove('historical_existing_cnt')
# cat_features.remove('com_reg_ver_win_rate')
# cat_features.remove('is_converted')
# cat_features
train_datas[0][0].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9800 entries, 0 to 9799
Data columns (total 20 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   bant_submit              9800 non-null   float64
 1   customer_country         9584 non-null   object 
 2   business_unit            9800 non-null   object 
 3   customer_idx             9800 non-null   object 
 4   customer_type            3209 non-null   object 
 5   enterprise               9800 non-null   object 
 6   historical_existing_cnt  1880 non-null   float64
 7   customer_job             6716 non-null   object 
 8   lead_desc_length         9800 non-null   float64
 9   inquiry_type             9568 non-null   object 
 10  product_category         6974 non-null   object 
 11  customer_position        9800 non-null   object 
 12  response_corporate       9800 non-null   object 
 13  expected_timeline        4909 non-null   object 
 14  business_area           

In [None]:
kmodel=KMODEL(dataset_K=11)
kmodel.modeling_kdata(n_estimators=100,max_depth=10,learning_rate=0.05,cat_features=ca)