In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)
import copy
import random
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
nltk.download("punkt")
import json
%matplotlib inline

import warnings
warnings.filterwarnings(action="ignore")

pd.set_option('display.max_rows', None)
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [2]:
# 시드 고정
import os

SEED=42
random.seed(SEED)
np.random.seed(SEED)
os.environ['PYTHONHASHSEED']=str(SEED)

In [3]:
train = pd.read_csv("train.csv") # 학습용 데이터
test = pd.read_csv("submission.csv") # 테스트 데이터(제출파일의 데이터)

In [4]:
#india_count = train['customer_country'].str.contains('india', case=False).sum()
#india_count_st = test['customer_country'].str.contains('india', case=False).sum()

#print("India가 포함된 행의 수 : ", india_count)
#print("India가 포함된 행의 수 from test : ",india_count_st)

In [5]:
# 학습 타겟 데이터
targets=train['is_converted']
rows=train.shape[0]

with open('permitted.json','r') as f:
    permitted=json.load(f)

#
train=train.drop('is_converted',axis=1)
test=test.drop('id',axis=1)
test=test.drop('is_converted',axis=1)

In [6]:
# columns
cols_by_type={}

cols_by_type['categorical']=train.columns[train.dtypes=='object'].tolist()
cols_by_type['numerical']=train.columns[train.dtypes!='object'].tolist()
#colsByType['numerical'].remove('is_converted')

print('\nnumerical columns: '+str(len(cols_by_type['numerical'])))
print('categorical columns: '+str(len(cols_by_type['categorical'])))
print('total columns: '+str(len(cols_by_type['numerical'])+len(cols_by_type['categorical'])))


numerical columns: 13
categorical columns: 15
total columns: 28


In [7]:
# delete cols
del_cols=['ver_win_rate_x','ver_win_ratio_per_bu',
          'business_subarea', 'product_subcategory', 'product_modelname',
          'customer_country.1']

# preserve
# preserve=pd.DataFrame()
# preserve['com_reg_ver_win_rate']=total_data['com_reg_ver_win_rate']

train_process=train.drop(del_cols,axis=1)
test_process=test.drop(del_cols,axis=1)

In [8]:
# id_strategic_ver it_strategic_ver idit_strategic_ver
ver=['id_strategic_ver', 'it_strategic_ver', 'idit_strategic_ver']
train_process['strategic_ver']=np.where(train_process['idit_strategic_ver']>0,1,0)
test_process['strategic_ver']=np.where(test_process['idit_strategic_ver']>0,1,0)
train_process=train_process.drop(ver,axis=1)
test_process=test_process.drop(ver,axis=1)

In [9]:
selected_countries = ['india', 'brazil', 'unitedstates', 'mexico', 'philippines', 'colombia', 'u.a.e', 'unitedkingdom', 'saudiarabia', 'chile', 'italy', 'peru']

for country in selected_countries:
    country_count = train_process['customer_country'].str.contains(country, case=False).sum()
    country_count_st = test_process['customer_country'].str.contains(country, case=False).sum()
    print(f"{country.capitalize()}가 포함된 행의 수 (Train): {country_count}")
    print(f"{country.capitalize()}가 포함된 행의 수 (Test): {country_count_st}")
#Kfold후
#brazil                  1045 - 0
#india                    906 - 2
#unitedstates             782
#philippines              289 - 0
#OT                       254
#peru                     211 - 2
#mexico                   180 - 0
#colombia                 127 - 1
#u.a.e                    120
#italy                    120
#chile                    108
#saudiarabia              102


India가 포함된 행의 수 (Train): 16924
India가 포함된 행의 수 (Test): 908
Brazil가 포함된 행의 수 (Train): 8739
Brazil가 포함된 행의 수 (Test): 1045
Unitedstates가 포함된 행의 수 (Train): 0
Unitedstates가 포함된 행의 수 (Test): 0
Mexico가 포함된 행의 수 (Train): 2771
Mexico가 포함된 행의 수 (Test): 180
Philippines가 포함된 행의 수 (Train): 2570
Philippines가 포함된 행의 수 (Test): 289
Colombia가 포함된 행의 수 (Train): 1995
Colombia가 포함된 행의 수 (Test): 128
U.a.e가 포함된 행의 수 (Train): 1785
U.a.e가 포함된 행의 수 (Test): 129
Unitedkingdom가 포함된 행의 수 (Train): 0
Unitedkingdom가 포함된 행의 수 (Test): 0
Saudiarabia가 포함된 행의 수 (Train): 0
Saudiarabia가 포함된 행의 수 (Test): 0
Chile가 포함된 행의 수 (Train): 1234
Chile가 포함된 행의 수 (Test): 108
Italy가 포함된 행의 수 (Train): 1148
Italy가 포함된 행의 수 (Test): 120
Peru가 포함된 행의 수 (Train): 1149
Peru가 포함된 행의 수 (Test): 213


In [10]:
# country columns
# region
def preprocess_region(x,permitted):
    if type(x)==type(''):
        if permitted.get(x):
            return permitted[x]
        return 'OT'
    return np.nan

def response_corporate_encoding(train_data):
    permit={}

    for train_label in train_data.value_counts().index:
        permit[train_label]=1
    permit['OT']=1

    return permit

def preprocess_response_corporate(x,permitted):
    if type(x)==type(''):
        if permitted.get(x):
            return x
        return 'OT'
    return np.nan


def country_encoding(train_data):
    permit={}

    train_labels=train_data.apply(lambda x:x.lower().replace(' ','').replace('/',' ').split(' ')[-1] if type(x)==type('') else np.nan).value_counts()
    train_labels=sorted(train_labels.items(),key=lambda x:x[1],reverse=True)

    for train_label in train_labels[:100]:
        if train_label[0]:
            permit[f'{train_label[0]}']=1

    if permit.get(''):
        del permit['']

    #
    permit['OT']='OT'
    permit['dump_key']='dump_value'
    permit['türkiye']='turkey'
    del permit['700patrooncreekblvdalbanyny12206']
    del permit['uaedubai']

    return permit

def preprocess_customer_country(x,permitted):
    if type(x)==type(''):
        x=x.lower().replace(' ','').replace('/',' ')
        for word in x.split(' '):
            if permitted.get(word):
                return word
        return 'OT'
    return np.nan

# region
train_process['region']=train_process['response_corporate'].apply(lambda x:preprocess_region(x,permitted=permitted['region']))
test_process['region']=test_process['response_corporate'].apply(lambda x:preprocess_region(x,permitted=permitted['region']))

# response_corporate
permitted['response_corporate']=response_corporate_encoding(train['response_corporate'])
train_process['response_corporate']=train_process['response_corporate'].apply(lambda x:preprocess_response_corporate(x,permitted=permitted['response_corporate']))
test_process['response_corporate']=test_process['response_corporate'].apply(lambda x:preprocess_response_corporate(x,permitted=permitted['response_corporate']))

# customer_country
permitted['customer_country']=country_encoding(train['customer_country'])
train_process['customer_country']=train_process['customer_country'].apply(lambda x:preprocess_customer_country(x,permitted=permitted['customer_country']))
test_process['customer_country']=test_process['customer_country'].apply(lambda x:preprocess_customer_country(x,permitted=permitted['customer_country']))

In [11]:
#region_value_counts = train_process['customer_country'].value_counts()
#print(region_value_counts)

selected_countries = ['india', 'brazil', 'unitedstates', 'mexico', 'philippines', 'colombia', 'u.a.e', 'unitedkingdom', 'saudiarabia', 'chile', 'italy', 'peru']

for country in selected_countries:
    country_count = train_process['customer_country'].str.contains(country, case=False).sum()
    country_count_st = test_process['customer_country'].str.contains(country, case=False).sum()
    print(f"{country.capitalize()}가 포함된 행의 수 (Train): {country_count}")
    print(f"{country.capitalize()}가 포함된 행의 수 (Test): {country_count_st}")


India가 포함된 행의 수 (Train): 16880
India가 포함된 행의 수 (Test): 906
Brazil가 포함된 행의 수 (Train): 8737
Brazil가 포함된 행의 수 (Test): 1045
Unitedstates가 포함된 행의 수 (Train): 3794
Unitedstates가 포함된 행의 수 (Test): 782
Mexico가 포함된 행의 수 (Train): 2756
Mexico가 포함된 행의 수 (Test): 180
Philippines가 포함된 행의 수 (Train): 2567
Philippines가 포함된 행의 수 (Test): 289
Colombia가 포함된 행의 수 (Train): 1985
Colombia가 포함된 행의 수 (Test): 127
U.a.e가 포함된 행의 수 (Train): 1624
U.a.e가 포함된 행의 수 (Test): 120
Unitedkingdom가 포함된 행의 수 (Train): 1488
Unitedkingdom가 포함된 행의 수 (Test): 43
Saudiarabia가 포함된 행의 수 (Train): 1472
Saudiarabia가 포함된 행의 수 (Test): 102
Chile가 포함된 행의 수 (Train): 1228
Chile가 포함된 행의 수 (Test): 108
Italy가 포함된 행의 수 (Train): 1142
Italy가 포함된 행의 수 (Test): 120
Peru가 포함된 행의 수 (Train): 1134
Peru가 포함된 행의 수 (Test): 211


In [12]:
# business_unit
train_process['business_unit']=train_process['business_unit'].replace('Solution','ETC')
train_process['business_unit']=train_process['business_unit'].replace('CM','ETC')

test_process['business_unit']=test_process['business_unit'].replace('Solution','ETC')
test_process['business_unit']=test_process['business_unit'].replace('CM','ETC')

In [13]:
type_value_counts = train_process['customer_type'].value_counts()
print(type_value_counts)

End-Customer                    6647
End Customer                    3996
Specifier/ Influencer           2525
Channel Partner                 1368
Service Partner                  349
Solution Eco-Partner             146
Installer/Contractor              52
Specifier / Influencer            43
Corporate                         31
HVAC Engineer                     23
Engineer                          20
Developer                         17
Technician                        16
Consultant                        15
Other                             10
Home Owner                        10
End-user                           8
Manager / Director                 8
Software/Solution Provider         7
Etc.                               6
Architect/Consultant               5
Homeowner                          5
Reseller                           5
Installer                          5
Interior Designer                  5
Distributor                        4
Others                             4
D

In [14]:
selected_type = ["endcustomer","Specifier / Influencer","channelpartner"]

for ty in selected_type:
    typ = train_process['customer_type'].str.contains(ty, case=False).sum()
    typ_st = test_process['customer_type'].str.contains(ty, case=False).sum()
    print(f"{country.capitalize()}가 포함된 행의 수 (Train): {ty}")
    print(f"{country.capitalize()}가 포함된 행의 수 (Test): {typ_st}")


Peru가 포함된 행의 수 (Train): endcustomer
Peru가 포함된 행의 수 (Test): 0
Peru가 포함된 행의 수 (Train): Specifier / Influencer
Peru가 포함된 행의 수 (Test): 0
Peru가 포함된 행의 수 (Train): channelpartner
Peru가 포함된 행의 수 (Test): 0


In [15]:
# customer_type
def preprocess_customer_type(x,permitted):
    if type(x)==type(''):
        x=x.lower().replace('-','').replace(' ','')
        if permitted.get(x):
            return permitted[x]
        else:
            return 'OT'
    return x

train_process['customer_type']=train_process['customer_type'].apply(lambda x:preprocess_customer_type(x,permitted=permitted['customer_type']))
test_process['customer_type']=test_process['customer_type'].apply(lambda x:preprocess_customer_type(x,permitted=permitted['customer_type']))

In [16]:
type_value_counts = train_process['customer_type'].value_counts()
print(type_value_counts)

EC    10643
SI     2568
CP     1368
OT      759
Name: customer_type, dtype: int64


In [17]:
# business_area
# total_data['business_area']=total_data['business_area'].replace('hospital & health care','ETC')
# total_data['business_area']=total_data['business_area'].replace('factory','ETC')
# total_data['business_area']=total_data['business_area'].replace('government department','ETC')
# total_data['business_area']=total_data['business_area'].replace('public facility','ETC')
# total_data['business_area']=total_data['business_area'].replace('transportation','ETC')
# total_data['business_area']=total_data['business_area'].replace('power plant / renewable energy','ETC')
train_process['business_area']=train_process['business_area'].fillna('UNK')
test_process['business_area']=test_process['business_area'].fillna('UNK')

In [18]:
# ver_cus, ver_pro
grant=['ver_cus', 'ver_pro']
train_process['grant_weight']=np.where(train_process['ver_cus']>0,1,0)
train_process['grant_weight']=np.where(train_process['ver_pro']>0,1,train_process['grant_weight'])
train_process=train_process.drop(grant,axis=1)

test_process['grant_weight']=np.where(test_process['ver_cus']>0,1,0)
test_process['grant_weight']=np.where(test_process['ver_pro']>0,1,test_process['grant_weight'])
test_process=test_process.drop(grant,axis=1)

In [19]:
# expected_timeline
def preprocess_expected_timeline(x,permitted):
    if type(x)==type(''):
        x=x.lower().replace(' ','').replace('_','')
        if permitted.get(x):
            return permitted[x]
        return 'OT'
    return x

train_process['expected_timeline']=train_process['expected_timeline'].apply(lambda x:preprocess_expected_timeline(x,permitted=permitted['expected_timeline']))
test_process['expected_timeline']=test_process['expected_timeline'].apply(lambda x:preprocess_expected_timeline(x,permitted=permitted['expected_timeline']))

In [20]:
# from sklearn.preprocessing import StandardScaler
# scaler = StandardScaler()

# lead_desc_length, historical_existing_cnt
numerical=['lead_desc_length','historical_existing_cnt']
#total_data[numerical]=scaler.fit_transform(total_data[numerical])
train_process[numerical]=np.log1p(train_process[numerical])
test_process[numerical]=np.log1p(test_process[numerical])

In [21]:
# inquiry_type
def preprocess_inquiry_type(x,permitted):
    if type(x)==type(''):
        x=x.lower().replace('_',' ')
        for word in x.split(' '):
            if permitted.get(word):
                return permitted[word]
        return 'OT'
    return np.nan

train_process['inquiry_type']=train_process['inquiry_type'].apply(lambda x:preprocess_inquiry_type(x,permitted=permitted['inquiry_type']))
test_process['inquiry_type']=test_process['inquiry_type'].apply(lambda x:preprocess_inquiry_type(x,permitted=permitted['inquiry_type']))

In [22]:
# customer_job
def preprocess_customer_job(x,permitted):
    if type(x)==type(''):
        porter=PorterStemmer()
        tokens=word_tokenize(x)
        stems=[porter.stem(token) for token in tokens]
        for stem in stems:
            if permitted.get(stem):
                return permitted[stem]
        return 'OT'
    return np.nan

train_process['customer_job']=train_process['customer_job'].apply(lambda x:preprocess_customer_job(x,permitted=permitted['customer_job']))
test_process['customer_job']=test_process['customer_job'].apply(lambda x:preprocess_customer_job(x,permitted=permitted['customer_job']))

In [23]:
# product_category
def preprocess_product_category(x,permitted):
    if type(x)==type(''):
        porter=PorterStemmer()
        tokens=word_tokenize(x)
        stems=[porter.stem(token) for token in tokens]

        prefer={}
        for pf in permitted['dump_key']:
            prefer[pf]=0

        for stem in stems:
            if permitted.get(stem):
                prefer[permitted[stem]]=1

        for pf in permitted['dump_key']:
            if prefer[pf]>0:
                return pf
        return 'OT'
    return np.nan

train_process['product_category']=train_process['product_category'].apply(lambda x:preprocess_product_category(x,permitted=permitted['product_category']))
test_process['product_category']=test_process['product_category'].apply(lambda x:preprocess_product_category(x,permitted=permitted['product_category']))

In [24]:
# customer_poisition
def preprocess_customer_position(x,permitted):
    if type(x)==type(''):
        x=x.lower().replace('-',' ').replace('/',' ')
        for word in x.split(' '):
            if permitted.get(word):
                return permitted[word]
        return 'OT'
    return np.nan

train_process['customer_position']=train_process['customer_position'].apply(lambda x:preprocess_customer_position(x,permitted=permitted['customer_position']))
test_process['customer_position']=test_process['customer_position'].apply(lambda x:preprocess_customer_position(x,permitted=permitted['customer_position']))

In [25]:
# one hot encoding columns
origin_data=train_process.drop('com_reg_ver_win_rate',axis=1)
origin_columns=origin_data.columns.to_list()
object_columns=origin_data.columns.to_list()
object_columns.remove('bant_submit')
object_columns.remove('historical_existing_cnt')
object_columns.remove('lead_desc_length')
object_columns.remove('strategic_ver')
object_columns.remove('grant_weight')
object_columns.remove('customer_idx')
object_columns.remove('lead_owner')
for col in object_columns:
    permitted['values'][col]=train_process[col].value_counts().index
permitted['values']

{'customer_country': Index(['india', 'brazil', 'unitedstates', 'OT', 'mexico', 'philippines',
        'colombia', 'u.a.e', 'unitedkingdom', 'saudiarabia', 'chile', 'italy',
        'peru', 'germany', 'poland', 'egypt', 'vietnam', 'spain', 'argentina',
        'hongkong', 'australia', 'panama', 'france', 'canada', 'turkey',
        'ecuador', 'indonesia', 'türkiye', 'singapore', 'southafrica', 'iraq',
        'nigeria', 'thailand', 'hungary', 'portugal', 'kenya', 'malaysia',
        'bulgaria', 'costarica', 'dominicanrepublic', 'israel', 'oman',
        'elsalvador', 'pakistan', 'guatemala', 'kuwait', 'bangladesh', 'qatar',
        'switzerland', 'china', 'bolivia', 'honduras', 'lebanon', 'taiwan',
        'netherlands', 'belgium', 'bahrain', 'venezuela', 'puertorico',
        'greece', 'japan', 'afghanistan', 'algeria', 'morocco', 'romania',
        'ghana', 'jordan', 'croatia', 'nicaragua', 'ireland', 'maldives',
        'serbia', 'srilanka', 'uruguay', 'albania', 'jamaica', 'southkor

In [26]:
# encoder
class Encoder():
    def __init__(self):
        self.classes=[]

    def fit(self,data):
        for value in data.value_counts().index:
            self.classes.append(value)

    def transform(self,data):
        result=data.copy(deep=True)
        for i,value in enumerate(self.classes):
            result=result.replace(value,i)
        return result

    def inverse_transform(self,data):
        result=data.copy(deep=True)
        for i in range(0,len(self.classes)):
            result=result.replace(i,self.classes[i])
        return result

In [27]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestRegressor

imputer=IterativeImputer(estimator=RandomForestRegressor(random_state=SEED),initial_strategy='most_frequent',max_iter=10,random_state=SEED,skip_complete=False,verbose=1)

# encoder
encoders={}
for object_column in object_columns:
    encoders[object_column]=Encoder()
    encoders[object_column].fit(train_process[object_column])


def order_encoding(target_data,object_columns,encoders):
    result=pd.DataFrame()
    for col in target_data.columns:
        if col in object_columns:
            result[col]=encoders[col].transform(target_data[col])
        else:
            result[col]=target_data[col]

    return result

def order_decoding(target_data,object_columns,encoders):
    result=pd.DataFrame()
    for col in target_data.columns:
        if col in object_columns:
            result[col]=target_data[col].apply(lambda x:round(x)).astype(int)
            result[col]=encoders[col].inverse_transform(result[col])
        else:
            result[col]=target_data[col]
    return result

# train data
train_dummy=order_encoding(train_process,object_columns,encoders)
imputer.fit(train_dummy)
train_dummy_imputed=pd.DataFrame(data=imputer.transform(train_dummy),columns=train_process.columns)
train_imputed=order_decoding(train_dummy_imputed,object_columns,encoders)

# test data
test_dummy=order_encoding(test_process,object_columns,encoders)
test_dummy_imputed=pd.DataFrame(data=imputer.transform(test_dummy),columns=test_process.columns)
test_imputed=order_decoding(test_dummy_imputed,object_columns,encoders)


[IterativeImputer] Completing matrix with shape (59299, 20)
[IterativeImputer] Change: 84.68516837832195, scaled tolerance: 47.466 
[IterativeImputer] Change: 32.199999999999996, scaled tolerance: 47.466 
[IterativeImputer] Early stopping criterion reached.
[IterativeImputer] Completing matrix with shape (59299, 20)
[IterativeImputer] Completing matrix with shape (5271, 20)


In [28]:
# data kfold
from sklearn.model_selection import KFold

train_datas=[]
train_imputed['is_converted']=targets
train_data_false=train_imputed[train_imputed['is_converted']==0]
train_data_true=train_imputed[train_imputed['is_converted']==1]

# kfold
K=11
dkf=KFold(n_splits=K,shuffle=True,random_state=SEED)
preserves=[pd.DataFrame() for x in range(0,K+1)]
for i,(_,index) in enumerate(dkf.split(train_data_false)):
    print(f'-{i+1} fold data-')
    y_data_false=train_data_false['is_converted'].iloc[index]
    X_data_false=train_data_false.drop('is_converted',axis=1).iloc[index]

    y_data=pd.concat([y_data_false,train_data_true['is_converted']],ignore_index=True)
    X_data=pd.concat([X_data_false,train_data_true.drop('is_converted',axis=1)],ignore_index=True)

    # preserve
    # preserves[i]['com_reg_ver_win_rate']=X_data['com_reg_ver_win_rate']
    # X_data=X_data.drop('com_reg_ver_win_rate',axis=1)
    # preserves[i]['customer_idx']=X_data['customer_idx']
    # X_data['customer_idx']=np.log1p(X_data['customer_idx'])
    # preserves[i]['lead_owner']=X_data['lead_owner']
    # X_data['lead_owner']=np.log1p(X_data['lead_owner'])


    print(f'X data shape: {X_data.shape}')
    print(f'y data shape: {y_data.shape}')
    train_datas.append((X_data,y_data))

# preserve
# preserves[K]['com_reg_ver_win_rate']=test_process['com_reg_ver_win_rate']
# test_data=test_process.drop('com_reg_ver_win_rate',axis=1)
# preserves[K]['customer_idx']=X_data['customer_idx']
# test_data['customer_idx']=np.log1p(test_data['customer_idx'])
# preserves[K]['lead_owner']=X_data['lead_owner']
# test_data['lead_owner']=np.log1p(test_data['lead_owner'])
test_data=test_imputed
print(f'test data shape: {test_data.shape}')

-1 fold data-
X data shape: (9800, 20)
y data shape: (9800,)
-2 fold data-
X data shape: (9800, 20)
y data shape: (9800,)
-3 fold data-
X data shape: (9800, 20)
y data shape: (9800,)
-4 fold data-
X data shape: (9800, 20)
y data shape: (9800,)
-5 fold data-
X data shape: (9800, 20)
y data shape: (9800,)
-6 fold data-
X data shape: (9800, 20)
y data shape: (9800,)
-7 fold data-
X data shape: (9800, 20)
y data shape: (9800,)
-8 fold data-
X data shape: (9800, 20)
y data shape: (9800,)
-9 fold data-
X data shape: (9800, 20)
y data shape: (9800,)
-10 fold data-
X data shape: (9800, 20)
y data shape: (9800,)
-11 fold data-
X data shape: (9799, 20)
y data shape: (9799,)
test data shape: (5271, 20)


In [33]:
type_value_counts = test_data['customer_type'].value_counts()
print(type_value_counts)

country_value_counts = test_data['customer_country'].value_counts()
print(country_value_counts)

EC    2826
SI    1661
CP     539
OT     245
Name: customer_type, dtype: int64
brazil                  1045
india                    906
unitedstates             782
philippines              289
OT                       254
peru                     211
mexico                   180
colombia                 127
u.a.e                    120
italy                    120
chile                    108
saudiarabia              102
indonesia                 95
egypt                     95
vietnam                   69
germany                   68
australia                 66
argentina                 65
türkiye                   51
hongkong                  48
unitedkingdom             43
poland                    42
singapore                 41
thailand                  38
panama                    27
serbia                    20
greece                    19
southafrica               19
canada                    17
oman                      16
france                    16
spain                  

In [34]:
# process data type
for (train_data,target) in train_datas:
    train_data['customer_idx']=train_data['customer_idx'].astype(str)
    train_data['lead_owner']=train_data['lead_owner'].astype(str)
    train_data['strategic_ver']=train_data['strategic_ver'].astype(int)
    train_data['grant_weight']=train_data['grant_weight'].astype(int)
    target=target.apply(lambda x:1 if x else 0)

test_data['customer_idx']=test_data['customer_idx'].astype(str)
test_data['lead_owner']=test_data['lead_owner'].astype(str)
test_data['strategic_ver']=test_data['strategic_ver'].astype(int)
test_data['grant_weight']=test_data['grant_weight'].astype(int)

In [35]:
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
!pip install catboost
from catboost import CatBoostClassifier
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics         import mean_squared_error

Collecting catboost
  Downloading catboost-1.2.3-cp310-cp310-manylinux2014_x86_64.whl (98.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.5/98.5 MB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2.3


In [36]:
# 모델 성능 테스트
def get_clf_eval(y_test, y_pred=None):
    confusion = confusion_matrix(y_test, y_pred, labels=[True, False])
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, labels=[True, False])
    recall = recall_score(y_test, y_pred)
    F1 = f1_score(y_test, y_pred, labels=[True, False])

    print("오차행렬:\n", confusion)
    print("\n정확도: {:.4f}".format(accuracy))
    print("정밀도: {:.4f}".format(precision))
    print("재현율: {:.4f}".format(recall))
    print("F1: {:.4f}".format(F1))
    return F1

In [37]:
class KMODEL:
    def __init__(self,dataset_K,train_K=5):
        self.k_data=dataset_K
        self.k_fold=train_K
        self.models=[[] for i in range(0,K)]
        self.scores=[[] for i in range(0,K)]
        self.thresholds=[[] for i in range(0,K)]
        self.cv_scores=[]

    def modeling_kfold(self,iters,n_estimators,max_depth,learning_rate,cat_features,train_data,targets_data,core):
        # k-fold
        kf=StratifiedKFold(n_splits=self.k_fold,shuffle=True,random_state=SEED)

        for i,(train_index,val_index) in enumerate(kf.split(train_data,targets_data)):
            print(f'-[{iters+1}-{i+1}] fold-')
            X_train,X_val=train_data.iloc[train_index],train_data.iloc[val_index]
            y_train,y_val=targets_data.iloc[train_index],targets_data.iloc[val_index]

            # logloss
            classifier=CatBoostClassifier(n_estimators=n_estimators, max_depth=max_depth, learning_rate=learning_rate, eval_metric='F1', random_state=SEED, bootstrap_type ='Bernoulli',task_type=core) # default:logloss

            # randomforst
            #classifier=RandomForestClassifier(n_estimators=n_estimators,max_depth=max_depth,random_state=SEED)

            # regressor=CatBoostRegressor(n_estimators=n_estimators, max_depth=max_depth, learning_rate=learning_rate, eval_metric='RMSE',random_state=SEED, bootstrap_type ='Bernoulli',task_type=core)

            model=classifier.fit(X_train, y_train, eval_set=(X_val,y_val),verbose=100, early_stopping_rounds=100,cat_features=cat_features,use_best_model=True)
            #model=classifier.fit(X_train,y_train)
            # model=regressor.fit(X_train, y_train, eval_set=(X_val,y_val),verbose=100, early_stopping_rounds=100,cat_features=cat_features)


            # pred=model.predict(X_val,prediction_type='RawFormulaVal')  # focal loss
            # coordinates=np.linspace(pred.min(),pred.max(),100)   # focal loss
            pred=model.predict_proba(X_val)[:,1]
            coordinates = np.linspace(0, 1, 100)

            best_score=0
            best_coordinate=0
            for coordinate in coordinates:
                pred_value=pred>coordinate
                score=f1_score(y_val,pred_value)
                if best_score<score:
                    best_score=score
                    best_coordinate=coordinate

            pred=(pred>best_coordinate)
            self.scores[iters].append(get_clf_eval(y_val,pred))
            self.thresholds[iters].append(best_coordinate)
            # scores.append(np.sqrt(mean_squared_error(y_val,pred)))
            self.models[iters].append(model)

        self.cv_scores.append(np.mean(self.scores[iters]))
        print(f'[{iters+1}] F1 scores mean: {self.cv_scores[iters]}')

    def modeling_kdata(self,n_estimators,max_depth,learning_rate,cat_features,train_datas,core='CPU'):
        for iter,(train_data,target) in enumerate(train_datas):
            self.modeling_kfold(iter,n_estimators,max_depth,learning_rate,cat_features,train_data,target,core=core)
        print(f'Total F1 scores mean: {np.mean(self.cv_scores)}')

    def predict(self,test_data):
        test_pred=pd.Series([0 for x in range(len(test_data))], index=test_data.index)
        for models,thresholds in zip(self.models,self.thresholds):
            for model,threshold in zip(models,thresholds):
                pred=model.predict_proba(test_data)[:,1]
                test_pred+=(pred>threshold)
        test_pred=test_pred/(self.k_data*self.k_fold)
        test_pred=test_pred.apply(lambda x:1 if x>0.5 else 0)
        return test_pred



In [38]:
cat_features=train_datas[0][0].columns.to_list()
cat_features.remove('bant_submit')
cat_features.remove('lead_desc_length')
cat_features.remove('historical_existing_cnt')
cat_features.remove('com_reg_ver_win_rate')
cat_features

['customer_country',
 'business_unit',
 'customer_idx',
 'customer_type',
 'enterprise',
 'customer_job',
 'inquiry_type',
 'product_category',
 'customer_position',
 'response_corporate',
 'expected_timeline',
 'business_area',
 'lead_owner',
 'strategic_ver',
 'region',
 'grant_weight']

In [39]:
kmodel=KMODEL(dataset_K=11)
kmodel.modeling_kdata(n_estimators=1000,max_depth=10,learning_rate=0.05,cat_features=cat_features,train_datas=train_datas)

-[1-1] fold-
0:	learn: 0.8409231	test: 0.8518519	best: 0.8518519 (0)	total: 107ms	remaining: 1m 47s
100:	learn: 0.9597848	test: 0.9282103	best: 0.9287519 (90)	total: 11s	remaining: 1m 38s


KeyboardInterrupt: 

In [None]:
pred=kmodel.predict(test_data)
pred.value_counts()