In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)
import random
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
%matplotlib inline

import warnings
warnings.filterwarnings(action="ignore")

pd.set_option('display.max_rows', None)
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\wlsyo\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\wlsyo\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [2]:
# 시드 고정
import os

SEED=42
random.seed(SEED)
np.random.seed(SEED)
os.environ['PYTHONHASHSEED']=str(SEED)

In [3]:
train = pd.read_csv("train.csv") # 학습용 데이터
test = pd.read_csv("submission.csv") # 테스트 데이터(제출파일의 데이터)

FileNotFoundError: [Errno 2] No such file or directory: 'train.csv'

In [None]:
# 학습 타겟 데이터
targets=train['is_converted']
rows=train.shape[0]

# 
train=train.drop('is_converted',axis=1)
test=test.drop('id',axis=1)
test=test.drop('is_converted',axis=1)

In [None]:
permitted={
    "customer_job":{"engin":"EN", "administr":"AD", "educ":"ED", "sale":"SA", "inform":"IF", "oper":"OP", "purchas":"PA", "art":"AT", "busi":"BS", "manag":"MA", "dump_key":"OT", "technolog":"TC", "develop":"DV", "consult":"CS", "media":"ME"},
    
    "product_category":{"signag":"SG", "board":"BD", "multi-split":"MS", "single-split":"SS", "tv":"TV", "vrf":"VR", "dump_key":["DB","VR","SG","SS","MS","TV","IT","DT","VI","IT","DT","VI","WL","LD","SE"], "interact":"IT", "digit":"DT","video":"VI","wall":"WL","led":"LD","seri":"SE"},
    
    "customer_country":{
        "india": 1,
        "brazil": 1,
        "unitedstates": 1,
        "OT": 1,
        "mexico": 1,
        "philippines": 1,
        "colombia": 1,
        "u.a.e": 1,
        "unitedkingdom": 1,
        "saudiarabia": 1,
        "chile": 1,
        "italy": 1,
        "peru": 1,
        "germany": 1,
        "poland": 1,
        "egypt": 1,
        "vietnam": 1,
        "spain": 1,
        "argentina": 1,
        "hongkong": 1,
        "australia": 1,
        "panama": 1,
        "france": 1,
        "canada": 1,
        "turkey": 1,
        "ecuador": 1,
        "indonesia": 1,
        "t\u00fcrkiye": 1,
        "singapore": 1,
        "southafrica": 1,
        "iraq": 1,
        "nigeria": 1,
        "thailand": 1,
        "hungary": 1,
        "portugal": 1,
        "kenya": 1,
        "malaysia": 1,
        "bulgaria": 1,
        "costarica": 1,
        "dominicanrepublic": 1,
        "israel": 1,
        "oman": 1,
        "elsalvador": 1,
        "pakistan": 1,
        "guatemala": 1,
        "kuwait": 1,
        "bangladesh": 1,
        "qatar": 1,
        "switzerland": 1,
        "china": 1,
        "bolivia": 1,
        "honduras": 1,
        "lebanon": 1,
        "taiwan": 1,
        "netherlands": 1,
        "belgium": 1,
        "bahrain": 1,
        "venezuela": 1,
        "puertorico": 1,
        "greece": 1,
        "japan": 1,
        "afghanistan": 1,
        "algeria": 1,
        "morocco": 1,
        "romania": 1,
        "ghana": 1,
        "jordan": 1,
        "croatia": 1,
        "nicaragua": 1,
        "ireland": 1,
        "maldives": 1,
        "serbia": 1,
        "srilanka": 1,
        "uruguay": 1,
        "albania": 1,
        "jamaica": 1,
        "southkorea": 1,
        "sweden": 1,
        "anguilla": 1,
        "paraguay": 1,
        "malta": 1,
        "azerbaijan": 1,
        "russia": 1,
        "cambodia": 1,
        "mozambique": 1,
        "yemen": 1,
        "bosniaandherzegovina": 1,
        "zimbabwe": 1,
        "iran": 1,
        "slovenia": 1,
        "ethiopia": 1,
        "botswana": 1,
        "papuanewguinea": 1,
        "senegal": 1,
        "denmark": 1,
        "angola": 1,
        "uganda": 1,
        "barbados": 1,
        "laos": 1,
        "burkinafaso": 1,
        "congo": 1,
        "unitedarabemirates": 1,
        "gambia": 1,
        "myanmar": 1,
        "togo": 1,
        "suriname": 1,
        "mauritius": 1,
        "czechrepublic": 1,
        "montenegro": 1,
        "cameroon": 1,
        "sierraleone": 1,
        "ivorycoast": 1,
        "namibia": 1,
        "mali": 1,
        "bahamas": 1,
        "sudan": 1,
        "benin": 1,
        "latvia": 1,
        "tunisia": 1,
        "guyana": 1,
        "gabon": 1,
        "cyprus": 1,
        "syria": 1,
        "georgia": 1,
        "libya": 1,
        "bermuda": 1,
        "austria": 1,
        "zambia": 1,
        "fiji": 1,
        "macedonia": 1,
        "brunei": 1,
        "norway": 1,
        "caymanislands": 1,
        "kazakhstan": 1,
        "newzealand": 1
    },

    "region":{
        "LGEAG": "LA",
        "LGECZ": "EU",
        "LGEFS": "EU",
        "LGEDG": "EU",
        "LGEHS": "EU",
        "LGEMK": "EU",
        "LGEIS": "EU",
        "LGESC": "EU",
        "LGEEH": "EU",
        "LGEBN": "EU",
        "LGEWR": "EU",
        "LGEPL": "EU",
        "LGEMA": "EU",
        "LGEPT": "EU",
        "LGERO": "EU",
        "LGEES": "EU",
        "LGENO": "EU",
        "LGESW": "EU",
        "LGEUK": "EU",
        "LGEAK": "OT",
        "LGERM": "OT",
        "LGERI": "OT",
        "LGERA": "OT",
        "LGEUR": "OT",
        "LGELV": "OT",
        "LGEAS": "OT",
        "LGEEG": "OT",
        "LGELF": "OT",
        "LGESK": "OT",
        "LGEMC": "OT",
        "LGESA": "OT",
        "LGETU": "OT",
        "LGEOT": "OT",
        "LGEDF": "OT",
        "LGEGF": "OT",
        "LGEME": "OT",
        "LGEAF": "OT",
        "LGEAO": "OT",
        "LGENI": "OT",
        "LGETK": "OT",
        "LGEAT": "OT",
        "LGESJ": "OT",
        "LGEEF": "OT",
        "LGEYK": "OT",
        "LGEIR": "OT",
        "LGEEB": "OT",
        "LGELA": "OT",
        "LGEBT": "OT",
        "LGEAP": "AP",
        "LGEQA": "AP",
        "LGETL": "AP",
        "LGECH": "AP",
        "LGEYT": "AP",
        "LGETR": "AP",
        "LGETA": "AP",
        "LGESY": "AP",
        "LGESH": "AP",
        "LGEQH": "AP",
        "LGEQD": "AP",
        "LGEPN": "AP",
        "LGENE": "AP",
        "LGEKS": "AP",
        "LGEHZ": "AP",
        "LGEHN": "AP",
        "LGEHK": "AP",
        "LGEIL": "AP",
        "LGEPH": "AP",
        "LGEVH": "AP",
        "LGEKR": "AP",
        "LGESL": "AP",
        "LGEIN": "AP",
        "LGETH": "AP",
        "LGEML": "AP",
        "LGETT": "AP",
        "LGEJP": "AP",
        "LGECI": "NA",
        "LGERS": "NA",
        "LGEMX": "NA",
        "LGEMS": "NA",
        "LGEMM": "NA",
        "LGEMR": "NA",
        "LGEUS": "NA",
        "LGEMU": "NA",
        "LGEAI": "NA",
        "LGEBR": "LA",
        "LGECL": "LA",
        "LGEVZ": "LA",
        "LGECB": "LA",
        "LGEPS": "LA",
        "LGEPR": "LA",
        "LGESP": "LA",
        "LGEAR": "LA"
    },

    "inquiry_type":{"quotation":"QP", "purchase":"QP", "sales":"SA", "dump_key":"OT"},

    "customer_position":{"none":"NO", "manager":"MA", "founder":"FD", "director":"DR", "entry":"EN", "analyst":"AN", "partner":"PA", "level":"LV", "execut":"EX", "c-level":"CL", "traine":"TR", "presid":"PR", "vice":"VI", "intern":"IN"},

    "values":{
        "customer_country":"dump_value",
        "business_unit":["ID", "AS", "IT", "ETC"],
        "customer_job":["OT", "EN", "AD", "ED", "SA", "PA", "OP", "IF", "AT", "BS", "MA", "OT"],
        "inquiry_type":["QP", "SA", "OT"],
        "product_category":["OT", "SG", "VR", "MS", "SS", "TV", "OT"],
        "customer_position":["NO", "OT", "MA", "FD", "DR", "AN", "PA", "EN", "OT"],
        "response_corporate":["AP", "LA", "NA", "EU", "OT"]
    },
    "response_corporate":"dump_value",
    "business_area":"dump_value",
    "enterprise":"dump_value",
    "business_unit":"dump_value",

    "com_reg_ver_win_rate":"dumpy_value",
    "customer_type":{"endcustomer":"EC", "specifier/influencer":"SI", "channelpartner":"CP", "dump_key":"OT"},
    "expected_timeline":{"lessthan3months":"L3", "3months~6months":"36", "morethanayear":"MY", "9months~1year":"91", "6months~9months":"69", "dump_key":"OT"}
}



In [None]:
# columns
cols_by_type={}

cols_by_type['categorical']=train.columns[train.dtypes=='object'].tolist()
cols_by_type['numerical']=train.columns[train.dtypes!='object'].tolist()
#colsByType['numerical'].remove('is_converted')

print('\nnumerical columns: '+str(len(cols_by_type['numerical'])))
print('categorical columns: '+str(len(cols_by_type['categorical'])))
print('total columns: '+str(len(cols_by_type['numerical'])+len(cols_by_type['categorical'])))


numerical columns: 13
categorical columns: 15
total columns: 28


In [None]:
# delete cols
del_cols=['business_subarea', 'product_subcategory', 'product_modelname', 
          'customer_country.1']

# preserve
# preserve=pd.DataFrame()
# preserve['com_reg_ver_win_rate']=total_data['com_reg_ver_win_rate']

train_process=train.drop(del_cols,axis=1)
test_process=test.drop(del_cols,axis=1)

In [4]:
# id_strategic_ver it_strategic_ver idit_strategic_ver
ver=['id_strategic_ver', 'it_strategic_ver', 'idit_strategic_ver']
train_process['strategic_ver']=np.where(train_process['idit_strategic_ver']>0,1,0)
test_process['strategic_ver']=np.where(test_process['idit_strategic_ver']>0,1,0)
train_process=train_process.drop(ver,axis=1)
test_process=test_process.drop(ver,axis=1)

NameError: name 'train_process' is not defined

In [5]:
# country columns
# region
def preprocess_region(x,permitted):
    if type(x)==type(''):
        if permitted.get(x):
            return permitted[x]
        return 'OT'
    return np.nan

def response_corporate_encoding(train_data):
    permit={}
    
    for train_label in train_data.value_counts().index:
        permit[train_label]=1
    permit['OT']=1

    return permit

def preprocess_response_corporate(x,permitted):
    if type(x)==type(''):
        if permitted.get(x):
            return x
        return 'OT'
    return np.nan


def country_encoding(permitted):
    permit={}
    for per in permitted.keys():
        permit[per]=per

    permit['OT']='OT'
    permit['dump_key']='dump_value'
    permit['türkiye']='turkey'
    
    return permit

def preprocess_customer_country(x,permitted):
    if type(x)==type(''):
        x=x.lower().replace(' ','').replace('/',' ')
        for word in x.split(' '):
            if permitted.get(word):
                return word
        return 'OT'
    return np.nan

# region
train_process['region']=train_process['response_corporate'].apply(lambda x:preprocess_region(x,permitted=permitted['region']))
test_process['region']=test_process['response_corporate'].apply(lambda x:preprocess_region(x,permitted=permitted['region']))

# response_corporate
permitted['response_corporate']=response_corporate_encoding(train['response_corporate'])
train_process['response_corporate']=train_process['response_corporate'].apply(lambda x:preprocess_response_corporate(x,permitted=permitted['response_corporate']))
test_process['response_corporate']=test_process['response_corporate'].apply(lambda x:preprocess_response_corporate(x,permitted=permitted['response_corporate']))

# customer_country   
permitted['customer_country']=country_encoding(permitted['customer_country'])
train_process['customer_country']=train_process['customer_country'].apply(lambda x:preprocess_customer_country(x,permitted=permitted['customer_country']))
test_process['customer_country']=test_process['customer_country'].apply(lambda x:preprocess_customer_country(x,permitted=permitted['customer_country']))

NameError: name 'train_process' is not defined

In [6]:
# business_unit
train_process['business_unit']=train_process['business_unit'].replace('Solution','ETC')
train_process['business_unit']=train_process['business_unit'].replace('CM','ETC')

test_process['business_unit']=test_process['business_unit'].replace('Solution','ETC')
test_process['business_unit']=test_process['business_unit'].replace('CM','ETC')

NameError: name 'train_process' is not defined

In [11]:
# customer_type
def preprocess_customer_type(x,permitted):
    if type(x)==type(''):
        x=x.lower().replace('-','').replace(' ','')
        if permitted.get(x):
            return permitted[x]
        else:
            return 'OT'
    return x
    
train_process['customer_type']=train_process['customer_type'].apply(lambda x:preprocess_customer_type(x,permitted=permitted['customer_type']))
test_process['customer_type']=test_process['customer_type'].apply(lambda x:preprocess_customer_type(x,permitted=permitted['customer_type']))

In [12]:
# ver_cus, ver_pro
grant=['ver_cus', 'ver_pro']
train_process['grant_weight']=np.where(train_process['ver_cus']>0,1,0)
train_process['grant_weight']=np.where(train_process['ver_pro']>0,1,train_process['grant_weight'])
train_process=train_process.drop(grant,axis=1)

test_process['grant_weight']=np.where(test_process['ver_cus']>0,1,0)
test_process['grant_weight']=np.where(test_process['ver_pro']>0,1,test_process['grant_weight'])
test_process=test_process.drop(grant,axis=1)

In [13]:
# expected_timeline
def preprocess_expected_timeline(x,permitted):
    if type(x)==type(''):
        x=x.lower().replace(' ','').replace('_','')
        if permitted.get(x):
            return permitted[x]
        return 'OT'
    return x

train_process['expected_timeline']=train_process['expected_timeline'].apply(lambda x:preprocess_expected_timeline(x,permitted=permitted['expected_timeline']))
test_process['expected_timeline']=test_process['expected_timeline'].apply(lambda x:preprocess_expected_timeline(x,permitted=permitted['expected_timeline']))

In [14]:
# from sklearn.preprocessing import StandardScaler
# scaler = StandardScaler()

# # lead_desc_length, historical_existing_cnt
# numerical=['lead_desc_length','historical_existing_cnt']
# #total_data[numerical]=scaler.fit_transform(total_data[numerical])
# train_process[numerical]=np.log1p(train_process[numerical])
# test_process[numerical]=np.log1p(test_process[numerical])

# lead_desc_length, historical_existing_cnt
from sklearn.preprocessing import RobustScaler

numerical=['lead_desc_length','historical_existing_cnt','com_reg_ver_win_rate','ver_win_rate_x','ver_win_ratio_per_bu']
scaler=RobustScaler()
scaler.fit(train_process[numerical])
train_process[numerical]=scaler.transform(train_process[numerical])
test_process[numerical]=scaler.transform(test_process[numerical])

In [15]:
# inquiry_type
def preprocess_inquiry_type(x,permitted):
    if type(x)==type(''):
        x=x.lower().replace('_',' ')
        for word in x.split(' '):
            if permitted.get(word):
                return permitted[word]
        return 'OT'
    return np.nan

train_process['inquiry_type']=train_process['inquiry_type'].apply(lambda x:preprocess_inquiry_type(x,permitted=permitted['inquiry_type']))
test_process['inquiry_type']=test_process['inquiry_type'].apply(lambda x:preprocess_inquiry_type(x,permitted=permitted['inquiry_type']))

In [16]:
# customer_job
def preprocess_customer_job(x,permitted):
    if type(x)==type(''):
        porter=PorterStemmer()
        tokens=word_tokenize(x)
        stems=[porter.stem(token) for token in tokens]
        for stem in stems:
            if permitted.get(stem):
                return permitted[stem]
        return 'OT'
    return np.nan

train_process['customer_job']=train_process['customer_job'].apply(lambda x:preprocess_customer_job(x,permitted=permitted['customer_job']))
test_process['customer_job']=test_process['customer_job'].apply(lambda x:preprocess_customer_job(x,permitted=permitted['customer_job']))

In [17]:
# product_category
def preprocess_product_category(x,permitted):
    if type(x)==type(''):
        porter=PorterStemmer()
        tokens=word_tokenize(x)
        stems=[porter.stem(token) for token in tokens]

        prefer={}
        for pf in permitted['dump_key']:
            prefer[pf]=0
        
        for stem in stems:
            if permitted.get(stem):
                prefer[permitted[stem]]=1

        for pf in permitted['dump_key']:
            if prefer[pf]>0:
                return pf
        return 'OT'
    return np.nan

train_process['product_category']=train_process['product_category'].apply(lambda x:preprocess_product_category(x,permitted=permitted['product_category']))
test_process['product_category']=test_process['product_category'].apply(lambda x:preprocess_product_category(x,permitted=permitted['product_category']))

In [18]:
# customer_poisition
def preprocess_customer_position(x,permitted):
    if type(x)==type(''):
        x=x.lower().replace('-',' ').replace('/',' ')
        for word in x.split(' '):
            if permitted.get(word):
                return permitted[word]
        return 'OT'
    return np.nan

train_process['customer_position']=train_process['customer_position'].apply(lambda x:preprocess_customer_position(x,permitted=permitted['customer_position']))
test_process['customer_position']=test_process['customer_position'].apply(lambda x:preprocess_customer_position(x,permitted=permitted['customer_position']))

In [19]:
# one hot encoding columns
origin_data=train_process.drop('com_reg_ver_win_rate',axis=1)
origin_columns=origin_data.columns.to_list()
object_columns=origin_data.columns.to_list()
object_columns.remove('bant_submit')
object_columns.remove('historical_existing_cnt')
object_columns.remove('lead_desc_length')
object_columns.remove('strategic_ver')
object_columns.remove('grant_weight')
object_columns.remove('customer_idx')
object_columns.remove('lead_owner')
object_columns.remove('ver_win_rate_x')
object_columns.remove('ver_win_ratio_per_bu')
for col in object_columns:
    permitted['values'][col]=train_process[col].value_counts().index
permitted['values']

{'customer_country': Index(['india', 'brazil', 'unitedstates', 'mexico', 'OT', 'philippines',
        'colombia', 'u.a.e', 'unitedkingdom', 'saudiarabia',
        ...
        'fiji', 'caymanislands', 'laos', 'benin', 'sierraleone', 'congo',
        'unitedarabemirates', 'gambia', 'mali', 'ivorycoast'],
       dtype='object', name='customer_country', length=134),
 'business_unit': Index(['ID', 'AS', 'IT', 'ETC'], dtype='object', name='business_unit'),
 'customer_job': Index(['OT', 'EN', 'AD', 'ED', 'SA', 'PA', 'OP', 'IF', 'AT', 'BS', 'MA', 'CS',
        'ME', 'DV', 'TC'],
       dtype='object', name='customer_job'),
 'inquiry_type': Index(['QP', 'SA', 'OT'], dtype='object', name='inquiry_type'),
 'product_category': Index(['OT', 'SG', 'IT', 'VR', 'MS', 'SS', 'SE', 'TV', 'VI', 'LD'], dtype='object', name='product_category'),
 'customer_position': Index(['NO', 'OT', 'MA', 'FD', 'DR', 'AN', 'PA', 'EN', 'LV', 'VI', 'IN'], dtype='object', name='customer_position'),
 'response_corporate': Ind

In [20]:
# encoder
class Encoder():
    def __init__(self):
        self.classes=[]

    def fit(self,data):
        for value in data.value_counts().index:
            self.classes.append(value)

    def transform(self,data):
        result=data.copy(deep=True)
        for i,value in enumerate(self.classes):
            result=result.replace(value,i)
        return result
    
    def inverse_transform(self,data):
        result=data.copy(deep=True)
        for i in range(0,len(self.classes)):
            result=result.replace(i,self.classes[i])
        return result

In [21]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestRegressor

imputer=IterativeImputer(estimator=RandomForestRegressor(random_state=SEED),initial_strategy='most_frequent',max_iter=10,random_state=SEED,skip_complete=False,verbose=1)

# encoder
encoders={}
for object_column in object_columns:
    encoders[object_column]=Encoder()
    encoders[object_column].fit(train_process[object_column])


def order_encoding(target_data,object_columns,encoders):
    result=pd.DataFrame()
    for col in target_data.columns:
        if col in object_columns:
            result[col]=encoders[col].transform(target_data[col])
        else:
            result[col]=target_data[col]

    return result

def order_decoding(target_data,object_columns,encoders):
    result=pd.DataFrame()
    for col in target_data.columns:
        if col in object_columns:
            result[col]=target_data[col].apply(lambda x:round(x)).astype(int)
            result[col]=encoders[col].inverse_transform(result[col])
        else:
            result[col]=target_data[col]
    return result

# train data
train_dummy=order_encoding(train_process,object_columns,encoders)
imputer.fit(train_dummy)  
train_dummy_imputed=pd.DataFrame(data=imputer.transform(train_dummy),columns=train_process.columns)
train_imputed=order_decoding(train_dummy_imputed,object_columns,encoders)

# test data
test_dummy=order_encoding(test_process,object_columns,encoders)
test_dummy_imputed=pd.DataFrame(data=imputer.transform(test_dummy),columns=test_process.columns)
test_imputed=order_decoding(test_dummy_imputed,object_columns,encoders)


[IterativeImputer] Completing matrix with shape (59299, 22)
[IterativeImputer] Change: 80.05833333333334, scaled tolerance: 47.466 
[IterativeImputer] Change: 26.809685928239805, scaled tolerance: 47.466 
[IterativeImputer] Early stopping criterion reached.
[IterativeImputer] Completing matrix with shape (59299, 22)
[IterativeImputer] Completing matrix with shape (5271, 22)


In [22]:
# data kfold
from sklearn.model_selection import KFold

train_datas=[]
train_imputed['is_converted']=targets
train_data_false=train_imputed[train_imputed['is_converted']==0]
train_data_true=train_imputed[train_imputed['is_converted']==1]

# kfold
K=11
dkf=KFold(n_splits=K,shuffle=True,random_state=SEED)
#preserves=[pd.DataFrame() for x in range(0,K+1)]
for i,(_,index) in enumerate(dkf.split(train_data_false)):
    print(f'-{i+1} fold data-')
    data_false=train_data_false.iloc[index]
    
    full_data=pd.concat([data_false,train_data_true],ignore_index=True)
    full_data=full_data.sample(frac=1,random_state=SEED)
    X_data=full_data.drop('is_converted',axis=1)
    y_data=full_data['is_converted']
    
    # preserve
    # preserves[i]['com_reg_ver_win_rate']=X_data['com_reg_ver_win_rate']
    # X_data=X_data.drop('com_reg_ver_win_rate',axis=1)
    # preserves[i]['customer_idx']=X_data['customer_idx']
    # X_data['customer_idx']=np.log1p(X_data['customer_idx'])
    # preserves[i]['lead_owner']=X_data['lead_owner']
    # X_data['lead_owner']=np.log1p(X_data['lead_owner'])
    

    print(f'X data shape: {X_data.shape}')
    print(f'y data shape: {y_data.shape}')
    train_datas.append((X_data,y_data))

# preserve
# preserves[K]['com_reg_ver_win_rate']=test_process['com_reg_ver_win_rate']
# test_data=test_process.drop('com_reg_ver_win_rate',axis=1)
# preserves[K]['customer_idx']=X_data['customer_idx']
# test_data['customer_idx']=np.log1p(test_data['customer_idx'])
# preserves[K]['lead_owner']=X_data['lead_owner']
# test_data['lead_owner']=np.log1p(test_data['lead_owner'])
test_data=test_imputed
print(f'test data shape: {test_data.shape}')

-1 fold data-
X data shape: (9800, 22)
y data shape: (9800,)
-2 fold data-
X data shape: (9800, 22)
y data shape: (9800,)
-3 fold data-
X data shape: (9800, 22)
y data shape: (9800,)
-4 fold data-
X data shape: (9800, 22)
y data shape: (9800,)
-5 fold data-
X data shape: (9800, 22)
y data shape: (9800,)
-6 fold data-
X data shape: (9800, 22)
y data shape: (9800,)
-7 fold data-
X data shape: (9800, 22)
y data shape: (9800,)
-8 fold data-
X data shape: (9800, 22)
y data shape: (9800,)
-9 fold data-
X data shape: (9800, 22)
y data shape: (9800,)
-10 fold data-
X data shape: (9800, 22)
y data shape: (9800,)
-11 fold data-
X data shape: (9799, 22)
y data shape: (9799,)
test data shape: (5271, 22)


In [23]:
# process data type
for (train_data,target) in train_datas:
    train_data['customer_idx']=train_data['customer_idx'].astype(str)
    train_data['lead_owner']=train_data['lead_owner'].astype(str)
    train_data['strategic_ver']=train_data['strategic_ver'].astype(int)
    train_data['grant_weight']=train_data['grant_weight'].astype(int)
    target=target.apply(lambda x:1 if x else 0)

test_data['customer_idx']=test_data['customer_idx'].astype(str)
test_data['lead_owner']=test_data['lead_owner'].astype(str)
test_data['strategic_ver']=test_data['strategic_ver'].astype(int)
test_data['grant_weight']=test_data['grant_weight'].astype(int)

In [24]:
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from catboost import CatBoostClassifier, CatBoostRegressor
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics         import mean_squared_error

In [25]:
# 모델 성능 테스트
def get_clf_eval(y_test, y_pred=None):
    confusion = confusion_matrix(y_test, y_pred, labels=[True, False])
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, labels=[True, False])
    recall = recall_score(y_test, y_pred)
    F1 = f1_score(y_test, y_pred, labels=[True, False])

    print("오차행렬:\n", confusion)
    print("\n정확도: {:.4f}".format(accuracy))
    print("정밀도: {:.4f}".format(precision))
    print("재현율: {:.4f}".format(recall))
    print("F1: {:.4f}".format(F1))
    return F1

In [26]:
class KMODEL:
    def __init__(self,dataset_K,train_K=5,random_state=SEED):
        self.k_data=dataset_K
        self.k_fold=train_K
        self.models=[[] for i in range(0,self.k_data)]
        self.scores=[[] for i in range(0,self.k_data)]
        self.thresholds=[[] for i in range(0,self.k_data)]
        self.cv_scores=[]
        self.final_threshold=0.45
        self.seed=random_state

    def modeling_kfold(self,iters,n_estimators,max_depth,learning_rate,cat_features,train_data,targets_data,noise,core):
        # k-fold
        kf=StratifiedKFold(n_splits=self.k_fold,shuffle=True,random_state=self.seed)

        for i,(train_index,val_index) in enumerate(kf.split(train_data,targets_data)):
            print(f'-[{iters+1}-{i+1}] fold-')
            # noise
            targets_noised=(targets_data+noise).apply(lambda x:self.value_scale(x))

            X_train,X_val=train_data.iloc[train_index],train_data.iloc[val_index]
            y_train,y_val=targets_noised.iloc[train_index],targets_noised.iloc[val_index]

            # logloss
            #classifier=CatBoostClassifier(n_estimators=n_estimators, max_depth=max_depth, learning_rate=learning_rate, eval_metric='F1', random_state=SEED, bootstrap_type ='Bernoulli',task_type=core) # default:logloss
            
            # randomforst
            #classifier=RandomForestClassifier(n_estimators=n_estimators,max_depth=max_depth,random_state=SEED)

            # regressor
            regressor=CatBoostRegressor(n_estimators=n_estimators, max_depth=max_depth, learning_rate=learning_rate, eval_metric='RMSE',random_state=self.seed, bootstrap_type ='Bernoulli',task_type=core)

            model=regressor.fit(X_train, y_train, eval_set=(X_val,y_val),verbose=100, early_stopping_rounds=100,cat_features=cat_features,use_best_model=True)
            #model=classifier.fit(X_train,y_train)
            # model=regressor.fit(X_train, y_train, eval_set=(X_val,y_val),verbose=100, early_stopping_rounds=100,cat_features=cat_features)
            
            
            # pred=model.predict(X_val,prediction_type='RawFormulaVal')  # focal loss
            # coordinates=np.linspace(pred.min(),pred.max(),100)   # focal loss
            # pred=model.predict_proba(X_val)[:,1]              # classifer
            pred=model.predict(X_val)                           # regressor               
            coordinates = np.linspace(0, 1, 100)
            y_val=y_val>0.5

            best_score=0
            best_coordinate=0
            for coordinate in coordinates:
                pred_value=pred>coordinate
                score=f1_score(y_val,pred_value)
                if best_score<score:
                    best_score=score
                    best_coordinate=coordinate
            
            pred=(pred>best_coordinate)
            self.scores[iters].append(get_clf_eval(y_val,pred))
            self.thresholds[iters].append(best_coordinate)
            # scores.append(np.sqrt(mean_squared_error(y_val,pred)))
            self.models[iters].append(model)
        
        self.cv_scores.append(np.mean(self.scores[iters]))
        print(f'[{iters+1}] F1 scores mean: {self.cv_scores[iters]}')

    def modeling_kdata(self,n_estimators,max_depth,learning_rate,cat_features,train_datas,core='CPU'):
        noise_size=0
        for _,target in train_datas:
            noise_size+=target.shape[0]
        noises=[round(random.uniform(-0.1,0.1),5) for i in range(noise_size)]

        checksum=0
        for iter,(train_data,target) in enumerate(train_datas):
            noise=noises[checksum:checksum+target.shape[0]]
            self.modeling_kfold(iter,n_estimators,max_depth,learning_rate,cat_features,train_data,target,noise,core=core)
            checksum+=target.shape[0]
            
        self.final_threshold=np.mean(self.thresholds)
        print(f'-----Total F1 scores mean: {np.mean(self.cv_scores)}-----')
        


    def predict(self,test_data):
        test_pred=pd.Series([0 for x in range(len(test_data))], index=test_data.index)
        
        # version1
        for models,thresholds in zip(self.models,self.thresholds):
            for model,threshold in zip(models,thresholds):
                pred=model.predict(test_data)
                test_pred+=pred>threshold
        test_pred=test_pred/((self.k_data)*(self.k_fold))
        test_pred=test_pred.apply(lambda x:1 if x>0.5 else 0)
        
        # version2
        # for models in self.models:
        #     for model in models:
        #         pred=model.predict(test_data)
        #         test_pred+=pred
        # test_pred=test_pred/((self.k_data)*(self.k_fold))
        # test_pred=test_pred>self.final_threshold
        
        return test_pred
    
    def value_scale(self,x):
        if x<0:
            return 0
        elif x>1:
            return 1
        
        return x

In [27]:
cat_features=train_datas[0][0].columns.to_list()
cat_features.remove('bant_submit')
cat_features.remove('lead_desc_length')
cat_features.remove('historical_existing_cnt')
cat_features.remove('com_reg_ver_win_rate')
cat_features.remove('ver_win_rate_x')
cat_features.remove('ver_win_ratio_per_bu')
cat_features

['customer_country',
 'business_unit',
 'customer_idx',
 'customer_type',
 'enterprise',
 'customer_job',
 'inquiry_type',
 'product_category',
 'customer_position',
 'response_corporate',
 'expected_timeline',
 'business_area',
 'lead_owner',
 'strategic_ver',
 'region',
 'grant_weight']

In [28]:
kmodel=KMODEL(dataset_K=11)
kmodel.modeling_kdata(n_estimators=1000,max_depth=8,learning_rate=0.05,cat_features=cat_features,train_datas=train_datas)


-[1-1] fold-
0:	learn: 0.4640228	test: 0.4644977	best: 0.4644977 (0)	total: 82.5ms	remaining: 1m 22s
100:	learn: 0.2292240	test: 0.2307925	best: 0.2307925 (100)	total: 2.1s	remaining: 18.7s
200:	learn: 0.2128958	test: 0.2266238	best: 0.2266197 (199)	total: 4.05s	remaining: 16.1s
300:	learn: 0.1986051	test: 0.2233828	best: 0.2233828 (300)	total: 5.92s	remaining: 13.8s
400:	learn: 0.1861259	test: 0.2213295	best: 0.2212836 (395)	total: 7.8s	remaining: 11.7s
500:	learn: 0.1754618	test: 0.2204764	best: 0.2204733 (499)	total: 9.69s	remaining: 9.65s
600:	learn: 0.1656358	test: 0.2195620	best: 0.2195620 (600)	total: 11.6s	remaining: 7.68s
700:	learn: 0.1560251	test: 0.2185678	best: 0.2185678 (700)	total: 13.5s	remaining: 5.75s
800:	learn: 0.1482133	test: 0.2178675	best: 0.2178329 (795)	total: 15.4s	remaining: 3.81s
900:	learn: 0.1413401	test: 0.2175148	best: 0.2174643 (887)	total: 17.3s	remaining: 1.9s
999:	learn: 0.1346592	test: 0.2169217	best: 0.2168873 (989)	total: 19.1s	remaining: 0us

bes

200:	learn: 0.2142893	test: 0.2331248	best: 0.2331248 (200)	total: 3.6s	remaining: 14.3s
300:	learn: 0.1998247	test: 0.2300493	best: 0.2300493 (300)	total: 5.54s	remaining: 12.9s
400:	learn: 0.1867791	test: 0.2281456	best: 0.2280872 (388)	total: 7.47s	remaining: 11.2s
500:	learn: 0.1760375	test: 0.2270518	best: 0.2270518 (500)	total: 9.42s	remaining: 9.38s
600:	learn: 0.1660792	test: 0.2263367	best: 0.2262392 (588)	total: 11.3s	remaining: 7.52s
700:	learn: 0.1565511	test: 0.2256266	best: 0.2255745 (693)	total: 13.3s	remaining: 5.67s
800:	learn: 0.1491229	test: 0.2247891	best: 0.2247291 (793)	total: 15.2s	remaining: 3.78s
900:	learn: 0.1409337	test: 0.2242988	best: 0.2242101 (872)	total: 17.2s	remaining: 1.89s
999:	learn: 0.1341778	test: 0.2241071	best: 0.2240291 (968)	total: 19.1s	remaining: 0us

bestTest = 0.2240290716
bestIteration = 968

Shrink model to first 969 iterations.
오차행렬:
 [[907  63]
 [ 73 917]]

정확도: 0.9306
정밀도: 0.9255
재현율: 0.9351
F1: 0.9303
-[2-4] fold-
0:	learn: 0.465467

400:	learn: 0.1859271	test: 0.2129294	best: 0.2129294 (400)	total: 7.48s	remaining: 11.2s
500:	learn: 0.1753291	test: 0.2119081	best: 0.2118696 (491)	total: 9.42s	remaining: 9.39s
600:	learn: 0.1642861	test: 0.2108281	best: 0.2107970 (599)	total: 11.4s	remaining: 7.55s
700:	learn: 0.1544572	test: 0.2100572	best: 0.2100572 (700)	total: 13.3s	remaining: 5.69s
800:	learn: 0.1457014	test: 0.2094580	best: 0.2093686 (782)	total: 15.3s	remaining: 3.81s
900:	learn: 0.1382369	test: 0.2094193	best: 0.2093196 (828)	total: 17.3s	remaining: 1.9s
999:	learn: 0.1309317	test: 0.2091498	best: 0.2091498 (999)	total: 19.2s	remaining: 0us

bestTest = 0.2091498399
bestIteration = 999

오차행렬:
 [[909  61]
 [ 64 926]]

정확도: 0.9362
정밀도: 0.9342
재현율: 0.9371
F1: 0.9357
[3] F1 scores mean: 0.9355701263810137
-[4-1] fold-
0:	learn: 0.4640764	test: 0.4637821	best: 0.4637821 (0)	total: 25.5ms	remaining: 25.5s
100:	learn: 0.2290502	test: 0.2279903	best: 0.2279903 (100)	total: 1.85s	remaining: 16.5s
200:	learn: 0.212189

오차행렬:
 [[903  67]
 [ 77 913]]

정확도: 0.9265
정밀도: 0.9214
재현율: 0.9309
F1: 0.9262
-[5-3] fold-
0:	learn: 0.4651176	test: 0.4638939	best: 0.4638939 (0)	total: 27.7ms	remaining: 27.7s
100:	learn: 0.2285580	test: 0.2274539	best: 0.2274539 (100)	total: 1.68s	remaining: 14.9s
200:	learn: 0.2140877	test: 0.2236875	best: 0.2236875 (200)	total: 3.43s	remaining: 13.6s
300:	learn: 0.2020509	test: 0.2216776	best: 0.2216776 (300)	total: 5.29s	remaining: 12.3s
400:	learn: 0.1906933	test: 0.2202437	best: 0.2201876 (399)	total: 7.18s	remaining: 10.7s
500:	learn: 0.1806754	test: 0.2195275	best: 0.2194245 (482)	total: 9.1s	remaining: 9.06s
600:	learn: 0.1699898	test: 0.2185882	best: 0.2185578 (599)	total: 11s	remaining: 7.31s
700:	learn: 0.1610882	test: 0.2179591	best: 0.2179323 (697)	total: 12.9s	remaining: 5.5s
800:	learn: 0.1533132	test: 0.2179547	best: 0.2178731 (735)	total: 14.8s	remaining: 3.68s
900:	learn: 0.1457062	test: 0.2177284	best: 0.2176518 (866)	total: 16.7s	remaining: 1.83s
999:	learn: 0.13

100:	learn: 0.2308709	test: 0.2256748	best: 0.2256748 (100)	total: 1.68s	remaining: 14.9s
200:	learn: 0.2142341	test: 0.2206528	best: 0.2206528 (200)	total: 3.5s	remaining: 13.9s
300:	learn: 0.1985890	test: 0.2171310	best: 0.2171010 (298)	total: 5.38s	remaining: 12.5s
400:	learn: 0.1858176	test: 0.2152274	best: 0.2152274 (400)	total: 7.27s	remaining: 10.9s
500:	learn: 0.1751561	test: 0.2142285	best: 0.2141985 (496)	total: 9.19s	remaining: 9.15s
600:	learn: 0.1658422	test: 0.2131553	best: 0.2131553 (600)	total: 11.1s	remaining: 7.37s
700:	learn: 0.1566716	test: 0.2125819	best: 0.2125284 (676)	total: 13s	remaining: 5.55s
800:	learn: 0.1484740	test: 0.2114824	best: 0.2114824 (800)	total: 14.9s	remaining: 3.71s
900:	learn: 0.1405295	test: 0.2110564	best: 0.2110549 (869)	total: 16.9s	remaining: 1.85s
999:	learn: 0.1332842	test: 0.2108696	best: 0.2107210 (969)	total: 18.7s	remaining: 0us

bestTest = 0.210721018
bestIteration = 969

Shrink model to first 970 iterations.
오차행렬:
 [[932  38]
 [ 9

800:	learn: 0.1448820	test: 0.2170795	best: 0.2170012 (753)	total: 14.8s	remaining: 3.69s
900:	learn: 0.1371345	test: 0.2170265	best: 0.2167653 (848)	total: 16.7s	remaining: 1.84s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.216765329
bestIteration = 848

Shrink model to first 849 iterations.
오차행렬:
 [[920  50]
 [ 76 914]]

정확도: 0.9357
정밀도: 0.9237
재현율: 0.9485
F1: 0.9359
-[8-3] fold-
0:	learn: 0.4646333	test: 0.4623413	best: 0.4623413 (0)	total: 28.3ms	remaining: 28.3s
100:	learn: 0.2261337	test: 0.2265455	best: 0.2265455 (100)	total: 1.78s	remaining: 15.9s
200:	learn: 0.2121392	test: 0.2232499	best: 0.2232499 (200)	total: 3.55s	remaining: 14.1s
300:	learn: 0.2006291	test: 0.2206796	best: 0.2206796 (300)	total: 5.38s	remaining: 12.5s
400:	learn: 0.1892587	test: 0.2187119	best: 0.2187119 (400)	total: 7.25s	remaining: 10.8s
500:	learn: 0.1797502	test: 0.2170175	best: 0.2170074 (499)	total: 9.11s	remaining: 9.07s
600:	learn: 0.1693648	test: 0.2161215	best: 0.2161199 

999:	learn: 0.1340492	test: 0.2244132	best: 0.2244132 (999)	total: 18.7s	remaining: 0us

bestTest = 0.2244131955
bestIteration = 999

오차행렬:
 [[911  59]
 [ 85 905]]

정확도: 0.9265
정밀도: 0.9147
재현율: 0.9392
F1: 0.9268
-[9-5] fold-
0:	learn: 0.4644252	test: 0.4633818	best: 0.4633818 (0)	total: 27.2ms	remaining: 27.2s
100:	learn: 0.2356991	test: 0.2262958	best: 0.2262958 (100)	total: 1.72s	remaining: 15.3s
200:	learn: 0.2205618	test: 0.2235304	best: 0.2235304 (200)	total: 3.52s	remaining: 14s
300:	learn: 0.2048428	test: 0.2205430	best: 0.2205276 (295)	total: 5.37s	remaining: 12.5s
400:	learn: 0.1902138	test: 0.2188443	best: 0.2187260 (396)	total: 7.24s	remaining: 10.8s
500:	learn: 0.1792457	test: 0.2174502	best: 0.2174099 (498)	total: 9.12s	remaining: 9.08s
600:	learn: 0.1681469	test: 0.2168378	best: 0.2168378 (600)	total: 11s	remaining: 7.32s
700:	learn: 0.1591416	test: 0.2163354	best: 0.2162329 (675)	total: 12.9s	remaining: 5.52s
800:	learn: 0.1507462	test: 0.2162039	best: 0.2160651 (760)	to

오차행렬:
 [[903  67]
 [ 69 921]]

정확도: 0.9306
정밀도: 0.9290
재현율: 0.9309
F1: 0.9300
-[11-2] fold-
0:	learn: 0.4636808	test: 0.4623811	best: 0.4623811 (0)	total: 32.5ms	remaining: 32.4s
100:	learn: 0.2274788	test: 0.2222814	best: 0.2222814 (100)	total: 1.76s	remaining: 15.6s
200:	learn: 0.2118354	test: 0.2179871	best: 0.2179871 (200)	total: 3.58s	remaining: 14.2s
300:	learn: 0.1994114	test: 0.2156224	best: 0.2156224 (300)	total: 5.45s	remaining: 12.7s
400:	learn: 0.1880835	test: 0.2140717	best: 0.2140717 (400)	total: 7.36s	remaining: 11s
500:	learn: 0.1770430	test: 0.2131536	best: 0.2130622 (496)	total: 9.27s	remaining: 9.23s
600:	learn: 0.1671753	test: 0.2124753	best: 0.2124508 (598)	total: 11.2s	remaining: 7.41s
700:	learn: 0.1580558	test: 0.2116870	best: 0.2116088 (691)	total: 13.1s	remaining: 5.59s
800:	learn: 0.1502487	test: 0.2113113	best: 0.2113113 (800)	total: 15s	remaining: 3.73s
900:	learn: 0.1428612	test: 0.2108133	best: 0.2107717 (891)	total: 16.9s	remaining: 1.86s
999:	learn: 0.1

In [32]:
pred=kmodel.predict(test_data)
pred.value_counts()

False    3483
True     1788
Name: count, dtype: int64

In [33]:
pred=pred.apply(lambda x:1 if x else 0)
submission=pd.read_csv('submission.csv')
submission['is_converted']=pred
submission.to_csv('submission.csv',index=False)
submission.head()

Unnamed: 0,id,bant_submit,customer_country,business_unit,com_reg_ver_win_rate,customer_idx,customer_type,enterprise,historical_existing_cnt,id_strategic_ver,...,response_corporate,expected_timeline,ver_cus,ver_pro,ver_win_rate_x,ver_win_ratio_per_bu,business_area,business_subarea,lead_owner,is_converted
0,19844,0.0,/ / Brazil,ID,0.073248,47466,End Customer,Enterprise,53.0,,...,LGESP,,1,0,0.001183,0.04984,retail,Electronics & Telco,278,1
1,9738,0.25,400 N State Of Franklin Rd Cloud IT / Johnson...,IT,,5405,End Customer,SMB,,,...,LGEUS,,0,0,1.3e-05,,transportation,Others,437,1
2,8491,1.0,/ / U.A.E,ID,,13597,Specifier/ Influencer,SMB,,,...,LGEGF,less than 3 months,0,0,6e-05,0.131148,hospital & health care,General Hospital,874,0
3,19895,0.5,/ Madison / United States,ID,0.118644,17204,,Enterprise,,,...,LGEUS,more than a year,0,0,0.001183,0.04984,retail,,194,0
4,10465,1.0,/ Sao Paulo / Brazil,ID,0.074949,2329,End Customer,Enterprise,2.0,1.0,...,LGESP,less than 3 months,1,1,0.003079,0.064566,corporate / office,Engineering,167,1
