In [20]:
import numpy as np
import pandas as pd
import sklearn
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder, OneHotEncoder

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [21]:
train = pd.read_csv('train.csv')
test = pd.read_csv('submission.csv')

# preprocess func

In [22]:
# drop
def drop_columns(df):
    df.drop(columns=['product_subcategory', 'product_modelname', 'customer_country.1', 'business_subarea'], inplace=True, axis=1)
    if 'id' in df.columns:
        df.drop(columns=['id'], inplace=True, axis=1)

    return df

In [23]:
# 결측값 0으로 처리 : 'ver_win_ratio_per_bu', 'com_reg_ver_win_rate', 'ver_win_rate_x' 결측값은 모델로 예측
def fill_missing_values_with_0(df):
    column = ['it_strategic_ver', 'id_strategic_ver', 'idit_strategic_ver', 'historical_existing_cnt']
    df[column] = df[column].fillna(0)
    
    return df

In [24]:
# customer_country
import googlemaps
gmaps = googlemaps.Client(key='AIzaSyAVUPrLICIAfdLfYEJDlc84qgzFX8noGWg')

def preprocess_country(df):
    primary_countries = [
       'Philippines', 'India', 'Nigeria', 'Saudi Arabia', 'Singapore', 'Brazil', 'South Africa', 'United States', 'Colombia',
       'Mexico', 'Ghana', 'Egypt', 'Rwanda', 'Ethiopia', 'Australia', 'Kenya', 'Indonesia', 'Oman', 'Pakistan', 'United Kingdom',
       'Guatemala', 'Panama', 'Canada', 'Bangladesh', 'Guinea', 'United Republic of Tanzania', 'Qatar', 'Afghanistan', 'Chile',
       'Mozambique', 'Türkiye', 'El Salvador', 'Togo', 'Jordan', 'Iraq', 'Israel', 'Sri Lanka', 'South Korea', 'Portugal', 'Mauritania',
       'Uruguay', 'Peru', 'Germany', 'Romania', 'Norway', 'Jamaica', 'Hungary', 'Poland', 'Spain', 'Argentina', 'Ecuador',
       'Senegal', 'Hong Kong', 'Malaysia', 'Japan', 'Kuwait', 'Ireland', 'Albania', 'Greece', 'Algeria', 'Nicaragua', 'Slovenia', 'Italy',
       'Netherlands', 'Dominican Republic', 'France', 'Uganda', 'Iran', 'Paraguay', 'Bolivia', 'Namibia', 'Tunisia', 'Puerto Rico',
       'Anguilla', 'Croatia', 'Fiji', 'Denmark', 'Sweden', 'Cyprus', 'Belgium', 'Venezuela', 'Maldives', 'Morocco', 'Switzerland',
       'Honduras', 'Austria', 'Russia', 'Burkina Faso', 'Thailand', 'Bahamas', "Côte d'Ivoire", 'Saint Lucia',
       'Democratic Republic of the Congo', 'Cambodia', 'Zimbabwe', 'Vietnam', 'Barbados', 'Suriname', 'Costa Rica', 'Botswana',
       'Curaçao', 'Guyana', 'Mali', 'China', 'Latvia', 'Libya', 'Central African Republic', 'Turks and Caicos Islands',
       'Azerbaijan', 'Yemen', 'Antigua', 'Lebanon', 'Angola', 'Bulgaria', 'Mongolia', 'Armenia', 'Trinidad and Tobago', 'Northern Mariana Islands', 
       'Nepal', 'Luxembourg', 'Somalia', 'Bahrain', 'Georgia', 'Mauritius', 'Uzbekistan', 'Taiwan', 'Iceland', 'Czechia', 'Monaco', 'Brunei', 'Malta',
       'Saint Kitts and Nevis', 'Myanmar', 'Sierra Leone', 'Sudan', 'Cameroon', 'Syria', 'The Gambia', 'Gabon', 'Montenegro', 'Laos',
       'Lithuania', 'Zambia', 'Estonia', 'Serbia', 'Benin', 'Macedonia', 'Bosnia and Herzegovina', 'Bermuda', 'Lesotho',
       'New Zealand', 'Ukraine', 'Republic of the Congo',  'Kazakhstan', 'Belarus', 'Palestine',  'Cayman Islands', 'Eswatini', 'Finland',  'Kosovo',
       'Djibouti', 'Belize', 'Saint Martin', 'U.S. Virgin Islands', 'United Arab Emirates', 'Aruba', 'Cuba', 'Haiti', 'Isle of Man', 'Slovakia'
    ]
    
    df['customer_country'] = df['customer_country'].replace('//', np.nan)
    
    for country in primary_countries:
        df.loc[train['customer_country'].str.contains(country, na=False), 'customer_country'] = country
    
    for index, loc in df.loc[df['customer_country'].isna() | ~df['customer_country'].isin(primary_countries), 'customer_country'].items():
        if pd.isna(loc):
            continue

        geocode_result = gmaps.geocode(loc)
        if geocode_result:
            for component in geocode_result[0]['address_components']:
                if 'country' in component['types']:
                    country_name = component['long_name']
                    train.at[index, 'customer_country'] = country_name
                    break
        else:
            continue
    
    df['customer_country'].fillna('Others', inplace=True)
    df.loc[~df['customer_country'].isin(primary_countries + ['Others']), 'customer_country'] = 'Others'

    return df

In [25]:
# customer_continent (파생변수)
def get_continent(country):
    customer_continent_mapping = {
        'Asia' : ['Philippines',  'Saudi Arabia', 'Singapore', 'United Arab Emirates', 
                  'Indonesia', 'Qatar','Israel', 'Sri Lanka', 'Malaysia', 'Kuwait', 
                  'Hong Kong', 'Uzbekistan', 'Brunei', 'Nepal', 'Maldives', 'Armenia', 'Myanmar', 'Cambodia', 
                  'Vietnam', 'Laos', 'Kazakhstan', 'Thailand', 'Syria'],
        'Africa' : ['Nigeria', 'South Africa', 'Ghana', 'Egypt', 'Rwanda', 'Ethiopia', 'Kenya', 'Guinea', 'Morocco',
                    'United Republic of Tanzania', 'Mozambique', 'Mauritania', 'Senegal', 'Algeria', 'Uganda', 'Mauritius',
                    'Namibia', 'Tunisia', 'Angola', 'Burkina Faso', "Côte d'Ivoire", 'Democratic Republic of the Congo', 
                    'Republic of the Congo', 'Zimbabwe', 'Botswana', 'Mali', 'Libya', 'Central African Republic', 'Somalia', 'Sierra Leone', 
                    'Sudan', 'Cameroon', 'The Gambia', 'Gabon', 'Zambia', 'Eswatini', 'Djibouti', 'Lesotho', 'Benin'],
        'Europe' : ['United Kingdom', 'Portugal', 'Germany', 'Romania', 'Norway', 'Hungary', 'Poland', 'Slovakia',
                    'Czechia', 'Spain', 'Ireland', 'Albania', 'Greece', 'Slovenia', 'Italy', 'Netherlands', 
                    'Croatia', 'Denmark', 'Sweden', 'Cyprus', 'Belgium', 'Switzerland', 'Austria', 'Russia', 
                    'Bulgaria', 'Luxembourg', 'Iceland', 'Monaco', 'Malta', 'Estonia', 'Serbia', 'France', 'Latvia',
                    'Macedonia', 'Bosnia and Herzegovina', 'Montenegro', 'Lithuania', 'Finland', 'Kosovo', 'Belarus', 'Ukraine'],
        'North America' : ['United States', 'Canada', 'Guatemala', 'Panama', 'Mexico', 'Colombia', 'Jamaica', 'Saint Martin',
                           'Puerto Rico', 'Anguilla', 'Dominican Republic', 'Bahamas', 'Barbados', 'Costa Rica', 'Aruba',
                           'Curaçao', 'Guyana', 'Northern Mariana Islands', 'U.S. Virgin Islands', 'Cayman Islands', 
                           'Bermuda', 'Belize', 'Cuba', 'Haiti', 'Isle of Man', 'Nicaragua', 'Honduras', 'Saint Lucia',
                           'Turks and Caicos Islands', 'Antigua', 'Saint Kitts and Nevis', 'Trinidad and Tobago'],
        'South America' : ['Brazil', 'Chile', 'El Salvador', 'Togo', 'Uruguay', 'Peru', 'Argentina', 'Ecuador', 
                           'Paraguay', 'Bolivia', 'Venezuela', 'Suriname'],
        'Oceania' : ['Australia', 'Fiji', 'New Zealand'],
        'Others' : ['Others'],
    }
    
    for continent, countries in customer_continent_mapping.items():
        if country in countries:
            return continent
    return 'Others'

In [26]:
# customer_type
def preprocess_customer_type(customer_type):
    customer_type_mapping = {
        'End-User' : ['End-user'],
        'End-Customer' : ['End-Customer', 'End Customer'],
        'Specifier/Influencer' : ['Specifier/Influencer', 'Specifier / Influencer', 'Specifier/ Influencer'],
        'Others' : ['Other', 'Others', 'Etc.', np.nan],
        'Software/Solution Provider' : ['Software/Solution Provider', 'Software / Solution Provider'],
        'Home Owner' : ['Homeowner', 'Home Owner'],
        'Manager/Director' : ['Manager / Director']
    }
    
    for category, jobs in customer_type_mapping.items():
        if customer_type in jobs:
            return category
    return customer_type

In [27]:
# customer_job
def get_customer_job(customer_job):
    customer_job_mapping = {
        'purchase' : ['purchasing', 'purchase', 'purchasing manager', 'purchaser', 'purchasing agent', 'drop, purchase maxhub', 'purchasing authority', 'purchasers', 'purchase dept', 'purchsing', 'requirements and buyer', 'buyer'],
        'director/purchase' : ['director purchaser', 'purchasing director', 'director purchaser', 'purchasing supervisor'],
        'coordinator/purchase' : ['purchasing coordinator', 'buyer, coordinating'],
        'install/purchase' : ['purchase and install', 'installation and purchaser'],
        'design/purchase' : ['designer purchaser', 'design/purchaser'],
        'install/designer' : ['design and install', 'designer/installer'],
        'media/communication' : ['media and communication', 'media and communications', 'broadcasting & media', 'media_e_comunicazione', 'média_és_kommunikáció', 'media_and_communication', 'medien_und_kommunikation', 'medios_de_comunicación'],
        'engineering' : ['engineering', 'engineer', 'engineering & technical', 'project engineer'],
        'director/engineering' : ['engineering director', 'director of engineering', 'chief of engineering', 'lead engineer', 'engineering & technical executive', 'chief engineer', 'principal engineer'],
        'system/engineering' : ['systems engineer', 'system engineer', 'systems administrator', 'systems design'],
        'design/engineering' : ['designer/ engineer', 'design engineer'],
        'consulting' : ['consulting', 'consultant', 'consultent', 'content creation, eq consultant'],
        'project_manager' : ['program and project management', 'project manager', 'project coordinator', 'project lead', 'project facilitator', 'producer/project manager', 'project director', 'gestión_de_proyectos', 
                             'project head', 'programm-_und_projektmanagement', 'program_and_project_management', 'program_and_project_manager', 'projectr mgmt', 'owner / project manager', 'project manage', 
                             'project sales/manage', 'project administrator', 'programm- und projektmanagement', 'projektmenedzsment\tprogram and project management', 'digital project manager', 'program-_és_projektmenedzsment', 
                             'projection manager'],
        'designer/project_manager' : ['designer/ project manager', 'project manager/designer'],
        'project_architect' : ['project architect', 'project designer'],
        'member' : ['project team member', 'mindenes'],
        'sales' : ['sales', 'sales manager', 'sales executive', 'salesman', 'technical sales', 'sale', 'sales rep', 'sales operations', 'field / outside sales', 'vendite', 'vertrieb', 'értékesítés'],
        'operation' : ['operations', 'strategy & operations specialist', 'facilities and operations', 'regional director of operations', 'operations executive', 'operaciones', 'üzemeltetés'],
        'director/operation' : ['operations manager', 'director of operations'],
        'administrative' : ['administrative', 'admin', 'administration', 'authorize (you are responsible for making the final decision)', 'adminisztráció', 'amministrativo', 'administración'],
        'administrative assistant' : ['administrative assistant', 'admin assistant'],
        'it' : ['information technology', 'it integrator', 'it department', 'it - information technology', 'computing & it', 'it/software', 'it',  'it tech.', 'it support', 'information technology\u200b', 'information_technology'],
        'director/it' : ['director it', 'it director', 'it specialist', 'it manager', 'director,it', 'director of it', "i'm directing it", 'it dairector', 'it project lead', 'it admin', 'it administrator', 
                         'deputy cio', 'it project lead'],
        'account manager' : ['account management', 'account exec/manager'],
        'education' : ['education', 'educator', 'higher education (college & university)', 'teacher', 'teaching', 'institute & academy'],
        'hr' : ['human resources', 'human_resources', 'hr posting', 'hr'],
        'finance' : ['finance', 'finanzen', 'finanzas', 'pénzügy'],
        'finance manager' : ['director of finance', 'finance executive'],
        'marketing' : ['marketing', 'marketing coordinator', 'event marketing', 'field marketing', 'marketing operations', 'marketing executive', 'technical marketing', 'product marketing'],
        'si' : ['si', 'system installer', 'installer/ system integrater'],
        'general manger' : ['general manager', 'gm', 'general manager - project manager', 'general manager (decision maker)', 'general management', 'genel müdür', 'genera manager'],
        'manager' : ['managgere', 'ordering manager', 'comanager', 'managing director', 'management', 'manger', 'managing contractor', 'managing partner', 'ops mgr'],
        'contractor' : ['general contractor', 'sub contractor', 'federal government contractor', 'contractor', 'electrical contractor', 'cintractor', 'managing contractor'],
        'owner' : ['owning company', 'owner', 'gm/part owner', 'product owner', 'business owner', 'owner representation'],
        'military and protective services' : ['military and protective services', 'military_and_protective_services'],
        'artist' : ['artist, lead on equipment selection','3d/vfx art'],
        'art/design' : ['arts and design', 'arts_and_design', 'art and design', 'arte_e_design', 'arte y diseño', 'művészet_és_design'],
        'medical imaging' : ['medical imaging specialist', 'spécialiste_en_imagerie_médicale', 'medical imaging  specialist', 'radiology professional', 'radiology  professional', 
                             'radiology_professional', 'profesional de radiología'],
        'medical solution' : ['medical solution provider', 'medical solution  provider', 'medical solution provider\u200b', 'medical solution'],
        'doctor' : ['surgery professional', 'doctor', 'surgery professional\u200b', 'főorvos', 'profesional de cirugía', 'cirugano', 'chirurgien'],
        'property owner' : ['property owner' 'building owner', 'proprietário(a)', 'building owner'],
        'ceo' : ['ceo', 'ceo/founder', 'chief eng.''c-level executive'],
        'end-user' : ['end user', 'primary end-user', 'main end user of the product', 'user', 'cliente final'],
        'recommender' : ['recommend', 'recommendation', 'recommend (you recommend specific products or technologies for the solution)', 'recommender'],
        'purchase/planner' : ['planner/purchaser', 'purchase/planner'],
        'install/planner' : ['planning and installation', 'planning and installation', 'install/planner'],
        'technical' : ['technical', 'tech service', 'tech', 'maintenance technician'],
        'technical/director' : ['head of technology', 'technical director', 'directeur technique'],
        'technical/designer' : ['technology designer', 'designer, creative technologist'],
        'av' : ['av technician', 'av tech','costar av team'],
        'av manager' : [ 'av project manager', 'av estimator', 'a/v project manager'],
        'bidder' : ['public bidder', 'bidder'],
        'installer' : ['installer.', 'installer','facilitator installation services'],
        'design/install' : ['install/designer', 'design and installation company', 'install/designer'],
        'research/install' : ['research/install', 'research and instalaltion'],
        'advertising' : ['advertising and promotions team', 'advertising'],
        'reseller' : ['vendor / reseller', 'revendedor', 'reseller', 'reseller/integrator', 'var'],
        'community/social services' : ['community and social services', 'community_and_social_services'],
        'video wall' : ['wall mounted screen mirroring', 'video wall', 'part of video wall', 'component of video wall', 'videowall'],
        'tv' : ['need 1 tv 55" edge led 4k uhd', 'replacing tv', 'tv studio manager', 'change tv', 'need one tv', 'hotel tv', 'fixing tv', 'replacement tv', 'guestroom tv'],
        'cctv' : ['cctv monetoring', 'cctv view'],
        'display/signage' : ['signage subcontractor p/m', 'digital signage', 'signage manager', 'signage for an attraction', 'sliding pictures of beauty salon', 'using for window display', 'signage subcontractor p/m',
                             'display screen from control', 'display our products', 'display screen', 'display screen from control', 'restaurant display', 'display', 'sign company', 'informatics, touch capability'],
        'repair' : ['repair uhd 120 hz units'],
        'manufacturer' : ['manufacturer', 'manufacturing factory / plant'],
        'procurement' : ['procurement', 'procurement specialist', 'procurment'],
        'sourcing/procurement' : ['sourcing/procurement', 'sourcing / procurement'],
        'supervisor' : ['maintenance supervisor', 'supervisor', 'overseer'],
        'testing' : ['testing and troubleshooting', 'tester', 'inquiry-to-buy/contact-us test', 'test4'],
        'solution' : ['solution provider', 'solution advisor', 'software solution', 'solution engineer'],
        'r&d' : ['research and developement',  'research & development', 'r&d project manager'],
        'research' : ['research', 'product research', 'product research', 'research products and prices', 'product researcher', 'project researcher'],
        'architect' : ['solutions architect', 'architect ass interiores'],
        'interior designer' : ['interior designer', 'interior stylist'],
        'integrator' : ['specifier/integrator', 'integration', 'integrator', 'integrador', 'intergrator'],
        'quoter' : ['sourcing & quoting for end user', 'asking for quote for client', 'quotation curator', 'quote gathering/proposer to owner', 'distributor quotation', 'customer experience', 'quoting project'],
        'leader' : ['lead', 'team leader', 'leader', 'team lead'],
        'technical design' : ['technical designer', 'technical design'],
        'creation and design' : ['kreation und design', 'kreation_und_design'],
        'designer' : ['designer', 'designer, producer', 'designers', 'graphic design'],
        'helpdesk' : ['helpdesk specialist', 'helpdesk specialist', 'help desk / desktop services'],
        'energy' : ['energy', 'renewable energy'],
        'distributor' : ['distributor', 'distribuidor'],
        'theater' : ['community theater', 'home theater'],
        'vice president' : ['vp/gm', 'vice president', 'underboss'],
        'distributor' : ['distribuidor', 'distributor'],
        'decision maker' : ['decision maker', 'design/decision maker', 'decider'],
        'equipment' : ['equipment custodian', 'equipment and app provider', 'equipment selection'],
        'photographer' : ['photos', 'photographer'],
        'quality assurance' : ['quality assurance', 'quality_assurance'],
        'healthcare services' : ['healthcare services', 'healthcare_services', 'mental health', 'healthcare professionals', 'healthcare'],
        'conference' : ['conference room', 'conference room', 'conference table', 'for confrence', 'for presentations'],
        'electronics' : ['electronics & telco', 'electronics evaluator'],
        'facilitator' : ['facilitator', 'facility administrator', 'facilities', 'facilitator installation services'],
        'coordinator' : ['coordinator', 'service coordinator', 'parts coordinator'],
        'developer' : ['application development', 'software developer', 'developer'],
        'business development' : ['business development', 'business_development'],
        'serving' : ['serving', 'serving robot', 'serving food', 'assist in serving food', 'waiter'],
        'exhibition' : ['museum / gallery', 'exhibition / convention center'],
        'clinic' : ['clinical specialist', 'clinic'],
        'office' : ['office', 'corporate / office', 'office it'],
        'executive' : ['execution', 'engagement executive'],
        'veterinarian' : ['tierarzt'],
        'principal' : ['principal', 'principal in charge'],
        'events' : ['store promotions', 'tradeshow event'],
        'others' : ['others', 'other', '5% of hotel needs', 'otro', 'otros', 'n.a', 'digital display vs signage need', 'no respoxse on phone will try again', 'we are in iceland', 'no requirment', 
                    'requirement close', 'the person with the credit card', 'nothing', 'other stores', 'sho lyrics', 'sonstiges', 'altro', 'autres', 'egyéb', 'ranger 2', 'menu', np.nan]
    }
    
    for category, jobs in customer_job_mapping.items():
        if customer_job in jobs:
            return category
    return customer_job

In [28]:
# customer_position
def get_customer_position_category(customer_position):
    customer_position_mapping = {
        'entry level' : ['entry level', 'entrylevel'], 
        'none' : ['none', 'this is a consume display requirement for home purpose.', 'not applicable', 'no influence', 'other - please specify - cedia association'], 
        'teacher' : ['teacher', 'academic coordinator/ post graduate teacher (accountancy, business studies)/ tgt (ict)'],
        'math/physics teacher' : ['math and physics teacher', 'physics and mathematics teacher'],
        'professor' : ['professor', 'prof.', 'education professional'],
        'assistant professor' : ['asst prof.', 'assistant professor'],
        'associate professor' : ['associate professor', 'associate professor in electronics engg'],
        'ceo/founder' : ['ceo/founder', 'ceo/fundador'],
        'c-level executive' : ['c-level executive', 'c-levelexecutive'],
        'architecture/consult' : ['architecture/consult', 'architect/consultant'],
        'decision-maker' : ['decision maker', 'decision-maker'],
        'decision-influencer' : ['decision-influencer', 'decision influencer'],
        'partner' : ['partner', 'business partner'],
        'vice president' : ['vice president', 'vicepresident', 'vp'], 
        'consultant' : ['consultant', 'consulting'],
        'business development' : ['business development', 'business development'],
        'president' : ['president', 'the big boss', 'chairman'],
        'exhibition' : ['exhibitiontv', 'exhibition'],
        'technical' : ['technical', 'técnico'],
        'owner' : ['proprietário(a)'],
        'sales' : ['subsidiary sales (ise)', 'sales'],
        'other' : ['other', 'others', 'bulgaria'],
        'developer' : ['lider de desarrollo'],
        'employee' : ['employee', 'mindenes'],
        'administrative' : ['administrative', 'gerente', 'genel müdür'], 
        'hospital' : ['hospital', 'főorvos'],
        'veterinarian' : ['tierarzt']
    }
    
    for category, customer_positions in customer_position_mapping.items():
        if customer_position in customer_positions:
            return category
    return customer_position

In [29]:
# product_category
def get_product_category(product_category):
    product_category_mapping = {
        'sinage' : ['signage', 'tv', 'ur640', '43us660h0sd.awz', '32lq621cbsb.awz', '32lq621cbsb.awz'],
        'special display' : ['特別顯示屏'],
        'standard display' : ['標準顯示屏'],
        'hospital display' : ['醫院電視'],
        'hotel display' : ['酒店電視'],
        'high brightness' : ['互動式顯示屏', 'high brightness', '高亮度顯示屏'],
        'multi divisions' : ['פיצול מרובה'],
        'board' : ['idb', 'board'],
        'monitor' : ['monitor', '28mq780'],
        'software' : ['軟體'],
        'all-in-one' : ['aio', 'allinone', 'leadallin'],
        'digital retail' : ['retaildigital'],
        'air conditioner' : ['air condition', 'split', 'ac', 'מזגנים', 'تكييفات', 'điều hòa', 'standard'],
        'residential air conditioner' : ['rac', 'ar condicionado residencial', 'résidentiel', 'เครื่องปรับอากาศเผื่อที่อยู่อาศัย'],
        'air handling unit' : ['ahu'],
        'multi air conditioner' : ['multi'],
        'single air conditioner' : ['single package'],
        'cassete air conditioner' : ['teto ou cassete inverter'],
        'heat pump' : ['pompy ciepła'],
        'heater' : ['heating', 'heater', 'isıtma', 'calefacción', 'حلول التدفئة', 'חימום', 'aquecimento'],
        'refrigerator' : ['refrigerator', 'soğutucu'],
        'cooling' : ['réfrigérant', 'pendingin'],
        'air conditiner/cooling' : ['تكييف وتبريد', 'مبرد'],
        'others': ['other', 'otros', 'outros', 'אחר', 'ฯลฯ', 'آخر', 'lainnya', 'not specified', 'inne', 'autre', 'khác', 'etc', np.nan]
    }
    
    for category, product_categories in product_category_mapping.items():
        if product_category in product_categories:
            return category
    return product_category

In [30]:
# lead_desc_length
from scipy import stats

def get_lead_desc_length_transformed(df):
    df['lead_desc_length'], fitted_lambda = stats.boxcox(df['lead_desc_length'])
    return df

In [31]:
# inquiry_type
def get_inquiry_type(inquiry_type):
    inquiry_type_mapping = {
        'others' : ['Other', 'other', 'other_', 'Others', 'others', 'ETC.', 'not specified', 'Not specified', '(Select ID_Needs)', np.nan],
        'quotation or purchase consultation' : ['Quotation or Purchase Consultation', 'Quotation or purchase consultation', 'Quotation or Purchase consultation', 'quotation_or_purchase_consultation', 
                                                'Request for quotation or purchase', 'Purchase or Quotation'],
        'usage or technical consultation' : ['Usage or technical consultation', 'Usage or Technical Consultation', 'usage or technical consultation', 'usage_or_technical_consultation'],
        'event inquiry' : ['Event Inquiry', 'Evento_SdelEstero'],
        'technical consultation' : ['Technical Consultation', 'Request for technical consulting', 'technical_consultation'],
        'lg magnit micro led inquiry' : ['estoy buscando para ecuador este producto lg magnit micro led, para un cliente de 138 pulgadas, con envió marítimo.'],
        'interactive screens quotation' : ['hola me pueden cotizar 19 pantallas interactivas de 100 pulgadas entregadas en guayaquil -ecuador.'],
        'body temperature measurement device inquiry' : ['Vui lòng báo giá giúp mình sản phẩm đo thân nhiệt Xin cảm ơn'],
        'probeam pricing inquiry' : ['probeam precio', 'Probeam precio'],
        'interactive screens for clinics' : ['Pantallas Interactivas para Clinicas'],
        'one quick support' : ['solicito apoyo para realizar cotizacion de los dispositivos que ofrecen en la solución one quick:', 'One Quick:Flex', 
                               'Solicito apoyo para realizar cotizacion de los dispositivos que ofrecen en la solución\xa0One Quick:\xa0'],
        'george v historical integrator' : ['intégrateur historique du george v'],
        'school inquiry' : ['for school'],
        'sales inquiry' : ['Sales Inquiry', 'Sales inquiry', 'sales'],
        'technical information and pricing inquiry' : ['toi muon tim hieu thong tin ky thuat, gia ca cua sp de su dung'],
        'lg product pricing and solutions inquiry' : ['tôi cần tham khảo giá và giải pháp từ lg'],
        'medical monitor for conventional and tomography inquiry' : ['preciso de um monitor médico para radiografia convencional e tomogrtafia.'],
        'lg magnit micro led inquiry' : ['estoy buscando para Ecuador este producto LG MAGNIT micro LED, para un cliente de 138 pulgadas, con envió marítimo.'],
        'george v historical integrator' : ['Intégrateur historique du George V'],
        'technical information and pricing inquiry' : ['Toi muon tim hieu thong tin ky thuat, gia ca cua sp de su dung'],
        'lg product pricing and solutions inquiry' : ['tôi cần tham khảo giá và giải pháp từ LG'],
        'medical monitor for conventional and tomography inquiry' : ['Preciso de um monitor médico para radiografia convencional e tomogrtafia.'],
        'interactive screens quotation' : ['Hola me pueden cotizar 19 pantallas interactivas de 100 pulgadas entregadas en Guayaquil -Ecuador.']
    }
    for category, inquiry_types in inquiry_type_mapping.items():
        if inquiry_type in inquiry_types:
            return category
    return inquiry_type

In [32]:
# expected_timeline
def get_expected_timeline(df):
    mapping_dict = {
        'less_than_3_months': 'less than 3 months',
        '3_months_~_6_months': '3 months ~ 6 months',
        '9_months_~_1_year': '9 months ~ 1 year',
        '6_months_~_9_months': '9 months ~ 1 year',
        'more_than_a_year': 'more than a year',
        'less then 6 months': '3 months ~ 6 months',
        'less than 5 months': '3 months ~ 6 months',
        'more then 3 months': '3 months ~ 6 months',
        'less than 3 months. customer not answered . to call back': 'less than 3 months',
        'one month': 'less than 3 months',
        'duplicate lead - il220100042906. less than 3 months': 'less than 3 months',
        '9 months - 1 year': '9 months ~ 1 year',
        'less than 3 months ,meeting with the customer for the more details and tentative boq will ne 32 and 43': 'less than 3 months',
        'less than 3 months- outdoor led requiment': 'less than 3 months'
    }

    valid_values = ['less than 3 months', '3 months ~ 6 months', '6 months ~ 9 months', '9 months ~ 1 year', 'more than a year']

    df['expected_timeline'] = df['expected_timeline'].replace(mapping_dict)
    df['expected_timeline'] = df['expected_timeline'].apply(lambda x: x if x in valid_values else 'unknown')

    return df

In [33]:
# converted_rate 한번에 생성하는 함수
def get_converted_rate(columns, train, test):
    for col in columns:
        conversion_rates = {}
        for uni in train[f'{col}'].unique():
            conversions = train[(train[f'{col}'] == uni) & (train['is_converted'] == True)].shape[0]
            total = train[train[f'{col}'] == uni].shape[0]
            conversion_rates[uni] = conversions / total if total > 0 else 0

        train[f'{col}_converted_rate'] = train[f'{col}'].map(conversion_rates).fillna(0)
        test[f'{col}_converted_rate'] = test[f'{col}'].map(conversion_rates).fillna(0)
    
    return train, test

In [34]:
# business_area
def get_business_area(df):
    df['business_area'].fillna('others', inplace=True)
    return df

In [35]:
# business_area 가중치 부여 : 'corporate / office', 'retail', 'hotel & accommodation' (파생변수)
def get_ver_business_area(df):
    ver_business_area = ['corporate / office', 'retail', 'hotel & accommodation']
    df['ver_business_area'] = np.where(df['business_area'].isin(ver_business_area), 1, 0)
    return df

# preprocess run

In [36]:
def preprocess(train, test):
    train = drop_columns(train)
    test = drop_columns(test)
    print("== [ drop ] complete == ", flush=True)
    
    train = fill_missing_values_with_0(train)
    test = fill_missing_values_with_0(test)
    print("== [ fill values with 0 ] complete == ", flush=True)
    
    train = preprocess_country(train)
    test = preprocess_country(test)
    print("== [ preprocess_country ] complete == ", flush=True)
    
    train['customer_continent'] = train['customer_country'].apply(get_continent)
    test['customer_continent'] = test['customer_country'].apply(get_continent)
    print("== [ customer_continent ] complete == ", flush=True)
    
    train['customer_type'] = train['customer_type'].apply(preprocess_customer_type)
    test['customer_type'] = test['customer_type'].apply(preprocess_customer_type)
    print("== [ customer_type ] complete == ", flush=True)
    
    train['customer_job'] = train['customer_job'].apply(get_customer_job)
    test['customer_job'] = test['customer_job'].apply(get_customer_job)
    print("== [ customer_job ] complete == ", flush=True)
    
    train['customer_position'] = train['customer_position'].apply(get_customer_position_category)
    test['customer_position'] = test['customer_position'].apply(get_customer_position_category)
    print("== [ customer_position ] complete == ", flush=True)
    
    train['product_category'] = train['product_category'].apply(get_product_category)
    test['product_category'] = test['product_category'].apply(get_product_category)
    print("== [ product_category ] complete == ", flush=True)
    
    train = get_lead_desc_length_transformed(train)
    test = get_lead_desc_length_transformed(test)
    print("== [ lead_desc_length ] complete == ", flush=True)
    
    train['inquiry_type'] = train['inquiry_type'].apply(get_inquiry_type)
    test['inquiry_type'] = test['inquiry_type'].apply(get_inquiry_type)
    print("== [ inquiry_type ] complete == ", flush=True)
    
    train = get_expected_timeline(train)
    test = get_expected_timeline(test)
    print("== [ expected_timeline ] complete == ", flush=True)
    
    train = get_business_area(train)
    test = get_business_area(test)
    print("== [ business_area ] complete == ", flush=True)
    
    train = get_ver_business_area(train)
    test = get_ver_business_area(test)
    print("== [ ver_business_area ] complete == ", flush=True)
    
    # 전환율 한번에 구하기
    columns = ['customer_continent', 'customer_idx', 'customer_type', 'customer_position', 'business_unit', 'response_corporate', 'lead_owner', 'product_category', 'inquiry_type', 'business_area',
               'bant_submit', 'expected_timeline', 'enterprise'] # 'ver_win_rate_x', 'ver_win_ratio_per_bu'는 학습으로 결측 처리한 후에 전환율 구하도록.
    train, test = get_converted_rate(columns, train, test)
    print("== [ converted_rate ] complete == ", flush=True)
    
    return train, test

train, test = preprocess(train, test)

== [ drop ] complete == 
== [ fill values with 0 ] complete == 
== [ preprocess_country ] complete == 
== [ customer_continent ] complete == 
== [ customer_type ] complete == 
== [ customer_job ] complete == 
== [ customer_position ] complete == 
== [ product_category ] complete == 
== [ lead_desc_length ] complete == 
== [ inquiry_type ] complete == 
== [ expected_timeline ] complete == 
== [ business_area ] complete == 
== [ ver_business_area ] complete == 
== [ converted_rate ] complete == 


# 

In [37]:
train.to_csv('train_preprocessed.csv', index=False)
test.to_csv('test_preprocessed.csv', index=False)

# process finish

In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
import koreanize_matplotlib
%matplotlib inline
import matplotlib.font_manager as fm
plt.rcParams['font.family'] = 'NanumGothic'

import random
import os
import sys
import joblib

import time
from tqdm import tqdm
import warnings                                              
warnings.filterwarnings('ignore')  

import sklearn
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder, OneHotEncoder
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, PolynomialFeatures
from scipy.stats import skew
from scipy.stats import boxcox
from scipy.stats import zscore

from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN, BorderlineSMOTE

In [2]:
from sklearn.linear_model import LogisticRegression, RidgeClassifier

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

from xgboost import XGBClassifier
from xgboost import plot_importance

from lightgbm import LGBMClassifier
from lightgbm import plot_importance
import lightgbm as lgb

import catboost
from catboost import CatBoostClassifier, CatBoostRegressor

from sklearn.metrics import confusion_matrix, accuracy_score  #분류- 성능지표
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import f1_score, roc_auc_score

from sklearn.model_selection import train_test_split
from hyperopt import hp, fmin, tpe, Trials
from tqdm import tqdm

from sklearn.model_selection import StratifiedKFold, GridSearchCV, KFold

In [3]:
import numpy as np
import random
import os
def seed_everything(seed: int = 24):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
seed_everything(24)

import pandas as pd
import sklearn
from lightgbm import LGBMClassifier
import lightgbm as lgb
from sklearn.metrics import f1_score, roc_auc_score
from sklearn.model_selection import train_test_split as tts
from bayes_opt import BayesianOptimization
from sklearn.model_selection import KFold
from sklearn.metrics import log_loss

In [4]:
import torch
from deepctr_torch.inputs import SparseFeat, DenseFeat, get_feature_names
from deepctr_torch.models import *

2024-02-19 06:41:06.465793: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-02-19 06:41:06.523892: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [5]:
train = pd.read_csv('train_preprocessed.csv')
test = pd.read_csv('test_preprocessed.csv')

In [6]:
train[['com_reg_ver_win_rate', 'ver_win_rate_x', 'ver_win_ratio_per_bu']] = train[['com_reg_ver_win_rate', 'ver_win_rate_x', 'ver_win_ratio_per_bu']].replace(np.nan, 0)
test[['com_reg_ver_win_rate', 'ver_win_rate_x', 'ver_win_ratio_per_bu']] = test[['com_reg_ver_win_rate', 'ver_win_rate_x', 'ver_win_ratio_per_bu']].replace(np.nan, 0)

In [7]:
# 유지하고자 하는 컬럼 리스트
#columns_to_keep = ['bant_submit', 'customer_country', 'business_unit', 'customer_idx',
#       'customer_type', 'enterprise', 'historical_existing_cnt',
#       'id_strategic_ver', 'it_strategic_ver', 'idit_strategic_ver',
#       'customer_job', 'lead_desc_length', 'inquiry_type', 'product_category',
#       'customer_position', 'response_corporate', 'expected_timeline',
#       'ver_cus', 'ver_pro', 'ver_win_rate_x', 'business_area', 'lead_owner',
#       'is_converted']

# 주어진 컬럼만을 포함하는 새로운 DataFrame 생성
#train_normal = train[columns_to_keep]
#test_normal = test[columns_to_keep]

# 결과 확인
#train_normal

Unnamed: 0,bant_submit,customer_country,business_unit,customer_idx,customer_type,enterprise,historical_existing_cnt,id_strategic_ver,it_strategic_ver,idit_strategic_ver,...,product_category,customer_position,response_corporate,expected_timeline,ver_cus,ver_pro,ver_win_rate_x,business_area,lead_owner,is_converted
0,1.0,Philippines,AS,32160,End-Customer,Enterprise,0.0,0.0,0.0,0.0,...,multi-split,entry level,LGEPH,less than 3 months,1,0,0.003079,corporate / office,0,True
1,1.0,Philippines,AS,23122,End-Customer,Enterprise,12.0,0.0,0.0,0.0,...,multi-split,ceo/founder,LGEPH,less than 3 months,1,0,0.003079,corporate / office,1,True
2,1.0,India,AS,1755,End-Customer,Enterprise,144.0,0.0,0.0,0.0,...,single-split,partner,LGEIL,less than 3 months,1,0,0.003079,corporate / office,2,True
3,1.0,India,AS,4919,End-Customer,Enterprise,0.0,0.0,0.0,0.0,...,vrf,ceo/founder,LGEIL,less than 3 months,1,0,0.003079,corporate / office,3,True
4,1.0,India,AS,17126,Specifier/Influencer,Enterprise,0.0,0.0,0.0,0.0,...,multi-split,partner,LGEIL,less than 3 months,0,0,0.003079,corporate / office,4,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30832,0.0,Others,ID,9320,Others,Enterprise,0.0,1.0,0.0,1.0,...,laec15,none,LGESP,unknown,0,0,0.003079,corporate / office,167,False
30833,0.0,Others,ID,40255,Others,SMB,0.0,1.0,0.0,1.0,...,others,none,LGESP,unknown,0,0,0.003079,corporate / office,225,False
30834,0.0,Others,ID,30387,Others,SMB,0.0,1.0,0.0,1.0,...,others,none,LGESP,unknown,0,0,0.003079,corporate / office,167,False
30835,0.0,Others,ID,33600,Others,SMB,0.0,1.0,0.0,1.0,...,others,none,LGESP,unknown,0,0,0.003079,corporate / office,167,False


In [9]:
train_normal['bant_submit'] = train_normal['bant_submit'].astype('str')
train_normal['customer_idx'] = train_normal['customer_idx'].astype('str')
train_normal['id_strategic_ver'] = train_normal['id_strategic_ver'].astype('str')
train_normal['it_strategic_ver'] = train_normal['it_strategic_ver'].astype('str')
train_normal['idit_strategic_ver'] = train_normal['idit_strategic_ver'].astype('str')
train_normal['ver_cus'] = train_normal['ver_cus'].astype('str')
train_normal['ver_pro'] = train_normal['ver_pro'].astype('str')
train_normal['lead_owner'] = train_normal['lead_owner'].astype('str')

In [10]:
test_normal['bant_submit'] = test_normal['bant_submit'].astype('str')
test_normal['customer_idx'] = test_normal['customer_idx'].astype('str')
test_normal['id_strategic_ver'] = test_normal['id_strategic_ver'].astype('str')
test_normal['it_strategic_ver'] = test_normal['it_strategic_ver'].astype('str')
test_normal['idit_strategic_ver'] = test_normal['idit_strategic_ver'].astype('str')
test_normal['ver_cus'] = test_normal['ver_cus'].astype('str')
test_normal['ver_pro'] = test_normal['ver_pro'].astype('str')
test_normal['lead_owner'] = test_normal['lead_owner'].astype('str')

In [11]:
train_normal.columns

Index(['bant_submit', 'customer_country', 'business_unit', 'customer_idx',
       'customer_type', 'enterprise', 'historical_existing_cnt',
       'id_strategic_ver', 'it_strategic_ver', 'idit_strategic_ver',
       'customer_job', 'lead_desc_length', 'inquiry_type', 'product_category',
       'customer_position', 'response_corporate', 'expected_timeline',
       'ver_cus', 'ver_pro', 'ver_win_rate_x', 'business_area', 'lead_owner',
       'is_converted'],
      dtype='object')

In [12]:
#수치형 변수 정의
num_sub = ['com_reg_ver_win_rate', 'historical_existing_cnt','lead_desc_length','ver_win_rate_x','ver_win_ratio_per_bu','customer_continent_converted_rate','customer_idx_converted_rate','customer_type_converted_rate','customer_position_converted_rate','business_unit_converted_rate','response_corporate_converted_rate','lead_owner_converted_rate','product_category_converted_rate', 'inquiry_type_converted_rate','business_area_converted_rate','bant_submit_converted_rate','expected_timeline_converted_rate','enterprise_converted_rate']
#범주형 변수 정의
cat_sub = ['bant_submit', 'customer_country','business_unit','customer_idx','customer_type','enterprise','id_strategic_ver','it_strategic_ver','idit_strategic_ver','customer_job','inquiry_type','product_category','customer_position', 'response_corporate','expected_timeline','ver_cus','ver_pro','business_area','lead_owner','customer_continent','ver_business_area']

In [13]:
sparse_features = cat_sub
dense_features = num_sub

target = ['is_converted']

In [14]:
test_sparse_features = cat_sub
test_dense_features = num_sub

In [15]:
from sklearn.preprocessing import MinMaxScaler,LabelEncoder
scaler = MinMaxScaler(feature_range=(0,1))
encoder = LabelEncoder()

In [16]:
for feat in sparse_features:
    train[feat] = encoder.fit_transform(train[feat])
    
train[dense_features] = scaler.fit_transform(train[dense_features])

In [17]:
for feat in test_sparse_features:
    test[feat] = encoder.fit_transform(test[feat])
test[dense_features] = scaler.fit_transform(test[test_dense_features])

In [18]:
fixlen_feature_columns = [SparseFeat(feat,train[feat].nunique()) 
                          for feat in sparse_features] + [DenseFeat(feat,1,)
                                                         for feat in dense_features]

In [19]:
fixlen_feature_columns

[SparseFeat(name='bant_submit', vocabulary_size=5, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='bant_submit', group_name='default_group'),
 SparseFeat(name='customer_country', vocabulary_size=165, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='customer_country', group_name='default_group'),
 SparseFeat(name='business_unit', vocabulary_size=4, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='business_unit', group_name='default_group'),
 SparseFeat(name='customer_idx', vocabulary_size=18514, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='customer_idx', group_name='default_group'),
 SparseFeat(name='customer_type', vocabulary_size=22, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='customer_type', group_name='default_group'),
 SparseFeat(name='enterprise', vocabulary_size=2, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='enterprise', group_name='default_group'),
 SparseFeat(name='id_strate

In [20]:
dnn_feature_columns = fixlen_feature_columns
linear_feature_columns = fixlen_feature_columns

In [21]:
feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)
feature_names

['bant_submit',
 'customer_country',
 'business_unit',
 'customer_idx',
 'customer_type',
 'enterprise',
 'id_strategic_ver',
 'it_strategic_ver',
 'idit_strategic_ver',
 'customer_job',
 'inquiry_type',
 'product_category',
 'customer_position',
 'response_corporate',
 'expected_timeline',
 'ver_cus',
 'ver_pro',
 'business_area',
 'lead_owner',
 'historical_existing_cnt',
 'lead_desc_length',
 'ver_win_rate_x']

In [22]:
train_model_input = {name: train[name]for name in feature_names}

In [23]:
test_model_input = {name: test[name] for name in feature_names}

In [24]:
device = ('cuda' if torch.cuda.is_available() else 'cpu')

In [25]:
model = DeepFM(linear_feature_columns, dnn_feature_columns, dnn_hidden_units=(256, 128, 64),
           l2_reg_linear=0.00001, l2_reg_embedding=0.00001, l2_reg_dnn=0, seed=1024, dnn_dropout=0.5,
           dnn_activation='relu', dnn_use_bn=False, task='binary')
model

DeepFM(
  (embedding_dict): ModuleDict(
    (bant_submit): Embedding(5, 4)
    (customer_country): Embedding(165, 4)
    (business_unit): Embedding(4, 4)
    (customer_idx): Embedding(18514, 4)
    (customer_type): Embedding(22, 4)
    (enterprise): Embedding(2, 4)
    (id_strategic_ver): Embedding(2, 4)
    (it_strategic_ver): Embedding(2, 4)
    (idit_strategic_ver): Embedding(2, 4)
    (customer_job): Embedding(194, 4)
    (inquiry_type): Embedding(32, 4)
    (product_category): Embedding(243, 4)
    (customer_position): Embedding(46, 4)
    (response_corporate): Embedding(51, 4)
    (expected_timeline): Embedding(6, 4)
    (ver_cus): Embedding(2, 4)
    (ver_pro): Embedding(2, 4)
    (business_area): Embedding(13, 4)
    (lead_owner): Embedding(862, 4)
  )
  (linear_model): Linear(
    (embedding_dict): ModuleDict(
      (bant_submit): Embedding(5, 1)
      (customer_country): Embedding(165, 1)
      (business_unit): Embedding(4, 1)
      (customer_idx): Embedding(18514, 1)
      (

In [26]:
model.compile(optimizer=torch.optim.Adam(model.parameters(),lr=1e-4, weight_decay=1e-2),
              loss = 'binary_crossentropy',
              metrics = ['binary_crossentropy','auc'])

In [27]:
%%time
history = model.fit(train_model_input, train[target].values, shuffle=True, batch_size=512, epochs=100, verbose=1,
                        validation_split=0.2)

cpu
Train on 24669 samples, validate on 6168 samples, 49 steps per epoch


49it [00:01, 40.51it/s]


Epoch 1/100
1s - loss:  0.6645 - binary_crossentropy:  0.6641 - auc:  0.5389 - val_binary_crossentropy:  0.6229 - val_auc:  0.6312


49it [00:01, 38.03it/s]


Epoch 2/100
1s - loss:  0.5800 - binary_crossentropy:  0.5791 - auc:  0.8057 - val_binary_crossentropy:  0.5138 - val_auc:  0.5873


49it [00:00, 61.28it/s]


Epoch 3/100
0s - loss:  0.4501 - binary_crossentropy:  0.4483 - auc:  0.7181 - val_binary_crossentropy:  0.3385 - val_auc:  0.5461


49it [00:00, 69.10it/s]


Epoch 4/100
0s - loss:  0.3357 - binary_crossentropy:  0.3338 - auc:  0.7081 - val_binary_crossentropy:  0.2379 - val_auc:  0.5875


49it [00:00, 69.08it/s]


Epoch 5/100
0s - loss:  0.3150 - binary_crossentropy:  0.3167 - auc:  0.7851 - val_binary_crossentropy:  0.2298 - val_auc:  0.6592


49it [00:00, 68.67it/s]


Epoch 6/100
0s - loss:  0.3067 - binary_crossentropy:  0.3057 - auc:  0.8587 - val_binary_crossentropy:  0.2264 - val_auc:  0.7028


49it [00:00, 67.85it/s]


Epoch 7/100
0s - loss:  0.2967 - binary_crossentropy:  0.2976 - auc:  0.9134 - val_binary_crossentropy:  0.2221 - val_auc:  0.7346


49it [00:00, 67.37it/s]


Epoch 8/100
0s - loss:  0.2841 - binary_crossentropy:  0.2839 - auc:  0.9458 - val_binary_crossentropy:  0.2164 - val_auc:  0.7504


49it [00:00, 68.86it/s]


Epoch 9/100
0s - loss:  0.2678 - binary_crossentropy:  0.2681 - auc:  0.9618 - val_binary_crossentropy:  0.2091 - val_auc:  0.7598


49it [00:00, 65.71it/s]


Epoch 10/100
0s - loss:  0.2459 - binary_crossentropy:  0.2457 - auc:  0.9698 - val_binary_crossentropy:  0.2016 - val_auc:  0.7672


49it [00:01, 47.10it/s]


Epoch 11/100
1s - loss:  0.2169 - binary_crossentropy:  0.2176 - auc:  0.9749 - val_binary_crossentropy:  0.1908 - val_auc:  0.7765


49it [00:00, 68.30it/s]


Epoch 12/100
0s - loss:  0.1832 - binary_crossentropy:  0.1820 - auc:  0.9788 - val_binary_crossentropy:  0.1777 - val_auc:  0.7875


49it [00:00, 67.74it/s]


Epoch 13/100
0s - loss:  0.1511 - binary_crossentropy:  0.1497 - auc:  0.9825 - val_binary_crossentropy:  0.1683 - val_auc:  0.8023


49it [00:00, 67.38it/s]


Epoch 14/100
0s - loss:  0.1264 - binary_crossentropy:  0.1262 - auc:  0.9853 - val_binary_crossentropy:  0.1632 - val_auc:  0.8181


49it [00:00, 64.75it/s]


Epoch 15/100
0s - loss:  0.1081 - binary_crossentropy:  0.1075 - auc:  0.9888 - val_binary_crossentropy:  0.1593 - val_auc:  0.8299


49it [00:00, 67.52it/s]


Epoch 16/100
0s - loss:  0.0926 - binary_crossentropy:  0.0923 - auc:  0.9914 - val_binary_crossentropy:  0.1553 - val_auc:  0.8419


49it [00:00, 61.33it/s]


Epoch 17/100
0s - loss:  0.0795 - binary_crossentropy:  0.0787 - auc:  0.9932 - val_binary_crossentropy:  0.1514 - val_auc:  0.8517


49it [00:00, 68.74it/s]


Epoch 18/100
0s - loss:  0.0687 - binary_crossentropy:  0.0690 - auc:  0.9940 - val_binary_crossentropy:  0.1487 - val_auc:  0.8608


49it [00:00, 68.70it/s]


Epoch 19/100
0s - loss:  0.0597 - binary_crossentropy:  0.0592 - auc:  0.9957 - val_binary_crossentropy:  0.1462 - val_auc:  0.8686


49it [00:00, 53.07it/s]


Epoch 20/100
0s - loss:  0.0518 - binary_crossentropy:  0.0516 - auc:  0.9964 - val_binary_crossentropy:  0.1455 - val_auc:  0.8749


49it [00:00, 69.10it/s]


Epoch 21/100
0s - loss:  0.0454 - binary_crossentropy:  0.0455 - auc:  0.9972 - val_binary_crossentropy:  0.1425 - val_auc:  0.8801


49it [00:00, 69.07it/s]


Epoch 22/100
0s - loss:  0.0399 - binary_crossentropy:  0.0400 - auc:  0.9977 - val_binary_crossentropy:  0.1415 - val_auc:  0.8842


49it [00:00, 68.94it/s]


Epoch 23/100
0s - loss:  0.0352 - binary_crossentropy:  0.0351 - auc:  0.9979 - val_binary_crossentropy:  0.1412 - val_auc:  0.8870


49it [00:00, 65.81it/s]


Epoch 24/100
0s - loss:  0.0311 - binary_crossentropy:  0.0310 - auc:  0.9984 - val_binary_crossentropy:  0.1411 - val_auc:  0.8891


49it [00:00, 56.48it/s]


Epoch 25/100
0s - loss:  0.0276 - binary_crossentropy:  0.0276 - auc:  0.9985 - val_binary_crossentropy:  0.1411 - val_auc:  0.8910


49it [00:01, 47.96it/s]


Epoch 26/100
1s - loss:  0.0246 - binary_crossentropy:  0.0248 - auc:  0.9988 - val_binary_crossentropy:  0.1409 - val_auc:  0.8924


49it [00:01, 31.88it/s]


Epoch 27/100
1s - loss:  0.0219 - binary_crossentropy:  0.0225 - auc:  0.9990 - val_binary_crossentropy:  0.1415 - val_auc:  0.8938


49it [00:01, 32.47it/s]


Epoch 28/100
1s - loss:  0.0197 - binary_crossentropy:  0.0196 - auc:  0.9993 - val_binary_crossentropy:  0.1423 - val_auc:  0.8947


49it [00:02, 21.21it/s]


Epoch 29/100
2s - loss:  0.0177 - binary_crossentropy:  0.0175 - auc:  0.9994 - val_binary_crossentropy:  0.1442 - val_auc:  0.8948


49it [00:02, 19.97it/s]


Epoch 30/100
2s - loss:  0.0159 - binary_crossentropy:  0.0158 - auc:  0.9995 - val_binary_crossentropy:  0.1448 - val_auc:  0.8951


49it [00:02, 17.48it/s]


Epoch 31/100
3s - loss:  0.0144 - binary_crossentropy:  0.0144 - auc:  0.9996 - val_binary_crossentropy:  0.1469 - val_auc:  0.8953


49it [00:02, 17.84it/s]


Epoch 32/100
3s - loss:  0.0131 - binary_crossentropy:  0.0130 - auc:  0.9997 - val_binary_crossentropy:  0.1475 - val_auc:  0.8956


49it [00:02, 17.14it/s]


Epoch 33/100
3s - loss:  0.0121 - binary_crossentropy:  0.0124 - auc:  0.9997 - val_binary_crossentropy:  0.1493 - val_auc:  0.8956


49it [00:02, 16.86it/s]


Epoch 34/100
3s - loss:  0.0111 - binary_crossentropy:  0.0109 - auc:  0.9998 - val_binary_crossentropy:  0.1506 - val_auc:  0.8957


49it [00:02, 17.94it/s]


Epoch 35/100
3s - loss:  0.0101 - binary_crossentropy:  0.0103 - auc:  0.9998 - val_binary_crossentropy:  0.1530 - val_auc:  0.8954


49it [00:02, 18.11it/s]


Epoch 36/100
3s - loss:  0.0095 - binary_crossentropy:  0.0100 - auc:  0.9999 - val_binary_crossentropy:  0.1539 - val_auc:  0.8949


49it [00:02, 18.19it/s]


Epoch 37/100
3s - loss:  0.0088 - binary_crossentropy:  0.0087 - auc:  0.9999 - val_binary_crossentropy:  0.1549 - val_auc:  0.8949


49it [00:02, 16.69it/s]


Epoch 38/100
3s - loss:  0.0085 - binary_crossentropy:  0.0084 - auc:  0.9999 - val_binary_crossentropy:  0.1590 - val_auc:  0.8943


49it [00:02, 18.87it/s]


Epoch 39/100
2s - loss:  0.0079 - binary_crossentropy:  0.0078 - auc:  0.9999 - val_binary_crossentropy:  0.1593 - val_auc:  0.8941


49it [00:02, 20.17it/s]


Epoch 40/100
2s - loss:  0.0075 - binary_crossentropy:  0.0074 - auc:  0.9999 - val_binary_crossentropy:  0.1594 - val_auc:  0.8937


49it [00:02, 22.34it/s]


Epoch 41/100
2s - loss:  0.0072 - binary_crossentropy:  0.0071 - auc:  0.9999 - val_binary_crossentropy:  0.1612 - val_auc:  0.8937


49it [00:02, 23.09it/s]


Epoch 42/100
2s - loss:  0.0069 - binary_crossentropy:  0.0070 - auc:  0.9999 - val_binary_crossentropy:  0.1625 - val_auc:  0.8930


49it [00:01, 25.17it/s]


Epoch 43/100
2s - loss:  0.0067 - binary_crossentropy:  0.0066 - auc:  0.9999 - val_binary_crossentropy:  0.1640 - val_auc:  0.8929


49it [00:01, 30.60it/s]


Epoch 44/100
1s - loss:  0.0064 - binary_crossentropy:  0.0064 - auc:  0.9999 - val_binary_crossentropy:  0.1654 - val_auc:  0.8922


49it [00:01, 31.62it/s]


Epoch 45/100
1s - loss:  0.0063 - binary_crossentropy:  0.0062 - auc:  0.9999 - val_binary_crossentropy:  0.1674 - val_auc:  0.8921


49it [00:01, 30.21it/s]


Epoch 46/100
1s - loss:  0.0060 - binary_crossentropy:  0.0060 - auc:  0.9999 - val_binary_crossentropy:  0.1703 - val_auc:  0.8916


49it [00:01, 32.37it/s]


Epoch 47/100
1s - loss:  0.0061 - binary_crossentropy:  0.0062 - auc:  0.9999 - val_binary_crossentropy:  0.1702 - val_auc:  0.8915


49it [00:01, 33.45it/s]


Epoch 48/100
1s - loss:  0.0058 - binary_crossentropy:  0.0057 - auc:  0.9999 - val_binary_crossentropy:  0.1722 - val_auc:  0.8909


49it [00:01, 36.44it/s]


Epoch 49/100
1s - loss:  0.0056 - binary_crossentropy:  0.0056 - auc:  0.9999 - val_binary_crossentropy:  0.1738 - val_auc:  0.8908


49it [00:01, 32.97it/s]


Epoch 50/100
1s - loss:  0.0054 - binary_crossentropy:  0.0054 - auc:  1.0000 - val_binary_crossentropy:  0.1738 - val_auc:  0.8903


49it [00:01, 39.10it/s]


Epoch 51/100
1s - loss:  0.0054 - binary_crossentropy:  0.0054 - auc:  1.0000 - val_binary_crossentropy:  0.1752 - val_auc:  0.8898


49it [00:01, 39.50it/s]


Epoch 52/100
1s - loss:  0.0053 - binary_crossentropy:  0.0052 - auc:  1.0000 - val_binary_crossentropy:  0.1771 - val_auc:  0.8893


49it [00:01, 41.58it/s]


Epoch 53/100
1s - loss:  0.0053 - binary_crossentropy:  0.0053 - auc:  1.0000 - val_binary_crossentropy:  0.1798 - val_auc:  0.8893


49it [00:01, 41.10it/s]


Epoch 54/100
1s - loss:  0.0051 - binary_crossentropy:  0.0051 - auc:  1.0000 - val_binary_crossentropy:  0.1800 - val_auc:  0.8888


49it [00:01, 41.90it/s]


Epoch 55/100
1s - loss:  0.0050 - binary_crossentropy:  0.0049 - auc:  1.0000 - val_binary_crossentropy:  0.1804 - val_auc:  0.8887


49it [00:01, 41.59it/s]


Epoch 56/100
1s - loss:  0.0050 - binary_crossentropy:  0.0054 - auc:  1.0000 - val_binary_crossentropy:  0.1814 - val_auc:  0.8882


49it [00:01, 34.89it/s]


Epoch 57/100
1s - loss:  0.0049 - binary_crossentropy:  0.0049 - auc:  1.0000 - val_binary_crossentropy:  0.1830 - val_auc:  0.8877


49it [00:01, 42.03it/s]


Epoch 58/100
1s - loss:  0.0048 - binary_crossentropy:  0.0047 - auc:  1.0000 - val_binary_crossentropy:  0.1851 - val_auc:  0.8874


49it [00:01, 43.22it/s]


Epoch 59/100
1s - loss:  0.0047 - binary_crossentropy:  0.0047 - auc:  1.0000 - val_binary_crossentropy:  0.1873 - val_auc:  0.8869


49it [00:01, 43.70it/s]


Epoch 60/100
1s - loss:  0.0047 - binary_crossentropy:  0.0048 - auc:  1.0000 - val_binary_crossentropy:  0.1865 - val_auc:  0.8867


49it [00:01, 44.29it/s]


Epoch 61/100
1s - loss:  0.0046 - binary_crossentropy:  0.0045 - auc:  1.0000 - val_binary_crossentropy:  0.1879 - val_auc:  0.8861


49it [00:01, 45.78it/s]


Epoch 62/100
1s - loss:  0.0045 - binary_crossentropy:  0.0045 - auc:  1.0000 - val_binary_crossentropy:  0.1901 - val_auc:  0.8858


49it [00:01, 44.94it/s]


Epoch 63/100
1s - loss:  0.0046 - binary_crossentropy:  0.0045 - auc:  1.0000 - val_binary_crossentropy:  0.1896 - val_auc:  0.8857


49it [00:01, 37.16it/s]


Epoch 64/100
1s - loss:  0.0044 - binary_crossentropy:  0.0044 - auc:  1.0000 - val_binary_crossentropy:  0.1957 - val_auc:  0.8850


49it [00:01, 37.75it/s]


Epoch 65/100
1s - loss:  0.0045 - binary_crossentropy:  0.0045 - auc:  1.0000 - val_binary_crossentropy:  0.1920 - val_auc:  0.8849


49it [00:01, 35.47it/s]


Epoch 66/100
1s - loss:  0.0044 - binary_crossentropy:  0.0050 - auc:  0.9999 - val_binary_crossentropy:  0.1938 - val_auc:  0.8844


49it [00:01, 31.42it/s]


Epoch 67/100
1s - loss:  0.0043 - binary_crossentropy:  0.0043 - auc:  1.0000 - val_binary_crossentropy:  0.1955 - val_auc:  0.8840


49it [00:01, 36.12it/s]


Epoch 68/100
1s - loss:  0.0043 - binary_crossentropy:  0.0042 - auc:  1.0000 - val_binary_crossentropy:  0.1959 - val_auc:  0.8835


49it [00:01, 34.63it/s]


Epoch 69/100
1s - loss:  0.0043 - binary_crossentropy:  0.0043 - auc:  1.0000 - val_binary_crossentropy:  0.1962 - val_auc:  0.8831


49it [00:01, 31.99it/s]


Epoch 70/100
1s - loss:  0.0043 - binary_crossentropy:  0.0043 - auc:  1.0000 - val_binary_crossentropy:  0.1982 - val_auc:  0.8827


49it [00:01, 32.63it/s]


Epoch 71/100
1s - loss:  0.0042 - binary_crossentropy:  0.0041 - auc:  1.0000 - val_binary_crossentropy:  0.1989 - val_auc:  0.8825


49it [00:01, 29.18it/s]


Epoch 72/100
1s - loss:  0.0041 - binary_crossentropy:  0.0041 - auc:  1.0000 - val_binary_crossentropy:  0.1994 - val_auc:  0.8822


49it [00:01, 30.42it/s]


Epoch 73/100
1s - loss:  0.0042 - binary_crossentropy:  0.0043 - auc:  1.0000 - val_binary_crossentropy:  0.2005 - val_auc:  0.8816


49it [00:01, 28.17it/s]


Epoch 74/100
1s - loss:  0.0041 - binary_crossentropy:  0.0040 - auc:  1.0000 - val_binary_crossentropy:  0.2015 - val_auc:  0.8818


49it [00:02, 23.79it/s]


Epoch 75/100
2s - loss:  0.0041 - binary_crossentropy:  0.0046 - auc:  1.0000 - val_binary_crossentropy:  0.2029 - val_auc:  0.8812


49it [00:02, 22.54it/s]


Epoch 76/100
2s - loss:  0.0041 - binary_crossentropy:  0.0047 - auc:  1.0000 - val_binary_crossentropy:  0.2043 - val_auc:  0.8807


49it [00:02, 22.13it/s]


Epoch 77/100
2s - loss:  0.0041 - binary_crossentropy:  0.0041 - auc:  1.0000 - val_binary_crossentropy:  0.2055 - val_auc:  0.8804


49it [00:02, 21.84it/s]


Epoch 78/100
2s - loss:  0.0040 - binary_crossentropy:  0.0041 - auc:  1.0000 - val_binary_crossentropy:  0.2061 - val_auc:  0.8804


49it [00:02, 21.36it/s]


Epoch 79/100
2s - loss:  0.0039 - binary_crossentropy:  0.0039 - auc:  1.0000 - val_binary_crossentropy:  0.2073 - val_auc:  0.8799


49it [00:02, 23.05it/s]


Epoch 80/100
2s - loss:  0.0040 - binary_crossentropy:  0.0039 - auc:  1.0000 - val_binary_crossentropy:  0.2101 - val_auc:  0.8796


49it [00:02, 21.80it/s]


Epoch 81/100
2s - loss:  0.0040 - binary_crossentropy:  0.0040 - auc:  1.0000 - val_binary_crossentropy:  0.2091 - val_auc:  0.8789


49it [00:02, 21.43it/s]


Epoch 82/100
2s - loss:  0.0039 - binary_crossentropy:  0.0043 - auc:  1.0000 - val_binary_crossentropy:  0.2105 - val_auc:  0.8786


49it [00:02, 21.33it/s]


Epoch 83/100
2s - loss:  0.0039 - binary_crossentropy:  0.0041 - auc:  1.0000 - val_binary_crossentropy:  0.2109 - val_auc:  0.8783


49it [00:02, 21.16it/s]


Epoch 84/100
2s - loss:  0.0039 - binary_crossentropy:  0.0042 - auc:  1.0000 - val_binary_crossentropy:  0.2135 - val_auc:  0.8780


49it [00:02, 21.30it/s]


Epoch 85/100
2s - loss:  0.0039 - binary_crossentropy:  0.0038 - auc:  1.0000 - val_binary_crossentropy:  0.2127 - val_auc:  0.8780


49it [00:02, 19.88it/s]


Epoch 86/100
2s - loss:  0.0038 - binary_crossentropy:  0.0038 - auc:  1.0000 - val_binary_crossentropy:  0.2148 - val_auc:  0.8773


49it [00:02, 21.03it/s]


Epoch 87/100
2s - loss:  0.0038 - binary_crossentropy:  0.0037 - auc:  1.0000 - val_binary_crossentropy:  0.2152 - val_auc:  0.8769


49it [00:02, 21.23it/s]


Epoch 88/100
2s - loss:  0.0037 - binary_crossentropy:  0.0037 - auc:  1.0000 - val_binary_crossentropy:  0.2173 - val_auc:  0.8764


49it [00:02, 20.50it/s]


Epoch 89/100
2s - loss:  0.0038 - binary_crossentropy:  0.0037 - auc:  1.0000 - val_binary_crossentropy:  0.2171 - val_auc:  0.8765


49it [00:02, 20.67it/s]


Epoch 90/100
2s - loss:  0.0038 - binary_crossentropy:  0.0037 - auc:  1.0000 - val_binary_crossentropy:  0.2182 - val_auc:  0.8759


49it [00:02, 20.88it/s]


Epoch 91/100
2s - loss:  0.0037 - binary_crossentropy:  0.0038 - auc:  1.0000 - val_binary_crossentropy:  0.2196 - val_auc:  0.8756


49it [00:02, 21.35it/s]


Epoch 92/100
2s - loss:  0.0037 - binary_crossentropy:  0.0037 - auc:  1.0000 - val_binary_crossentropy:  0.2202 - val_auc:  0.8753


49it [00:02, 21.60it/s]


Epoch 93/100
2s - loss:  0.0037 - binary_crossentropy:  0.0037 - auc:  1.0000 - val_binary_crossentropy:  0.2209 - val_auc:  0.8752


49it [00:02, 20.41it/s]


Epoch 94/100
2s - loss:  0.0038 - binary_crossentropy:  0.0038 - auc:  1.0000 - val_binary_crossentropy:  0.2230 - val_auc:  0.8747


49it [00:02, 22.05it/s]


Epoch 95/100
2s - loss:  0.0039 - binary_crossentropy:  0.0038 - auc:  1.0000 - val_binary_crossentropy:  0.2225 - val_auc:  0.8748


49it [00:02, 22.00it/s]


Epoch 96/100
2s - loss:  0.0037 - binary_crossentropy:  0.0037 - auc:  1.0000 - val_binary_crossentropy:  0.2236 - val_auc:  0.8750


49it [00:02, 21.74it/s]


Epoch 97/100
2s - loss:  0.0036 - binary_crossentropy:  0.0036 - auc:  1.0000 - val_binary_crossentropy:  0.2249 - val_auc:  0.8747


49it [00:02, 22.11it/s]


Epoch 98/100
2s - loss:  0.0036 - binary_crossentropy:  0.0036 - auc:  1.0000 - val_binary_crossentropy:  0.2254 - val_auc:  0.8743


49it [00:02, 21.50it/s]


Epoch 99/100
2s - loss:  0.0036 - binary_crossentropy:  0.0037 - auc:  1.0000 - val_binary_crossentropy:  0.2297 - val_auc:  0.8740


49it [00:02, 21.79it/s]


Epoch 100/100
2s - loss:  0.0036 - binary_crossentropy:  0.0035 - auc:  1.0000 - val_binary_crossentropy:  0.2272 - val_auc:  0.8733
CPU times: user 5min 42s, sys: 704 ms, total: 5min 43s
Wall time: 3min


In [28]:
pred_ans = model.predict(test_model_input, 512)

In [29]:
sum(pred_ans)

array([257.94421162])

In [30]:
best_threshold= 0.009
pred_ans = model.predict(test_model_input, 512)
final_preds = pred_ans >= best_threshold

In [31]:
def convert_to_boolean_vector(input_vector):
    return np.array(input_vector == 1, dtype=bool)          #0,1로 제출해도 되는데 찜찜해서 True False로 변경 
    
result_vector = convert_to_boolean_vector(final_preds)

In [32]:
result_vector

array([[False],
       [False],
       [False],
       ...,
       [False],
       [False],
       [ True]])

In [33]:
sum(final_preds)

array([693])

In [69]:
ones_ratio = sum(final_preds) / len(final_preds);ones_ratio 

array([0.33807627])

In [70]:
# 제출 데이터 읽어오기 (df_test는 전처리된 데이터가 저장됨)
df_sub = pd.read_csv("submission.csv")
df_sub["is_converted"] = final_preds

# 제출 파일 저장
df_sub.to_csv("submission.csv", index=False)

In [108]:
df_sub["is_converted"]

0        True
1        True
2       False
3       False
4       False
        ...  
5266    False
5267     True
5268     True
5269     True
5270     True
Name: is_converted, Length: 5271, dtype: bool