In [1]:
import random
import os
import pandas as pd
import numpy as np
import re
import warnings
warnings.filterwarnings('ignore')
from collections import Counter
import importlib
import matplotlib.ticker as mticker
from matplotlib.ticker import FuncFormatter
from matplotlib.ticker import MaxNLocator
from datetime import datetime, timedelta

import matplotlib.pyplot as plt
from matplotlib import rc
import seaborn as sns
%matplotlib inline
sns.set(style="whitegrid")  # 그래프 스타일 설정

import matplotlib
import matplotlib.font_manager
#[f.name for f in matplotlib.font_manager.fontManager.ttflist if 'Nanum' in f.name]
import platform

plt.rcParams['axes.unicode_minus'] = False

if platform.system() == 'Windows':
    path = "c:/Windows/Fonts/NanumBarunGothic.ttf"
    font_name = font_manager.FontProperties(fname=path).get_name()
    rc('font', family=font_name)
elif platform.system() == 'Darwin':
    rc('font', family='AppleGothic')
elif platform.system() == 'Linux':
    rc('font', family='NanumBarunGothic')
else:
    print('Unknown system... sorry~~~~~~')

In [2]:
df_train = pd.read_csv("train.csv") # 학습용 데이터
df_test = pd.read_csv("submission.csv") # 테스트 데이터(제출파일의 데이터)

## 1. customer_country (고객의 국적)
- `customer_nation (나라)` + `customer_city (도시)` 로 나눔
- `install & import pycountry` 필수

In [3]:
# !pip install pycountry
import pycountry

# '고객_유형' 컬럼의 모든 값을 소문자로 변환하고, 기호와 띄어쓰기 제거 ## 숫자도
def preprocess_text_column(df, column_name):
    df[column_name] = df[column_name].str.lower().str.replace(r'[^a-zA-Z]+', '', regex=True)
    

# 국가 이름을 표준화하는 함수
def standardize_country_name(name):
    try:
        # pycountry를 사용하여 국가 객체 찾기
        country = pycountry.countries.lookup(name)
        # 공식 국가명 반환
        return country.name
    except LookupError:
        # 국가를 찾지 못한 경우 입력값 그대로 반환
        return name

##########################################################################################
def eda_customer_country(df):
    ## '/'를 기준으로 컬럼을 나누어 'customer_city'와 'customer_country' 컬럼 생성
    df[['customer_city', 'customer_nation']] = df['customer_country'].str.split('/', expand=True).iloc[:, 1:3]

    ## 영어로 번역
    df['customer_city'] = df['customer_city'].apply(standardize_country_name)
    df['customer_nation'] = df['customer_nation'].apply(standardize_country_name)

    ## 띄어쓰기, 기호, 소문자 + 숫자 제거
    preprocess_text_column(df, 'customer_city')
    preprocess_text_column(df, 'customer_nation')

    ## 커스텀 : 터키, us
    df['customer_nation'] = df['customer_nation'].replace({'trkiye': 'turkey'}, regex=True)
    df['customer_nation'] = df['customer_nation'].replace({'.*unitedstates.*': 'unitedstates'}, regex=True)

    ## 기존 customer_country 제거
    df.drop(['customer_country'], axis=1, inplace=True)

    return df

In [4]:
df_train = eda_customer_country(df_train)
df_test = eda_customer_country(df_test)

In [5]:
df_train.head()

Unnamed: 0,bant_submit,business_unit,com_reg_ver_win_rate,customer_idx,customer_type,enterprise,historical_existing_cnt,id_strategic_ver,it_strategic_ver,idit_strategic_ver,...,ver_cus,ver_pro,ver_win_rate_x,ver_win_ratio_per_bu,business_area,business_subarea,lead_owner,is_converted,customer_city,customer_nation
0,1.0,AS,0.066667,32160,End-Customer,Enterprise,,,,,...,1,0,0.003079,0.026846,corporate / office,Engineering,0,True,quezoncity,philippines
1,1.0,AS,0.066667,23122,End-Customer,Enterprise,12.0,,,,...,1,0,0.003079,0.026846,corporate / office,Advertising,1,True,ph,philippines
2,1.0,AS,0.088889,1755,End-Customer,Enterprise,144.0,,,,...,1,0,0.003079,0.026846,corporate / office,Construction,2,True,kolkata,india
3,1.0,AS,0.088889,4919,End-Customer,Enterprise,,,,,...,1,0,0.003079,0.026846,corporate / office,IT/Software,3,True,bhubaneswar,india
4,1.0,AS,0.088889,17126,Specifier/ Influencer,Enterprise,,,,,...,0,0,0.003079,0.026846,corporate / office,,4,True,hyderabad,india


## 2. response_corporate (담당 자사 법인명)
- 전처리하면 성능 낮아짐
- pass

In [6]:
df_train['response_corporate']

0        LGEPH
1        LGEPH
2        LGEIL
3        LGEIL
4        LGEIL
         ...  
59294    LGEPL
59295    LGECB
59296    LGEPR
59297    LGEPR
59298    LGEKR
Name: response_corporate, Length: 59299, dtype: object