In [1]:
# 범용 라이브러리
import pandas as pd
import numpy as np
from tqdm import tqdm   # 진행상황 Progress Bar를 위한 tqdm library
import re               # Regular Expression
import pickle           # 토큰화된 단어목록의 인덱스를 저장, 불러오기 위해 사용
from sklearn.model_selection import train_test_split

# 토크나이징, 인코딩 관련 라이브러리
from eunjeon import Mecab
from collections import Counter
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# 그래프 관련 라이브러리
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import platform

# 한글 폰트 설정
if (platform.system() == 'Windows'):
    plt.rc('font', family='Malgun Gothic')
else:
    plt.rc('font', family='AppleGothic')

# 음수(-)가 깨지는 현상 방지
plt.rcParams['axes.unicode_minus'] = False 


In [2]:
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')

In [3]:
# 한국표준산업분류 딕셔너리 생성

cat_dic = pd.read_excel('./data/한국표준산업분류(10차)_국문.xlsx').iloc[2:,0:6]
cat_dic.rename(columns={'개정 분류체계(제10차 기준)':'대분류', 'Unnamed: 1':'대분류명', 
                        'Unnamed: 2': '중분류', 'Unnamed: 3': '중분류명',
                        'Unnamed: 4': '소분류', 'Unnamed: 5': '소분류명'}, inplace=True)

digit_1_df = cat_dic[['대분류', '대분류명']].dropna().reset_index(drop=True) # 대분류 'A' ~ 'U'
digit_2_df = cat_dic[['중분류', '중분류명']].dropna().reset_index(drop=True) # 중분류 1 ~ 99
digit_3_df = cat_dic[['소분류', '소분류명']].dropna().reset_index(drop=True) # 소분류 11 ~ 990

digit_1_dict = {}
for i in tqdm(range(len(digit_1_df))):
    digit_1_dict[digit_1_df.iloc[i]['대분류']] = digit_1_df.iloc[i]['대분류명']

digit_2_dict = {}
for i in tqdm(range(len(digit_2_df))):
    digit_2_dict[int(digit_2_df.iloc[i]['중분류'])] = digit_2_df.iloc[i]['중분류명']

digit_3_dict = {}
for i in tqdm(range(len(digit_3_df))):
    digit_3_dict[int(digit_3_df.iloc[i]['소분류'])] = digit_3_df.iloc[i]['소분류명']

100%|██████████| 21/21 [00:00<00:00, 5392.46it/s]
100%|██████████| 77/77 [00:00<00:00, 3766.93it/s]
100%|██████████| 232/232 [00:00<00:00, 2509.01it/s]


In [4]:
train['text_obj'] = train['text_obj'].str.replace('[^ㄱ-ㅎㅏ-ㅣ가-힣 ]','').str.replace('^ +', '') # [ㄱ - ㅎ, ㅏ - ㅣ, 가 - 핳] 제외 제거
train['text_mthd'] = train['text_mthd'].str.replace('[^ㄱ-ㅎㅏ-ㅣ가-힣 ]','').str.replace('^ +', '') # [ㄱ - ㅎ, ㅏ - ㅣ, 가 - 핳] 제외 제거
train['text_deal'] = train['text_deal'].str.replace('[^ㄱ-ㅎㅏ-ㅣ가-힣 ]','').str.replace('^ +', '') # [ㄱ - ㅎ, ㅏ - ㅣ, 가 - 핳] 제외 제거

test['text_obj'] = test['text_obj'].str.replace('[^ㄱ-ㅎㅏ-ㅣ가-힣 ]','').str.replace('^ +', '')
test['text_mthd'] = test['text_mthd'].str.replace('[^ㄱ-ㅎㅏ-ㅣ가-힣 ]','').str.replace('^ +', '')
test['text_deal'] = test['text_deal'].str.replace('[^ㄱ-ㅎㅏ-ㅣ가-힣 ]','').str.replace('^ +', '')

  train['text_obj'] = train['text_obj'].str.replace('[^ㄱ-ㅎㅏ-ㅣ가-힣 ]','').str.replace('^ +', '') # [ㄱ - ㅎ, ㅏ - ㅣ, 가 - 핳] 제외 제거
  train['text_mthd'] = train['text_mthd'].str.replace('[^ㄱ-ㅎㅏ-ㅣ가-힣 ]','').str.replace('^ +', '') # [ㄱ - ㅎ, ㅏ - ㅣ, 가 - 핳] 제외 제거
  train['text_deal'] = train['text_deal'].str.replace('[^ㄱ-ㅎㅏ-ㅣ가-힣 ]','').str.replace('^ +', '') # [ㄱ - ㅎ, ㅏ - ㅣ, 가 - 핳] 제외 제거
  test['text_obj'] = test['text_obj'].str.replace('[^ㄱ-ㅎㅏ-ㅣ가-힣 ]','').str.replace('^ +', '')
  test['text_mthd'] = test['text_mthd'].str.replace('[^ㄱ-ㅎㅏ-ㅣ가-힣 ]','').str.replace('^ +', '')
  test['text_deal'] = test['text_deal'].str.replace('[^ㄱ-ㅎㅏ-ㅣ가-힣 ]','').str.replace('^ +', '')


In [5]:
print('Train 데이터 결측치 제거 전:', len(train))
train.fillna("",inplace=True)
print('Train 데이터 결측치 제거 후:', len(train))

print('Test 데이터 결측치 제거 전:', len(test))
test.fillna("", inplace=True)
print('Test 데이터 결측치 제거 후:', len(test))

Train 데이터 결측치 제거 전: 1000000
Train 데이터 결측치 제거 후: 1000000
Test 데이터 결측치 제거 전: 100000
Test 데이터 결측치 제거 후: 100000


In [6]:
# # 불용어 사전 불러오기

# stopwords = pd.read_csv("./data/stopwords/stopwords_1.csv", encoding='CP949')
# stopwords = stopwords['stopwords']
# print(len(stopwords))

# stopwords_2 = pd.read_table("./data/stopwords/stopwords_2.txt", names=['stopwords', '품사', 'ratio'])
# stopwords_2 = stopwords_2['stopwords']
# print(len(stopwords_2))

# # 불용어 사전 병합

# stopwords = pd.concat([stopwords, stopwords_2], axis=0)
# stopwords.drop_duplicates(inplace=True)
# stopwords.reset_index(inplace=True, drop=True)
# print(len(stopwords))

# # 불용어 사전 csv로 저장, 불러오기

# stopwords.to_csv("./data/stopwords/stopwords_concat.csv", index=False) 
stopwords = pd.read_csv("./data/stopwords/stopwords_concat.csv") 
stopwords = list(stopwords)
stopwords

['stopwords']

In [7]:
# train 데이터 토크나이징

mecab_train_text_obj = []
mecab_train_text_mthd = []
mecab_train_text_deal = []

mecab = Mecab()

for sentence in tqdm(train['text_obj']): # 진행상황 Progress Bar를 위한 tqdm library
    tokenized_sentence = mecab.morphs(sentence) # 토큰화
    stopwords_removed_sentence = [word for word in tokenized_sentence if not word in list(stopwords)] # 불용어 제거
    mecab_train_text_obj.append(stopwords_removed_sentence)

for sentence in tqdm(train['text_mthd']): # 진행상황 Progress Bar를 위한 tqdm library
    tokenized_sentence = mecab.morphs(sentence) # 토큰화
    stopwords_removed_sentence = [word for word in tokenized_sentence if not word in list(stopwords)] # 불용어 제거
    mecab_train_text_mthd.append(stopwords_removed_sentence)

for sentence in tqdm(train['text_deal']): # 진행상황 Progress Bar를 위한 tqdm library
    tokenized_sentence = mecab.morphs(sentence) # 토큰화
    stopwords_removed_sentence = [word for word in tokenized_sentence if not word in list(stopwords)] # 불용어 제거
    mecab_train_text_deal.append(stopwords_removed_sentence)

100%|██████████| 1000000/1000000 [00:30<00:00, 33110.76it/s]
100%|██████████| 1000000/1000000 [00:24<00:00, 40565.81it/s]
100%|██████████| 1000000/1000000 [00:25<00:00, 39729.37it/s]


In [10]:
mecab_train_text_mthd

[['자동차', '부분', '정비'],
 ['일반인', '을', '대상', '으로'],
 ['공업', '용', '고무', '를', '가지', '고'],
 ['일반', '소비자', '에게'],
 ['보호자', '의', '위탁', '을', '받', '아'],
 ['절삭', '용접'],
 ['접객', '시설', '을', '갖추', '고'],
 ['가공', '하', '여'],
 ['일반', '공공', '행정'],
 ['고객', '의뢰', '를', '받', '아'],
 ['입', '고', '가공'],
 ['접객', '시설', '을', '갖추', '고'],
 ['일반인', '대상', '으로'],
 ['접객', '시설', '을', '갖추', '고'],
 ['일반인', '을', '대상', '으로'],
 ['학생', '및', '일반', '대사', '으로'],
 ['고객', '요청', '으로'],
 ['운송', '화물'],
 ['일반인', '에게'],
 ['일반', '고객', '을', '대상'],
 ['가루', '로', '분쇄', '하', '여', '떡', '을', '쪄서'],
 [],
 ['무도', '시설', '을', '갖추', '고'],
 ['일반인', '대상', '으로', '소매'],
 ['빙수', '아이스크림', '커피', '디저트', '제공'],
 ['일반', '소비자', '에게'],
 ['절단', '가공'],
 ['직원', '들', '대상'],
 ['일반인', '대상'],
 ['재료', '구입', '세미', '앗', '세이'],
 ['소비자', '를', '대상', '으로'],
 ['가공', '업체', '가공', '의뢰', '하', '여', '완제품', '수출'],
 ['접객', '시설', '을', '갖추', '고'],
 ['고객', '의', '의뢰', '를', '받', '아'],
 ['접객', '시설', '을', '갖추', '고'],
 ['접객', '시설', '을', '갖추', '고'],
 ['기독교', '계통', '종교', '활동'],
 ['접객', '시설', '을

In [8]:
# test 데이터 토크나이징

mecab_test_text_obj = []
mecab_test_text_mthd = []
mecab_test_text_deal = []

mecab = Mecab()

for sentence in tqdm(test['text_obj']): # 진행상황 Progress Bar를 위한 tqdm library
    tokenized_sentence = mecab.morphs(sentence) # 토큰화
    stopwords_removed_sentence = [word for word in tokenized_sentence if not word in list(stopwords)] # 불용어 제거
    mecab_test_text_obj.append(stopwords_removed_sentence)

for sentence in tqdm(test['text_mthd']): # 진행상황 Progress Bar를 위한 tqdm library
    tokenized_sentence = mecab.morphs(sentence) # 토큰화
    stopwords_removed_sentence = [word for word in tokenized_sentence if not word in list(stopwords)] # 불용어 제거
    mecab_test_text_mthd.append(stopwords_removed_sentence)

for sentence in tqdm(test['text_deal']): # 진행상황 Progress Bar를 위한 tqdm library
    tokenized_sentence = mecab.morphs(sentence) # 토큰화
    stopwords_removed_sentence = [word for word in tokenized_sentence if not word in list(stopwords)] # 불용어 제거
    mecab_test_text_deal.append(stopwords_removed_sentence)

100%|██████████| 100000/100000 [00:04<00:00, 21685.03it/s]
100%|██████████| 100000/100000 [00:02<00:00, 38401.85it/s]
100%|██████████| 100000/100000 [00:02<00:00, 47631.84it/s]


In [9]:
mecab_test_text_mthd

[['고객', '의', '주문', '에', '의해'],
 ['다른', '소매업자', '에게'],
 ['신도', '을', '대상', '으로'],
 ['고객', '요구', '로'],
 ['접객', '시설', '을', '갖추', '고'],
 ['스크린', '인쇄'],
 ['접객', '시설', '가지', '고'],
 ['프레스', '가공', '하', '여', '제조'],
 ['서비스', '센터', '에서'],
 ['완성품', '입', '고', '수선'],
 ['원재료', '입', '고', '반죽'],
 ['일반인', '대상', '으로'],
 ['고객', '의', '요구', '에', '따라'],
 ['소아', '청소년', '을', '대상', '으로'],
 ['사무실', '에서'],
 ['일반', '고객', '대상'],
 ['관련', '사용', '자', '에게'],
 ['주문', '에', '의해'],
 ['일반인', '대상'],
 ['고객', '요청', '에', '의해'],
 ['도금'],
 [],
 ['고객', '에', '의뢰', '를', '받', '아'],
 ['일반인'],
 ['나무', '를', '이용', '해'],
 ['고객', '요구', '로'],
 ['일반인', '을', '대상', '으로'],
 ['고객', '의뢰', '로'],
 ['계약', '및', '중개', '수수료', '받', '고'],
 ['인터넷', '시설', '을', '갖추', '고'],
 ['어린이', '들', '을', '대상', '으로'],
 ['선', '별장', '에서'],
 ['접객', '시설', '을', '갖추', '고'],
 ['사업장', '에서'],
 ['남성', '두', '발', '서비스'],
 ['접객', '시설', '을', '갗', '추고'],
 ['일반', '소비자', '대상', '으로'],
 ['기술'],
 ['원료', '투입', '압출'],
 ['외래', '환자', '위주', '로'],
 ['일반', '고객', '을', '대상', '으로'],
 ['재단기', '사용'],
 [

In [11]:
train['text'] = mecab_train_text_obj
train['text'] = train['text'] + mecab_train_text_mthd
train['text'] = train['text'] + mecab_train_text_deal

  return op(a, b)


In [12]:
test['text'] = mecab_test_text_obj
test['text'] = test['text'] + mecab_test_text_mthd
test['text'] = test['text'] + mecab_test_text_deal

In [14]:
train.to_csv('./train_user_dictionary.csv', index=False, encoding='CP949')

test.to_csv('./test_user_dictionary.csv', index=False, encoding='CP949')