In [1]:
import os
import glob
from tqdm.notebook import tqdm
import pandas as pd
import urllib3
import json

In [2]:
DATA_PATH = os.getcwd() + r'\target_data'
print(DATA_PATH)

C:\Users\Kislee\PycharmProjects\Korean_textbook\target_data


In [3]:
files = glob.glob(os.path.join(DATA_PATH, '*.xlsx'))

In [4]:
all_dfs = []
for source in files:
    temp_df = pd.read_excel(source, engine='openpyxl')
    all_dfs.append(temp_df)
df = pd.concat(all_dfs)

In [5]:
# 언어 분석 기술 문어/구어 중 한가지만 선택해 사용
# 언어 분석 기술(문어)
#openApiURL = "http://aiopen.etri.re.kr:8000/WiseNLU" 
# 언어 분석 기술(구어)
openApiURL = "http://aiopen.etri.re.kr:8000/WiseNLU_spoken"
accessKey = "YOUR_ACCESS_KEY"
analysisCode = "ner"

In [6]:
def requestNERfromText(text):
    #요청정보 생성
    requestJson = {
        "access_key": accessKey,
        "argument": {
            "text": text,
            "analysis_code": analysisCode
        }
    }
        
    http = urllib3.PoolManager()
    #API 호출
    response = http.request(
        "POST",
        openApiURL,
        headers={"Content-Type": "application/json; charset=UTF-8"},
        body=json.dumps(requestJson)
    )
        
    #수신 데이터 변환
    if response.status ==  200:
        result = json.loads(response.data)
    else:
        result = None        
    
    #개체정보 수집
    ner_list = []
    if result != None:
        for sent in result['return_object']['sentence']:
            for item in sent['NE']:
                ner_list.append((item['text'], item['type']))
    
    return ner_list   

In [7]:
def isName(tag):
    if tag == 'PS_NAME':
        return True
    else:
        return False

def isCountry(tag):
    if tag == 'LCP_COUNTRY':
        return True
    else:
        return False

def isTribe(tag):
    if tag == 'CV_TRIBE':
        return True
    else:
        return False

In [8]:
#추가할 컬럼들
name_columns = [] #외국인인명
country_columns = [] #국가명
tribe_columns = [] #민족명

In [9]:
attribute='지문'
stop_country = ['한국','대한민국','북한','조선','고구려','북','고려','고조선','남한','삼국','신라','한','남북한','한국대','남','한중일','한국적','청','미','대〜한민국','남북']

In [10]:
for cell in tqdm(df[attribute]):
    ner_info = []
    try:
        ner_info = requestNERfromText(cell)
    except:
        print('NER 정보 수집 실패')
    
    names_in_cell = []
    country_in_cell = []
    tribe_in_cell = []
    for word, tag in ner_info:
        if isName(tag):
            names_in_cell.append(word)
        elif isCountry(tag):
            if word not in stop_country: #제외국가 설정
                country_in_cell.append(word)
        elif isTribe(tag):
            tribe_in_cell.append(word)
    
    name_columns.append(list(set(names_in_cell))) 
    country_in_cell.sort() #국가 정렬
    country_columns.append(country_in_cell) #중복제거없이 모두 나열
    tribe_columns.append(list(set(tribe_in_cell)))     

  0%|          | 0/475 [00:00<?, ?it/s]

In [11]:
for countries in country_columns:
    for i in range(len(countries)):
        countries[i] = countries[i].replace('나우루 공화국','나우루')
        countries[i] = countries[i].replace('남프랑스','프랑스')

In [12]:
name_sum = ['/'.join(item) for item in name_columns]
country_sum = ['/'.join(item) for item in country_columns]
tribe_sum = ['/'.join(item) for item in tribe_columns]

In [13]:
df['외국인인명'] = name_sum
df['국가명'] = country_sum
df['민족명'] = tribe_sum

In [15]:
df.to_excel('result_211015.xlsx')

In [16]:
country_dic = {}
for countries in country_columns:
    for item in countries:
        country_dic[item] = country_dic.get(item, 0) + 1

country_freq = []
for key, value in country_dic.items():
    country_freq.append((value, key))
country_freq.sort(reverse=True)
print("상위빈도 국가", country_freq)

상위빈도 국가 [(49, '중국'), (48, '미국'), (43, '일본'), (39, '나우루'), (16, '영국'), (11, '호주'), (11, '스페인'), (10, '독일'), (8, '인도'), (8, '그리스'), (7, '러시아'), (6, '프랑스'), (6, '노르웨이'), (5, '콜롬비아'), (5, '이탈리아'), (5, '브라질'), (5, '몽골'), (4, '태국'), (3, '이란'), (3, '몰디브'), (3, '모나코'), (2, '필리핀'), (2, '파라과이'), (2, '투발루'), (2, '코스타리카'), (2, '케냐'), (2, '캐나다'), (2, '자메이카'), (2, '이집트'), (2, '예멘'), (2, '에티오피아'), (2, '스웨덴'), (2, '부탄'), (2, '베트남'), (2, '라오스'), (1, '파나마'), (1, '탄자니아'), (1, '칠레'), (1, '인도네시아'), (1, '스위스'), (1, '스리랑카'), (1, '뉴질랜드'), (1, '나이지리아'), (1, '과테말라')]


In [17]:
#코드 종료