In [129]:
import json
import requests
import pandas as pd
import csv
import os
from kiwipiepy import Kiwi
import re
from collections import Counter

In [96]:
def get_lstrm(page):
    configfile = 'law_config.json'
    law_category = '법률용어'

    with open(configfile, 'r', encoding='utf-8') as f:
        config = json.load(f)

    law_config = config[law_category]

    law_meta = law_config[law_category]

    url = law_meta['url']

    params = {
        'OC' : law_meta['OC'], # 사용자 이메일의 ID(g4c@korea.kr일경우 OC값=g4c) (필수)
        'type' : law_meta['type'], # 출력 형태 HTML/XML/JSON (필수)
        'page' : page, # 검색 결과 페이지 (default=1)
        'display' : 100, # 검색된 결과 개수 (default=20 max=100)
    }

    response = requests.get(url, params=params)
    data = response.json()

    lstrm = data['LsTrmSearch']['lstrm']

    return lstrm

In [99]:
# CSV 헤더 파일 생성

file_name = 'law.csv'

if not os.path.isfile(file_name): # law.csv 파일이 존재하지 않으면
    with open(file_name, 'w', encoding='utf-8', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(get_lstrm(1)[0].keys())

In [None]:
# # CSV 파일에 내용 채우기

# page = 1
# while True:
#     try:
#         lstrm = get_lstrm(page)
#         df = pd.DataFrame.from_dict(lstrm)

#         df.to_csv('law.csv', mode='a', index=False, header=False)
#     except:
#         break

#     page += 1

In [101]:
salary_df = pd.read_csv('law.csv')

law_word = salary_df['법령용어명']

freq_dict = dict()
kiwi = Kiwi()

for i in range(len(law_word)):
    token_list = re.split(r'[,ㆍ:()「」 ]', law_word[i])
    # ['1959년', '12월', '31일', '이전에', '퇴직한', '군인의', '퇴직급여금지급에관한', '특별법']

    for token in token_list:
        if token != '':
            kiwi_token_list = kiwi.tokenize(token)

            for kiwi_token in kiwi_token_list:
                if kiwi_token.tag in ['NNG', 'NNP']:
                    word = kiwi_token.form

                    if word in freq_dict.keys():
                        freq_dict[word] += 1
                    else:
                        freq_dict[word] = 1

In [103]:
sorted(freq_dict.items(), key = lambda x : x[1], reverse = True)

[('대통령령', 6411),
 ('령', 2868),
 ('기관', 2043),
 ('관리', 1737),
 ('시설', 1736),
 ('사업', 1715),
 ('정보', 1635),
 ('경우', 1573),
 ('기준', 1186),
 ('사항', 1061),
 ('안전', 891),
 ('시스템', 873),
 ('기술', 861),
 ('위원회', 853),
 ('산업', 851),
 ('금융', 779),
 ('환경', 758),
 ('교육', 754),
 ('업무', 744),
 ('해양', 733),
 ('법', 716),
 ('지역', 691),
 ('지원', 676),
 ('평가', 673),
 ('사유', 668),
 ('개발', 661),
 ('연구', 654),
 ('국토', 625),
 ('기획', 619),
 ('관련', 600),
 ('해당', 579),
 ('고시', 576),
 ('조사', 574),
 ('부령', 574),
 ('자료', 560),
 ('기업', 558),
 ('법률', 551),
 ('교통부', 547),
 ('단체', 524),
 ('사업자', 516),
 ('행정', 507),
 ('운영', 503),
 ('밖', 493),
 ('폐지', 489),
 ('재정부', 488),
 ('검사', 462),
 ('보호', 459),
 ('복지', 457),
 ('국가', 455),
 ('기간', 451),
 ('시험', 442),
 ('보건', 440),
 ('전문', 436),
 ('이상', 429),
 ('계획', 420),
 ('계약', 418),
 ('부', 416),
 ('장치', 414),
 ('장관', 403),
 ('규모', 399),
 ('수산부', 399),
 ('투자', 395),
 ('농림', 392),
 ('변경', 391),
 ('법인', 389),
 ('항공', 379),
 ('주택', 379),
 ('금액', 378),
 ('관계', 377),
 ('장비', 376),
 ('서비스

In [128]:
kiwi.tokenize('080착신과금사업자')

[Token(form='080', tag='SN', start=0, len=3),
 Token(form='착', tag='NNG', start=3, len=1),
 Token(form='신', tag='NNG', start=4, len=1),
 Token(form='과', tag='JC', start=5, len=1),
 Token(form='금', tag='NNG', start=6, len=1),
 Token(form='사업자', tag='NNG', start=7, len=3)]

In [None]:
kiwi = Kiwi()

law_text = pd.read_csv('law.csv')['법령용어명']

kiwi_token_list = kiwi.tokenize(law_text[4])

noun_tokens = [token.form for token in kiwi_token_list if token.tag.startswith('NN')]

noun_tokens

# 연속된 명사 조합
for i in range(len(noun_tokens)):
    for j in range(i+1, )

['정', '지방식', '비상', '정지', '장치']

In [None]:
kiwi = Kiwi()

def extract_noun_phrases(text):
    tokens = kiwi.tokenize(text)
    # 명사만 추출
    noun_tokens = [token.form for token in tokens if token.tag.startswith('NN')]
    # 연속된 명사 조합 (길이 1~4까지)
    result = []
    for i in range(len(noun_tokens)):
        for j in range(i+1, min(i+5, len(noun_tokens)+1)):
            phrase = ''.join(noun_tokens[i:j])
            result.append(phrase)
    return result

rows = pd.read_csv('law.csv')['법령용어명']

# 전체 키워드 후보 추출
all_phrases = []
for line in rows:
    all_phrases.extend(extract_noun_phrases(line))

# 빈도수 집계
counter = Counter(all_phrases)
common_phrases = [(k, v) for k, v in counter.items() if v >= 2]  # 등장횟수 2 이상만

# 결과 정리
df = pd.DataFrame(common_phrases, columns=["keyword", "frequency"])
df = df.sort_values("frequency", ascending=False)
df.to_csv("의미있는_키워드_후보.csv", index=False, encoding="utf-8-sig")

In [1]:
import pandas as pd
from kiwipiepy import Kiwi
from collections import defaultdict

# 엑셀 파일 경로
file_path = "작업.xlsx"

# 데이터 불러오기
df = pd.read_excel(file_path)
kiwi = Kiwi()

original_terms = df['법령용어명'].astype(str).tolist()
keyword_dict = defaultdict(lambda: 0)

output_rows = []
stopwords = {'이상', '자', '등', '조건', '내용', '법률', '대상', '경우', '관련', '조치', '사유', '기준', '정의', '요건', '의무', '방법', '사항'}
existing_terms = set(original_terms)

for term in original_terms:
    # 기본적으로는 사용불가 (0)
    output_rows.append({'법령용어명': term, '사용가능': 0})
    
    tokens = kiwi.tokenize(term)
    
    # 연속된 명사를 복합어로 추출
    compound = []
    for token in tokens:
        if token.tag == 'NNG' and token.form not in stopwords and len(token.form) >= 2:
            compound.append(token.form)
        else:
            if compound:
                joined = ''.join(compound)
                if joined not in existing_terms:
                    keyword_dict[joined] = 1
                    existing_terms.add(joined)
                compound = []
    if compound:
        joined = ''.join(compound)
        if joined not in existing_terms:
            keyword_dict[joined] = 1
            existing_terms.add(joined)

# 키워드 추가
for keyword, value in keyword_dict.items():
    output_rows.append({'법령용어명': keyword, '사용가능': value})

# 결과 저장
result_df = pd.DataFrame(output_rows)
result_df.to_excel("법령용어_키워드_분석결과.xlsx", index=False)