In [7]:
import pandas as pd
import ast
from tqdm import tqdm

tqdm.pandas()
df = pd.read_csv('bond_docs.csv')
df['tagged'] = df['tagged'].progress_apply(ast.literal_eval)

100%|██████████| 5367/5367 [00:47<00:00, 113.36it/s]


In [1]:
import pandas as pd
import ast
from tqdm import tqdm

tqdm.pandas()
df = pd.read_csv('economy_docs.csv')
df['tagged'] = df['tagged'].progress_apply(ast.literal_eval)

100%|██████████| 7962/7962 [01:10<00:00, 112.30it/s]


In [8]:
# tagged 컬럼 삭제 버전
from collections import Counter
from tqdm import tqdm

def generate_ngrams(token_list, max_n=5):
    words = [token[0] for token in token_list]
    ngram_counter = Counter()
    for n in range(1, max_n + 1):
        ngrams = [tuple(words[i:i+n]) for i in range(len(words) - n + 1)]
        ngram_counter.update(ngrams)
    return ngram_counter

def sorted_ngrams(token_list, max_n=5):
    counter = generate_ngrams(token_list, max_n)
    sorted_ngrams = sorted(counter.items(), key=lambda x: x[1], reverse=True)
    return str(sorted_ngrams)

tqdm.pandas()
# 각 행별로 n-그램 결과를 'ngram' 컬럼에 추가
df['ngram'] = df['tagged'].progress_apply(lambda x: sorted_ngrams(x, max_n=5))
df[['date', 'ngram']].to_csv('ngram_bond_results.csv', index=False)

100%|██████████| 5367/5367 [00:19<00:00, 281.57it/s]


In [None]:
# tagged 컬럼 미삭제 버전
from collections import Counter
from tqdm import tqdm

def generate_ngrams(token_list, max_n=5):

    words = [token[0] for token in token_list]
    ngram_counter = Counter()
    for n in range(1, max_n + 1):
        ngrams = [tuple(words[i:i+n]) for i in range(len(words) - n + 1)]
        ngram_counter.update(ngrams)
    return ngram_counter

def sorted_ngrams(token_list, max_n=5):
    counter = generate_ngrams(token_list, max_n)
    sorted_ngrams = sorted(counter.items(), key=lambda x: x[1], reverse=True)
    return str(sorted_ngrams)

tqdm.pandas()
# 각 행별로 n-그램 결과를 'ngram' 컬럼에 추가
df['ngram'] = df['tagged'].progress_apply(lambda x: sorted_ngrams(x, max_n=5))

df.to_csv('ngram_economy_results.csv', index=False)

100%|██████████| 7962/7962 [00:29<00:00, 271.03it/s]


In [9]:
pd.read_csv('ngram_bond_results.csv')['ngram'][0]

"[(('금리',), 13), (('투자',), 10), (('회사',), 10), (('크레딧',), 7), (('롯데그룹',), 6), (('롯데케미칼',), 6), (('제공',), 6), (('신한',), 6), (('증권',), 6), (('신한', '투자'), 6), (('투자', '증권'), 6), (('신한', '투자', '증권'), 6), (('유동성',), 5), (('유효',), 5), (('회사채',), 5), (('축',), 5), (('상기',), 5), (('상기', '회사'), 5), (('추가',), 4), (('인하',), 4), (('금통위',), 4), (('사채',), 4), (('관련',), 4), (('은행',), 4), (('제시',), 4), (('판단',), 4), (('의견',), 4), (('국내',), 3), (('기대',), 3), (('우려',), 3), (('약세',), 3), (('거래',), 3), (('예금',), 3), (('대비',), 3), (('공사',), 3), (('주식',), 3), (('조사',), 3), (('분석',), 3), (('국내', '크레딧'), 3), (('롯데그룹', '유동성'), 3), (('금리', '축'), 3), (('조사', '분석'), 3), (('중심',), 2), (('이슈',), 2), (('일단락',), 2), (('시장',), 2), (('부각',), 2), (('하락',), 2), (('국채',), 2), (('했으며',), 2), (('속',), 2), (('신용스프레드',), 2), (('유지',), 2), (('스프레드',), 2), (('폭',), 2), (('확대',), 2), (('반',), 2), (('발생',), 2), (('채권',), 2), (('강세',), 2), (('재무비율',), 2), (('내용',), 2), (('사채권자',), 2), (('집회',), 2), (('소집',), 2), (('언론',), 2), (('보증

In [None]:
pd.read_csv('ngram_economy_results.csv')['ngram'][0]

"[(('미국',), 59), (('대신증권',), 32), (('제조업',), 23), (('경기',), 18), (('지수',), 18), (('PMI',), 18), (('한국',), 14), (('소비',), 14), (('유로존',), 14), (('중국',), 13), (('PCEPI',), 13), (('지속',), 12), (('수출',), 11), (('상승',), 11), (('생산',), 11), (('상승률',), 11), (('일본',), 11), (('SP',), 10), (('독일',), 9), (('PMI', '미국'), 9), (('대신증권', '대신증권'), 9), (('서비스업',), 8), (('수요',), 8), (('증가',), 8), (('신규',), 8), (('스프레드',), 8), (('제조업', '경기'), 8), (('감소',), 7), (('했으나',), 7), (('기업',), 7), (('고용',), 7), (('파악',), 7), (('국내',), 7), (('기준',), 7), (('금리',), 7), (('물가',), 7), (('개인',), 7), (('주요국',), 7), (('기대',), 6), (('임금',), 6), (('둔화',), 6), (('확대',), 6), (('양호',), 6), (('흐름',), 6), (('우려',), 6), (('종합',), 6), (('서비스',), 6), (('CB',), 6), (('소비자',), 6), (('소득',), 6), (('측',), 6), (('비교',), 6), (('요인',), 6), (('판매',), 6), (('회의',), 6), (('임금', '상승률'), 6), (('제조업', 'PMI'), 6), (('ISM',), 5), (('크',), 5), (('증가율',), 5), (('판단',), 5), (('가운데',), 5), (('매크로',), 5), (('실업률',), 5), (('관련',), 5), (('압력',), 5), ((