In [1]:
import pdfplumber
import glob
from konlpy.tag import Komoran
import re
from collections import Counter
import os
import pandas as pd

In [2]:
files = glob.glob('.\open\*')
files

['.\\open\\기호_01 공약.pdf',
 '.\\open\\기호_02 공약.pdf',
 '.\\open\\기호_03 공약.pdf',
 '.\\open\\기호_04 공약.pdf',
 '.\\open\\기호_05 공약.pdf',
 '.\\open\\기호_06 공약.pdf',
 '.\\open\\기호_07 공약.pdf',
 '.\\open\\기호_08 공약.pdf',
 '.\\open\\기호_09 공약.pdf',
 '.\\open\\기호_10 공약.pdf',
 '.\\open\\기호_11 공약.pdf',
 '.\\open\\기호_12 공약.pdf',
 '.\\open\\기호_13 공약.pdf',
 '.\\open\\기호_14 공약.pdf']

In [3]:
# path = '.\open\기호_5 공약.pdf'
# pdf = pdfplumber.open(path)

In [4]:
komoran = Komoran(userdic='.\\uesr_dic.txt')

def tokenization(pdf):
    temp = []
    for page in pdf.pages:
        page_text = page.extract_text().replace('\n','').replace('\t','').replace('\r','')
        temp.append(komoran.nouns(page_text))
    return temp

# pdf_text = tokenization(pdf)
# pdf_text

In [5]:
def preprocessing(text):
    hangul = re.compile('[^ A-Za-z가-힣+]+')
    text = hangul.sub('', str(text))
    tokens = komoran.morphs(text)
    
    tokens = [token for token in tokens]
    
    return tokens

In [6]:
all_tokens = []
writer = pd.ExcelWriter('word_counts.xlsx', engine='xlsxwriter')

for i in range(len(files)):
    pdf = pdfplumber.open(files[i])
    pdf_text = tokenization(pdf)
    
    clean_text = []

    for word in pdf_text:
        preprocessed = preprocessing(word)
        clean_text.append(preprocessed)
        
    all_tokens.extend(clean_text)
    
    clean_text_reshaped = sum(clean_text, [])
    counts = Counter(clean_text_reshaped)
    sorted_counts = sorted(dict(counts).items(), key=lambda x: x[1], reverse=True)
    
    sorted_counts_df = pd.DataFrame(sorted_counts)
    
    file_name = files[i].split('\\')[2]

    sorted_counts_df.to_excel(writer, sheet_name = file_name, index=False, header=False)
    
    print(f'{file_name} completed')
            
all_tokens

기호_01 공약.pdf completed
기호_02 공약.pdf completed
기호_03 공약.pdf completed
기호_04 공약.pdf completed
기호_05 공약.pdf completed
기호_06 공약.pdf completed
기호_07 공약.pdf completed
기호_08 공약.pdf completed
기호_09 공약.pdf completed
기호_10 공약.pdf completed
기호_11 공약.pdf completed
기호_12 공약.pdf completed
기호_13 공약.pdf completed
기호_14 공약.pdf completed


[['기호',
  '공약',
  '순위',
  '코로나',
  '팬데믹',
  '완전',
  '극복',
  '피해',
  '소상공인',
  '완전',
  '지원',
  '목표',
  '코로나',
  '팬데믹',
  '완전',
  '극복',
  '피해',
  '소상공인',
  '피해',
  '완전',
  '극복',
  '이행',
  '방법',
  '코로나',
  '팬데믹',
  '완전',
  '극복',
  '대응',
  '강화',
  '오미크론',
  '등',
  '변이종',
  '확산',
  '대응',
  '총력',
  '체제',
  '강화',
  '백신',
  '치료제',
  '확보',
  '의료',
  '보건',
  '체제',
  '구축',
  '재정',
  '투입',
  '공공',
  '병원',
  '확보',
  '감염병',
  '대응',
  '역량',
  '강화',
  '국내',
  '개발',
  '백신',
  '치료제',
  '주권',
  '확보',
  '필수',
  '의약품',
  '공공',
  '생산',
  '체계',
  '구축',
  '국산',
  '코로나',
  '백신',
  '치료제',
  '개발',
  '끝',
  '지원',
  '필수',
  '예방 접종',
  '의약품',
  '자급',
  '실현',
  '국가',
  '지원',
  '체제',
  '구축',
  '코로나',
  '백신',
  '치료제',
  '개발',
  '지원',
  '등',
  '바이오',
  '산업',
  '국제',
  '경쟁력',
  '제고',
  '코로나',
  '피해',
  '소상공인',
  '보상',
  '매출',
  '회복',
  '지원',
  '코로나',
  '발생',
  '시점',
  '완전',
  '극복',
  '시점',
  '피해',
  '보상',
  '지원',
  '한국',
  '형',
  '제도',
  '도입',
  '정비',
  '피해',
  '지원',
  '추진',
  '소상공인',
  '자영업',
  '매출',
  '회복',
  '지원',
  

In [7]:
all_tokens_reshaped = sum(all_tokens, [])
counts = Counter(all_tokens_reshaped)
sorted_counts = sorted(dict(counts).items(), key=lambda x: x[1], reverse=True)

sorted_counts

[('등', 385),
 ('년', 303),
 ('이행', 246),
 ('확대', 231),
 ('국가', 215),
 ('지원', 212),
 ('공공', 196),
 ('사회', 180),
 ('도입', 179),
 ('보장', 178),
 ('강화', 176),
 ('기호', 168),
 ('주택', 167),
 ('청년', 162),
 ('재원', 159),
 ('폐지', 159),
 ('추진', 155),
 ('공약', 151),
 ('제도', 151),
 ('법', 150),
 ('기간', 145),
 ('순위', 141),
 ('국민', 140),
 ('방법', 135),
 ('조달', 134),
 ('방안', 134),
 ('수', 132),
 ('목표', 128),
 ('정책', 122),
 ('제정', 115),
 ('교육', 115),
 ('전환', 114),
 ('원', 114),
 ('구축', 103),
 ('정부', 102),
 ('산업', 101),
 ('마련', 101),
 ('기업', 101),
 ('실현', 99),
 ('지역', 99),
 ('경제', 98),
 ('예산', 98),
 ('일자리', 96),
 ('정치', 90),
 ('개정', 87),
 ('확보', 83),
 ('고용', 81),
 ('체계', 79),
 ('것', 79),
 ('금지', 78),
 ('공급', 77),
 ('위원회', 76),
 ('돌봄', 76),
 ('서비스', 75),
 ('재정', 73),
 ('통합', 73),
 ('시행', 71),
 ('지급', 71),
 ('미래', 70),
 ('대학', 70),
 ('중심', 69),
 ('의료', 68),
 ('개혁', 68),
 ('주거', 67),
 ('노동', 67),
 ('소득', 66),
 ('이상', 66),
 ('에너지', 63),
 ('개선', 63),
 ('공정', 62),
 ('필요', 62),
 ('개발', 61),
 ('플랫폼', 61),
 ('디지털', 61),


In [8]:
sorted_counts_df = pd.DataFrame(sorted_counts)
sorted_counts_df

Unnamed: 0,0,1
0,등,385
1,년,303
2,이행,246
3,확대,231
4,국가,215
...,...,...
4031,비판,1
4032,보니,1
4033,약방문,1
4034,번,1


In [9]:
sorted_counts_df.to_excel(writer, sheet_name='all', index=False, header=False)

In [10]:
writer.save()

In [12]:
all_stopwords = []

for file in glob.glob('./stopwords/*'):
    with open(file, 'r') as f:
        lines = f.read()
        all_stopwords.append(lines.split())
        
f.close()
        
all_stopwords = sum(all_stopwords, [])
all_stopwords

['등',
 '년',
 '공약',
 '순위',
 '기호',
 '목표',
 '재원',
 '조달',
 '방법',
 '기간',
 '이행',
 '개',
 '대',
 '수',
 'ㄴ',
 '호',
 '원',
 '목',
 '표',
 '강',
 '차',
 '전',
 '공',
 '분',
 '아',
 '명',
 '직',
 '지',
 '임',
 'ㅁ',
 '업',
 '살',
 '도',
 '영',
 '다',
 '육',
 '자',
 '건',
 '후',
 '간',
 '주',
 '원',
 '기호',
 '치',
 '민',
 '신',
 '위',
 '형',
 '입',
 '열',
 '내',
 '나',
 '속',
 '종',
 '기',
 '축',
 '곳',
 '가',
 '덕',
 '그리',
 '산다',
 '존',
 '관',
 '경',
 '보',
 '가칭',
 '실',
 '노',
 '사',
 '환초',
 '회',
 '과',
 '여대',
 '비',
 '하',
 '인',
 '상군',
 '투',
 'G',
 '청',
 '반시',
 '망',
 '기호',
 '공약',
 '순위',
 '이행',
 '방법',
 '목표',
 '재원',
 '조달',
 '기간',
 '년',
 '호',
 '등',
 '수',
 '후',
 '임',
 '간',
 '실',
 '인',
 '월',
 'ㅁ',
 '것',
 '시',
 '맞추',
 '하',
 '장',
 '중',
 'ㄴ',
 '주',
 '기',
 '차',
 '전',
 '원',
 '의',
 '국',
 '민',
 '비',
 '형',
 '축',
 '년차',
 '서',
 '성',
 '강',
 '건',
 '이어',
 '새',
 '만',
 '단',
 '숙',
 '적',
 '내',
 '동',
 '강',
 '학',
 '열',
 '특',
 '동안',
 '증',
 '이',
 '회',
 '명',
 '세로',
 '공',
 '조',
 '양자',
 '자간',
 '바',
 '망',
 '유',
 '도',
 '기호',
 '안',
 '기호',
 '공약',
 '순위',
 '목표',
 '이행',
 '방법',
 '재원',

In [13]:
all_stopwords_set = set(all_stopwords)
all_stopwords_set

{'G',
 'ㄴ',
 'ㄹ',
 'ㅁ',
 '가',
 '가량',
 '가지',
 '가칭',
 '간',
 '감',
 '강',
 '개',
 '개월',
 '거',
 '건',
 '것',
 '경',
 '고',
 '곳',
 '공',
 '공공',
 '공약',
 '과',
 '관',
 '국',
 '권',
 '그동안',
 '그리',
 '그릴',
 '그중',
 '극',
 '금',
 '급',
 '기',
 '기간',
 '기호',
 '나',
 '나가야',
 '날',
 '남은',
 '내',
 '녀',
 '년',
 '년대',
 '년차',
 '노',
 '농',
 '다',
 '단',
 '담',
 '닷',
 '당',
 '대',
 '대부분',
 '덕',
 '덕분',
 '데',
 '도',
 '독',
 '동',
 '동안',
 '드',
 '등',
 '등국',
 '등양',
 '때',
 '때문',
 '량',
 '로',
 '리',
 '리다',
 '마다',
 '만',
 '만천하',
 '만큼',
 '망',
 '망라',
 '맞추',
 '맡',
 '매',
 '면',
 '명',
 '모',
 '목',
 '목표',
 '문',
 '물',
 '미',
 '민',
 '바',
 '밖',
 '반',
 '반시',
 '발',
 '방',
 '방법',
 '배',
 '배로',
 '번',
 '범',
 '보',
 '봉',
 '분',
 '불',
 '비',
 '뿐',
 '사',
 '사이',
 '산다',
 '살',
 '상',
 '상군',
 '새',
 '서',
 '석',
 '설',
 '성',
 '세로',
 '소',
 '속',
 '송',
 '수',
 '숙',
 '순위',
 '시',
 '식',
 '신',
 '실',
 '심',
 '아',
 '아서',
 '안',
 '안팎',
 '액',
 '양',
 '양자',
 '어',
 '업',
 '여',
 '여대',
 '여원',
 '역',
 '열',
 '영',
 '영화로',
 '오',
 '오랫동안',
 '외',
 '용',
 '용탈',
 '울',
 '원',
 '원부',
 '월',
 '위',
 '유',
 '육',
 '으',

In [14]:
with open('./stopwords/stopwords_all.txt', 'w') as f:
    for word in all_stopwords_set:
        f.write(word+'\n')
        
f.close()