In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from tqdm import tqdm
import matplotlib as mpl
import matplotlib.pyplot as plt
import io
import csv
import os
import re
#시드값 고정
np.random.seed(42)
%matplotlib inline

#KoNLPy 공식 문서 : https://konlpy.org/ko/latest/
#설치되었는지 확인 : pip install konlpy
try:
  from konlpy.tag import Hannanum, Kkma, Komoran, Okt
except:
  !pip install konlpy
  from konlpy.tag import Hannanum, Kkma, Komoran, Okt

#LDA : 빈도수 기반 CountVectorizer 사용
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation as LDA
#설치되었는지 확인 : pip install gensim
import gensim
#설치되었는지 확인 : pip install pyLDAvis
try:
  import pyLDAvis.gensim_models
except:
  !pip install pyLDAvis
  import pyLDAvis.gensim_models
try:
  import PyPDF2
except:
  !pip install pypdf2
  import PyPDF2

try:
  from tika import parser
except:
  !pip install tika 
  from tika import parser

from sklearn.feature_extraction.text import TfidfVectorizer

from gensim.models.coherencemodel import CoherenceModel
from gensim.models.ldamodel import LdaModel

  from collections import Iterable


In [None]:
!sudo apt-get install -y fonts-nanum
!sudo fc-cache -fv
!rm ~/.cache/matplotlib -rf

mpl.rc('font', family='NanumBarunGothic')
sns.set(font="NanumBarunGothic", 
        rc={"axes.unicode_minus":False},
        style='darkgrid')

Reading package lists... Done
Building dependency tree       
Reading state information... Done
fonts-nanum is already the newest version (20170925-1).
The following package was automatically installed and is no longer required:
  libnvidia-common-460
Use 'sudo apt autoremove' to remove it.
0 upgraded, 0 newly installed, 0 to remove and 45 not upgraded.
/usr/share/fonts: caching, new cache contents: 0 fonts, 1 dirs
/usr/share/fonts/truetype: caching, new cache contents: 0 fonts, 3 dirs
/usr/share/fonts/truetype/humor-sans: caching, new cache contents: 1 fonts, 0 dirs
/usr/share/fonts/truetype/liberation: caching, new cache contents: 16 fonts, 0 dirs
/usr/share/fonts/truetype/nanum: caching, new cache contents: 10 fonts, 0 dirs
/usr/local/share/fonts: caching, new cache contents: 0 fonts, 0 dirs
/root/.local/share/fonts: skipping, no such directory
/root/.fonts: skipping, no such directory
/var/cache/fontconfig: cleaning cache directory
/root/.cache/fontconfig: not cleaning non-existent

In [None]:
# 그림 저장
image_path = '/content/drive/MyDrive/LIS3813/images/'

def save_fig(fig_name, tight_layout=True, fig_extension="png", resolution=300):
    #해상도 dpi=300
    path = image_path + fig_name + "." + fig_extension
    print("그림 저장: ", fig_name)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
pdf_path = "/content/drive/MyDrive/LIS3821/datasets/사회모범규준(2010.12).pdf" 
parsed = parser.from_file(pdf_path)

txt = open('/content/drive/MyDrive/LIS3821/datasets/social_criteria.txt', 'w', encoding = 'utf-8')
# output.txt에 pdf파일 내용을 write
print(parsed['content'], file = txt)
txt.close()

In [None]:
pdf_path = '/content/drive/MyDrive/LIS3821/datasets/기업지배구조 모범규준(2차개정,2016.08).pdf'
parsed = parser.from_file(pdf_path)

txt = open('/content/drive/MyDrive/LIS3821/datasets/governance_criteria.txt', 'w', encoding = 'utf-8')
# output.txt에 pdf파일 내용을 write
print(parsed['content'], file = txt)
txt.close()

In [None]:
pdf_path = '/content/drive/MyDrive/LIS3821/datasets/환경모범규준(2010.12).pdf'
parsed = parser.from_file(pdf_path)

txt = open('/content/drive/MyDrive/LIS3821/datasets/environment_criteria.txt', 'w', encoding = 'utf-8')
# output.txt에 pdf파일 내용을 write
print(parsed['content'], file = txt)
txt.close()

# 사회

In [None]:
file_path = '/content/drive/MyDrive/LIS3821/datasets/social_criteria.txt'

with open(file_path) as f:
    lines = f.read().splitlines()

In [None]:
text = []
for i in lines:
  if i != '':
    text.append(i)

In [None]:
text[20:50]

['4. 소비자와의 소통 ···················································································· 22',
 'Ⅳ. 지역사회 ',
 '1. 지역사회 참여 및 공헌 ········································································· 23',
 '2. 지역경제발전 ·························································································· 24',
 '3. 지역사회와의 소통 ················································································· 26',
 '前 文',
 '2',
 '[모범규준의 제정 배경]',
 '지속가능경영에 대한 사회적 인식 고조와 기업의 글로벌화 및 다양한 이해관계자의 등장, 기후변화 등으로 ',
 '기업의 사회적 책임에 대한 관심이 증대되고 있다. 특히 국제화된 자본시장에 있어 기업의 사회적 책임 ',
 '이행은 투자자들의 투자의사결정에 매우 중요한 요소로 등장하고 있다. 따라서 기업의 사회책임은 더 이상 ',
 '사회적 요구에 대한 수동적 대응의 문제가 아니며, 기업의 자금조달과 투자에 중대한 영향을 미치는 ',
 '요소로 대두되고 있다. ',
 '더욱이 ISO 26000이나 SRI(사회책임투자) 지수들이 기업의 생산과 투자활동에 있어 기업들에게 직접적인 ',
 '영향을 줄 가능성이 매우 높다. ',
 '이제 기업은 사회적 책임을 기업의 이미지 제고나 홍보 측면에서만 접근할 것이 아니라 기업가치 제고를 ',
 '위한 필수불가결한 요소로 인식하여 기업 경영에 적극 활용해야 할 것이다.',
 '반면에 사회적 책임이 과연 무엇인가에 대한 일관된 논의나 방향 제시는 충분히 이루어지지 않고 있어 ',
 '기업의 입장에서 과연 어떻게 사회적 책임을 실

In [None]:
corpus = []

for i in text:
  i = re.sub(r'[^\sA-Za-z가-힣]',"",i) #한글만 가져오기 공백 \s
  corpus.append(i)

social_corpus = []
for i in corpus:
  i = i.strip()
  if i != '':
    social_corpus.append(i)

In [None]:
stopwords=['와의', '대해', '내용', '고려', '주어', '최소한', '규정', '기준', '관련', '사항', '경우', '이상', '이하',
           '초과', '최소', '최대', '대비', '참고', '순위도', '기존', '모범', '관계', '고조', '요소','문제', '영향',
           '더욱이', '대두', '직접', '얼마나', '지침', '지향', '지양', '우리나라', '비롯', '도록', '수준', '국가', '모두',
           '국내', '보고', '최선', '부족', '다음', '이유', '간주', '저해', '우려', '또한', '거나', '규준', '위해', '기반', '따라서',
           '대한', '방법', '모든', '우선', '통해', '도비', '조건', '포함', '상황', '프로그램', '기업', '각종', '활동', '서비스',
           '자신', '주요', '다른', '해당', '주의', '로서', '로써', '로부터', '기타', '여부', '다만', '개별', '개인', '중요', '특정', '우리', '이외', '사실', '가장', '제일', '방향', '활용', 
           ]

In [None]:
okt = Okt()

def get_nounslist(corpus, stopwords=stopwords):
    nouns_list = []

    for c in tqdm(corpus):
        #명사 추출
        nouns = okt.nouns(c)
        data = []
        for noun in nouns:
          #2음절 이상
          if (len(noun) >= 2) and (noun not in stopwords):
            data.append(noun)
          if len(data) != 0:
            nouns_list.append(data)

    return nouns_list

In [None]:
social_nouns_nested = get_nounslist(social_corpus)

100%|██████████| 775/775 [00:01<00:00, 537.93it/s]


In [None]:
print(social_nouns_nested[:20])
print(f'문서 개수: {len(social_nouns_nested)}')

[['사회'], ['사회'], ['사회'], ['근로자'], ['고용', '근로'], ['고용', '근로'], ['고용', '근로'], ['고용', '근로'], ['노사'], ['노사'], ['직장', '보건', '안전'], ['직장', '보건', '안전'], ['직장', '보건', '안전'], ['직장', '보건', '안전'], ['직장', '보건', '안전'], ['인력', '개발', '지원'], ['인력', '개발', '지원'], ['인력', '개발', '지원'], ['인력', '개발', '지원'], ['직장', '기본권']]
문서 개수: 4692


In [None]:
df_social_nouns = pd.DataFrame({'nouns' : social_nouns_nested})
df_social_nouns['nouns'] = [" ".join(x) for x in df_social_nouns['nouns'].values]

In [None]:
df_social_nouns

Unnamed: 0,nouns
0,사회
1,사회
2,사회
3,근로자
4,고용 근로
...,...
4687,협의 참여 접근 방식 취하
4688,협의 참여 접근 방식 취하
4689,협의 참여 접근 방식 취하
4690,협의 참여 접근 방식 취하


In [None]:
social_nouns = [y for x in social_nouns_nested for y in x]

In [None]:
from collections import Counter
counter = Counter(social_nouns)
counter.most_common(20)

[('사회', 1237),
 ('지역', 1079),
 ('근로자', 872),
 ('제품', 740),
 ('소비자', 676),
 ('제공', 632),
 ('협력', 606),
 ('거래', 523),
 ('정보', 433),
 ('행위', 379),
 ('교육', 361),
 ('책임', 353),
 ('고용', 349),
 ('지원', 326),
 ('보건', 310),
 ('노동', 277),
 ('안전', 255),
 ('투자', 250),
 ('사용', 222),
 ('보호', 218)]

In [None]:
tfidf = TfidfVectorizer(max_df = 0.95, max_features=1000)
tfidf_matrix = tfidf.fit_transform(df_social_nouns['nouns'])

In [None]:
tfidf.vocabulary_
tfidf_matrix.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [None]:
df_social_tfidf = pd.DataFrame(tfidf_matrix.toarray(), columns = sorted(tfidf.vocabulary_))
df_social_tfidf.head()

Unnamed: 0,가격,가능,가능성,가로,가입,가족,가족부,가치,간섭,간접,...,환불,회사,회수,획득,효과,효율,훈련,훼손,휴가,휴일
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
social_keyword_250_df = pd.DataFrame(df_social_tfidf.sum().sort_values(ascending=False)[:250])
social_keyword_250 = social_keyword_250_df.index.tolist()
len(social_keyword_250)

250

In [None]:
path = '/content/drive/MyDrive/LIS3821/datasets/social_keyword_250.txt'

with open(path,'w',encoding='utf-8') as f:
    for keyword in social_keyword_250:
        f.write(keyword+'\n')

f.close()

# 환경

In [None]:
file_path = '/content/drive/MyDrive/LIS3821/datasets/environment_criteria.txt'

with open(file_path) as f:
    lines = f.read().splitlines()

In [None]:
text = []
for i in lines:
  if i != '':
    text.append(i)

In [None]:
corpus = []

for i in text:
  i = re.sub(r'[^\s가-힣]',"",i) #한글만 가져오기 공백 \s
  corpus.append(i)

e_corpus = []
for i in corpus:
  i = i.strip()
  if i != '':
    e_corpus.append(i)

In [None]:
e_nouns_nested = get_nounslist(e_corpus)
print(e_nouns_nested[:20])
print(f'문서 개수: {len(e_nouns_nested)}')

100%|██████████| 669/669 [00:01<00:00, 337.74it/s]

[['환경'], ['환경'], ['환경'], ['환경', '경영', '계획'], ['환경', '경영', '계획'], ['환경', '경영', '계획'], ['최고경영자', '실천', '의지'], ['최고경영자', '실천', '의지'], ['최고경영자', '실천', '의지'], ['환경', '경영', '전략', '방침'], ['환경', '경영', '전략', '방침'], ['환경', '경영', '전략', '방침'], ['환경', '경영', '전략', '방침'], ['환경', '목표', '계획', '수립'], ['환경', '목표', '계획', '수립'], ['환경', '목표', '계획', '수립'], ['환경', '목표', '계획', '수립'], ['환경', '목표', '계획', '수립'], ['환경', '조직', '문화'], ['환경', '조직', '문화']]
문서 개수: 5732





In [None]:
df_e_nouns = pd.DataFrame({'nouns' : e_nouns_nested})
df_e_nouns['nouns'] = [" ".join(x) for x in df_e_nouns['nouns'].values]

df_e_nouns

Unnamed: 0,nouns
0,환경
1,환경
2,환경
3,환경 경영 계획
4,환경 경영 계획
...,...
5727,접근 용이 강제 비용 발생
5728,접근 용이 강제 비용 발생
5729,접근 용이 강제 비용 발생
5730,접근 용이 강제 비용 발생


In [None]:
e_nouns = [y for x in e_nouns_nested for y in x]

In [None]:
from collections import Counter
counter = Counter(e_nouns)
counter.most_common(20)

[('환경', 4954),
 ('경영', 1843),
 ('환경성', 883),
 ('정보', 744),
 ('제품', 687),
 ('관리', 672),
 ('평가', 574),
 ('생산', 536),
 ('이해관계자', 508),
 ('수립', 455),
 ('내부', 455),
 ('구축', 426),
 ('시스템', 423),
 ('감사', 419),
 ('성과', 405),
 ('전략', 380),
 ('개선', 380),
 ('실행', 377),
 ('구성원', 374),
 ('목표', 372)]

In [None]:
tfidf = TfidfVectorizer(max_df = 0.95, max_features=1000)
tfidf_matrix = tfidf.fit_transform(df_e_nouns['nouns'])

In [None]:
df_e_tfidf = pd.DataFrame(tfidf_matrix.toarray(), columns = sorted(tfidf.vocabulary_))
df_e_tfidf.head()

Unnamed: 0,가격,가능,가능성,가동,가스,가지,가치,각하,간접,감사,...,회계,회수,회의,획득,효과,효율,효익,훈련,흐름,흡수
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
e_keyword_250_df = pd.DataFrame(df_e_tfidf.sum().sort_values(ascending=False)[:250])
e_keyword_250 = e_keyword_250_df.index.tolist()
len(e_keyword_250)

250

In [None]:
path = '/content/drive/MyDrive/LIS3821/datasets/environment_keyword_250.txt'

with open(path,'w',encoding='utf-8') as f:
    for keyword in e_keyword_250:
        f.write(keyword+'\n')
        
f.close()

# 지배구조

In [None]:
file_path = '/content/drive/MyDrive/LIS3821/datasets/governance_criteria.txt'

with open(file_path) as f:
    lines = f.read().splitlines()

In [None]:
text = []
for i in lines:
  if i != '':
    text.append(i)

In [None]:
corpus = []

for i in text:
  i = re.sub(r'[^\s가-힣]',"",i) #한글만 가져오기 공백 \s
  corpus.append(i)

In [None]:
g_corpus = []
for i in corpus:
  i = i.strip()
  if i != '':
    g_corpus.append(i)

In [None]:
g_nouns_nested = get_nounslist(g_corpus)
print(g_nouns_nested[:20])
print(f'문서 개수: {len(g_nouns_nested)}')

100%|██████████| 1052/1052 [00:02<00:00, 364.07it/s]

[['제정', '배경'], ['제정', '배경'], ['경제', '가치', '창조', '주체', '경쟁력', '경쟁력', '결정'], ['경제', '가치', '창조', '주체', '경쟁력', '경쟁력', '결정'], ['경제', '가치', '창조', '주체', '경쟁력', '경쟁력', '결정'], ['경제', '가치', '창조', '주체', '경쟁력', '경쟁력', '결정'], ['경제', '가치', '창조', '주체', '경쟁력', '경쟁력', '결정'], ['경제', '가치', '창조', '주체', '경쟁력', '경쟁력', '결정'], ['경제', '가치', '창조', '주체', '경쟁력', '경쟁력', '결정'], ['경제', '가치', '창조', '주체', '경쟁력', '경쟁력', '결정'], ['경제', '가치', '창조', '주체', '경쟁력', '경쟁력', '결정'], ['핵심', '요체', '세계', '각국', '자국', '효율', '경쟁력'], ['핵심', '요체', '세계', '각국', '자국', '효율', '경쟁력'], ['핵심', '요체', '세계', '각국', '자국', '효율', '경쟁력'], ['핵심', '요체', '세계', '각국', '자국', '효율', '경쟁력'], ['핵심', '요체', '세계', '각국', '자국', '효율', '경쟁력'], ['핵심', '요체', '세계', '각국', '자국', '효율', '경쟁력'], ['핵심', '요체', '세계', '각국', '자국', '효율', '경쟁력'], ['핵심', '요체', '세계', '각국', '자국', '효율', '경쟁력'], ['핵심', '요체', '세계', '각국', '자국', '효율', '경쟁력']]
문서 개수: 7543





In [None]:
df_g_nouns = pd.DataFrame({'nouns' : g_nouns_nested})
df_g_nouns['nouns'] = [" ".join(x) for x in df_g_nouns['nouns'].values]

df_g_nouns

Unnamed: 0,nouns
0,제정 배경
1,제정 배경
2,경제 가치 창조 주체 경쟁력 경쟁력 결정
3,경제 가치 창조 주체 경쟁력 경쟁력 결정
4,경제 가치 창조 주체 경쟁력 경쟁력 결정
...,...
7538,개선 독립성 확보 방안 마련 적극 실행
7539,개선 독립성 확보 방안 마련 적극 실행
7540,개선 독립성 확보 방안 마련 적극 실행
7541,개선 독립성 확보 방안 마련 적극 실행


In [None]:
g_nouns = [y for x in g_nouns_nested for y in x]

In [None]:
counter = Counter(g_nouns)
counter.most_common(20)

[('감사', 2392),
 ('위원회', 1348),
 ('이사회', 1186),
 ('주주', 1148),
 ('경영', 898),
 ('이사', 799),
 ('사외이사', 752),
 ('외부', 679),
 ('정보', 678),
 ('수행', 544),
 ('공시', 538),
 ('행사', 465),
 ('책임', 410),
 ('지배구조', 399),
 ('운영', 351),
 ('영진', 348),
 ('평가', 345),
 ('직무', 337),
 ('선임', 326),
 ('지배', 308)]

In [None]:
tfidf = TfidfVectorizer(max_df = 0.95, max_features=1000)
tfidf_matrix = tfidf.fit_transform(df_g_nouns['nouns'])

In [None]:
df_g_tfidf = pd.DataFrame(tfidf_matrix.toarray(), columns = sorted(tfidf.vocabulary_))
df_g_tfidf.head()

Unnamed: 0,가격,가능성,가액,가입,가정,가족,가지,가치,각각,각계,...,효과,효력,효율,후보,후보자,후의,훼손,휴가,흐름,희생
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.35139,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.35139,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.35139,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
g_keyword_250_df = pd.DataFrame(df_g_tfidf.sum().sort_values(ascending=False)[:250])
g_keyword_250 = g_keyword_250_df.index.tolist()
len(g_keyword_250)

250

In [None]:
path = '/content/drive/MyDrive/LIS3821/datasets/governance_keyword_250.txt'

with open(path,'w',encoding='utf-8') as f:
    for keyword in g_keyword_250:
        f.write(keyword+'\n')
        
f.close()