# 데이터 불러오기

## 라이브러리

In [18]:
# gb_module.py

# 주요 패키지 불러오기

import pandas as pd
import numpy as np
import csv

from tqdm import tqdm
import time

import matplotlib.pyplot as plt
import seaborn as sns
from scipy.cluster.hierarchy import linkage, fcluster, dendrogram
from adjustText import adjust_text

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import gb_module

# 그래프에서 한글 표시를 위한 폰트 설정
plt.rcParams['font.family'] = 'Malgun Gothic'  # 맑은 고딕으로 설정
plt.rcParams['axes.unicode_minus'] = False  # 마이너스 기호가 정상 표시되도록 설정

In [19]:
# 그래프 선명하게
import matplotlib.pyplot as plt
plt.rcParams.update({'figure.dpi' : '100'})
%config InlineBackend.figure_format = 'retina'

## 데이터 불러오기
- gb_df.txt (period 정보 반영)

In [20]:
gb_df = pd.read_csv('result/gb_df.txt', sep='\t')
gb_df.head(3)

Unnamed: 0,sent_id,sent_raw,sent_split,r_no,title,writer,w_new,ho_no,year,month,grid_1,period
0,1,創刊辭,창간 辭,1,創刊辭,-,uk01,1,1920,6,01q,p1
1,2,强者도 부르짖고 弱者도 부르짖으며 優者도 부르짖고 劣者도 부르짖도다,강자 약자 優者 劣者,1,創刊辭,-,uk01,1,1920,6,01q,p1
2,3,東西南北 四海八方이 다같이 소리中에 묻혀 있도다,동서 남북 사해 팔방 소리,1,創刊辭,-,uk01,1,1920,6,01q,p1


# 시기별 단어 간 관계 행렬 (1기)

## 단어 유사도 행렬
- 사용자 패키지 사용 + 가중 연결망 정보 생성용 입력값

In [23]:
# p1만 추출
p1_gbdf = gb_df.query('period == "p1"')
p1_gbdf.shape

(15240, 12)

In [24]:
# 1기 dtm 산출
stopword = ['문제', '금일', '관계'] # 제외할 단어
p1_dtm = gb_module.get_dtm(p1_gbdf, 'sent_split', stopword, 50)
p1_dtm

Unnamed: 0,사회,朝鮮,생활,사람,민족,自己,主義,사상,세계,문화,...,현상,현재,일본,국가,인생,過去,농업,발달,지식,방법
0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.000000,0.0,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.000000,0.0,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.000000,0.0,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.000000,0.0,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15235,0.000000,0.0,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.000000,9.005388,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15236,3.307914,0.0,0.0,0.0,0.000000,0.0,12.821707,0.000000,4.307385,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15237,0.000000,0.0,0.0,0.0,3.789061,0.0,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15238,3.307914,0.0,0.0,0.0,0.000000,0.0,0.000000,4.215012,0.000000,9.005388,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [27]:
# 1기 단어간 유사도 행렬(matrix) 산출
p1_tdm = p1_dtm.T
p1_term_cossim = gb_module.get_cossim(p1_tdm)
p1_term_cossim.to_csv('result/p1_term_cossim.txt', encoding='utf8', sep='\t')

## 단어 유사도 행렬 변형
- 간선 리스트(edge list) 형태로 변형

In [34]:
def matrix_to_edge(df):
    '''
    # 단어간 유사도 matrix 형식을 -> edge 형식으로 
     (동일 단어 간 관계는 삭제: nc.exe 입력값에 맞게)
    '''
    # df : 특성 유사도 df
    df1 = df.unstack().to_frame()
    df1.columns = ['cossim']
    hetero_index = [(m,n) for m, n in df1.index if (m != n)]
    hetero_df = df1.loc[hetero_index]
    return hetero_df

In [35]:
# 1기 단어간 유사도 행렬을 간선 리스트(edge list) 형태로 변형
p1_term_cossim_edged = matrix_to_edge(p1_term_cossim)
p1_term_cossim_edged.to_csv('result/p1_term_cossim_edged.txt', encoding='utf8', header=False, sep='\t')

# 시기별 가중 연결망 정보 (1기)

In [40]:
# [주피터노트북 외부] 명령어 실행 안내
# input값에는 p1의 경우 "1"을 입력

%run wnet.0.4.1_hs_1.py
%run nc.0.4_hs_1.py

---------------------------------------------------------
WNET (Weighted Network analysis)
 - PFNet, PNNC, and Weighted Network Centralities (v.0.4,1)
				by J. Y. Lee
---------------------------------------------------------



처리할 시기의 번호를 입력하세요:  1



... 50 X 50 size matrix is entered

'result/p1_zPFNet-term_cossim.txt' is successfully generated.
'result/p1_zPNNC-term_cossim.txt' is successfully generated.
'result/p1_zWCENT-term_cossim.txt' is successfully generated.
------------------------------------------------------
nc - Neighbor Centrality Calculation Program (v.0.3)
				by J. Y. Lee
------------------------------------------------------



처리할 시기의 번호를 입력하세요:  1


'result\p1_zzNC_2.0-term_cossim_edged.txt' is successfully generated.


# 시기별 단어 간 관계 및 연결망 정보(2기, 3기)

In [64]:
periods = ['p2', 'p3']
prd_df = {}  # 결과를 저장할 딕셔너리

for i in periods:
    # 각 period에 대한 query 결과를 딕셔너리에 저장
    prd_df[f"{i}_gbdf"] = gb_df.query('period == @i')
    print(prd_df[f"{i}_gbdf"].shape)
    
    # 각 시기 dtm 산출
    stopword = ['문제', '금일', '관계'] # 제외할 단어
    prd_dtm = gb_module.get_dtm(prd_df[f"{i}_gbdf"], 'sent_split', stopword, 50)
    prd_dtm
    
    # 각 시기 단어간 유사도 행렬(matrix) 산출
    prd_tdm = prd_dtm.T
    prd_term_cossim = gb_module.get_cossim(prd_tdm)
    prd_term_cossim.to_csv(f'result/{i}_term_cossim.txt', encoding='utf8', sep='\t')
    
    # 각 시기 단어간 유사도 행렬을 간선 리스트(edge list) 형태로 변형
    prd_term_cossim_edged = matrix_to_edge(prd_term_cossim)
    prd_term_cossim_edged.to_csv(f'result/{i}_term_cossim_edged.txt', encoding='utf8', header=False, sep='\t')

(14522, 12)
(4268, 12)


In [63]:
p1_gbdf.shape[0] +  prd_df['p2_gbdf'].shape[0] +  prd_df['p3_gbdf'].shape[0]

34030

In [66]:
# [주피터노트북 외부] 명령어 실행 안내
# input값에는 p1의 경우 "1"을 입력

%run wnet.0.4.1_hs_1.py
%run nc.0.4_hs_1.py

---------------------------------------------------------
WNET (Weighted Network analysis)
 - PFNet, PNNC, and Weighted Network Centralities (v.0.4,1)
				by J. Y. Lee
---------------------------------------------------------



처리할 시기의 번호를 입력하세요:  3



... 50 X 50 size matrix is entered

'result/p3_zPFNet-term_cossim.txt' is successfully generated.
'result/p3_zPNNC-term_cossim.txt' is successfully generated.
'result/p3_zWCENT-term_cossim.txt' is successfully generated.
------------------------------------------------------
nc - Neighbor Centrality Calculation Program (v.0.3)
				by J. Y. Lee
------------------------------------------------------



처리할 시기의 번호를 입력하세요:  3


'result\p3_zzNC_2.0-term_cossim_edged.txt' is successfully generated.


# 연결망 작성용 파일 생성(p1,p2,p3)

In [68]:
period_numbers = [1, 2, 3]

for period_number in period_numbers:
    # 2. 링크 유사도 등급화
    lk_grade = pd.read_csv(f'result/p{period_number}_zPFNet-term_cossim.txt', sep='\t', encoding='utf8')
    lk_grade.columns = ['node1', 'node2', 'weight']
    gb_module.remove_same_pairwords(lk_grade)
    lk_graded = gb_module.transfer_to_grade(lk_grade, 'weight', 5)

    # 3. 노드의 그룹 및 중심성 처리
    gp_info = pd.read_csv(f'result/p{period_number}_zPNNC-term_cossim.txt', encoding='utf8', sep='\t')
    glob_ctrl = pd.read_csv(f'result/p{period_number}_zWCENT-term_cossim.txt', encoding='utf8', sep='\t')
    loca_ctrl = pd.read_csv(f'result/p{period_number}_zzNC_2.0-term_cossim_edged.txt', encoding='utf8', sep='\t')
    gp_ctrl = gb_module.get_gp_ctrl(gp_info, glob_ctrl, loca_ctrl)
    node_attr_df, groups_df, group_vertices_df = gb_module.generate_node_attributes(lk_graded, gp_ctrl, 'weight', 5)

    # 결과 저장
    with pd.ExcelWriter(f'result/p{period_number}_results.xlsx') as writer:
        lk_graded.to_excel(writer, sheet_name='Links')
        node_attr_df.to_excel(writer, sheet_name='Node_Attributes')
        groups_df.to_excel(writer, sheet_name='Groups') 
        group_vertices_df.to_excel(writer, sheet_name='Group_Vertices')

No duplicate rows found.
No duplicate rows found.
No duplicate rows found.


# The End of Note