# 데이터 불러오기

## 라이브러리

In [1]:
import pandas as pd
import numpy as np
import csv

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

## 데이터 불러오기

In [2]:
sent = pd.read_excel('data/1_gb_sent_1.xlsx')
r = pd.read_excel('data/2_r_ho_1.xlsx').drop('ho_no', axis=1)
ho = pd.read_excel('data/3_ho_grid_1.xlsx')

In [4]:
def combine_to_basic_df(sent, r, ho):
    '''
    # 개벽 데이터 정보 결합 -> 분석의 기본 df 생성
    '''
    
    sent_r = pd.merge(sent, r, left_on = 'r_no', right_on = 'r_id', how = 'inner')
    sent_rho = pd.merge(sent_r, ho, left_on = 'ho_no', right_on = 'ho_id', how = 'inner')

    gb_df = sent_rho[['sent_id',  'sent_raw',  'sent_split',  'r_no',  'title',  'writer',  'w_new',  'ho_no',  'year',  'month',  'grid_1']]
    return gb_df

In [5]:
# 기본 데이터프레임 생성
gb_df = combine_to_basic_df(sent, r, ho)
print(gb_df.shape)
gb_df

(34030, 11)


Unnamed: 0,sent_id,sent_raw,sent_split,r_no,title,writer,w_new,ho_no,year,month,grid_1
0,1,創刊辭,창간 辭,1,創刊辭,-,uk01,1,1920,6,01q
1,2,强者도 부르짖고 弱者도 부르짖으며 優者도 부르짖고 劣者도 부르짖도다,강자 약자 優者 劣者,1,創刊辭,-,uk01,1,1920,6,01q
2,3,東西南北 四海八方이 다같이 소리中에 묻혀 있도다,동서 남북 사해 팔방 소리,1,創刊辭,-,uk01,1,1920,6,01q
3,4,霹靂이냐 地震이냐 神籟이냐 魔哭이냐 우리는 아즉 이 소리의 正邪를 判斷할 수 없도다,소리 판단,1,創刊辭,-,uk01,1,1920,6,01q
4,5,左右間 多數가 渴仰하고 多數가 要求하는 人民의 소리임은 明白하도다,좌우 間 다수 갈앙 다수 요구 인민 소리 명백,1,創刊辭,-,uk01,1,1920,6,01q
...,...,...,...,...,...,...,...,...,...,...,...
34025,34026,어떠케 되엿던지 만약 미국과 영국간에 전쟁이 生起인다면 영국의 또미니온인 캐나다는 ...,미국 영국 전쟁 生起 영국 캐나다 미국 양식 힘 영국,334,유로빠와 아메리카(一) 금년 봄에 모쓰크바 엑쓰페리멘탈 劇場에서 한 「트로츠끼」의 講演.,쇠뫼 譯,김철산,72,1926,8,24q
34026,34027,이 비밀은 여러분과 나 외에 또 政客 셋이 알고 잇음니다,비밀 정객,334,유로빠와 아메리카(一) 금년 봄에 모쓰크바 엑쓰페리멘탈 劇場에서 한 「트로츠끼」의 講演.,쇠뫼 譯,김철산,72,1926,8,24q
34027,34028,그것은 미국과 영국과 캐나다임니다,미국 영국 캐나다,334,유로빠와 아메리카(一) 금년 봄에 모쓰크바 엑쓰페리멘탈 劇場에서 한 「트로츠끼」의 講演.,쇠뫼 譯,김철산,72,1926,8,24q
34028,34029,미국의 물질적 권력은 대개 이러함니다,미국 물질 권력,334,유로빠와 아메리카(一) 금년 봄에 모쓰크바 엑쓰페리멘탈 劇場에서 한 「트로츠끼」의 講演.,쇠뫼 譯,김철산,72,1926,8,24q


# 특성, 특성벡터
- sent 데이터프레임을 대상으로 진행

## 문서-단어 행렬(dtm) 산출 함수

In [6]:
def get_dtm(df, col_name, stopw, rank_n): # rank_n : 고빈도 단어 n 순위까지
    '''
    # 문서-단어 행렬(dtm) 산출 함수
    '''

    # 단어 종류 모두 벡터화. 2음절 이상
    tv = TfidfVectorizer(stop_words=stopw, norm=None)
    dtm = tv.fit_transform(df[col_name])

    # df 형태로 표시
    dtm_df = pd.DataFrame(dtm.toarray(), columns=tv.get_feature_names_out(), index=df.index)

    highword_list = dtm_df.sum().sort_values(ascending=False)[:rank_n].index.to_list()
    feature_df = dtm_df[highword_list] # 열 순서는 tfidf값이 높은 것부터 낮은 순으로 정렬
    return feature_df

## 특성 및 특성벡터
- tfidf 고빈도 50위 단어

In [10]:
# 제외할 단어
stopword_3 = ['문제', '금일', '관계']

# 함수 실행
dtm50_df = get_dtm(sent, 'sent_split', stopword_3, 50)
dtm50_df

Unnamed: 0,사회,朝鮮,主義,사람,생활,민족,계급,운동,사상,自己,...,생산,意識,문명,발달,개조,역사,생명,세력,목적,현대
0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34025,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
34026,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
34027,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
34028,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
dtm50_df.sum()

사회    15827.172075
朝鮮    14199.233664
主義    12674.308180
사람    12671.241649
생활    11580.715072
민족     9910.092593
계급     9816.194654
운동     8654.394675
사상     8051.875884
自己     6871.976023
경제     6478.327521
세계     6409.178086
노동     6320.916148
인류     5445.257318
시대     5373.451062
정치     5324.171992
자본     5231.891352
일본     5005.844742
정신     4811.027361
문화     4768.675500
민중     4748.929060
자유     4674.698179
필요     4565.964933
단체     4542.969431
자연     4435.627779
국가     4419.284448
개인     4345.515142
소작     4289.453831
제도     4207.552746
일반     4199.175122
도덕     4155.286878
종교     3985.073282
의미     3966.282610
교육     3941.552793
理想     3922.170708
인간     3905.554257
조직     3847.144943
혁명     3759.686263
현상     3690.235847
중국     3655.497693
생산     3620.314701
意識     3556.246022
문명     3396.093742
발달     3393.454508
개조     3259.842289
역사     3140.154939
생명     3129.882563
세력     3123.447753
목적     3092.028435
현대     3090.535795
dtype: float64

In [14]:
# .txt로 저장하기 (탭으로 구분)
dtm50_df.to_csv('result/gb_030_dtm50_df.txt', sep='\t', index=True)

# The End of Note