In [34]:
import os
import sys
from glob import glob
from collections import Counter
from konlpy.tag import Mecab
import pandas as pd 
import numpy as np
pd.set_option('display.max_rows', 100)

import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib as mpl
import matplotlib.font_manager as fm
plt.style.use('ggplot')
plt.rc('font', family='Malgun Gothic')
mpl.rcParams['axes.unicode_minus'] = False


sys.path.insert(0, '../')
from utils import load_pickle
from load_data import load_data
from config import Config

In [8]:
relations = load_pickle('../input/data/label_type.pkl')
inverse_relations = {i:j for j, i in relations.items()}
tagger = Mecab()

In [3]:
train_raw = load_data(Config.Train)
train_raw.head()

TRAIN_SIZE = train_raw.shape[0]
print(train_raw.shape)

(9000, 4)


In [4]:
print(f"Uniqueness for Entity 1: {train_raw['entity_01'].nunique()}")
print(f"Uniqueness for Entity 2: {train_raw['entity_02'].nunique()}")
print(f"Intersection between Entities: {len(set(train_raw['entity_01']).intersection(set(train_raw['entity_02'])))}")
print(f"Intersection Rate: {len(set(train_raw['entity_01']).intersection(set(train_raw['entity_02']))) / TRAIN_SIZE: .4f}")

Uniqueness for Entity 1: 5089
Uniqueness for Entity 2: 4474
Intersection between Entities: 694
Intersection Rate:  0.0771


In [25]:
ENTITY1 = '더불어민주당'
ENTITY2 = '문재인'

# 더불어민주당 -> 문재인
condition = (train_raw['entity_01'] == ENTITY1) & (train_raw['entity_02'] == ENTITY2)
entity_left = train_raw[condition].reset_index(drop=True)
entity_left.loc[:, 'label'] = entity_left.loc[:, 'label'].apply(lambda x: inverse_relations[x])

# 문재인 -> 더불어민주당
condition = (train_raw['entity_02'] == ENTITY1) & (train_raw['entity_01'] == ENTITY2)
entity_right = train_raw[condition].reset_index(drop=True)
entity_right.loc[:, 'label'] = entity_right.loc[:, 'label'].apply(lambda x: inverse_relations[x])

In [28]:
nouns = ['NNG', 'NNP', 'NNB', 'NNBC', 'NR', 'NP']
verb = ['VV']
adjective = ['VA']

FILTER = nouns + verb + adjective


In [29]:
def filter_tags(pos: list):
    output = list(filter(lambda x: x[-1] in FILTER, pos))
    output = list(map(lambda x: x[0], output))
    return output


entity_left['tagged'] = entity_left['sentence'].apply(lambda x: filter_tags(tagger.pos(x)))
entity_right['tagged'] = entity_right['sentence'].apply(lambda x: filter_tags(tagger.pos(x)))

In [31]:
def flatten(l: list):
    output = []
    for element in l:
        if isinstance(element, list):
            output.extend(element)
        else:
            output.append(element)
    return output

In [51]:
# '문재인 -> 더불어민주당' 관계에서의 주요 소잿거리
keywords_right = pd.Series(flatten(entity_right['tagged'].tolist())).unique().tolist()

# '더불어민주당 -> 문재인' 관계에서의 주요 소잿거리
keywords_left = pd.Series(flatten(entity_left['tagged'].tolist())).unique().tolist()

print(len(keywords_right), len(keywords_left))

416 318


In [50]:
len(set(keywords_right).intersection(set(keywords_left)))

187

In [78]:
for top_k in [10, 20, 30, 40, 50, 100, 200, 300]:
    # '문재인 -> 더불어민주당' 관계에서의 주요 소잿거리
    keywords_right_cnt = pd.Series(flatten(entity_right['tagged'].tolist())).value_counts()
    keywords_right = keywords_right_cnt.index[:top_k].tolist()

    # '더불어민주당 -> 문재인' 관계에서의 주요 소잿거리
    keywords_left_cnt = pd.Series(flatten(entity_left['tagged'].tolist())).value_counts()
    keywords_left = keywords_left_cnt.index[:top_k].tolist()

    print(f'Top {top_k} 중복도', len(set(keywords_right).intersection(set(keywords_left))) / top_k)
    print(f'Right Rate: {keywords_right_cnt.iloc[:top_k].sum() / keywords_right_cnt.sum()}')
    print(f'Left Rate: {keywords_left_cnt.iloc[:top_k].sum() / keywords_left_cnt.sum()}')

Top 10 중복도 0.8
Right Rate: 0.2557213930348259
Left Rate: 0.2888198757763975
Top 20 중복도 0.8
Right Rate: 0.34626865671641793
Left Rate: 0.386128364389234
Top 30 중복도 0.7
Right Rate: 0.4079601990049751
Left Rate: 0.45445134575569357
Top 40 중복도 0.7
Right Rate: 0.454726368159204
Left Rate: 0.5093167701863354
Top 50 중복도 0.66
Right Rate: 0.49154228855721394
Left Rate: 0.5559006211180124
Top 100 중복도 0.55
Right Rate: 0.6268656716417911
Left Rate: 0.7287784679089027
Top 200 중복도 0.525
Right Rate: 0.7850746268656716
Left Rate: 0.8778467908902692
Top 300 중복도 0.49333333333333335
Right Rate: 0.8845771144278607
Left Rate: 0.9813664596273292
