In [7]:
from Mylib import *
from os import *

now = datetime.now()
tstmp = now.strftime("%y%m%d;%H%M")

# =============================================================================
# 1. Data Loading
# =============================================================================
# Loading Tokens
noun_tokened = pd.read_pickle('./result/HWC_noun_token.pkl')
print(noun_tokened.groupby(['MEETING_NAME']).size())
print(noun_tokened.groupby(['PARTY']).size())
print(noun_tokened.groupby(['SPEAKER']).size())
# all_tokened = pd.read_pickle('./result/HWC_all_token.pkl')

### 2. Data cleansing
noun_tokened.loc[noun_tokened['SPEAKER']=='박인숙 위원', 'PARTY']='자유한국당'
# Domain Knowledge based Filtering
Coding = pd.read_csv('./data/단어집.csv')
DKF = Coding[Coding['code'] == 1]['단어']

# =============================================================================
# 2. Custom criterions 
# =============================================================================
addf = [
    '국민', # 상투어
    '위원장', '위원', '의원', '장관', # 직책
    '질의', # 회의 진행용어
    '보건', '복지' # 보건복지위원회!
]

정당 = ['무소속']
#정당 = ['바른정당','자유한국당'] # 보수
#정당 = ['국민의당','더불어민주당','무소속'] # 진보
#회차 = [343, 350] # 정권교체 전
#회차 = [352, 364] # 정권교체 후

#발언자 = ['강석진','천정배'] # 이름검색 필요!

# Data Preparation
DKF = DKF.append(pd.Series(addf))
noun_tokened['token'] = [list(set(d) - set(DKF)) for d in noun_tokened['token']]

#fdata = noun_tokened # All data

fdata = prpr_data(noun_tokened).by_party(정당).get_data()
# fdata = prpr_data(noun_tokened).by_meeting(회차).get_data()

tokens = fdata['token'].copy()

print('ALL', noun_tokened.shape)
print('fdata', fdata.shape)
makedirs('result/'+tstmp)

MEETING_NAME
제343회(2016.06.07-2016.07.06)    3100
제344회(2016.07.19-2016.07.27)     517
제345회(2016.08.16-2016.08.31)     750
제346회(2016.09.01-2016.12.09)     853
제347회(2016.12.12-2016.12.31)    1178
제348회(2017.01.09-2017.01.20)     128
제349회(2017.02.01-2017.03.02)    2157
제350회(2017.03.03-2017.04.01)     159
제352회(2017.07.04-2017.07.22)    3035
제353회(2017.08.18-2017.08.31)    1003
제354회(2017.09.01-2017.12.09)    1384
제355회(2017.12.11-2017.12.29)     394
제356회(2018.01.30-2018.02.28)    1221
제362회(2018.07.13-2018.07.26)    1882
제363회(2018.08.16-2018.08.31)    1287
제364회(2018.09.01-2018.12.09)    1253
dtype: int64
PARTY
국민의당      1209
더불어민주당    5615
무소속        656
바른정당       184
자유한국당     2824
dtype: int64
SPEAKER
강석진 위원                 242
건강보험심사평가원개발상임이사 황의동      4
건강보험심사평가원상임감사 서정숙        2
건강보험심사평가원장 김승택          91
건강보험심사평가원장 손명세          68
                      ... 
천정배 위원                 282
천정배 의원                   1
최교일 의원                   1
최도자 위원                 428
최도자 의원    

In [None]:
# =============================================================================
# 3. Word Cloud
# =============================================================================
wordsvec = [item for sublist in tokens for item in sublist]
wordsvec = pd.Series(wordsvec)
#print(wordsvec.loc[0:5])
wcld = WordCloud(width=600, height=600, background_color='white', 
                 font_path = './NanumBarunGothic.ttf')
cnt = Counter(wordsvec.tolist())
wcld = wcld.generate_from_frequencies(cnt)
#wcld.to_image()
wcld.to_file('./result/'+tstmp+'/WC.png')


# =============================================================================
# 4. Corpus Normalization & TF-IDF
# =============================================================================
gcol()
for j in tqdm(pd.DataFrame.keys(tokens)):
    tokens[j] = ' '.join(tokens[j])
ncorpus = tokens[~tokens.isin(addf)].tolist() # Normalized corpus.
#print('Normalized corpus example : %s' %ncorpus[1:3])

# TF-IDF
tv = TfidfVectorizer(min_df=0., max_df=1., use_idf=True)
tv_matrix = tv.fit_transform(ncorpus)
tv_matrix = tv_matrix.toarray()

vocab = tv.get_feature_names()
TFIDF = pd.DataFrame(np.round(tv_matrix, 2), columns=vocab)
TFIDF.to_csv('./result/'+tstmp+'/TFIDF.csv')
#TFIDF.head(10) # vocab에 대한 TFIDF score
#TFIDF.shape # vocab에 대한 TFIDF score

# =============================================================================
# 5. LDA
# =============================================================================
a, b = 1, 50
perpl, coher = [], []
for top in tqdm(range(a,b+1)) :
    _,p,c = doLDA(fdata['token'], top, 'ALL', False)
    perpl.append(p)
    coher.append(c)
    del _,p,c

# Tuning Process
plt.figure(figsize=(10, 14))

plt.subplot(2, 1, 1)
plt.grid(True)
plt.plot(range(a,b+1), perpl, '-.', label='ALL')
plt.title('Perplexities, '+'Opt. ntopics='+str(range(a,b+1)[np.argmin(perpl)]))
plt.ylabel('log perplexities')

plt.subplot(2, 1, 2)
plt.grid(True)
plt.plot(range(a,b+1), coher, '-.', label='ALL')
plt.title('Coherences, '+'Opt. ntopics='+str(range(a,b+1)[np.argmax(coher)]))
plt.xlabel('n Topics')
plt.ylabel('Coherence')

plt.savefig('./result/'+tstmp+'/perpcoh.png')

# Final results with pyLDAvis
#flda = doLDA(fdata['token'], range(a,b+1)[np.argmin(perpl)], 
#             './result/'+tstmp, True)
optcoh = range(a,b+1)[np.argmax(coher)]
flda = doLDA(fdata['token'], optcoh, './result/'+tstmp, True)
fldatops = flda[0].print_topics(num_words=10)
fldatops = pd.DataFrame(fldatops, columns=['Topic','Representation'])
fldatops.to_excel('./result/'+tstmp+'/fldatops.xlsx')
path = './result/'+tstmp+'/'+str(optcoh)+'_pyLDAvis.pickle'
pyLDAvis = pd.read_pickle(path)
pyLDAvis

# =============================================================================
# 6. Networkds
# =============================================================================

# Gathering keywords
ftopkw = np.unique([j[0] for 
                    i in range(flda[0].num_topics) for
                    j in flda[0].get_topic_terms(i)])
ftopkw = [flda[0].id2word[i] for i in ftopkw]

# Gephi (coocurrence)
gcol()
norm_corp = fdata['token'].copy()
for j in tqdm(range(len(norm_corp))):
    norm_corp.iloc[j] = ' '.join(intersection(norm_corp.iloc[j],ftopkw))
norm_corp = norm_corp.tolist() # Normalized corpus.

print('Normalized corpus example : %s' %norm_corp[1:3])

cv = CountVectorizer(ngram_range=(1,1))
X = cv.fit_transform(norm_corp)

Xc = (X.T * X) # This is the matrix manipulation step
Xc.setdiag(0) # We set the diagonals to be zeroes as it's pointless to be 1
names = cv.get_feature_names() # This are the entity names (i.e. keywords)
df = pd.DataFrame(data = Xc.toarray(), columns = names, index = names)

df.to_csv('./result/'+tstmp+'/coocurr_for_gephi.csv', sep = ',', encoding = 'utf-8')

Collecting 1 ...
Unreachable objects : 4836
Remaining Garbage : []
Collecting 2 ...
Unreachable objects : 0
Remaining Garbage : []


100%|█████████████████████████████████████████████████████████████████████████████| 656/656 [00:00<00:00, 32821.55it/s]
  6%|████▉                                                                              | 3/50 [00:15<03:52,  4.95s/it]

In [31]:
# pd.DataFrame(pyLDAvis[1]).to_csv('./result/'+tstmp+'/pyLDAvis.csv', encoding = 'utf-8')