### (1) Init. and setup Env.

In [93]:
## Configs:
FN_NN = 'tmp/keywords_nn_draft.xlsx'
FN_ESG = 'tmp/keywords_esg_draft.xlsx'
FN_model = 'models/merge_sgns_bigram_char300.txt.bz2'

In [87]:
# Import original keywords and helper functions
import pandas as pd
from news_classifier import config
from news_classifier.w2vhelper import w2v

In [88]:
# Load model
w2v = w2v()
w2v.load_model(FN_model)

models/merge_sgns_bigram_char300.txt.bz2 loaded


In [90]:
# Simple test
w2v.most_similar('詐騙')

[('詐騙案', 0.7540429830551147),
 ('行騙', 0.7037684917449951),
 ('詐騙犯', 0.67093825340271),
 ('欺詐', 0.6673640012741089),
 ('詐騙者', 0.6636461019515991)]

### (2) Expand keywords and export to Excel files.

In [91]:
# Expand keywords 
nndata = w2v.extend_keywords(config.NN_KEYWORDS)
esgdata = w2v.extend_keywords(config.ESG_KEYWORDS)

OOV 人口贩卖
OOV 工安事故
OOV 不如预期
OOV 不实财务披露
OOV 内幕交易
OOV 公款出国
OOV 公款出境
OOV 公款购买
OOV 以公谋私
OOV 以检谋私
OOV 市场泡沫
OOV 市场禁入
OOV 地下钱庄
OOV 作空
OOV 利益交换
OOV 肥手指
OOV 怠忽职守
OOV 美化财务报表
OOV 美化财报
OOV 负面展望
OOV 军火贩运
OOV 倒货
OOV 气爆
OOV 财报不实
OOV 财报难产
OOV 追究党纪责任
OOV 假交易
OOV 勒令停工
OOV 帐务不实
OOV 推诿扯皮
OOV 被抓
OOV 被诉
OOV 贩卖武器
OOV 无法表示意见
OOV 无预警
OOV 诈贷
OOV 买官卖官
OOV 越权审批
OOV 黑箱作业
OOV 经济泡沫
OOV 经营倒闭
OOV 资金链危机
OOV 资金链困难
OOV 资金链紧张
OOV 资金链断裂
OOV 运毒
OOV 幕后黑手
OOV 弊案
OOV 撤销公开
OOV 网路犯罪
OOV 银行紧缩
OOV 暴力威胁
OOV 调降信用评等
OOV 调降信评
OOV 卖毒
OOV 操弄股价
OOV 钱色交易
OOV 检调约谈
OOV 谣传危机
OOV 藏毒
OOV 骗贷
OOV 权色交易
OOV 权权交易
OOV 全额交割
OOV 踩雷
OOV 不法组织
OOV 空污
OOV 高碳排
OOV 气爆
OOV 过劳致死
OOV 职灾
OOV 职业安全
OOV 剥削劳工
OOV 撤照
OOV 裁罚
OOV 财报不实
OOV 操纵股价
OOV 非常规交易
OOV 假交易


In [95]:
# Export to excel
dfnn=pd.DataFrame(nndata)
dfnn.to_excel(FN_NN, index=False)
print("NN Extended: {} ({} => {})".format(FN_NN, len(config.NN_KEYWORDS), dfnn.shape[0]))      
      
dfesg=pd.DataFrame(esgdata)
dfesg.to_excel(FN_ESG, index=False)
print("ESG Extended: {} ({} => {})".format(FN_ESG, len(config.ESG_KEYWORDS), dfesg.shape[0]))

NN Extended: tmp/keywords_nn_draft.xlsx (332 => 820)
ESG Extended: tmp/keywords_esg_draft.xlsx (57 => 159)


### (3) Manual Review and Summary
 - 請Review上述生成之Excel 檔案 (Selected欄位)，欄位說明如下
   - Keywords: NN/ESG 關鍵字
   - Seed: 
     - 原始種子關鍵字 = 1
     - 新擴增關鍵字 = 0
   - Selected = 此欄位需人工複核
     - 加入新關鍵字清單 = 1
     - 不使用此關鍵字 = 0
   - Vector: word2vec 向量 (如無向量，則表示模型未包含此關鍵詞)

In [96]:
# Releases/202207
nn_reviewed  = 'Releases/202207/keywords_nn_v202207.xlsx'
esg_reviewed = 'Releases/202207/keywords_esg_v202207.xlsx'

In [97]:
df = pd.read_excel(nn_reviewed)
cntSeed = df[df['Seed'] ==1].shape[0]
cntExpand = df[df['Seed'] ==0].shape[0]
cntSelected = df[df['Selected'] ==1].shape[0]
reviewed = cntSelected - cntSeed
print("# NN 關鍵字擴增統計:")
print("#   - 原種子關鍵字", cntSeed)
print("#   - 新擴充關鍵字", cntExpand)
print("#   - 人工複核新增 {} ({:.2f}%)".format(reviewed, reviewed/cntExpand*100) )
print("#--------------------------------")
print("# => 最終擴增結果: {} ({:.2f}%)".format(cntSelected, cntSelected/cntSeed*100) )

# NN 關鍵字擴增統計:
#   - 原種子關鍵字 332
#   - 新擴充關鍵字 488
#   - 人工複核新增 296 (60.66%)
#--------------------------------
# => 最終擴增結果: 628 (189.16%)


In [98]:
df = pd.read_excel(esg_reviewed)
cntSeed = df[df['Seed'] ==1].shape[0]
cntExpand = df[df['Seed'] ==0].shape[0]
cntSelected = df[df['Selected'] ==1].shape[0]
reviewed = cntSelected - cntSeed
print("# ESG 關鍵字擴增統計:")
print("#   - 原種子關鍵字", cntSeed)
print("#   - 新擴充關鍵字", cntExpand)
print("#   - 人工複核新增 {} ({:.2f}%)".format(reviewed, reviewed/cntExpand*100) )
print("#--------------------------------")
print("# => 最終擴增結果: {} ({:.2f}%)".format(cntSelected, cntSelected/cntSeed*100) )

# ESG 關鍵字擴增統計:
#   - 原種子關鍵字 57
#   - 新擴充關鍵字 102
#   - 人工複核新增 44 (43.14%)
#--------------------------------
# => 最終擴增結果: 101 (177.19%)


### (4) Visualization via tensorboard
 - https://projector.tensorflow.org/ (Upload manually)
 - Online demo: 
   - https://projector.tensorflow.org/?config=%20https://wmyaoyao.bot.nu:8443/demos/config_nn.json
 

In [99]:
# Helper functions
def format_df_vector(vvv):
    line = str(vvv[0])
    for v in vvv[1:]:
        line = line + '\t' + str(v)
    line = line + "\n"
    return line
    
def format_excel_vec(vvv):
    # [ 0.179107 -0.156642 -0.696843 -0.795377  1.155509 -0.930281 -0.395106\n...
    vvv = vvv.replace('\n','').replace('[','').replace(']','')
    line = '\t'.join(vvv.split())
    line += '\n'
    return line

def export_to_tfboard(df, prefix):
    vectorFN = "{}_vector.tsv".format(prefix)
    metaFN = "{}_meta.tsv".format(prefix)
    
    fvec  = open(vectorFN, 'w')
    fmeta = open(metaFN, 'w')
    
    fmeta.write("Keyword\tSeed\n")

    for idx, row in df.iterrows():
        vecstr = format_excel_vec(row['Vector'])
        fvec.write(vecstr)
        fmeta.write("{}\t{}\n".format(row['Keyword'], row['Seed']))
    
    fvec.close()
    fmeta.close()
    print(" File exported: ", vectorFN, metaFN)
    print(" Please load the two files into tensorboard for visualization.")

In [101]:
dfnn = pd.read_excel(nn_reviewed)
m = dfnn['Vector'].notnull() & dfnn['Selected']==1
export_to_tfboard(dfnn.loc[m],  'Releases/202207/TF_nn')


dfesg = pd.read_excel(esg_reviewed)
m = dfesg['Vector'].notnull() & dfesg['Selected']==1
export_to_tfboard(dfesg.loc[m],  'Releases/202207//TF_esg')


 File exported:  Releases/202207/TF_nn_vector.tsv Releases/202207/TF_nn_meta.tsv
 Please load the two files into tensorboard for visualization.
 File exported:  Releases/202207//TF_esg_vector.tsv Releases/202207//TF_esg_meta.tsv
 Please load the two files into tensorboard for visualization.
