In [6]:
import numpy as np

## Read NTUSD

In [1]:
neg_dict = {}
with open("chinese_sentiment/dict/ntusd-negative.txt") as f:
    lines = [line.strip() for line in f.readlines()]
    for word in lines:
        neg_dict[word] = 1
neg_dict.pop('', None)
post_dict = {}        
with open("chinese_sentiment/dict/ntusd-positive.txt") as f:
    lines = [line.strip() for line in f.readlines()]    
    for word in lines:
        post_dict[word] = 1
print('number of postive word: ' + str(len(post_dict)))
print('number of negative word: ' + str(len(neg_dict)))

number of postive word: 2647
number of negative word: 7740


## Read ANTUSD

In [2]:
ANTUSD = {}
with open("chinese_sentiment/ANTUSD/opinion_word_utf8.csv", encoding='utf-8') as f:
    lines = [line.strip() for line in f.readlines()]
for line in lines:
    word = line.split(',')[0]
    score = line.split(',')[1]
    ANTUSD[word] = float(score)

In [3]:
ANTUSD['低吼']

-0.55

In [4]:
len(ANTUSD)

27221

## Load Data

In [7]:
def read_label_comment():
    index = []
    label = []
    comment = []
    for idx, line in enumerate(open('data/semi_comment.csv', 'r', encoding='UTF-8')):
        list = line.split('_+_')
        if len(list) >= 2:
            if list[1] == '1\n' or list[1] == '-1\n':
                comment.append(list[0])
                label.append(int(list[1].strip('\n')))

    return comment, np.array(label)
test_comment, test_label = read_label_comment()

## ANTUSD approach

In [8]:
def ANTUSD_approach(list_):
    ANTUSD_sentiment_list = []
    ANTUSD_post_token = []
    ANTUSD_neg_token = []
    for sentence in list_:
        score = 0
        temp_post = []
        temp_neg = []
        for token in sentence.split(' '):
            if token in ANTUSD:
                score = score + ANTUSD[token]
                if ANTUSD[token] >= 0:
                    temp_post.append(token)
                else:
                    temp_neg.append(token)
        ANTUSD_post_token.append(temp_post)
        ANTUSD_neg_token.append(temp_neg)
        if score >= 0:
            ANTUSD_sentiment_list.append(1)
        else:
            ANTUSD_sentiment_list.append(-1)
    return ANTUSD_sentiment_list, ANTUSD_post_token, ANTUSD_neg_token

## NTUSD approach

In [9]:
NTUSD_sentiment_list = []
NTUSD_post_token = []
NTUSD_neg_token = []
for sentence in test_comment:
    post_score = 0
    neg_score = 0
    temp_post = []
    temp_neg = []
    for token in sentence.split(' '):
        if token in post_dict:
            post_score = post_score + 1
            temp_post.append(token)
        elif token in neg_dict:
            neg_score = neg_score + 1
            temp_neg.append(token)
    NTUSD_post_token.append(temp_post)
    NTUSD_neg_token.append(temp_neg)
    if post_score >= neg_score:
        NTUSD_sentiment_list.append(1)
    else:
        NTUSD_sentiment_list.append(-1)

## Show Testing result

In [10]:
from sklearn.metrics import accuracy_score
ANTUSD_sentiment_list, ANTUSD_post_token, ANTUSD_neg_token = ANTUSD_approach(test_comment)
print("ANTUSD accuracy:",accuracy_score(ANTUSD_sentiment_list, test_label))
from sklearn.metrics import accuracy_score 
print("NTUSD accuracy:",accuracy_score(NTUSD_sentiment_list, test_label))

ANTUSD accuracy: 0.6835443037974683
NTUSD accuracy: 0.6784810126582278


In [84]:
for idx in range(0,3):
    print('comment: ')
    print(test_comment[idx].replace(' ', ''))
    print( )
    print('Gold sentiment: ' + str(test_label[idx]))
    print('NTUSD sentiment: ' + str(NTUSD_sentiment_list[idx]))
    print('ANTUSD sentiment: ' + str(ANTUSD_sentiment_list[idx]))
    print()
    print('post_token: ')
    print(ANTUSD_post_token[idx])
    print(NTUSD_post_token[idx])
    print( )
    print('neg_token: ')
    print(ANTUSD_neg_token[idx])
    print(NTUSD_neg_token[idx])
    print('-----------------------------------------------------------------------------------')

comment: 
全「台」稱讚齊鼓掌，峽「灣」風雲起濤浪，匯「總」首善發光芒，一「統」江山震四方，南「柯」有夢志飛揚，棄「文」從政展擔當，儒「哲」挺身振家邦，美「好」典範垂飄香。.柯P：出來選2020的總統啦。

Gold sentiment: 1
NTUSD sentiment: 1
ANTUSD sentiment: 1

post_token: 
['稱讚', '鼓掌', '發光', '江山', '擔當', '挺身', '典範']
['稱讚', '鼓掌']

neg_token: 
[]
[]
-----------------------------------------------------------------------------------
comment: 
全「台」稱讚齊鼓掌，峽「灣」風雲起濤浪，匯「總」首善發光芒，一「統」江山震四方，南「柯」有夢志飛揚，棄「文」從政展擔當，儒「哲」挺身振家邦，美「好」典範垂飄香。.柯P：出來選2020的總統啦。

Gold sentiment: 1
NTUSD sentiment: 1
ANTUSD sentiment: 1

post_token: 
['稱讚', '鼓掌', '發光', '江山', '擔當', '挺身', '典範']
['稱讚', '鼓掌']

neg_token: 
[]
[]
-----------------------------------------------------------------------------------
comment: 
一堆政客只會出張嘴在那邊看世大運搞砸，好像不是他們黨執政的縣市，辦啥活動都只會靠北。

Gold sentiment: -1
NTUSD sentiment: -1
ANTUSD sentiment: -1

post_token: 
['好像']
[]

neg_token: 
['政客', '搞砸', '不是', '靠北']
['不是', '靠北']
-----------------------------------------------------------------------------------


## kp and yao comments prediction

In [21]:
from sklearn.externals import joblib
(tf_kp_posts,tf_kp_posts_feature_names, kp_sum_posts_clean_seg, tf_kp_comments,tf_kp_comments_feature_names, kp_sum_comments_clean_seg) = joblib.load( "result/tf_idf_kp_all.pkl" )
(tf_yao_posts,tf_yao_posts_feature_names, yao_sum_posts_clean_seg, tf_yao_comments,tf_yao_comments_feature_names, yao_sum_comments_clean_seg) = joblib.load( "result/tf_idf_yao_all.pkl" )


In [22]:
print('# of kp posts: ' + str(len(kp_sum_posts)))
print('# of kp posts time: ' + str(len(kp_posts_time)))
print('# of kp comments: ' + str(len(kp_sum_comments)))
print('# of kp comments time: ' + str(len(kp_comments_time)))
print('# of kp comments to index: ' + str(len(kp_comments_to_post_index)))
print()
print('# of yao posts: ' + str(len(yao_sum_posts)))
print('# of yao posts time: ' + str(len(yao_posts_time)))
print('# of yao comments: ' + str(len(yao_sum_comments)))
print('# of yao comments time: ' + str(len(yao_comments_time)))
print('# of yao comments to index: ' + str(len(yao_comments_to_post_index)))

# of kp posts: 1526
# of kp posts time: 1526
# of kp comments: 74705
# of kp comments time: 74705
# of kp comments to index: 74705

# of yao posts: 977
# of yao posts time: 977
# of yao comments: 15497
# of yao comments time: 15497
# of yao comments to index: 15497


In [23]:
kp_comments_label, ANTUSD_post_token_kp,  ANTUSD_neg_token_kp= ANTUSD_approach(kp_sum_comments_clean_seg)
yao_comments_label, ANTUSD_post_token_yao,  ANTUSD_neg_token_yao= ANTUSD_approach(yao_sum_comments_clean_seg)



In [24]:
idx = 0
print('kp post: ' + str(idx))
print(kp_sum_posts[idx])
print()
print('kp posts time: ')
print(kp_posts_time[idx])
print()
for comments_idx in range(1,5):
    print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')
    print('kp comments: ')
    print(kp_sum_comments[comments_idx])
    print()
    print('kp comments time: ')
    print(kp_comments_time[comments_idx])
    print()
    print('kp comments to post: ')  # 記錄這則留言是哪篇貼文的留言
    print(kp_comments_to_post_index[comments_idx])
    print()
    print('kp sentiments labels: ')
    print(kp_comments_label[comments_idx])
    print()
    print('kp postive token: ')
    print(ANTUSD_post_token_kp[comments_idx])
    print()
    print('kp negative token: ')
    print(ANTUSD_neg_token_kp[comments_idx])
    print()
print('-------------')

kp post: 0
還在當醫師的時候，我寫了第一本書「白色的力量」，講了很多對當時社會的批判，以及從醫多年累積的生死觀，算是我行醫二十幾年下來的回顧之作。這本書，等等凌晨0點在全台各大電子書店就會開始預購，除了宣揚理念，順便賺點版稅貼補家用，大家就多多捧場一下。第二、第三本則是跟選舉有關，一本在選前談從政的心境，一本是選後回顧團隊的SOP。---「光榮城市」新書預購資訊（22日0時正式預購）http://pcse.pw/7WLYS「光榮城市」簽書會活動資訊http://pcse.pw/79QBY新書首發讀者見面會時間：6/30（六）PM 6:00地點：台北捷運中山站第二廣場（R7光盒旁）。現在，我每天七點半準時上工，不知不覺也進入第四年任期，反省改進是我每日的功課，我每天都會想很多事情如果重來一遍，怎麼做會比較好，想著想著，乾脆就記錄下來，除了市政上的理念，當然還有很多當市長之後學到的寶貴經驗。不包括醫學書籍的話，「光榮城市」是我寫的第四本書。不只是寫，我還自己做投影片解說，講了5個小時，出版社乾脆直接錄下來燒錄成兩片DVD，幕僚都笑說大家睡不著的時候可以拿來看。

kp posts time: 
2018-06-21T12:39:10+0000

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
kp comments: 
恭喜你變成大作家，不讓你專美於前，我也要開始寫作，等你出完這本書，明年換我出，呵呵呵，咱們倆接力賺稿費，支付不足的選舉經費和我們官司的錢。

kp comments time: 
2018-06-21T12:53:45+0000

kp comments to post: 
0

kp sentiments labels: 
1

kp postive token: 
['恭喜', '變成', '作家', '專美於前', '開始', '呵呵', '支付']

kp negative token: 
['不足']

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
kp comments: 
柯P您好，如果之後您有製作影片的需求，但是缺少人力和經費的話，到選舉結束前我都願意免費協助您製作各種動畫、剪輯和簡單的文案。本人在業界雖然不是什麼大導演，但也有8年的相關經驗，雖然沒有錢，但我願意出這

In [26]:
print(len([label for label in kp_comments_label if label == 1]))
print(len(kp_comments_label))
print(len([label for label in yao_comments_label if label == 1]))
print(len(yao_comments_label))

60584
74705
13028
15497
