In [1]:
import pandas as pd
import os
import re
import jieba
import numpy as np
jieba.load_userdict('user_dict.txt')
import collections

from snownlp import SnowNLP
import random

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\26519\AppData\Local\Temp\jieba.cache
Loading model cost 0.374 seconds.
Prefix dict has been built successfully.


In [2]:
data = pd.DataFrame(pd.read_pickle('matched_2017.pkl')).T
data = data.reset_index()

data['date'] = data['time'].apply(lambda x:int(x[:10].replace('-','')))
data['stock'] = data['stock'].astype(int)
data['like count'] = data['like count'].astype(int)

In [3]:
def clean(s):
    # 替换emoji 为标准格式  [XX]
    p = '<img src="http://gbfek.dfcfw.com/face/emot[^>]+?title="(?P<emoj>[^"]+?)"/>'
    s = re.sub(p,lambda x:'礐'+x.group('emoj'),s)

    
    p = r'<[^<]+?>'  # 去除标签
    s = re.sub(p,'',s)

    s = s.replace('\n','').replace('\r','') # 去除换行符

    p = r'[http|https]*://[a-zA-Z0-9.?/&=:_-]*'
    s = re.sub(p,'',s)
    return s.strip()

def clean_title(s):
    # 去除 股票信息
    p = '_[^<]*'
    s = re.sub(p,'',s)

    p = r'<[^<]+?>'  # 去除标签
    s = re.sub(p,'',s)

    p = '\[(?P<emoj>[^\]]+?)\]'
    s = re.sub(p,lambda x:'礐'+x.group('emoj'),s)
    return s.strip()
    

In [4]:
# emoji stats
emojis = data['title'].apply(lambda x:''.join(re.findall(r'\[[^\]]+?\]',x))).sum()
emoji_dict = dict(collections.Counter(emojis[1:-1].split('][')))
emoji_dict = sorted(emoji_dict.items(),key=lambda x:x[1],reverse=True)
emoji_dict[:20]

[('大笑', 16234),
 ('鼓掌', 8109),
 ('胜利', 6478),
 ('哭', 4905),
 ('献花', 4573),
 ('微笑', 4035),
 ('不赞', 3761),
 ('赞', 3669),
 ('拜神', 3589),
 ('牛', 2937),
 ('不屑', 2549),
 ('大便', 2293),
 ('滴汗', 2135),
 ('想一下', 2040),
 ('傲', 1734),
 ('亏大了', 1548),
 ('加油', 1459),
 ('买入', 1441),
 ('为什么', 1401),
 ('俏皮', 1136)]

In [5]:
with open('user_dict.txt','w',encoding='utf8') as f:
    for w,q in [('礐'+x[0],x[1]) for x in emoji_dict if x[1] > 10]:
        f.write(w+' '+str(q*100)+'\n')

In [6]:
data['title'] = data['title'].apply(clean_title)
data['contents'] = data['contents'].apply(clean)

In [7]:
data.loc[0,'contents']

'平安银行为什么这么叼，有大神给小弟解答一下吗礐微笑'

In [8]:
jieba.lcut(data.loc[0,'contents'])

['平安',
 '银行',
 '为什么',
 '这么',
 '叼',
 '，',
 '有',
 '大神',
 '给',
 '小弟',
 '解答',
 '一下',
 '吗',
 '礐微笑']

In [11]:
def get_sentiment(x):
    try:
        s = SnowNLP(x).sentiments
        return s
    except:
        return np.NAN

data['sentiment'] = data['contents'].apply(get_sentiment)

In [12]:
idx = 6
print(SnowNLP(data.loc[idx,'contents']).sentiments)
data.loc[idx,'contents']

0.8789840911955085


'朝不保夕，人命危浅！大势已去！逢高抛之，以待来日。保住本金，留得青山。古人云：识时务者为俊杰！礐想一下礐想一下礐爱心'

### 迭代方式打标签

In [36]:
from snownlp import sentiment
def retrain(path=r'D:\python38\Lib\site-packages\snownlp\sentiment\tp.marshal'):
    sentiment.train(r'D:\python38\Lib\site-packages\snownlp\sentiment\neg.txt',r'D:\python38\Lib\site-packages\snownlp\sentiment\pos.txt')
    sentiment.save(r'D:\python38\Lib\site-packages\snownlp\sentiment\tp.marshal')
    sentiment.load(r'D:\python38\Lib\site-packages\snownlp\sentiment\tp.marshal')


def add_pos(df):
    global sample
    with open(r'D:\python38\Lib\site-packages\snownlp\sentiment\pos.txt','a',encoding='utf8') as f:
        f.write((df['contents']+'\n').sum())
    tp = df[['contents']].copy()
    tp['flag'] = 1
    sample = pd.concat([sample,tp])

def add_neg(df):
    global sample
    with open(r'D:\python38\Lib\site-packages\snownlp\sentiment\neg.txt','a',encoding='utf8') as f:
        f.write((df['contents']+'\n').sum())
    tp = df[['contents']].copy()
    tp['flag'] = -1
    sample = pd.concat([sample,tp])


# 迭代词表
def update_model(data):
    data['sentiment'] = data['contents'].apply(get_sentiment)
    goodnews = data[(data['sentiment']>0.9) & (data['like count']>6) & (data['sentiment']<0.98)]
    add_pos(goodnews)
    
    badnews = data[(data['sentiment']<0.1) & (data['like count']>5) & (data['sentiment']>0.001)]
    add_neg(badnews)
    
    retrain()


def reset():
    # 重置 正负词表
    with open(r'D:\python38\Lib\site-packages\snownlp\sentiment\neg.txt','w',encoding='utf8') as f:
        with open(r'D:\python38\Lib\site-packages\snownlp\sentiment\neg - 副本.txt','r',encoding='utf8') as f1:
            f.write(f1.read())
        with open('badw.txt','r',encoding='utf8') as f1:
            f.write(f1.read())

    with open(r'D:\python38\Lib\site-packages\snownlp\sentiment\pos.txt','w',encoding='utf8') as f:
        with open(r'D:\python38\Lib\site-packages\snownlp\sentiment\pos - 副本.txt','r',encoding='utf8') as f1:
            f.write(f1.read())
        with open('goodw.txt','r',encoding='utf8') as f1:
            f.write(f1.read())


    sentiment.train(r'D:\python38\Lib\site-packages\snownlp\sentiment\neg.txt',r'D:\python38\Lib\site-packages\snownlp\sentiment\pos.txt')
    sentiment.save(r'D:\python38\Lib\site-packages\snownlp\sentiment\tp.marshal')
    sentiment.load(r'D:\python38\Lib\site-packages\snownlp\sentiment\tp.marshal')

In [37]:
# 引入股价信息来进行启动
reset()
sample = pd.DataFrame()


ret = pd.read_csv('D:\databank\CSMAR\TRD_Dalyr_merge.csv')
ret = ret[(ret['date']>=20170000) & (ret['date']<=20180000)].copy()


# 股票当日涨跌超5%的进入筛选
goodnews = ret[(ret['Dretwd']>0.05)].copy()[['stkcd','date']]
badnews = ret[(ret['Dretwd']<-0.05)].copy()[['stkcd','date']]
goodnews.columns = ['stock','date']
badnews.columns = ['stock','date']



goodnews = pd.merge(data,goodnews,on=['stock','date'],how='inner')
goodnews['sentiment'] = goodnews['contents'].apply(get_sentiment)
tp = goodnews[(goodnews['sentiment']>0.6) & (goodnews['like count']>5) & (goodnews['sentiment']<0.8)].copy()  # 情感值较高且有一定点赞数的评论加入词表
add_pos(tp)
tp = goodnews[(goodnews['sentiment']>0.8) & (goodnews['like count']>8)].copy()           #情感值很高的且有一定点赞数的评论加入词表且提升其重要性
add_pos(tp)



badnews = pd.merge(data,badnews,on=['stock','date'],how='inner')
badnews['sentiment'] = badnews['contents'].apply(get_sentiment)
#badnews['like count'] = badnews['like count'].astype(int)


tp = badnews[(badnews['sentiment']<0.1) & (badnews['like count']>3) & (badnews['sentiment']<0.1)].copy() #负面评论的默认评分更为不准确，且用户点赞较多，我们可以更严格的筛选
add_neg(tp)
tp = badnews[(badnews['sentiment']<0.4) & (badnews['like count']>1) & (badnews['sentiment']>0.1)].copy() #较温和负面评论也加入词表
add_neg(tp)

retrain()

In [38]:
pncount = {}
for _ in range(20):
    pncount[_] = sample.groupby(['flag']).size().tolist()
    print(pncount[_])

    update_model(data)

[870, 1379]
[1481, 2284]
[2040, 3226]
[2573, 3855]
[3093, 4416]
[3571, 5028]
[4066, 5623]
[4536, 6222]
[5000, 6772]
[5447, 7334]
[5937, 7902]
[6384, 8408]
[6793, 8884]
[7183, 9303]
[7561, 9717]
[7924, 10094]
[8286, 10506]
[8646, 10853]
[8987, 11241]
[9349, 11578]


In [39]:
sample.to_pickle('marked_sample.pkl')

In [1]:
import pandas as pd

sample = pd.read_pickle('marked_sample.pkl')
sample = sample.drop_duplicates(subset=['contents'],keep='last')

In [7]:
sample.to_pickle('tagged_sample.pkl')

In [40]:
sample['sentiment'] = sample['contents'].apply(get_sentiment)

In [41]:
idx = -300
sample.drop_duplicates(keep='last').iloc[idx:idx+30]

Unnamed: 0,contents,flag,sentiment
60658,7.05可以买了！放心买，今天不会跌破7元！,-1,0.004893
61569,这一天涨一天跌的，小妖精我是折腾不起，走了,-1,0.001118
61658,谁知道葫芦里卖啥药呢。,-1,0.018765
65018,[大便][大便],-1,0.005977
65854,因为微网教育频道推荐了，正常都有20－50%涨幅。立贴为证。,-1,0.001115
72468,京东方柔性屏(四),-1,0.032923
72487,京东方柔性屏(五),-1,0.053095
72592,京东方，下午如果还不放量上攻，坚决清仓！礐怒,-1,0.001795
74232,明天拦腰一斩？,-1,0.002003
75633,尾盘不跌5个点以上。将退出股市,-1,0.013696


In [43]:
get_sentiment('人人挂一単，缔造泛微超级板王')

0.9994957668448866