# 新浪微博用户画像之中文文本处理

---
** 记录程序时间 **

---

In [1]:
import datetime
startTime = datetime.datetime.now()

## 1. 导入库

In [2]:
import pandas as pd
import re
import jieba
import jieba.analyse

## 2. 导入数据

In [3]:
full = pd.read_csv('full.csv')

In [4]:
full['text'].head()

0    “雅思口语评分标准”共5讲，5个视频，由新东方谢绍东老师主讲。课程主要内容包括“雅思口语评分...
1    MD，人类早晚要被吃货给毁了！//@编剧肖言: 还能再狠点儿吗，这群臭SB不怕报应就吃吧//...
2    大家帮忙投下C组冯湲http://t.cn/zjxrUmM我参与了@小银星艺术团 发起的投票...
3    等变潇洒哥了的时候我们也包个场@ACE大卫 @吉o0snake 这货今天喝大了打球头疼算怎么...
4    哈哈，太好玩了一万年，这世上没人比她好 看看大家怎么说>>http://t.cn/zW4VG...
Name: text, dtype: object

## 3. 中文文本处理
> 3.1 用中文正则表达式， 获取文本中的中文

> 3.2 过滤中文停用词

> 3.3 获取情感词词典并进行性别评分

> 3.4 导出数据

#### 获取文本中的中文

In [5]:
# 表示汉字的Unicode码
get_chinese = re.compile(r'[^\u4e00-\u9fa5]')
for i in range(full.shape[0]):
    full.at[i, 'text'] = get_chinese.split(full.at[i, 'text'])
    while '' in full.at[i, 'text']:
        full.at[i, 'text'].remove('')
    full.at[i, 'text'] = ''.join(full.at[i, 'text'])
    full.at[i, 'text'] = jieba.lcut(full.at[i, 'text'])

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\ADMINI~1\AppData\Local\Temp\jieba.cache
Loading model cost 2.237 seconds.
Prefix dict has been built succesfully.


In [6]:
full['text'].head()

0    [雅思, 口语, 评分标准, 共, 讲个, 视频, 由, 新东方, 谢绍东, 老师, 主讲,...
1    [人类, 早晚, 要, 被, 吃货, 给, 毁, 了, 编剧, 肖言, 还, 能, 再, 狠...
2    [大家, 帮忙, 投下, 组冯, 湲, 我, 参与, 了, 小, 银星, 艺术团, 发起, ...
3    [等, 变, 潇洒, 哥, 了, 的, 时候, 我们, 也, 包个场, 大卫, 吉, 这货,...
4    [哈哈, 太, 好玩, 了, 一万年, 这, 世上, 没人, 比, 她, 好, 看看, 大家...
Name: text, dtype: object

In [7]:
filepath = './stop_word.txt'
def stopwordslist(filepath):  
    with open(filepath, 'r') as f:
        stopwords = [line.strip() for line in f.readlines()]  
    return stopwords 
def delete_stopwords(sentence):  
    stopwords = stopwordslist('./stop_word.txt')
    outstr = ''  
    for word in sentence:  
        if word not in stopwords:  
            outstr += word  
            outstr += " "  
    return outstr 

#### 过滤停用词

In [8]:
for i in range(full.shape[0]):
    full.at[i,'text'] =  delete_stopwords(full.at[i,'text'])

#### 提取词频前10的关键词

In [9]:
for i in range(full.shape[0]):
    full.at[i, 'text'] = jieba.analyse.extract_tags(full.at[i,'text'], topK=10, withWeight=True)

In [10]:
full['text'][2]

[('快盘', 0.14738754455630138),
 ('南京', 0.10462033698857534),
 ('音乐台', 0.10285794485671232),
 ('叶帆', 0.09825836303753424),
 ('金山', 0.06910369486553425),
 ('空间', 0.057839605246027395),
 ('光棍节', 0.05427752248520548),
 ('按钮', 0.05251062760789041),
 ('收藏', 0.05178300238527397),
 ('易经', 0.04920929712958905)]

#### 性别数据分组

In [11]:
full_m = full[full['gender'] == 'm'].reset_index()
full_f = full[full['gender'] == 'f'].reset_index()

In [12]:
genRatingNmber = 200
full_genRating = pd.concat([full_m.iloc[:genRatingNmber, :], full_f.iloc[:genRatingNmber, :]]\
                                       , ignore_index = True, axis = 0)
full_genRated = pd.concat([full_m.iloc[genRatingNmber:len(full_m), :], full_f.iloc[genRatingNmber:len(full_f), :]]\
                                      , ignore_index = True, axis = 0)
del full_genRating['index'], full_genRated['index']

#### 获取情感词典

In [13]:
textRating_dict = {}
for i in range(full_genRating.shape[0]):
    if full_genRating.at[i, 'gender'] == 'm':
        for j in full_genRating.at[i, 'text']:
            k = j[0]
            if k in textRating_dict:
                textRating_dict[k] += j[1]
            else:
                textRating_dict[k] = j[1]
    elif full_genRating.at[i, 'gender'] == 'f':
        for j in full_genRating.at[i, 'text']:
            k = j[0]
            if k in textRating_dict:
                textRating_dict[k] += -j[1]
            else:
                textRating_dict[k] = -j[1]

In [14]:
sorted(textRating_dict.items(),key = lambda x:x[1],reverse = True)

[('毒蘑菇', 2.835193846100569),
 ('环境中工作', 1.278086931022222),
 ('售楼处', 1.1961314928888889),
 ('英伦', 1.1629656052777777),
 ('英国', 1.1169710251977778),
 ('游戏', 0.9832848427111186),
 ('无耻', 0.9725759083511112),
 ('推荐', 0.9460936422557839),
 ('好久', 0.9442222522333333),
 ('城市', 0.9042802072536326),
 ('十足', 0.8955595549555555),
 ('伊堂', 0.7983150252353923),
 ('工商大学', 0.7749969421984494),
 ('感受', 0.7659980472377778),
 ('留灯', 0.748038754526767),
 ('寒夜', 0.704666856789803),
 ('玩微', 0.6992975410586107),
 ('九江', 0.6927354951234835),
 ('婚庆', 0.6882039648358932),
 ('中国', 0.6418029401770983),
 ('庐山', 0.6068278631205223),
 ('地址', 0.6013690927801849),
 ('顽徒', 0.5894805654990011),
 ('第届', 0.5891747881085386),
 ('冰心', 0.5858698506403183),
 ('不错', 0.5733176703422846),
 ('睡觉', 0.5664829923393826),
 ('博文', 0.5488725990978665),
 ('转载', 0.5463867287934334),
 ('咕咕', 0.5425389351288674),
 ('青蜂', 0.5313230001288889),
 ('侠大卖', 0.5313230001288889),
 ('不赖', 0.5309925641501635),
 ('漫画', 0.5285521981208104),
 ('芸临', 0.

In [15]:
textRating_dict

{'英语': 0.18091282303196698,
 '考试': 0.03365515208032184,
 '考研': 0.15002191003873844,
 '老师': 0.20488252345216584,
 '学习': 0.12678941303460622,
 '根杰': 0.056848432787339975,
 '四六级': 0.049953242588575746,
 '名师': 0.049440042794409673,
 '讲堂': 0.048241195853342814,
 '雅思': -0.0012777772054222772,
 '戏剧': 0.10365144175163624,
 '郑飞': 0.06777847546717314,
 '华语': 0.06335958411529652,
 '次品': 0.045971009366866995,
 '三人风': 0.039311515770960426,
 '钓鱼岛': 0.06363040135272546,
 '云雨': 0.037863492002993535,
 '小鬼子': 0.036593103408697134,
 '中国': 0.6418029401770983,
 '梧桐': 0.0809161895219633,
 '快盘': 0.14738754455630138,
 '南京': -0.017440271330865738,
 '音乐台': 0.10285794485671232,
 '叶帆': 0.09825836303753424,
 '金山': 0.06910369486553425,
 '空间': 0.1637821303659179,
 '光棍节': 0.05427752248520548,
 '按钮': 0.05251062760789041,
 '收藏': 0.19176749897450662,
 '易经': 0.04920929712958905,
 '我何': 0.1032678087140752,
 '胡志星': 0.07593221228976117,
 '妞妞': 0.07136251874756099,
 '友善': 0.061216957411902945,
 '好看': 0.05017834097771849,
 '大

#### 情感评分

In [16]:
for i in range(full_genRated.shape[0]):
    count = 0
    for j in full_genRated.at[i, 'text']:
        if j[0] in textRating_dict:
            count += textRating_dict[j[0]]
    full_genRated.at[i, 'textRating'] = count

#### 整理并导出数据

In [21]:
full_end = full_genRated.drop(['text'], axis = 1)
full_end = full_genRated

In [23]:
full_end[['user_id', 'text', 'textRating']].head()

Unnamed: 0,user_id,text,textRating
0,2091184385,"[(奥特曼, 0.6950338826), (菜头, 0.6401032681650001)...",-1.002012
1,2091185220,"[(无锡, 0.3049203041812383), (电影, 0.257321430433...",0.074631
2,2091189351,"[(澜坊, 0.22805279373002008), (汤臣, 0.17467006865...",-1.428712
3,2091194150,"[(好看, 1.6880508554555556), (短款, 0.672717676816...",-1.947148
4,2091194191,"[(我爱你, 0.644575728569066), (老婆, 0.489221679348...",-0.076988


In [19]:
full_end.to_csv('full_end.csv', encoding = 'utf-8')

---
** 记录程序时间 **

---

In [20]:
endTime = datetime.datetime.now()
print(endTime - startTime)

0:27:41.102009
