# 新浪微博用户画像之数据预处理

---
** 记录程序时间 **

---

In [1]:
import datetime
startTime = datetime.datetime.now()

## 1. 导入库

In [2]:
import pandas as pd

## 2. 导入数据

In [3]:
user_info = pd.read_csv("E:\data_analysis\Graduation design\data\weibo_users.csv")
weibo_info_reader = pd.read_csv("E:\data_analysis\Graduation design\data\weibodata.csv", iterator=True, \
                            names = ['user_id', 'reposts_count', 'comment_count', 'source', 'created_at', 'text'])
weibo_info = weibo_info_reader.get_chunk(10000)

In [20]:
user_info.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2611090 entries, 0 to 2611089
Data columns (total 7 columns):
index          int64
verified       bool
name           object
gender         object
location       object
user_id        int64
description    object
dtypes: bool(1), int64(2), object(4)
memory usage: 122.0+ MB


## 3. 数据预处理
> 3.1 微博数据时间数值转换

> 3.2 微博数据按用户id分组聚合

> 3.3 用户数据去重

> 3.4 用户数据地域数值转换

> 3.5 微博数据和用户数据合并

> 3.6 导出数据

In [5]:
# 创建时间转换函数
def time_transform(df):
    for row in range(df.shape[0]):
        hour = pd.to_datetime(df['created_at'][row]).hour
        if 0 <= hour <= 8:
            df.at[row, 'timeRating'] = 9
        elif 9 <= hour <= 13:
            df.at[row, 'timeRating'] = 1
        elif 14 <= hour <= 18:
            df.at[row, 'timeRating'] = 2
        elif 19 <= hour <= 23:
            df.at[row, 'timeRating'] = 5
    return df

# 分组聚合，再重组
def group_text(df):
    df = df.groupby('user_id').sum()
    df = df.reset_index()
    return df
def group_Rcc(df):
    df = df.groupby('user_id').mean()
    df = df.reset_index()
    return df
def combine_RccText(df):
    return pd.merge(group_Rcc(df), group_text(df[['user_id', 'text']]), on='user_id')

#### 微博时间数据转换与分组聚合

In [6]:
weibo_Prepared = combine_RccText(time_transform(weibo_info))
for i in range(3):
    weibo_info = weibo_info_reader.get_chunk(100000).reset_index()
    temp = combine_RccText( time_transform(weibo_info) )
    weibo_Prepared = pd.concat([temp, weibo_Prepared])
weibo_Prepared = combine_RccText(weibo_Prepared)

In [7]:
location_Dict = {'河北': 1, '山西': 2, '内蒙': 3, '黑龙': 4, '吉林': 5, '辽宁': 6, '陕西': 7, '甘肃': 8,\
                 '青海': 9, '新疆': 10, '维吾': 11, '宁夏': 12, '山东': 13, '河南': 14, '江苏': 15, '浙江': 16,\
                 '安徽': 17, '江西': 18, '福建': 19, '台湾': 20, '湖北': 21, '湖南': 22, '广东': 23, '广西': 24,\
                 '海南': 25, '四川': 26, '云南': 27, '贵州': 28, '西藏': 29, '重庆': 30, '天津': 31, '上海':32,\
                 '北京': 33, '香港': 34, '澳门': 35, '海外': 36, '其他': 37}

In [8]:
weibo_Prepared.head()

Unnamed: 0,user_id,comment_count,index,reposts_count,timeRating,text
0,1427583973,0.324443,18203.922747,0.324443,3.503905,“雅思口语评分标准”共5讲，5个视频，由新东方谢绍东老师主讲。课程主要内容包括“雅思口语评分...
1,1427587605,1.333277,16720.354136,1.333277,3.504647,MD，人类早晚要被吃货给毁了！//@编剧肖言: 还能再狠点儿吗，这群臭SB不怕报应就吃吧//...
2,1427589977,0.091503,,0.091503,3.457516,大家帮忙投下C组冯湲http://t.cn/zjxrUmM我参与了@小银星艺术团 发起的投票...
3,1427590831,0.302523,60721.087617,0.302523,3.564042,等变潇洒哥了的时候我们也包个场@ACE大卫 @吉o0snake 这货今天喝大了打球头疼算怎么...
4,1427591573,0.130364,97834.287879,0.130364,4.935154,哈哈，太好玩了一万年，这世上没人比她好 看看大家怎么说>>http://t.cn/zW4VG...


#### 用户数据去重与用户地域数值转换

In [9]:
# 用户数据去重
user_info = user_info.drop_duplicates(['user_id']).reset_index()
# 两表合并
full = pd.merge(weibo_Prepared, user_info[['user_id', 'location', 'gender']], on = 'user_id')
# 地域评分
full.loc[:, 'location'] = list(map(lambda x: location_Dict[x[0:2]], full.loc[: , 'location']))

In [10]:
user_info.head()

Unnamed: 0,index,verified,name,gender,location,user_id,description
0,0,False,小神万里,m,湖北 武汉,44528425,农民
1,1,False,咯咯spy,m,江苏 扬州,44550011,
2,2,False,魔魅小妖,m,辽宁 大连,44566544,
3,3,False,moyan919,m,宁夏 石嘴山,44596787,
4,4,False,其林,m,甘肃 陇南,44608380,


#### 用户数据和微博数据的合并

In [11]:
full.rename(columns={'location': 'locaRating'}, inplace = True)
full = full[['user_id', 'comment_count', 'reposts_count', 'timeRating', 'locaRating', 'text', 'gender']]

#处理空text
full = full.dropna().reset_index()
del full['index']

In [12]:
full.head()

Unnamed: 0,user_id,comment_count,reposts_count,timeRating,locaRating,text,gender
0,1427583973,0.324443,0.324443,3.503905,33,“雅思口语评分标准”共5讲，5个视频，由新东方谢绍东老师主讲。课程主要内容包括“雅思口语评分...,m
1,1427587605,1.333277,1.333277,3.504647,33,MD，人类早晚要被吃货给毁了！//@编剧肖言: 还能再狠点儿吗，这群臭SB不怕报应就吃吧//...,m
2,1427589977,0.091503,0.091503,3.457516,15,大家帮忙投下C组冯湲http://t.cn/zjxrUmM我参与了@小银星艺术团 发起的投票...,m
3,1427590831,0.302523,0.302523,3.564042,32,等变潇洒哥了的时候我们也包个场@ACE大卫 @吉o0snake 这货今天喝大了打球头疼算怎么...,m
4,1427591573,0.130364,0.130364,4.935154,22,哈哈，太好玩了一万年，这世上没人比她好 看看大家怎么说>>http://t.cn/zW4VG...,m


#### 地域数值的one-hot编码

In [13]:
loc_Df = pd.DataFrame()
loc_Df = pd.get_dummies(full['locaRating'], prefix = 'local')

In [14]:
temp = full.drop(['locaRating'], axis = 1)
fullPrepared = pd.concat([temp, loc_Df], axis = 1)

In [15]:
fullPrepared.head()

Unnamed: 0,user_id,comment_count,reposts_count,timeRating,text,gender,local_1,local_2,local_3,local_4,...,local_27,local_28,local_29,local_30,local_31,local_32,local_33,local_34,local_36,local_37
0,1427583973,0.324443,0.324443,3.503905,“雅思口语评分标准”共5讲，5个视频，由新东方谢绍东老师主讲。课程主要内容包括“雅思口语评分...,m,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1,1427587605,1.333277,1.333277,3.504647,MD，人类早晚要被吃货给毁了！//@编剧肖言: 还能再狠点儿吗，这群臭SB不怕报应就吃吧//...,m,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,1427589977,0.091503,0.091503,3.457516,大家帮忙投下C组冯湲http://t.cn/zjxrUmM我参与了@小银星艺术团 发起的投票...,m,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1427590831,0.302523,0.302523,3.564042,等变潇洒哥了的时候我们也包个场@ACE大卫 @吉o0snake 这货今天喝大了打球头疼算怎么...,m,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,1427591573,0.130364,0.130364,4.935154,哈哈，太好玩了一万年，这世上没人比她好 看看大家怎么说>>http://t.cn/zW4VG...,m,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


#### 导出为.csv文件

In [16]:
fullPrepared.to_csv( 'full.csv' , index = False , encoding = 'utf-8')

---
** 记录程序时间 **

---

In [17]:
endTime = datetime.datetime.now()
print(endTime - startTime)

0:02:51.263796
