In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

# 画图的主题设计
sns.set_theme()
sns.set_context('notebook')

# 数据获取
数据中有5个维度的字段，其分别表示用户id、商品id、用户行为类型、商品类别以及时间信息。充分理解这些字段的含义是数据分析的基础，这里我们列出这些字段的主要信息：

| 字段          | 字段说明                     | 提取说明                             |
|---------------|-----------------------------|-------------------------------------|
| `user_id`     | 用户标识                     | 抽样和字段脱敏                       |
| `item_id`     | 商品标识                     | 字段脱敏                             |
| `behavior_type` | 用户对商品的行为类型         | 包括浏览、收藏、加购物车、购买，对应取值分别是1、2、3、4 |
| `item_category` | 商品分类标识                 | 字段脱敏                             |
| `time`        | 行为时间                     | 精确到小时级别                       |

In [2]:
data_user = pd.read_csv("../dataset/alibaba_eshop_user_action.zip")
data_user.head(20)

Unnamed: 0,user_id,item_id,behavior_type,item_category,time
0,98047837,232431562,1,4245,2014-12-06 02
1,97726136,383583590,1,5894,2014-12-09 20
2,98607707,64749712,1,2883,2014-12-18 11
3,98662432,320593836,1,6562,2014-12-06 10
4,98145908,290208520,1,13926,2014-12-16 21
5,93784494,337869048,1,3979,2014-12-03 20
6,94832743,105749725,1,9559,2014-12-13 20
7,95290487,76866650,1,10875,2014-11-27 16
8,96610296,161166643,1,3064,2014-12-11 23
9,100684618,21751142,3,2158,2014-12-05 23


In [3]:
# 查看数据集量级
print('整体数据的大小为: ', len(data_user))

print('数据集中用户数量是: ', len(set(data_user['user_id'])))
print('数据集中商品数量是: ', len(set(data_user['item_id'])))
print('数据集中商品类别数量是: ', len(set(data_user['item_category'])))

整体数据的大小为:  12256906
数据集中用户数量是:  10000
数据集中商品数量是:  2876947
数据集中商品类别数量是:  8916


In [4]:
# 查看数据缺失情况
data_user.isnull().sum()

user_id          0
item_id          0
behavior_type    0
item_category    0
time             0
dtype: int64

In [5]:
# 分割天(date)和小时(hour)
data_user['date'] = data_user['time'].map(lambda x: x.split(' ')[0])
data_user['hour'] = data_user['time'].map(lambda x: x.split(' ')[1])
data_user.head()

Unnamed: 0,user_id,item_id,behavior_type,item_category,time,date,hour
0,98047837,232431562,1,4245,2014-12-06 02,2014-12-06,2
1,97726136,383583590,1,5894,2014-12-09 20,2014-12-09,20
2,98607707,64749712,1,2883,2014-12-18 11,2014-12-18,11
3,98662432,320593836,1,6562,2014-12-06 10,2014-12-06,10
4,98145908,290208520,1,13926,2014-12-16 21,2014-12-16,21


In [6]:
# 查看字段类型：
data_user.dtypes

user_id           int64
item_id           int64
behavior_type     int64
item_category     int64
time             object
date             object
hour             object
dtype: object

In [None]:
# 数据类型转换
data_user['user_id'] = data_user['user_id'].astype('object')
data_user['item_id'] = data_user['item_id'].astype('object')
data_user['item_category'] = data_user['item_category'].astype('object')
data_user['date'] = pd.to_datetime(data_user['date'])
data_user['hour'] = data_user['hour'].astype('int64')

In [None]:
data_user.dtypes