## 导入模块

In [1]:

import time  # 时间库
import numpy as np  # numpy库
import pandas as pd  # pandas库
import pymysql  # mysql连接库
from pyecharts.charts import Bar3D # 3D柱形图


## 读取数据

In [2]:

sheet_names = ['2015','2016','2017','2018','会员等级']
sheet_datas = [pd.read_excel('data/sales.xlsx',sheet_name=i) for i in sheet_names]


In [3]:
for each_name,each_data in zip(sheet_names,sheet_datas):    
    print('[data summary for ============={}===============]'.format(each_name))
    print('Overview:','\n',each_data.head(4))# 展示数据前4条
    print('DESC:','\n',each_data.describe())# 数据描述性信息
    print('NA records',each_data.isnull().any(axis=1).sum()) # 缺失值记录数    
    print('Dtypes',each_data.dtypes) # 数据类型


Overview: 
           会员ID         订单号       提交日期    订单金额
0  15278002468  3000304681 2015-01-01   499.0
1  39236378972  3000305791 2015-01-01  2588.0
2  38722039578  3000641787 2015-01-01   498.0
3  11049640063  3000798913 2015-01-01  1572.0
DESC: 
                会员ID           订单号           订单金额
count  3.077400e+04  3.077400e+04   30774.000000
mean   2.918779e+10  4.020414e+09     960.991161
std    1.385333e+10  2.630510e+08    2068.107231
min    2.670000e+02  3.000305e+09       0.500000
25%    1.944122e+10  3.885510e+09      59.000000
50%    3.746545e+10  4.117491e+09     139.000000
75%    3.923593e+10  4.234882e+09     899.000000
max    3.954613e+10  4.282025e+09  111750.000000
NA records 0
Dtypes 会员ID             int64
订单号              int64
提交日期    datetime64[ns]
订单金额           float64
dtype: object
Overview: 
           会员ID         订单号       提交日期    订单金额
0  39288120141  4282025766 2016-01-01    76.0
1  39293812118  4282037929 2016-01-01  7599.0
2  27596340905  4282038740 2016-0

In [4]:
# 去除缺失值和异常值
for ind,each_data in enumerate(sheet_datas[:-1]):    
    sheet_datas[ind] = each_data.dropna()# 丢弃缺失值记录
    sheet_datas[ind] = each_data[each_data['订单金额'] > 1]# 丢弃订单金额<=1的记录
    sheet_datas[ind]['max_year_date'] = each_data['提交日期'].max() # 增加一列最大日期值



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


## 汇总所有数据

In [5]:
data_merge = pd.concat(sheet_datas[:-1],axis=0)
# 获取各自年份数据
data_merge['date_interval'] = data_merge['max_year_date']-data_merge['提交日期']
data_merge['year'] = data_merge['提交日期'].dt.year
# 转换日期间隔为数字
data_merge['date_interval'] = data_merge['date_interval'].apply(lambda x: x.days)
data_merge.head()


Unnamed: 0,会员ID,订单号,提交日期,订单金额,max_year_date,date_interval,year
0,15278002468,3000304681,2015-01-01,499.0,2015-12-31,364,2015
1,39236378972,3000305791,2015-01-01,2588.0,2015-12-31,364,2015
2,38722039578,3000641787,2015-01-01,498.0,2015-12-31,364,2015
3,11049640063,3000798913,2015-01-01,1572.0,2015-12-31,364,2015
4,35038752292,3000821546,2015-01-01,10.1,2015-12-31,364,2015


## 按会员ID做聚合

In [7]:
rfm_gb = data_merge.groupby(['year','会员ID'],as_index=False).agg(
    {'date_interval': 'min',  # 计算最近一次订单时间
     '提交日期': 'count', # 计算订单频率
     '订单金额': 'sum'})  # 计算订单总金额
# 重命名列名
rfm_gb.columns =  ['year','会员ID','r','f','m']
rfm_gb.head()



Unnamed: 0,year,会员ID,r,f,m
0,2015,267,197,2,105.0
1,2015,282,251,1,29.7
2,2015,283,340,1,5398.0
3,2015,343,300,1,118.0
4,2015,525,37,3,213.0


In [8]:
# 查看数据分布
desc_pd = rfm_gb.iloc[:,2:].describe().T
desc_pd


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
r,148591.0,165.524043,101.988472,0.0,79.0,156.0,255.0,365.0
f,148591.0,1.365002,2.626953,1.0,1.0,1.0,1.0,130.0
m,148591.0,1323.741329,3753.906883,1.5,69.0,189.0,1199.0,206251.8


In [9]:
# 定义区间边界
r_bins = [-1,79,255,365] # 注意起始边界小于最小值
f_bins = [0,2,5,130] 
m_bins = [0,69,1199,206252]


In [10]:
# RFM分箱得分
rfm_gb['r_score'] = pd.cut(rfm_gb['r'], r_bins, labels=[i for i in range(len(r_bins)-1,0,-1)])  # 计算R得分
rfm_gb['f_score'] = pd.cut(rfm_gb['f'], f_bins, labels=[i+1 for i in range(len(f_bins)-1)])  # 计算F得分
rfm_gb['m_score'] = pd.cut(rfm_gb['m'], m_bins, labels=[i+1 for i in range(len(m_bins)-1)])  # 计算M得分


In [11]:
rfm_gb['r_score'] = rfm_gb['r_score'].astype(np.str)
rfm_gb['f_score'] = rfm_gb['f_score'].astype(np.str)
rfm_gb['m_score'] = rfm_gb['m_score'].astype(np.str)
rfm_gb['rfm_group'] = rfm_gb['r_score'].str.cat(rfm_gb['f_score']).str.cat(rfm_gb['m_score'])


  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


In [None]:
# rfm_gb.to_excel('sales_rfm_score1.xlsx')  # 保存数据为Excel


## RFM图形展示

In [12]:
display_data = rfm_gb.groupby(['rfm_group','year'],as_index=False)['会员ID'].count()
display_data.columns = ['rfm_group','year','number']
display_data['rfm_group'] = display_data['rfm_group'].astype(np.int32)
display_data.head()


Unnamed: 0,rfm_group,year,number
0,111,2015,2180
1,111,2016,1498
2,111,2017,3169
3,111,2018,2271
4,112,2015,3811


In [14]:
# 显示图形
from pyecharts.commons.utils import JsCode
from pyecharts import options as opts


range_color = ['#313695', '#4575b4', '#74add1', '#abd9e9', '#e0f3f8', '#ffffbf',
               '#fee090', '#fdae61', '#f46d43', '#d73027', '#a50026']
range_max = int(display_data['number'].max())
c = (
    Bar3D()#设置了一个3D柱形图对象
    .add(
        "",#标题
        [d.tolist() for d in display_data.values],#数据
        xaxis3d_opts=opts.Axis3DOpts( type_="category",name='分组名称'),#x轴数据类型，名称
        yaxis3d_opts=opts.Axis3DOpts( type_="category",name='年份'),#y轴数据类型，名称
        zaxis3d_opts=opts.Axis3DOpts(type_="value",name='会员数量'),#z轴数据类型，名称
    )
    .set_global_opts(#设置颜色，及不同取值对应的颜色
        visualmap_opts=opts.VisualMapOpts(max_=range_max,range_color=range_color),
        title_opts=opts.TitleOpts(title="RFM分组结果"),#设置标题
    )
)
c.render_notebook() #在notebook中显示

  super().__init__(init_opts=init_opts)
