- 导入库

In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

- 导入数据

In [2]:
with pd.ExcelFile(r'C:\百度云同步盘\小鸡理财\数据报告\月报\2月\经典转存管.xlsx') as xlsx:
    df_hk = pd.read_excel(xlsx,'回款')
    df_cz = pd.read_excel(xlsx,'充值')
    df_tz = pd.read_excel(xlsx,'投资')
    df_md = pd.read_excel(xlsx,'大额名单')
    df_rz = pd.read_excel(xlsx,'认证')

1.把所有表格内身份证里的字母都转换成大写

In [4]:
df_hk['身份证'] = df_hk['身份证'].str.upper()
df_cz['身份证'] = df_cz['身份证'].str.upper()
df_tz['身份证'] = df_tz['身份证'].str.upper()
df_md['身份证'] = df_md['身份证'].str.upper()
df_rz['身份证'] = df_rz['身份证'].str.upper()

2.数据清洗，去除有空值的行

In [11]:
df_hk.dropna(inplace=True)
df_cz.dropna(inplace=True)
df_tz.dropna(inplace=True)
df_md.dropna(inplace=True)
df_rz.dropna(inplace=True)

3.统计经典版回款的所有用户，并分为大额和非大额

In [12]:
df_yh = df_hk['身份证'].drop_duplicates()

总人数

In [13]:
df_yh.shape

(4871,)

大额用户

In [14]:
df_dae_yh = df_md['身份证']

非大额用户

In [22]:
df_pt_yh = df_yh[~df_yh.isin(df_dae_yh)]

In [23]:
df_pt_yh.shape

(4566,)

4.统计所拥有，大额、普通用户开通存管比例

In [61]:
df_yh.isin(df_rz['身份证']).value_counts()

True     4744
False     127
Name: 身份证, dtype: int64

In [62]:
df_dae_yh.isin(df_rz['身份证']).value_counts()

True     297
False      9
Name: 身份证, dtype: int64

In [31]:
df_pt_yh.isin(df_rz['身份证']).value_counts(normalize=True)

True     0.973938
False    0.026062
Name: 身份证, dtype: float64

4-1.排除用户中年龄小于18岁的

In [55]:
def is18(x):
    birth_date = pd.to_datetime(x[6:14])
    age = (pd.datetime.now() - birth_date) / np.timedelta64(1, 'Y')
    
    if age < 18:
        return False
    else:
        return True

In [60]:
df_yh.apply(is18).value_counts()

True     4869
False       2
Name: 身份证, dtype: int64

5.统计开通存管用户在存管投资比例

- 整理数据：保留到日期，筛选出大额用户

- 保留日期的方法

仅保留日期
df['just_date'] = df['dates'].dt.date

保留日期和时间
df['normalised_date'] = df['dates'].dt.normalize()

In [None]:
df_hk['时间'] = pd.to_datetime(df_hk['发放时间'])
df_hk['身份证'] = df_hk['身份证'].str.upper()
df_md['身份证'] = df_md['身份证'].str.upper()

df_hk_ok = df_hk[df_hk['身份证'].isin(df_md['身份证'])]
del df_hk_ok['发放时间']
df_hk_ok.head()

In [None]:
df_cz['身份证'] = df_cz['身份证'].str.upper()
df_cz['时间'] = pd.to_datetime(df_cz['成功时间']).dt.date

df_cz_ok = df_cz[df_cz['身份证'].isin(df_md['身份证'])]
del df_cz_ok['成功时间']
df_cz_ok.head()

In [None]:
df_tz['身份证'] = df_tz['身份证'].str.upper()
df_tz['时间'] = pd.to_datetime(df_tz['投资时间']).dt.date

df_tz_ok = df_tz[df_tz['身份证'].isin(df_md['身份证'])]
del df_tz_ok['投资时间']
df_tz_ok.head()

- 合并回款、充值、投资

In [None]:
df_hk_gp = df_hk_ok.groupby(['时间','身份证'],as_index=False).sum()
df_cz_gp = df_cz_ok.groupby(['时间','身份证'],as_index=False).sum()
df_tz_gp = df_tz_ok.groupby(['时间','身份证'],as_index=False).sum()

In [None]:
df_hk_gp['时间'] = pd.to_datetime(df_hk_gp['时间'])
df_cz_gp['时间'] = pd.to_datetime(df_cz_gp['时间'])
df_tz_gp['时间'] = pd.to_datetime(df_tz_gp['时间'])

In [None]:
gp_temp = pd.merge(df_hk_gp,df_cz_gp,on = ['时间','身份证'],how = 'outer')
gp = pd.merge(gp_temp,df_tz_gp,on = ['时间','身份证'],how = 'outer')
gp.info()

- ！扩充每个用户的数据到20180301-20180731的每一天

In [None]:
time = pd.DataFrame({'时间':pd.date_range(start='20180301', end='20180731', freq='D')})
time

#构造一个二级行索引，使得每人对应20180301-20180731的每一天
index = pd.MultiIndex.from_product([time['时间'],df_md['身份证']], names = ['时间','身份证'])
s = pd.Series(1,index=index)

df= s.to_frame().reset_index()
del df[0]
df.info()
df.head()

In [None]:
gp_tm = pd.merge(df,gp,on=['时间','身份证'],how='left')
gp_tm

- 按每个用户计算累加值

In [None]:
gp_tm.fillna(0,inplace=True)

In [None]:
gp_tm['累计还款'] = gp_tm.groupby(['身份证'])['还款金额'].apply(lambda x: x.cumsum())
gp_tm['累计充值'] = gp_tm.groupby(['身份证'])['充值金额'].apply(lambda x: x.cumsum())
gp_tm['累计投资'] = gp_tm.groupby(['身份证'])['账户资金'].apply(lambda x: x.cumsum())

In [None]:
gp_tm.head()

In [None]:
gp_tm[gp_tm['身份证']=='130406198605270317']

- 累加值：还款>充值>账户

In [None]:
gp_tm['累计充值_ok'] = np.where(gp_tm['累计还款'] < gp_tm['累计充值'],gp_tm['累计还款'],gp_tm['累计充值'])
gp_tm['累计投资_ok'] = np.where(gp_tm['累计充值_ok'] < gp_tm['累计投资'],gp_tm['累计充值_ok'],gp_tm['累计投资'])

- 按日汇总数据

In [None]:
res = gp_tm.groupby('时间')['累计还款','累计投资_ok'].sum()
res

- 导出结果

In [None]:
res.to_excel('大额用户经典版回款新充投资比.xlsx')