### Reference:
* https://mp.weixin.qq.com/s?__biz=MzA3NDc3ODg1MQ==&mid=2653658662&idx=1&sn=2f1ae5d82a479539cab9280753742d7c&chksm=84a5afaab3d226bc0120d3be009b83c7bb1406c30e6ab7c8aea90e17c787479a803cae4fe085&scene=21#wechat_redirect

In [1]:
import pandas as pd
import numpy as np

In [2]:
login_data = pd.read_csv('./data./login_data.csv', usecols=[1,2], names=['uid', 'ts'], header=None, parse_dates=['ts'])
login_data.head(3)

Unnamed: 0,uid,ts
0,466,2017-01-07 18:24:07
1,466,2017-01-07 18:24:55
2,458,2017-01-07 18:25:18


In [3]:
login_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3003 entries, 0 to 3002
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   uid     3003 non-null   object        
 1   ts      3003 non-null   datetime64[ns]
dtypes: datetime64[ns](1), object(1)
memory usage: 47.0+ KB


In [4]:
login_data['day'] = login_data['ts'].dt.date
login_data.head(3)

Unnamed: 0,uid,ts,day
0,466,2017-01-07 18:24:07,2017-01-07
1,466,2017-01-07 18:24:55,2017-01-07
2,458,2017-01-07 18:25:18,2017-01-07


## aggregate calculate unique count uid per day (daily active user)

In [5]:
uid_count = login_data.groupby('day').aggregate({'uid': lambda x: x.nunique()})
uid_count.reset_index(inplace=True)
uid_count.head()

Unnamed: 0,day,uid
0,2017-01-07,3
1,2017-01-08,3
2,2017-01-09,9
3,2017-01-10,9
4,2017-01-11,8


## Retention rate after N days 
* 需要求day1 活跃的 users # -> dayN 活跃的 users # 

In [6]:
N = 1

In [7]:
login_data['dt_ts'] = login_data['day']
del login_data['ts']
data_1 = login_data.copy()

data_1['dt_ts_N_days_before'] = data_1['dt_ts'] + pd.Timedelta(days=-N)
del data_1['dt_ts']
data_1.head(3)

Unnamed: 0,uid,day,dt_ts_N_days_before
0,466,2017-01-07,2017-01-06
1,466,2017-01-07,2017-01-06
2,458,2017-01-07,2017-01-06


#### 对当前在活跃的 user， 看看N天前是不是也在活跃

In [8]:
merge_1 = pd.merge(login_data, data_1, left_on=['uid', 'dt_ts'], right_on=['uid', 'dt_ts_N_days_before'], how='left')

# merge_1[merge_1.uid.isin(['366', '592'])].head(20)
# merge_1.head(20)

In [9]:
merge_1[merge_1.uid.isin(['458', '592'])].head(10)

Unnamed: 0,uid,day_x,dt_ts,day_y,dt_ts_N_days_before
2,458,2017-01-07,2017-01-07,,
3,458,2017-01-07,2017-01-07,,
4,592,2017-01-07,2017-01-07,2017-01-08,2017-01-07
5,592,2017-01-07,2017-01-07,2017-01-08,2017-01-07
7,592,2017-01-08,2017-01-08,2017-01-09,2017-01-08
8,592,2017-01-08,2017-01-08,2017-01-09,2017-01-08
9,592,2017-01-08,2017-01-08,2017-01-09,2017-01-08
10,592,2017-01-08,2017-01-08,2017-01-09,2017-01-08
11,592,2017-01-08,2017-01-08,2017-01-09,2017-01-08
12,592,2017-01-08,2017-01-08,2017-01-09,2017-01-08


### 上面可以看到对于 `592`,  01-07 and 01-08 都 active. But `458` 就只有 01-07， 01-08 就没有; day_x 是当前的， day_y 是 N天以后的

In [10]:
# 计算第一天活跃的用户数
init_user = merge_1.groupby('day_x').aggregate({'uid': lambda x: x.nunique()})
init_user.reset_index(inplace=True)
init_user.head()

Unnamed: 0,day_x,uid
0,2017-01-07,3
1,2017-01-08,3
2,2017-01-09,9
3,2017-01-10,9
4,2017-01-11,8


In [11]:
# 计算次日活跃的用户数
one_day_remain_user = merge_1[merge_1['day_y'].notnull()].groupby('day_x').aggregate({'uid': lambda x: x.nunique()})
one_day_remain_user.reset_index(inplace=True)
one_day_remain_user.head()

Unnamed: 0,day_x,uid
0,2017-01-07,1
1,2017-01-08,2
2,2017-01-09,5
3,2017-01-10,4
4,2017-01-11,6


In [12]:
# 合并前面两步的结果，计算 retention
merge_one_day = pd.merge(init_user, one_day_remain_user, on=['day_x'])
merge_one_day['one_remain_rate'] = merge_one_day['uid_y'] / merge_one_day['uid_x']
merge_one_day['one_remain_rate'] = merge_one_day['one_remain_rate'].apply(lambda x: format(x, '.2%'))
merge_one_day.head(5)

Unnamed: 0,day_x,uid_x,uid_y,one_remain_rate
0,2017-01-07,3,1,33.33%
1,2017-01-08,3,2,66.67%
2,2017-01-09,9,5,55.56%
3,2017-01-10,9,4,44.44%
4,2017-01-11,8,6,75.00%


## N  days retention

In [13]:
# 1.计算日期差，为后续做准备
merge_all = pd.merge(login_data, login_data, on=['uid'], how='left')
merge_all['diff'] = (merge_all['dt_ts_y'] - merge_all['dt_ts_x']).map(lambda x: x.days)#使用map取得具体数字
merge_all.head()

Unnamed: 0,uid,day_x,dt_ts_x,day_y,dt_ts_y,diff
0,466,2017-01-07,2017-01-07,2017-01-07,2017-01-07,0
1,466,2017-01-07,2017-01-07,2017-01-07,2017-01-07,0
2,466,2017-01-07,2017-01-07,2017-01-10,2017-01-10,3
3,466,2017-01-07,2017-01-07,2017-01-16,2017-01-16,9
4,466,2017-01-07,2017-01-07,2017-01-17,2017-01-17,10


In [14]:
# 2.计算第n天的留存人数，n=0,1,6,13。需要先进行筛选再进行计数，仍然使用nunique
diff_0 = merge_all[merge_all['diff'] == 0].groupby('day_x')['uid'].nunique()
diff_1 = merge_all[merge_all['diff'] == 1].groupby('day_x')['uid'].nunique()
diff_6 = merge_all[merge_all['diff'] == 6].groupby('day_x')['uid'].nunique()
diff_13 = merge_all[merge_all['diff'] == 13].groupby('day_x')['uid'].nunique()
diff_0 = diff_0.reset_index()#groupby计数后得到的是series格式，reset得到dataframe
diff_1 = diff_1.reset_index()
diff_6 = diff_6.reset_index()
diff_13 = diff_13.reset_index()

In [15]:
# 3.对多个dataframe进行一次合并
retention = pd.merge(pd.merge(pd.merge(diff_0, diff_1, on=['day_x'], how='left'), diff_6, 
                                 on=['day_x'], how='left'), diff_13, on=['day_x'], how='left')
retention.head()

Unnamed: 0,day_x,uid_x,uid_y,uid_x.1,uid_y.1
0,2017-01-07,3,1.0,2.0,2.0
1,2017-01-08,3,2.0,1.0,1.0
2,2017-01-09,9,5.0,3.0,
3,2017-01-10,9,4.0,5.0,4.0
4,2017-01-11,8,6.0,5.0,1.0


In [16]:
# 4.对结果重命名，并用0填充na值
retention.columns=['day', 'init', 'one_day_remain', 'seven_day_remain', 'fifteen_day_remain']#后来发现英文写错了，将就看，懒得改了
retention.fillna(0, inplace=True)
retention.head(10)

Unnamed: 0,day,init,one_day_remain,seven_day_remain,fifteen_day_remain
0,2017-01-07,3,1.0,2.0,2.0
1,2017-01-08,3,2.0,1.0,1.0
2,2017-01-09,9,5.0,3.0,0.0
3,2017-01-10,9,4.0,5.0,4.0
4,2017-01-11,8,6.0,5.0,1.0
5,2017-01-12,10,5.0,7.0,3.0
6,2017-01-13,8,2.0,4.0,1.0
7,2017-01-14,4,2.0,3.0,2.0
8,2017-01-15,4,3.0,2.0,1.0
9,2017-01-16,11,5.0,1.0,0.0


### 方法二：[这个好像暂时是错的， 尴尬 !]

* 这种方法是从网上看到的，也放在这里供大家学习，文末有链接。它没有用自关联，而是对日期进行循环，计算当日的活跃用户数和n天后的活跃用户数。把n作为参数传入封装好的函数中。参考下面代码：

In [17]:
login_data = pd.read_csv('./data./login_data.csv', usecols=[1,2], names=['uid', 'ts'], header=None, parse_dates=['ts'])
login_data['day'] = login_data['ts'].dt.date
login_data['dt_ts'] = login_data['day']
del login_data['ts']

login_data.head(3)

Unnamed: 0,uid,day,dt_ts
0,466,2017-01-07,2017-01-07
1,466,2017-01-07,2017-01-07
2,458,2017-01-07,2017-01-07


In [18]:
def cal_n_day_remain(df, n):
    dates = pd.Series(login_data.dt_ts.unique()).sort_values()[:-n]#取截止到n天的日期，保证有n日留存
    users = [] #定义列表存放初始用户数
    remains = []#定义列表存放留存用户数
    for d in dates:
        user = login_data[login_data['dt_ts'] == d]['uid'].unique()#当日活跃用户
        user_n_day = login_data[login_data['dt_ts']==d + pd.Timedelta(n)]['uid'].unique()#n日后活跃用户
        remain = [x for x in user_n_day if x in user]#取交集
        users.append(len(user))
        remains.append(len(remain))
    #一次循环计算一天的n日留存 
    #循环结束后构造dataframe并返回
    remain_df = pd.DataFrame({'days': dates, 'user': users, 'remain': remains})
    return remain_df

In [19]:
one_day_remain = cal_n_day_remain(login_data, 1)
seven_day_remain = cal_n_day_remain(login_data, 6)
fifteen_day_remain = cal_n_day_remain(login_data, 13)

retention2 = pd.merge(pd.merge(one_day_remain, seven_day_remain[['days', 'remain']], on=['days'], how='left'), 
                     fifteen_day_remain[['days', 'remain']], on=['days'], how='left')
retention2.head(10)

Unnamed: 0,days,user,remain_x,remain_y,remain
0,2017-01-07,3,3,3.0,3.0
1,2017-01-08,3,3,3.0,3.0
2,2017-01-09,9,9,9.0,9.0
3,2017-01-10,9,9,9.0,9.0
4,2017-01-11,8,8,8.0,8.0
5,2017-01-12,10,10,10.0,10.0
6,2017-01-13,8,8,8.0,8.0
7,2017-01-14,4,4,4.0,4.0
8,2017-01-15,4,4,4.0,4.0
9,2017-01-16,11,11,11.0,11.0
