In [1]:
import pandas as pd
import numpy as np
from pandas import DataFrame,Series
import datetime

In [3]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics: 
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [7]:
from datetime import datetime
def datelist(beginDate, endDate):
    # beginDate, endDate是形如‘20160601’的字符串或datetime格式
    date_l=[datetime.strftime(x,'%Y-%m-%d') for x in list(pd.date_range(start=beginDate, end=endDate))]
    return date_l

In [20]:
%%time
M_data = pd.DataFrame(pd.read_csv("data/MemberData.csv",low_memory=False))
O_data = pd.DataFrame(pd.read_csv("data/OrderData.csv",low_memory=False))
OS_data = pd.DataFrame(pd.read_csv("data/OrderSlaveData.csv",low_memory=False))
M_data['RegisterDateTime'] = pd.to_datetime(M_data['RegisterDateTime'].astype(str).str[0:10], format = "%Y-%m-%d") 
OS_data['TradesDateTime'] = pd.to_datetime(OS_data['TradesDateTime'].astype(str).str[0:10], format = "%Y-%m-%d") 
O_data['TradesDateTime'] = pd.to_datetime(O_data['TradesDateTime'].astype(str).str[0:10], format = "%Y-%m-%d") 

Wall time: 6.08 s


In [21]:
print("MemberData資料筆數：",M_data.shape)
print("OrderData資料筆數：",O_data.shape)
print("OrderSlaveData資料筆數：",OS_data.shape)

MemberData資料筆數： (337576, 11)
OrderData資料筆數： (502402, 16)
OrderSlaveData資料筆數： (841332, 15)


In [22]:
M_data['RegisterDateTime'] = pd.to_datetime(M_data['RegisterDateTime'].astype(str).str[0:10], format = "%Y-%m-%d") 
OS_data['TradesDateTime'] = pd.to_datetime(OS_data['TradesDateTime'].astype(str).str[0:10], format = "%Y-%m-%d") 
O_data['TradesDateTime'] = pd.to_datetime(O_data['TradesDateTime'].astype(str).str[0:10], format = "%Y-%m-%d") 

In [24]:
M_data.head()

Unnamed: 0,RegisterSourceTypeDef,RegisterDateTime,Gender,Birthday,IsAppInstalled,IsEnableEmail,IsEnablePushNotification,IsEnableShortMessage,MemberCardLevel,MemberID,eland_uuid
0,iOSApp,2014-12-27,Female,1983-10-19,True,True,True,True,10,b%2Be6RCNYUwTRO%2BnSBeE4H7ySets9zgOaGkqDYItQZS...,
1,,2015-05-29,Female,1990-03-13,True,True,True,True,10,%2FL156mdTEuzDIF5ulSFL4mhJdNRGs8haMhCgz%2BSrG9...,
2,,2013-10-23,,1900-01-01,True,True,True,True,10,wj5w28khA7PlMGGXYaUHIfTxKhjae8pnB76k46OL%2Bhc%3D,
3,,2015-06-27,Female,1977-03-14,True,True,True,True,10,rvJ4K4pF5BYUVUVZnRdcsc0AMlVLX0T2warN0CyPKmU%3D,
4,,2015-06-03,Female,1983-01-15,True,True,True,True,10,Oalq5zcQazy%2B%2BnozogEXMbTItDObkJMUPzz4H%2Fn7...,


In [25]:
O_data.head()

Unnamed: 0,TradesGroupCode,TradesDateTime,ChannelType,ChannelDetail,PaymentType,ShippingType,TsCount,Qty,TotalSalesAmount,TotalPrice,TotalDiscount,TotalPromotionDiscount,TotalCouponDiscount,TotalLoyaltyDiscount,Status,MemberID
0,TG190926B00185,2019-09-26,OfficialECom,MobileWeb,JKOPay,Home,1,1,680,680,0,0,0,0,Finish,JAE7ypAI%2BmBkMAd1REpPJI3zcEQBI9WBTeg5QlsEri4%3D
1,TG190926X00450,2019-09-26,OfficialECom,iOSApp,JKOPay,FamilyPickup,4,4,2660,2760,-100,0,-100,0,Finish,UB%2F0WwXAdW5GhwWmVB036mhSMcUnbK1IA5kXp31h4XQ%3D
2,TG190928M00620,2019-09-28,OfficialECom,MobileWeb,JKOPay,FamilyPickup,1,1,1016,1280,-264,-64,-200,0,Finish,rKUVLmkXyUqqVN8zU1UB40kuhBLRok8FvQtDZTGIS00%3D
3,TG190928U00383,2019-09-28,OfficialECom,iOSApp,JKOPay,FamilyPickup,1,1,1380,1480,-100,0,-100,0,Finish,WpUUri%2BOuznMIisbG5ramILffQkqoQr8TPOj0L8lqN8%3D
4,TG190929R00721,2019-09-29,OfficialECom,MobileWeb,JKOPay,FamilyPickup,1,1,1780,1880,-100,0,-100,0,Cancel,acrdzjptXw%2FDeckXY4b5R89lYb41BC0H0bEgiTo9Zj8%3D


In [26]:
OS_data.head()

Unnamed: 0,TradesGroupCode,TradesDateTime,ChannelType,ChannelDetail,PaymentType,ShippingType,OuterProductSkuCode,ProductSkuCode,SalePageCode,Qty,TotalSalesAmount,TotalPrice,TotalDiscount,Status,MemberID
0,TG190925A00551,2019-09-25,OfficialECom,MobileWeb,JKOPay,Home,9021-1BE35,15016264.0,5074360.0,1,4040,4140,-100,Cancel,tZku5%2Bsy%2F4HUcFAJTcgByc5UWKnrXkyL7ikewfJwoE...
1,TG190925A00551,2019-09-25,OfficialECom,MobileWeb,JKOPay,Home,6182-1DB35,16417049.0,5515813.0,1,4040,4140,-100,Cancel,tZku5%2Bsy%2F4HUcFAJTcgByc5UWKnrXkyL7ikewfJwoE...
2,TG190926X00450,2019-09-26,OfficialECom,iOSApp,JKOPay,FamilyPickup,8309-3WH34,11397813.0,3904262.0,1,2660,2760,-100,Finish,UB%2F0WwXAdW5GhwWmVB036mhSMcUnbK1IA5kXp31h4XQ%3D
3,TG190926X00450,2019-09-26,OfficialECom,iOSApp,JKOPay,FamilyPickup,SSS84,16578545.0,5575777.0,1,2660,2760,-100,Finish,UB%2F0WwXAdW5GhwWmVB036mhSMcUnbK1IA5kXp31h4XQ%3D
4,TG190928U00383,2019-09-28,OfficialECom,iOSApp,JKOPay,FamilyPickup,002-3WH40,8771599.0,3136651.0,1,1380,1480,-100,Finish,WpUUri%2BOuznMIisbG5ramILffQkqoQr8TPOj0L8lqN8%3D


In [27]:
list_ = datelist('20180627','20200430')
print("共計總天數",len(list_)+1)

共計總天數 675


In [28]:
M_data['MemberID'].value_counts()
# 代表有會員id是重複的

jxYhHrEqmnI43qyBLvKlZQ%3D%3D                        16197
vCVjyMZUAth%2BoZkI3ov4kC6HErGEE7HdBFQgmZbTIkA%3D       74
sc1vUHKOmjfRYu7qt8hrWfFBIBPgYkQzs%2B5spbRpons%3D       66
qlSH7MDmiJYVKo8z7MxNaRVjgWId5R3tTFqzARGr4RQ%3D         51
GVtcqdyZ0BpNrwir6S6JtflRovLyJkMNkjqr92i9xrM%3D         49
                                                    ...  
pv6uFZCd9gZo51FCL7S4XJZ7EJjSmQMdIKFPAmZ1Oak%3D          1
r0w%2FrX2op2j0RV0b4ZH6nEBhz09Ph1jl3jZc6CJXjtw%3D        1
efl9HCNDGT1SipSlM18%2BjxalfdvUsU0lVeSAOHNuS9c%3D        1
16LbfPHuA94CZJi%2Ba0pNvLgEjkCSsZPtbzNgujhGJeY%3D        1
NjcJkTNEh2iC1JpioO%2Bvkbbh8ZaR4YZU5XwVLVwG1lk%3D        1
Name: MemberID, Length: 313668, dtype: int64

In [23]:
%%time
df = pd.DataFrame(pd.read_csv("data/Behavior/123_new/productPageView_2018-06-26.csv",low_memory=False))
df = pd.DataFrame(df, columns = ['uid','ht','pr1id'] )
for i in range(len(list_)):
    path = 'data/Behavior/123_new/productPageView_' + list_[i] + '.csv'
    df1 = pd.read_csv(path, low_memory=False)
    df1 = pd.DataFrame(df1, columns = ['uid','ht','pr1id'] )
    df = pd.concat([df, df1], axis = 0)
# print(df.shape)

Wall time: 1min 44s


In [24]:
df

Unnamed: 0,uid,ht,pr1id
0,,1529984856928,41490
1,,1530052755380,41490
2,,1530007429153,41490
3,,1530005672236,3960793
4,,1530007683619,3960793
...,...,...,...
14900,4oXbuD0UAWgi8f2o%2FzFUxDvsGDgla9VkiFdyx4f9SXY%3D,1588250697451,5898619
14901,0joCRJv2K5ypbluZEmEDyrC5wZuJ5bZ4L7tGbATY2ms%3D,1588223230535,5993550
14902,0joCRJv2K5ypbluZEmEDyrC5wZuJ5bZ4L7tGbATY2ms%3D,1588223394110,5993550
14903,0joCRJv2K5ypbluZEmEDyrC5wZuJ5bZ4L7tGbATY2ms%3D,1588207798103,5288532


In [25]:
import datetime
df.loc[:,'ht'] = [datetime.datetime.fromtimestamp(i/1000).strftime("%Y-%m-%d") for i in df['ht']]
df['ht'] = pd.to_datetime(df['ht'].astype(str).str[0:10], format = "%Y-%m-%d")
print(df.shape)

(12019168, 3)


In [26]:
df = df.dropna()
print(df.shape)

(5644067, 3)


In [27]:
df.to_csv('View_Page.csv', index = False)

In [4]:
View = pd.read_csv('View_Page.csv',low_memory=False)
View = reduce_mem_usage(View)
View = pd.DataFrame(View, columns = ['uid','ht','pr1id'] )
View.head()

Mem. usage decreased to 107.65 Mb (16.7% reduction)


Unnamed: 0,uid,ht,pr1id
0,YP4rsCTcLZMhO8CXsQDoB5KonLHEM6km1ycEBT9Hxtw%3D,2018-06-26,4440043
1,YP4rsCTcLZMhO8CXsQDoB5KonLHEM6km1ycEBT9Hxtw%3D,2018-06-26,4450808
2,YP4rsCTcLZMhO8CXsQDoB5KonLHEM6km1ycEBT9Hxtw%3D,2018-06-26,4508572
3,YP4rsCTcLZMhO8CXsQDoB5KonLHEM6km1ycEBT9Hxtw%3D,2018-06-26,3533076
4,YP4rsCTcLZMhO8CXsQDoB5KonLHEM6km1ycEBT9Hxtw%3D,2018-06-26,4456042


In [5]:
print("商品數",len(View['pr1id'].unique())) # 商品數
print(View.shape)

商品數 3878
(5644067, 3)


In [8]:
%%time
from datetime import datetime
list_ = datelist('20180626','20190101')
print("day", len(list_))
sub = View[View['ht']<list_[-1]]
sub.head()
print(sub.shape)
sub = sub.drop_duplicates(subset=None, keep='first', inplace=False)
print(sub.shape)

day 190
(1202531, 3)
(945684, 3)
Wall time: 1.71 s


In [9]:
sub

Unnamed: 0,uid,ht,pr1id
0,YP4rsCTcLZMhO8CXsQDoB5KonLHEM6km1ycEBT9Hxtw%3D,2018-06-26,4440043
1,YP4rsCTcLZMhO8CXsQDoB5KonLHEM6km1ycEBT9Hxtw%3D,2018-06-26,4450808
2,YP4rsCTcLZMhO8CXsQDoB5KonLHEM6km1ycEBT9Hxtw%3D,2018-06-26,4508572
3,YP4rsCTcLZMhO8CXsQDoB5KonLHEM6km1ycEBT9Hxtw%3D,2018-06-26,3533076
4,YP4rsCTcLZMhO8CXsQDoB5KonLHEM6km1ycEBT9Hxtw%3D,2018-06-26,4456042
...,...,...,...
1203337,Uaf5V%2B8kLv73Yhc2L1Jq8eG8gPhx8yYiNCUVifDXNZw%3D,2018-12-31,4841961
1203339,Uaf5V%2B8kLv73Yhc2L1Jq8eG8gPhx8yYiNCUVifDXNZw%3D,2018-12-31,4841963
1203340,Uaf5V%2B8kLv73Yhc2L1Jq8eG8gPhx8yYiNCUVifDXNZw%3D,2018-12-31,4841955
1203341,Uaf5V%2B8kLv73Yhc2L1Jq8eG8gPhx8yYiNCUVifDXNZw%3D,2018-12-31,4780430


In [10]:
list_1 = datelist('20190101','20190630')

In [11]:
%%time
view_matrix = pd.pivot_table(sub, index="uid", columns="pr1id", aggfunc='count')
view_matrix.fillna(value=0, inplace=True)
view_matrix 

Wall time: 11.6 s


Unnamed: 0_level_0,ht,ht,ht,ht,ht,ht,ht,ht,ht,ht,ht,ht,ht,ht,ht,ht,ht,ht,ht,ht,ht
pr1id,41488,41490,41560,41562,153024,153049,153055,154238,154247,154251,...,5073721,5073726,5074265,5074360,5074457,5074545,5074580,5074637,5074706,5074753
uid,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
%2B%2B8NpYVoqyjmc8mROiPMlX%2F2J8JlOghaysSF3qbYCVE%3D,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
%2B%2B8lPULC%2BdY%2F3MWf4kbymzr1cx%2B3Eaktr2fvQHuFH78%3D,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
%2B%2BB3LdtIYEk9XGz9jG%2F5n7C740nRzI02Ig7uHKfNT90%3D,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
%2B%2BH2qr3ZTZV%2BEFF8cHKhtWa4KvG4hiPRX%2BsYTjD6GnU%3D,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
%2B%2BKWWfO7UJQhmQXfJHoERSh2Ha4XnWrMJRMdMhB3xIw%3D,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zzozEazXjgKfDpsvvu4jW%2BuYT9vowDYVnYBKrsyO0uo%3D,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
zzpr4DLdeOxM9QJ9%2BC56UADVWTZ0IFwoZUwSrQngg1o%3D,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
zzw%2Bk3PFtOtuOIu62cuXNkLNLUFDlgmXZj2XeMTwl2g%3D,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
zzw9O6yOUvVbxgrvUGCnmjjacARx%2FbQ%2F12aZDz27TbE%3D,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
# view_matrix = reduce_mem_usage(view_matrix)

In [13]:
view_matrix 

Unnamed: 0_level_0,ht,ht,ht,ht,ht,ht,ht,ht,ht,ht,ht,ht,ht,ht,ht,ht,ht,ht,ht,ht,ht
pr1id,41488,41490,41560,41562,153024,153049,153055,154238,154247,154251,...,5073721,5073726,5074265,5074360,5074457,5074545,5074580,5074637,5074706,5074753
uid,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
%2B%2B8NpYVoqyjmc8mROiPMlX%2F2J8JlOghaysSF3qbYCVE%3D,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
%2B%2B8lPULC%2BdY%2F3MWf4kbymzr1cx%2B3Eaktr2fvQHuFH78%3D,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
%2B%2BB3LdtIYEk9XGz9jG%2F5n7C740nRzI02Ig7uHKfNT90%3D,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
%2B%2BH2qr3ZTZV%2BEFF8cHKhtWa4KvG4hiPRX%2BsYTjD6GnU%3D,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
%2B%2BKWWfO7UJQhmQXfJHoERSh2Ha4XnWrMJRMdMhB3xIw%3D,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zzozEazXjgKfDpsvvu4jW%2BuYT9vowDYVnYBKrsyO0uo%3D,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
zzpr4DLdeOxM9QJ9%2BC56UADVWTZ0IFwoZUwSrQngg1o%3D,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
zzw%2Bk3PFtOtuOIu62cuXNkLNLUFDlgmXZj2XeMTwl2g%3D,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
zzw9O6yOUvVbxgrvUGCnmjjacARx%2FbQ%2F12aZDz27TbE%3D,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
index = view_matrix.columns
view_matrix.columns = [i[1] for i in index]
# view_matrix.columns

In [15]:
%%time

for i in list(view_matrix.columns[:50]):
    target_item = view_matrix[i]

    similarity_with_other_items = view_matrix.corrwith(target_item)  # find correlation between "Bad Boys (1995)" and other movies
    similarity_with_other_items = similarity_with_other_items.sort_values(ascending=False)
    print('-----------------------------------------')
    print("target item", i)
    if similarity_with_other_items.iloc[1] > 0.5 and similarity_with_other_items.iloc[1] < 0.99:
        print("************************************")
        print(similarity_with_other_items.index[1])
        print(similarity_with_other_items.iloc[1])
    else:
        print(similarity_with_other_items.iloc[1])

-----------------------------------------
target item 41488
0.3422347218896061
-----------------------------------------
target item 41490
0.16740014806383152
-----------------------------------------
target item 41560
0.3219090969848256
-----------------------------------------
target item 41562
0.3927356678898813
-----------------------------------------
target item 153024
0.31025130988262395
-----------------------------------------
target item 153049
0.3588258857310374
-----------------------------------------
target item 153055
0.41487272696197347
-----------------------------------------
target item 154238
0.3729996122633735
-----------------------------------------
target item 154247
0.3333341005230546
-----------------------------------------
target item 154251
0.39586309842021344
-----------------------------------------
target item 154257
0.4567715900749531
-----------------------------------------
target item 154264
0.38734444508751276
---------------------------------------

In [16]:
sub_1 = View[View['ht'] < list_1[-1]]
sub_1 = sub_1[sub_1['ht']  > list_[-1]]
sub_1 = sub_1.drop_duplicates(subset=None, keep='first', inplace=False)
sub_1

Unnamed: 0,uid,ht,pr1id
1203348,4z1Kyq8%2BklAM%2Fi282DKUKY2tOOSQdInhxvXjMUN7La...,2019-01-02,4915333
1203349,4z1Kyq8%2BklAM%2Fi282DKUKY2tOOSQdInhxvXjMUN7La...,2019-01-02,5074360
1203350,4z1Kyq8%2BklAM%2Fi282DKUKY2tOOSQdInhxvXjMUN7La...,2019-01-02,5036875
1203351,ASGZI2tyXaEYxtwVGAwoCzFafmdayPnzgLCHk7cfLCA%3D,2019-01-02,3859436
1203392,rYh169xg9FodCaPVRVT0BEV3iw5%2FqwLhtqPDQwVXxNo%3D,2019-01-02,5074753
...,...,...,...
2632884,tpgMPJ54Q6GlgkAU9UoJ8jnpy8v4qFwnIFn3Ub%2Fvqg4%3D,2019-06-29,4808019
2632885,tpgMPJ54Q6GlgkAU9UoJ8jnpy8v4qFwnIFn3Ub%2Fvqg4%3D,2019-06-29,5288446
2632886,gvZQYnA8mAwLg8J7DfEwGKf0mKhtTQ%2BSt0s%2FhOmVCc...,2019-06-29,4956160
2632890,A3X1eK35d1yKxlMwowNJJT70Lun1J2J%2BF7HkU%2FFN9v...,2019-06-29,5340796


In [17]:
target = 437906
pair = 437829
ViewA = sub[sub['pr1id'] == target]
print(ViewA)
ViewB = sub[sub['pr1id'] == pair]
print(ViewB)

                                                       uid          ht   pr1id
1813     O%2FzMUURzDFsC%2FCPgdLpHkskJffEzDimtMEpSnccgbr...  2018-06-27  437906
5221        aGRpTGFDy9MeMGCDniw5r3JNbP3Q82DAbl0AUrOsQuI%3D  2018-06-27  437906
5421     oOcAtFZ0lUeKd%2FsITwdgPdVpSXzV5K85pZ6BpHz7F%2B...  2018-06-27  437906
8781     rukQI4kz8ATq91NprRp%2B%2Bq9OIarL0D9oG4xH1Skdx2...  2018-06-27  437906
11372    F6ebxTEfVUrC82qRg%2FVTR6hVeNmV2CGv9PzblUQ%2FcA...  2018-06-28  437906
...                                                    ...         ...     ...
1161000  tP2zzMhluZa9NirV%2F%2BKi85fA4lrOxvljA27O3Ha6CF...  2018-12-25  437906
1173404  Ytf3xmfaSEbE6fYeuy3sn%2F74zVGotL%2FB01E6nY0q7R...  2018-12-27  437906
1178405  Z0lzBCQEpFznGzMfVJgW%2BX5TpHvSOTHBZ%2FP%2By%2F...  2018-12-28  437906
1198223     HedMobiJkQMabD8hFChdlLzEx95LpXZ0chxynvPyYpI%3D  2018-12-31  437906
1203270   3lDBTCGINXptUcPxoWbV8l6MhxF0zHREFo%2Fx7MsJ7vE%3D  2018-12-31  437906

[636 rows x 3 columns]
                            

In [19]:
count = 0
for i in ViewA['uid']:
    if i in list(ViewB['uid']):
        count+=1
print(ViewA.shape[0])
print(ViewB.shape[0])
print(count)
print(count/ViewA.shape[0])

636
455
253
0.3977987421383648
