In [1]:
import pandas as pd
import datetime
import os
from scipy.sparse import lil_matrix
from scipy.sparse import csr_matrix
from sklearn.decomposition import NMF

In [2]:
dfm = pd.read_csv('data/dataset15.csv', sep='\t')
dfm['date'] = dfm.Time.map(lambda x:datetime.date.fromtimestamp(x))
print(len(dfm))
dfm.head(3)

45881450


Unnamed: 0,UserId,ItemId,SessionId,Time,date
0,1,181459,0,1444342000.0,2015-10-09
1,1,779078,0,1444342000.0,2015-10-09
2,1,779078,0,1444342000.0,2015-10-09


In [3]:
tar_dfm = dfm[['UserId', 'ItemId']]
tar_dfm.head(2)

Unnamed: 0,UserId,ItemId
0,1,181459
1,1,779078


In [4]:
User = sorted(tar_dfm.UserId.unique())
print(len(User), User[:3], User[-3:])

Item = sorted(tar_dfm.ItemId.unique())
print(len(Item), Item[:3], Item[-3:])

422282 [1, 2, 3] [424168, 424169, 424170]
624221 [2, 3, 4] [1113162, 1113163, 1113166]


In [5]:
User2No = dict([(User[i], i) for i in range(len(User))])
Item2No = dict([(Item[i], i) for i in range(len(Item))])

In [6]:
m = lil_matrix((len(User), len(Item)))

In [7]:
for row in tar_dfm.itertuples():
    m[User2No[row.UserId], Item2No[row.ItemId]] = 1

In [8]:
M = m.tocsr()

In [22]:
N_COMPONENTS = 64
# NMF_INIT = 'nndsvd'
NMF_INIT = 'random'
NMF_SOLVER = 'cd'  # Coordinate Descent solver.
NMF_TOL = 0.0001
NMF_MAX_ITER = 200
NMF_RANDOM_STATE = 0
NMF_ALPHA = 0.0
NMF_L1_RATIO = 0.0
NMF_SHUFFLE = True

In [23]:
model = NMF(n_components=N_COMPONENTS,
            init=NMF_INIT,
            solver=NMF_SOLVER,
            tol=NMF_TOL,
            max_iter=NMF_MAX_ITER,
            random_state=NMF_RANDOM_STATE,
            alpha=NMF_ALPHA,
            l1_ratio=NMF_L1_RATIO,
            shuffle=False)

In [24]:
W = model.fit_transform(M)

In [25]:
H = model.components_

In [26]:
W.shape

(422282, 64)

In [27]:
U2Cls = {}
for i, row in enumerate(W):
    # if i>1000:break
    #print(row, row.argmax(), row[row.argmax()])
    U2Cls[User[i]] = row.argmax()

In [28]:
Rows = []
for user, c in U2Cls.items():
    Rows.append((user,c))
u2cls_dfm = pd.DataFrame(Rows)    
u2cls_dfm.columns = ['user_id', 'class']

In [29]:
class2count = pd.DataFrame(u2cls_dfm.groupby('class').count()).reset_index()
class2count.columns = ['class', 'count']
class2count

Unnamed: 0,class,count
0,0,24440
1,1,46576
2,2,52569
3,3,10906
4,4,12013
...,...,...
59,59,4917
60,60,6908
61,61,2522
62,62,7206


In [30]:
Cls2Users = {}
for user, cls in U2Cls.items():
    Cls2Users.setdefault(cls, [])
    Cls2Users[cls].append(user)
len(Cls2Users)    

64

In [31]:
Cls2Dfm = {}
for cls, Users in Cls2Users.items():
    Cls2Dfm[cls] = dfm[dfm.UserId.isin(Cls2Users[cls])]

In [32]:
for cls, c_dfm in Cls2Dfm.items():
    print(cls, len(c_dfm))

38 767163
29 370983
2 7292080
54 520708
1 5251637
63 1445111
26 933992
47 625311
14 463547
11 220140
16 819467
42 287214
24 309607
28 370411
0 5533463
36 147535
43 430034
13 276999
52 283115
23 199463
6 316397
19 650809
56 546363
4 1198590
25 434117
5 718972
40 354396
60 595843
55 361202
58 567712
53 448579
18 134901
3 1606166
21 179199
9 251231
44 518148
37 210777
57 519726
33 256374
49 459246
8 660495
35 311471
41 381192
62 612430
31 230748
27 451523
7 374488
48 333880
39 494104
30 285617
61 391938
45 357302
50 605457
51 588052
32 260672
22 525036
59 517961
15 184284
17 184679
10 240672
20 69950
46 440860
34 334382
12 167529


In [33]:
start_date = datetime.date(2015,5,20)
end_date = datetime.date(2015,11,12)

target_start_date = start_date + datetime.timedelta(28)

AllRows = []
for cls, c_dfm in Cls2Dfm.items():
    print('---', cls, '---')
    RF2N = {}
    RF2H = {}
    for i in range((end_date - target_start_date).days + 1):
        target_date = target_start_date + datetime.timedelta(i)
        train_start_date = target_date - datetime.timedelta(28) 
        train_end_date = target_date - datetime.timedelta(1) 
        #if i>1:break
        if i>146:break
        print(i, train_start_date.strftime("%F"), '~', train_end_date.strftime("%F"), ':', target_date.strftime("%F"), flush=True)  
        x_dfm = c_dfm[(train_start_date <= c_dfm.date) & (c_dfm.date <= train_end_date)].copy()
        y_dfm = c_dfm[c_dfm.date == target_date].copy()
        x_record_num = len(x_dfm)
        y_record_num = len(y_dfm)
        #print(x_record_num, y_record_num)
        x_dfm['target_date'] = target_date
        x_dfm['day_rcen'] = (x_dfm.target_date - x_dfm.date).map(lambda x:x.days)

        U2Is = {}
        for row in y_dfm.itertuples():
            U2Is.setdefault(row.UserId, set())
            U2Is[row.UserId].add(row.ItemId)

        U2I2Info = {}
        for row in x_dfm.itertuples():
            U2I2Info.setdefault(row.UserId, {})
            U2I2Info[row.UserId].setdefault(row.ItemId, [])
            U2I2Info[row.UserId][row.ItemId].append(row.day_rcen)

        for UserId, I2Info in U2I2Info.items():
            HitSet = set(U2Is[UserId]) if UserId in U2Is else set()
            for ItemId, Info in I2Info.items():
                freq = len(Info)
                rcen = min(Info)
                RF2N.setdefault((rcen,freq), 0)
                RF2H.setdefault((rcen,freq), 0)
                RF2N[rcen,freq] += 1
                if ItemId in HitSet:
                    RF2H[rcen,freq] += 1
        N = 0
        for rf, n in RF2N.items():
            N += n
        #print('data:', N)
    RF2P = {}
    for rf, n in RF2N.items():
        RF2P[rf] = RF2H[rf] / n                        

    RFRows = []
    for rf, n in RF2N.items():
        h = RF2H[rf]
        p = RF2P[rf]
        RFRows.append((rf[0], rf[1], h, n , p))
        AllRows.append((cls, rf[0], rf[1], h, n , p))
        
    rf_dfm = pd.DataFrame(RFRows)
    rf_dfm.columns = ['rcen', 'freq', 'hit', 'N', 'prob']
    rf_dfm = rf_dfm.sort_values(['rcen', 'freq'])
    rf_dfm = rf_dfm[rf_dfm.freq <= 40]
    fname = 'output_cls/rf_cls%02d.csv' % cls
    rf_dfm.to_csv(fname, index=False)
all_dfm = pd.DataFrame(AllRows)
all_dfm.columns = ['cls', 'rcen', 'freq', 'hit', 'N', 'prob']
all_dfm = all_dfm.sort_values(['cls', 'rcen', 'freq'])
all_dfm = all_dfm[all_dfm.freq <= 40]
fname = 'output_cls/cls_rf.csv'
all_dfm.to_csv(fname, index=False)

--- 38 ---
0 2015-05-20 ~ 2015-06-16 : 2015-06-17
1 2015-05-21 ~ 2015-06-17 : 2015-06-18
2 2015-05-22 ~ 2015-06-18 : 2015-06-19
3 2015-05-23 ~ 2015-06-19 : 2015-06-20
4 2015-05-24 ~ 2015-06-20 : 2015-06-21
5 2015-05-25 ~ 2015-06-21 : 2015-06-22
6 2015-05-26 ~ 2015-06-22 : 2015-06-23
7 2015-05-27 ~ 2015-06-23 : 2015-06-24
8 2015-05-28 ~ 2015-06-24 : 2015-06-25
9 2015-05-29 ~ 2015-06-25 : 2015-06-26
10 2015-05-30 ~ 2015-06-26 : 2015-06-27
11 2015-05-31 ~ 2015-06-27 : 2015-06-28
12 2015-06-01 ~ 2015-06-28 : 2015-06-29
13 2015-06-02 ~ 2015-06-29 : 2015-06-30
14 2015-06-03 ~ 2015-06-30 : 2015-07-01
15 2015-06-04 ~ 2015-07-01 : 2015-07-02
16 2015-06-05 ~ 2015-07-02 : 2015-07-03
17 2015-06-06 ~ 2015-07-03 : 2015-07-04
18 2015-06-07 ~ 2015-07-04 : 2015-07-05
19 2015-06-08 ~ 2015-07-05 : 2015-07-06
20 2015-06-09 ~ 2015-07-06 : 2015-07-07
21 2015-06-10 ~ 2015-07-07 : 2015-07-08
22 2015-06-11 ~ 2015-07-08 : 2015-07-09
23 2015-06-12 ~ 2015-07-09 : 2015-07-10
24 2015-06-13 ~ 2015-07-10 : 2015-07-11