Copyright (c) 2020, NVIDIA CORPORATION.

Modifications copyright Intel. 

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

### Caculate follower, following count and write to parquent
#### using dataframe groupby operations 

In [3]:
import os, time
os.environ["CUDA_VISIBLE_DEVICES"]="0"
start = time.time()

In [4]:

from datetime import datetime 
time_begin = datetime.now()

In [5]:
import pandas as pd, numpy as np, gc
from datetime import datetime
import joblib

import matplotlib.pyplot as plt
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)
#import cudf, cupy
#cudf.__version__

In [6]:
from numba import jit, njit, prange
from sklearn.metrics import precision_recall_curve, auc, log_loss

def compute_prauc(gt, pred, nafill=True):
    if nafill:
        pred[ np.isnan(pred) ] = np.nanmean( pred )
    prec, recall, thresh = precision_recall_curve(gt, pred)
    prauc = auc(recall, prec)
    return prauc

@jit
def fast_auc(y_true, y_prob):
    y_true = np.asarray(y_true)
    y_true = y_true[np.argsort(y_prob)]
    nfalse = 0
    auc = 0
    n = len(y_true)
    for i in range(n):
        y_i = y_true[i]
        nfalse += (1 - y_i)
        auc += y_i * nfalse
    auc /= (nfalse * (n - nfalse))
    return auc

@njit
def numba_log_loss(y,x):
    n = x.shape[0]
    ll = 0.
    for i in prange(n):
        if y[i]<=0.:
            ll += np.log(1-x[i] + 1e-15 )
        else:
            ll += np.log(x[i] + 1e-15)
    return -ll / n

def compute_rce(gt , pred, nafill=True, verbose=0):
    if nafill:
        pred[ np.isnan(pred) ] = np.nanmean( pred )
        
    cross_entropy = numba_log_loss( gt, pred  )
    
    yt = np.mean(gt>0)     
    strawman_cross_entropy = -(yt*np.log(yt) + (1 - yt)*np.log(1 - yt))
    
    if verbose:
        print( "logloss: {0:.5f} / {1:.5f} = {2:.5f}".format(cross_entropy, strawman_cross_entropy, cross_entropy/strawman_cross_entropy))
        print( 'mean:    {0:.5f} / {1:.5f}'.format( np.nanmean( pred ) , yt  ) )
    
    return (1.0 - cross_entropy/strawman_cross_entropy)*100.0


In [7]:
def save_memory( df ):
    features = df.columns
    for i in range( df.shape[1] ):
        if df.dtypes[i] == 'uint8':
            df[features[i]] = df[features[i]].astype( np.int8 )
            gc.collect()
        elif df.dtypes[i] == 'bool':
            df[features[i]] = df[features[i]].astype( np.int8 )
            gc.collect()
        elif df.dtypes[i] == 'uint32':
            df[features[i]] = df[features[i]].astype( np.int32 )
            gc.collect()
        elif df.dtypes[i] == 'int64':
            df[features[i]] = df[features[i]].astype( np.int32 )
            gc.collect()
        elif df.dtypes[i] == 'float64':
            df[features[i]] = df[features[i]].astype( np.float32 )
            gc.collect()

# Load Train

In [8]:
%%time
train = pd.read_parquet( '/mnt/DP_disk3/Recsys/train-1.parquet' )
test0 = pd.read_parquet( '/mnt/DP_disk3/Recsys/test-0.parquet' )
test1 = pd.read_parquet( '/mnt/DP_disk3/Recsys/test-1.parquet' )
gc.collect()

train['tr'] = 0
test0['tr'] = 1
test1['tr'] = 1

train.shape, test0.shape, test1.shape

CPU times: user 1min 3s, sys: 1min 8s, total: 2min 11s
Wall time: 7.16 s


((96600391, 28), (9943301, 28), (9941988, 28))

In [9]:
%%time
train = pd.concat( (train,test0,test1), sort=False )
gc.collect()
del test0,test1
gc.collect()

train.shape

CPU times: user 3min 1s, sys: 3min 21s, total: 6min 23s
Wall time: 23 s


(116485680, 28)

In [10]:
%%time
train = train.sort_values('timestamp').reset_index(drop=True) #TIME ORDER
gc.collect()

CPU times: user 4min 5s, sys: 7min 35s, total: 11min 41s
Wall time: 1min 58s


20

In [11]:
%%time
train.loc[ train.reply>0, 'reply' ] = 1
train.loc[ train.retweet>0, 'retweet' ] = 1
train.loc[ train.retweet_comment>0, 'retweet_comment' ] = 1
train.loc[ train.like>0, 'like' ] = 1

train['engage'] = 0
train.loc[ (train.reply>0)|(train.retweet>0)|(train.retweet_comment>0)|(train.like>0)  , 'engage'] = 1
gc.collect()

CPU times: user 2min 27s, sys: 2min 18s, total: 4min 45s
Wall time: 12.6 s


0

In [12]:
%%time
#train.head()

CPU times: user 3 µs, sys: 4 µs, total: 7 µs
Wall time: 15.5 µs


In [13]:
save_memory(train)
gc.collect()

0

In [14]:
train.dtypes

hashtags              int32
tweet_id              int32
media                  int8
links                 int32
domains               int32
tweet_type             int8
language               int8
timestamp             int32
a_user_id             int32
a_follower_count      int32
a_following_count     int32
a_is_verified          int8
a_account_creation    int32
b_user_id             int32
b_follower_count      int32
b_following_count     int32
b_is_verified          int8
b_account_creation    int32
b_follows_a            int8
reply                 int32
retweet               int32
retweet_comment       int32
like                  int32
id                    int32
len_hashtags          int32
len_domains           int32
len_links             int32
tr                    int32
engage                int32
dtype: object

In [15]:
%%time
dt0 = train[['tr','a_user_id','a_follower_count','a_following_count','timestamp','id']].copy()
dt1 = train[['tr','b_user_id','b_follower_count','b_following_count','timestamp','id']].copy()
dt1.columns = ['tr','a_user_id','a_follower_count','a_following_count','timestamp','id']
dt1['id'] = -1
dt = pd.concat( (dt0,dt1), sort=False )
del dt0,dt1; _=gc.collect()
dt.head()

CPU times: user 1min 42s, sys: 2min 17s, total: 4min
Wall time: 13.1 s


Unnamed: 0,tr,a_user_id,a_follower_count,a_following_count,timestamp,id
0,0,270842,38676,0,1580947200,14176027
1,0,20986,682185,690,1580947200,61249597
2,0,3038,2982406,2411,1580947200,66929047
3,0,55201,634514,2549,1580947200,5817331
4,0,3698,12029492,14,1580947200,62093480


In [16]:
%%time
dt['a_fc_max'] = dt.groupby(['tr','a_user_id','a_follower_count'])['timestamp'].transform('max'); _=gc.collect()
dt['a_fc_min'] = dt.groupby(['tr','a_user_id','a_follower_count'])['timestamp'].transform('min'); _=gc.collect()

dt['a_fc_dif'] = dt['a_fc_max'] - dt['a_fc_min']; _=gc.collect()

dt['a_fc_dif_max'] = dt.groupby(['tr','a_user_id'])['a_fc_dif'].transform('max'); _=gc.collect()

dt['a_fc_dif_flag'] = 1* ((dt['a_fc_dif'] == dt['a_fc_dif_max']) )  ; _=gc.collect()
dt.loc[ dt.a_fc_dif==0 ,'a_fc_dif_flag'] = -1

del dt['a_fc_max'],dt['a_fc_min'],dt['a_fc_dif'],dt['a_fc_dif_max'] ; _=gc.collect()
dt['a_fc_dif_flag'].value_counts()

CPU times: user 9min 41s, sys: 6min 26s, total: 16min 7s
Wall time: 5min 13s


 1    154558263
 0     50318574
-1     28094523
Name: a_fc_dif_flag, dtype: int64

In [17]:
%%time
train['a_user_fer_count_delta_time'] = train.merge( dt, on='id', how='left' )['a_fc_dif_flag'] ; _=gc.collect()

del dt['a_fc_dif_flag'] ; _=gc.collect()

train['a_user_fer_count_delta_time'].value_counts()

CPU times: user 2min 10s, sys: 1min 45s, total: 3min 55s
Wall time: 2min 37s


 1    65798700
 0    38696965
-1    11990015
Name: a_user_fer_count_delta_time, dtype: int64

In [18]:
%%time
train.groupby(['tr','a_user_fer_count_delta_time'])[['reply','retweet','retweet_comment','like']].agg(['mean'])

CPU times: user 6.76 s, sys: 8.48 s, total: 15.2 s
Wall time: 15.3 s


Unnamed: 0_level_0,Unnamed: 1_level_0,reply,retweet,retweet_comment,like
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,mean,mean,mean
tr,a_user_fer_count_delta_time,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
0,-1,0.019869,0.044525,0.004036,0.208164
0,0,0.000657,0.002185,0.000168,0.007955
0,1,0.038859,0.174026,0.011115,0.708059
1,-1,0.0,0.0,0.0,0.0
1,0,0.0,0.0,0.0,0.0
1,1,0.0,0.0,0.0,0.0


In [19]:
%%time
dt['a_fc_max'] = dt.groupby(['tr','a_user_id','a_following_count'])['timestamp'].transform('max'); _=gc.collect()
dt['a_fc_min'] = dt.groupby(['tr','a_user_id','a_following_count'])['timestamp'].transform('min'); _=gc.collect()

dt['a_fc_dif'] = dt['a_fc_max'] - dt['a_fc_min']; _=gc.collect()

dt['a_fc_dif_max'] = dt.groupby(['tr','a_user_id'])['a_fc_dif'].transform('max'); _=gc.collect()

dt['a_fc_dif_flag'] = 1* ((dt['a_fc_dif'] == dt['a_fc_dif_max']) )  ; _=gc.collect()
dt.loc[ dt.a_fc_dif==0 ,'a_fc_dif_flag'] = -1

del dt['a_fc_max'],dt['a_fc_min'],dt['a_fc_dif'],dt['a_fc_dif_max'] ; _=gc.collect()
dt['a_fc_dif_flag'].value_counts()

CPU times: user 9min 38s, sys: 7min 20s, total: 16min 59s
Wall time: 5min 20s


 1    167975190
 0     36979505
-1     28016665
Name: a_fc_dif_flag, dtype: int64

In [20]:
%%time
train['a_user_fing_count_delta_time'] = train.merge( dt, on='id', how='left' )['a_fc_dif_flag'] ; _=gc.collect()

del dt['a_fc_dif_flag'] ; _=gc.collect()

train['a_user_fing_count_delta_time'].value_counts()

CPU times: user 2min 31s, sys: 1min 59s, total: 4min 31s
Wall time: 2min 50s


 1    82344700
 0    24481864
-1     9659116
Name: a_user_fing_count_delta_time, dtype: int64

In [21]:
%%time
train.groupby(['tr','a_user_fing_count_delta_time'])[['reply','retweet','retweet_comment','like']].agg(['mean'])

CPU times: user 16.7 s, sys: 21.3 s, total: 38 s
Wall time: 15.9 s


Unnamed: 0_level_0,Unnamed: 1_level_0,reply,retweet,retweet_comment,like
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,mean,mean,mean
tr,a_user_fing_count_delta_time,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
0,-1,0.024465,0.053724,0.004893,0.2531
0,0,0.001242,0.004111,0.000322,0.015526
0,1,0.031213,0.139823,0.00893,0.56879
1,-1,0.0,0.0,0.0,0.0
1,0,0.0,0.0,0.0,0.0
1,1,0.0,0.0,0.0,0.0


In [22]:
%%time
dt['a_fc_max'] = dt.groupby(['tr','a_user_id','a_follower_count','a_following_count'])['timestamp'].transform('max'); _=gc.collect()
dt['a_fc_min'] = dt.groupby(['tr','a_user_id','a_follower_count','a_following_count'])['timestamp'].transform('min'); _=gc.collect()

dt['a_fc_dif'] = dt['a_fc_max'] - dt['a_fc_min']; _=gc.collect()

dt['a_fc_dif_max'] = dt.groupby(['tr','a_user_id'])['a_fc_dif'].transform('max'); _=gc.collect()

dt['a_fc_dif_flag'] = 1* ((dt['a_fc_dif'] == dt['a_fc_dif_max']) )  ; _=gc.collect()
dt.loc[ dt.a_fc_dif==0 ,'a_fc_dif_flag'] = -1

del dt['a_fc_max'],dt['a_fc_min'],dt['a_fc_dif'],dt['a_fc_dif_max'] ; _=gc.collect()
dt['a_fc_dif_flag'].value_counts()

CPU times: user 9min 50s, sys: 6min 28s, total: 16min 19s
Wall time: 5min 54s


 1    142509101
 0     52374661
-1     38087598
Name: a_fc_dif_flag, dtype: int64

In [23]:
%%time
train['a_user_fering_count_delta_time'] = train.merge( dt, on='id', how='left' )['a_fc_dif_flag'] ; _=gc.collect()

del dt['a_fc_dif_flag'] ; _=gc.collect()

train['a_user_fering_count_delta_time'].value_counts()

CPU times: user 2min 22s, sys: 1min 56s, total: 4min 18s
Wall time: 2min 54s


 1    64229290
 0    38849075
-1    13407315
Name: a_user_fering_count_delta_time, dtype: int64

In [24]:
%%time
train.groupby(['tr','a_user_fering_count_delta_time'])[['reply','retweet','retweet_comment','like']].agg(['mean'])

CPU times: user 18 s, sys: 27.1 s, total: 45.1 s
Wall time: 17.2 s


Unnamed: 0_level_0,Unnamed: 1_level_0,reply,retweet,retweet_comment,like
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,mean,mean,mean
tr,a_user_fering_count_delta_time,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
0,-1,0.018088,0.040757,0.003686,0.188984
0,0,0.000445,0.00152,0.000109,0.005477
0,1,0.039823,0.178404,0.011398,0.725815
1,-1,0.0,0.0,0.0,0.0
1,0,0.0,0.0,0.0,0.0
1,1,0.0,0.0,0.0,0.0


In [25]:
%%time
dt['a_fc_count'] = dt.groupby(['tr','a_user_id','a_following_count'])['id'].transform('count'); _=gc.collect()
dt['a_fc_max']   = dt.groupby(['tr','a_user_id'])['a_fc_count'].transform('max'); _=gc.collect()
dt['a_fc_count_flag'] = 1* ((dt['a_fc_count'] == dt['a_fc_max']))  ; _=gc.collect()
dt.loc[ dt.a_fc_count<=1,'a_fc_count_flag'] = -1

del dt['a_fc_count'], dt['a_fc_max']
dt['a_fc_count_flag'].value_counts()

CPU times: user 5min 21s, sys: 3min 6s, total: 8min 27s
Wall time: 3min 8s


 1    174152185
 0     31870211
-1     26948964
Name: a_fc_count_flag, dtype: int64

In [26]:
%%time
train['a_user_fing_count_mode'] = train.merge( dt, on='id', how='left' )['a_fc_count_flag'] ; _=gc.collect()
del dt['a_fc_count_flag'] ; _=gc.collect()
train['a_user_fing_count_mode'].value_counts()

CPU times: user 2min 25s, sys: 2min 11s, total: 4min 36s
Wall time: 2min 55s


 1    84891704
 0    23002162
-1     8591814
Name: a_user_fing_count_mode, dtype: int64

In [27]:
%%time
train.groupby(['tr','a_user_fing_count_mode'])[['reply','retweet','retweet_comment','like']].agg(['mean'])

CPU times: user 18.4 s, sys: 34 s, total: 52.4 s
Wall time: 18.3 s


Unnamed: 0_level_0,Unnamed: 1_level_0,reply,retweet,retweet_comment,like
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,mean,mean,mean
tr,a_user_fing_count_mode,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
0,-1,0.024937,0.051945,0.004886,0.231284
0,0,0.002899,0.009612,0.000906,0.040556
0,1,0.030109,0.134996,0.008567,0.549942
1,-1,0.0,0.0,0.0,0.0
1,0,0.0,0.0,0.0,0.0
1,1,0.0,0.0,0.0,0.0


In [28]:
%%time
dt['a_fc_count'] = dt.groupby(['tr','a_user_id','a_follower_count'])['id'].transform('count'); _=gc.collect()
dt['a_fc_max']   = dt.groupby(['tr','a_user_id'])['a_fc_count'].transform('max'); _=gc.collect()
dt['a_fc_count_flag'] = 1* ((dt['a_fc_count'] == dt['a_fc_max']))  ; _=gc.collect()
dt.loc[ dt.a_fc_count<=1,'a_fc_count_flag'] = -1

del dt['a_fc_count'], dt['a_fc_max']
dt['a_fc_count_flag'].value_counts()

CPU times: user 5min 31s, sys: 3min 3s, total: 8min 34s
Wall time: 3min 11s


 1    160046081
 0     47030798
-1     25894481
Name: a_fc_count_flag, dtype: int64

In [29]:
%%time
train['a_user_fer_count_mode'] = train.merge( dt, on='id', how='left' )['a_fc_count_flag'] ; _=gc.collect()
del dt['a_fc_count_flag'] ; _=gc.collect()
train['a_user_fer_count_mode'].value_counts()

CPU times: user 2min 33s, sys: 2min 13s, total: 4min 47s
Wall time: 3min 6s


 1    68129849
 0    38565530
-1     9790301
Name: a_user_fer_count_mode, dtype: int64

In [30]:
%%time
train.groupby(['tr','a_user_fer_count_mode'])[['reply','retweet','retweet_comment','like']].agg(['mean'])

CPU times: user 20.2 s, sys: 35.2 s, total: 55.4 s
Wall time: 17.1 s


Unnamed: 0_level_0,Unnamed: 1_level_0,reply,retweet,retweet_comment,like
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,mean,mean,mean
tr,a_user_fer_count_mode,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
0,-1,0.022498,0.047876,0.004481,0.211644
0,0,0.00114,0.003956,0.000334,0.016538
0,1,0.03766,0.16837,0.010736,0.68622
1,-1,0.0,0.0,0.0,0.0
1,0,0.0,0.0,0.0,0.0
1,1,0.0,0.0,0.0,0.0


In [31]:
%%time
dt['a_fc_count'] = dt.groupby(['tr','a_user_id','a_follower_count','a_following_count'])['id'].transform('count'); _=gc.collect()
dt['a_fc_max']   = dt.groupby(['tr','a_user_id'])['a_fc_count'].transform('max'); _=gc.collect()
dt['a_fc_count_flag'] = 1* ((dt['a_fc_count'] == dt['a_fc_max']))  ; _=gc.collect()
dt.loc[ dt.a_fc_count<=1,'a_fc_count_flag'] = -1

del dt['a_fc_count'], dt['a_fc_max']
dt['a_fc_count_flag'].value_counts()

CPU times: user 5min 48s, sys: 3min 2s, total: 8min 50s
Wall time: 3min 34s


 1    148901469
 0     48249278
-1     35820613
Name: a_fc_count_flag, dtype: int64

In [32]:
%%time
train['a_user_fering_count_mode'] = train.merge( dt, on='id', how='left' )['a_fc_count_flag'] ; _=gc.collect()
del dt['a_fc_count_flag'] ; _=gc.collect()
train['a_user_fering_count_mode'].value_counts()

CPU times: user 2min 27s, sys: 2min 14s, total: 4min 41s
Wall time: 3min 5s


 1    66550719
 0    38794136
-1    11140825
Name: a_user_fering_count_mode, dtype: int64

In [33]:
%%time
train.groupby(['tr','a_user_fering_count_mode'])[['reply','retweet','retweet_comment','like']].agg(['mean'])

CPU times: user 19.5 s, sys: 45 s, total: 1min 4s
Wall time: 17.1 s


Unnamed: 0_level_0,Unnamed: 1_level_0,reply,retweet,retweet_comment,like
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,mean,mean,mean
tr,a_user_fering_count_mode,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
0,-1,0.020006,0.042961,0.004003,0.188413
0,0,0.001049,0.003635,0.000305,0.015398
0,1,0.038492,0.172264,0.010983,0.702084
1,-1,0.0,0.0,0.0,0.0
1,0,0.0,0.0,0.0,0.0
1,1,0.0,0.0,0.0,0.0


In [34]:
del dt; _=gc.collect()

In [35]:
%%time
train['a_count_combined'] = pd.factorize(
    (1+train['a_user_fer_count_delta_time'])*3**0 +
    (1+train['a_user_fing_count_delta_time'])*3**1 +
    (1+train['a_user_fering_count_delta_time'])*3**2 +
    (1+train['a_user_fing_count_mode'])*3**3 +
    (1+train['a_user_fer_count_mode'])*3**4 +
    (1+train['a_user_fering_count_mode'])*3**5 
)[0]
train['a_count_combined'].value_counts()

CPU times: user 27.9 s, sys: 13.4 s, total: 41.3 s
Wall time: 13.6 s


5      61241236
2      18623980
0      14443528
15      7564818
7       1910292
12      1575159
3       1081646
6       1001534
1        980180
4        751981
35       681396
38       608990
20       598396
11       578454
18       470587
9        401579
37       393211
13       338716
8        312975
28       255325
19       251898
16       226945
25       197505
48       178957
14       137615
29       129562
21        98735
31        89212
40        84427
33        71302
47        67186
49        62782
56        60543
26        58732
52        55906
10        54016
58        50091
50        40565
74        39913
30        38136
23        36482
51        34683
41        33079
44        32638
39        32311
46        31539
17        31015
53        29332
87        26198
27        24270
83        21122
68        20227
70        16047
60        16003
89        15336
42        14732
76        14702
55        13397
61        12422
45        12354
79        12027
62        10323
64      

In [36]:
%%time
train.loc[ train.tr==0 ].groupby('a_count_combined')[['reply','retweet','retweet_comment','like']].agg(['mean'])

CPU times: user 34.9 s, sys: 59 s, total: 1min 33s
Wall time: 42.7 s


Unnamed: 0_level_0,reply,retweet,retweet_comment,like
Unnamed: 0_level_1,mean,mean,mean,mean
a_count_combined,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
0,0.000121,0.00044,3.2e-05,0.001588
1,0.004567,0.015067,0.001161,0.05444
2,0.000111,0.000348,2.7e-05,0.001281
3,0.003262,0.013309,0.000867,0.048433
4,0.000383,0.001185,7.6e-05,0.005644
5,0.039985,0.181085,0.011416,0.736225
6,0.037669,0.125738,0.012553,0.542132
7,4.5e-05,0.000156,1.1e-05,0.000611
8,0.000371,0.001082,7.7e-05,0.004442
9,0.03275,0.109221,0.009898,0.488173


In [37]:
%%time
train.head()

CPU times: user 142 µs, sys: 126 µs, total: 268 µs
Wall time: 242 µs


Unnamed: 0,hashtags,tweet_id,media,links,domains,tweet_type,language,timestamp,a_user_id,a_follower_count,a_following_count,a_is_verified,a_account_creation,b_user_id,b_follower_count,b_following_count,b_is_verified,b_account_creation,b_follows_a,reply,retweet,retweet_comment,like,id,len_hashtags,len_domains,len_links,tr,engage,a_user_fer_count_delta_time,a_user_fing_count_delta_time,a_user_fering_count_delta_time,a_user_fing_count_mode,a_user_fer_count_mode,a_user_fering_count_mode,a_count_combined
0,0,9120314,9,1071726,22,2,11,1580947200,270842,38676,0,0,1548203841,17464009,23,83,0,1564324477,0,0,0,0,0,14176027,0,1,1,0,0,0,1,0,1,0,0,0
1,0,1830434,5,265433,1872,2,54,1580947200,20986,682185,690,1,1231757540,16198222,119,399,0,1447636077,0,0,0,0,0,61249597,0,1,1,0,0,0,1,0,1,0,0,0
2,27371,1168463,0,0,0,2,54,1580947200,3038,2982406,2411,1,1179400159,22612297,250,1067,0,1488232436,0,0,0,0,0,66929047,2,0,0,0,0,0,0,0,1,1,1,1
3,351628,4238914,5,560619,4220,2,54,1580947200,55201,634514,2549,1,1213725718,14270974,14,352,0,1557016053,0,0,0,0,0,5817331,1,1,1,0,0,0,0,0,0,0,0,2
4,1585,733660,5,113569,429,2,54,1580947200,3698,12029492,14,1,1239741288,16481715,1,124,0,1570715989,0,0,0,0,0,62093480,1,1,1,0,0,0,1,0,1,0,0,0


In [38]:
%%time
dt = train[['id','a_count_combined','a_user_fer_count_delta_time','a_user_fing_count_delta_time','a_user_fering_count_delta_time','a_user_fing_count_mode','a_user_fer_count_mode','a_user_fering_count_mode']]
dt = dt.sort_values( 'id' )
dt = dt.reset_index( drop=True )
save_memory( dt )
dt.to_parquet( '/mnt/DP_disk3/Recsys/a_count_combined-final.parquet' )
dt.tail(50)

CPU times: user 2min 48s, sys: 1min 49s, total: 4min 37s
Wall time: 1min 14s


Unnamed: 0,id,a_count_combined,a_user_fer_count_delta_time,a_user_fing_count_delta_time,a_user_fering_count_delta_time,a_user_fing_count_mode,a_user_fer_count_mode,a_user_fering_count_mode
116485630,116485630,15,-1,-1,-1,-1,-1,-1
116485631,116485631,5,1,1,1,1,1,1
116485632,116485632,0,0,1,0,1,0,0
116485633,116485633,15,-1,-1,-1,-1,-1,-1
116485634,116485634,5,1,1,1,1,1,1
116485635,116485635,5,1,1,1,1,1,1
116485636,116485636,2,0,0,0,0,0,0
116485637,116485637,5,1,1,1,1,1,1
116485638,116485638,15,-1,-1,-1,-1,-1,-1
116485639,116485639,5,1,1,1,1,1,1


In [39]:
time_end = datetime.now()

print("5th notebook total process time", time_end - time_begin)

5th notebook total process time 0:51:24.979394
