Copyright (c) 2020, NVIDIA CORPORATION.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

### Caculate follower, following count and write to parquent
#### using dataframe groupby operations 

In [1]:
import os, time
os.environ["CUDA_VISIBLE_DEVICES"]="0"
start = time.time()

In [2]:

from datetime import datetime 
time_begin = datetime.now()

In [3]:
import pandas as pd, numpy as np, gc
from datetime import datetime
import joblib

import matplotlib.pyplot as plt
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)
#import cudf, cupy
#cudf.__version__

In [4]:
from numba import jit, njit, prange
from sklearn.metrics import precision_recall_curve, auc, log_loss

def compute_prauc(gt, pred, nafill=True):
    if nafill:
        pred[ np.isnan(pred) ] = np.nanmean( pred )
    prec, recall, thresh = precision_recall_curve(gt, pred)
    prauc = auc(recall, prec)
    return prauc

@jit
def fast_auc(y_true, y_prob):
    y_true = np.asarray(y_true)
    y_true = y_true[np.argsort(y_prob)]
    nfalse = 0
    auc = 0
    n = len(y_true)
    for i in range(n):
        y_i = y_true[i]
        nfalse += (1 - y_i)
        auc += y_i * nfalse
    auc /= (nfalse * (n - nfalse))
    return auc

@njit
def numba_log_loss(y,x):
    n = x.shape[0]
    ll = 0.
    for i in prange(n):
        if y[i]<=0.:
            ll += np.log(1-x[i] + 1e-15 )
        else:
            ll += np.log(x[i] + 1e-15)
    return -ll / n

def compute_rce(gt , pred, nafill=True, verbose=0):
    if nafill:
        pred[ np.isnan(pred) ] = np.nanmean( pred )
        
    cross_entropy = numba_log_loss( gt, pred  )
    
    yt = np.mean(gt>0)     
    strawman_cross_entropy = -(yt*np.log(yt) + (1 - yt)*np.log(1 - yt))
    
    if verbose:
        print( "logloss: {0:.5f} / {1:.5f} = {2:.5f}".format(cross_entropy, strawman_cross_entropy, cross_entropy/strawman_cross_entropy))
        print( 'mean:    {0:.5f} / {1:.5f}'.format( np.nanmean( pred ) , yt  ) )
    
    return (1.0 - cross_entropy/strawman_cross_entropy)*100.0


In [5]:
def save_memory( df ):
    features = df.columns
    for i in range( df.shape[1] ):
        if df.dtypes[i] == 'uint8':
            df[features[i]] = df[features[i]].astype( np.int8 )
            gc.collect()
        elif df.dtypes[i] == 'bool':
            df[features[i]] = df[features[i]].astype( np.int8 )
            gc.collect()
        elif df.dtypes[i] == 'uint32':
            df[features[i]] = df[features[i]].astype( np.int32 )
            gc.collect()
        elif df.dtypes[i] == 'int64':
            df[features[i]] = df[features[i]].astype( np.int32 )
            gc.collect()
        elif df.dtypes[i] == 'float64':
            df[features[i]] = df[features[i]].astype( np.float32 )
            gc.collect()

# Load Train

In [6]:
%%time
train = pd.read_parquet( '/mnt/DP_disk3/Recsys/train-1.parquet' )
test0 = pd.read_parquet( '/mnt/DP_disk3/Recsys/test-0.parquet' )
test1 = pd.read_parquet( '/mnt/DP_disk3/Recsys/test-1.parquet' )
gc.collect()

train['tr'] = 0
test0['tr'] = 1
test1['tr'] = 1

train.shape, test0.shape, test1.shape

((96600391, 28), (9943301, 28), (9941988, 28))

In [7]:
%%time
train = pd.concat( (train,test0,test1), sort=False )
gc.collect()
del test0,test1
gc.collect()

train.shape

KeyboardInterrupt: 

In [None]:
%%time
train = train.sort_values('timestamp').reset_index(drop=True) #TIME ORDER
gc.collect()

In [None]:
%%time
train.loc[ train.reply>0, 'reply' ] = 1
train.loc[ train.retweet>0, 'retweet' ] = 1
train.loc[ train.retweet_comment>0, 'retweet_comment' ] = 1
train.loc[ train.like>0, 'like' ] = 1

train['engage'] = 0
train.loc[ (train.reply>0)|(train.retweet>0)|(train.retweet_comment>0)|(train.like>0)  , 'engage'] = 1
gc.collect()

In [None]:
%%time
#train.head()

In [None]:
save_memory(train)
gc.collect()

In [None]:
train.dtypes

In [None]:
%%time
dt0 = train[['tr','a_user_id','a_follower_count','a_following_count','timestamp','id']].copy()
dt1 = train[['tr','b_user_id','b_follower_count','b_following_count','timestamp','id']].copy()
dt1.columns = ['tr','a_user_id','a_follower_count','a_following_count','timestamp','id']
dt1['id'] = -1
dt = pd.concat( (dt0,dt1), sort=False )
del dt0,dt1; _=gc.collect()
dt.head()

In [None]:
%%time
dt['a_fc_max'] = dt.groupby(['tr','a_user_id','a_follower_count'])['timestamp'].transform('max'); _=gc.collect()
dt['a_fc_min'] = dt.groupby(['tr','a_user_id','a_follower_count'])['timestamp'].transform('min'); _=gc.collect()

dt['a_fc_dif'] = dt['a_fc_max'] - dt['a_fc_min']; _=gc.collect()

dt['a_fc_dif_max'] = dt.groupby(['tr','a_user_id'])['a_fc_dif'].transform('max'); _=gc.collect()

dt['a_fc_dif_flag'] = 1* ((dt['a_fc_dif'] == dt['a_fc_dif_max']) )  ; _=gc.collect()
dt.loc[ dt.a_fc_dif==0 ,'a_fc_dif_flag'] = -1

del dt['a_fc_max'],dt['a_fc_min'],dt['a_fc_dif'],dt['a_fc_dif_max'] ; _=gc.collect()
dt['a_fc_dif_flag'].value_counts()

In [None]:
%%time
train['a_user_fer_count_delta_time'] = train.merge( dt, on='id', how='left' )['a_fc_dif_flag'] ; _=gc.collect()

del dt['a_fc_dif_flag'] ; _=gc.collect()

train['a_user_fer_count_delta_time'].value_counts()

In [None]:
%%time
train.groupby(['tr','a_user_fer_count_delta_time'])[['reply','retweet','retweet_comment','like']].agg(['mean'])

In [None]:
%%time
dt['a_fc_max'] = dt.groupby(['tr','a_user_id','a_following_count'])['timestamp'].transform('max'); _=gc.collect()
dt['a_fc_min'] = dt.groupby(['tr','a_user_id','a_following_count'])['timestamp'].transform('min'); _=gc.collect()

dt['a_fc_dif'] = dt['a_fc_max'] - dt['a_fc_min']; _=gc.collect()

dt['a_fc_dif_max'] = dt.groupby(['tr','a_user_id'])['a_fc_dif'].transform('max'); _=gc.collect()

dt['a_fc_dif_flag'] = 1* ((dt['a_fc_dif'] == dt['a_fc_dif_max']) )  ; _=gc.collect()
dt.loc[ dt.a_fc_dif==0 ,'a_fc_dif_flag'] = -1

del dt['a_fc_max'],dt['a_fc_min'],dt['a_fc_dif'],dt['a_fc_dif_max'] ; _=gc.collect()
dt['a_fc_dif_flag'].value_counts()

In [None]:
%%time
train['a_user_fing_count_delta_time'] = train.merge( dt, on='id', how='left' )['a_fc_dif_flag'] ; _=gc.collect()

del dt['a_fc_dif_flag'] ; _=gc.collect()

train['a_user_fing_count_delta_time'].value_counts()

In [None]:
%%time
train.groupby(['tr','a_user_fing_count_delta_time'])[['reply','retweet','retweet_comment','like']].agg(['mean'])

In [None]:
%%time
dt['a_fc_max'] = dt.groupby(['tr','a_user_id','a_follower_count','a_following_count'])['timestamp'].transform('max'); _=gc.collect()
dt['a_fc_min'] = dt.groupby(['tr','a_user_id','a_follower_count','a_following_count'])['timestamp'].transform('min'); _=gc.collect()

dt['a_fc_dif'] = dt['a_fc_max'] - dt['a_fc_min']; _=gc.collect()

dt['a_fc_dif_max'] = dt.groupby(['tr','a_user_id'])['a_fc_dif'].transform('max'); _=gc.collect()

dt['a_fc_dif_flag'] = 1* ((dt['a_fc_dif'] == dt['a_fc_dif_max']) )  ; _=gc.collect()
dt.loc[ dt.a_fc_dif==0 ,'a_fc_dif_flag'] = -1

del dt['a_fc_max'],dt['a_fc_min'],dt['a_fc_dif'],dt['a_fc_dif_max'] ; _=gc.collect()
dt['a_fc_dif_flag'].value_counts()

In [None]:
%%time
train['a_user_fering_count_delta_time'] = train.merge( dt, on='id', how='left' )['a_fc_dif_flag'] ; _=gc.collect()

del dt['a_fc_dif_flag'] ; _=gc.collect()

train['a_user_fering_count_delta_time'].value_counts()

In [None]:
%%time
train.groupby(['tr','a_user_fering_count_delta_time'])[['reply','retweet','retweet_comment','like']].agg(['mean'])

In [None]:
%%time
dt['a_fc_count'] = dt.groupby(['tr','a_user_id','a_following_count'])['id'].transform('count'); _=gc.collect()
dt['a_fc_max']   = dt.groupby(['tr','a_user_id'])['a_fc_count'].transform('max'); _=gc.collect()
dt['a_fc_count_flag'] = 1* ((dt['a_fc_count'] == dt['a_fc_max']))  ; _=gc.collect()
dt.loc[ dt.a_fc_count<=1,'a_fc_count_flag'] = -1

del dt['a_fc_count'], dt['a_fc_max']
dt['a_fc_count_flag'].value_counts()

In [None]:
%%time
train['a_user_fing_count_mode'] = train.merge( dt, on='id', how='left' )['a_fc_count_flag'] ; _=gc.collect()
del dt['a_fc_count_flag'] ; _=gc.collect()
train['a_user_fing_count_mode'].value_counts()

In [None]:
%%time
train.groupby(['tr','a_user_fing_count_mode'])[['reply','retweet','retweet_comment','like']].agg(['mean'])

In [None]:
%%time
dt['a_fc_count'] = dt.groupby(['tr','a_user_id','a_follower_count'])['id'].transform('count'); _=gc.collect()
dt['a_fc_max']   = dt.groupby(['tr','a_user_id'])['a_fc_count'].transform('max'); _=gc.collect()
dt['a_fc_count_flag'] = 1* ((dt['a_fc_count'] == dt['a_fc_max']))  ; _=gc.collect()
dt.loc[ dt.a_fc_count<=1,'a_fc_count_flag'] = -1

del dt['a_fc_count'], dt['a_fc_max']
dt['a_fc_count_flag'].value_counts()

In [None]:
%%time
train['a_user_fer_count_mode'] = train.merge( dt, on='id', how='left' )['a_fc_count_flag'] ; _=gc.collect()
del dt['a_fc_count_flag'] ; _=gc.collect()
train['a_user_fer_count_mode'].value_counts()

In [None]:
%%time
train.groupby(['tr','a_user_fer_count_mode'])[['reply','retweet','retweet_comment','like']].agg(['mean'])

In [None]:
%%time
dt['a_fc_count'] = dt.groupby(['tr','a_user_id','a_follower_count','a_following_count'])['id'].transform('count'); _=gc.collect()
dt['a_fc_max']   = dt.groupby(['tr','a_user_id'])['a_fc_count'].transform('max'); _=gc.collect()
dt['a_fc_count_flag'] = 1* ((dt['a_fc_count'] == dt['a_fc_max']))  ; _=gc.collect()
dt.loc[ dt.a_fc_count<=1,'a_fc_count_flag'] = -1

del dt['a_fc_count'], dt['a_fc_max']
dt['a_fc_count_flag'].value_counts()

In [None]:
%%time
train['a_user_fering_count_mode'] = train.merge( dt, on='id', how='left' )['a_fc_count_flag'] ; _=gc.collect()
del dt['a_fc_count_flag'] ; _=gc.collect()
train['a_user_fering_count_mode'].value_counts()

In [None]:
%%time
train.groupby(['tr','a_user_fering_count_mode'])[['reply','retweet','retweet_comment','like']].agg(['mean'])

In [None]:
del dt; _=gc.collect()

In [None]:
%%time
train['a_count_combined'] = pd.factorize(
    (1+train['a_user_fer_count_delta_time'])*3**0 +
    (1+train['a_user_fing_count_delta_time'])*3**1 +
    (1+train['a_user_fering_count_delta_time'])*3**2 +
    (1+train['a_user_fing_count_mode'])*3**3 +
    (1+train['a_user_fer_count_mode'])*3**4 +
    (1+train['a_user_fering_count_mode'])*3**5 
)[0]
train['a_count_combined'].value_counts()

In [None]:
%%time
train.loc[ train.tr==0 ].groupby('a_count_combined')[['reply','retweet','retweet_comment','like']].agg(['mean'])

In [None]:
%%time
train.head()

In [None]:
%%time
dt = train[['id','a_count_combined','a_user_fer_count_delta_time','a_user_fing_count_delta_time','a_user_fering_count_delta_time','a_user_fing_count_mode','a_user_fer_count_mode','a_user_fering_count_mode']]
dt = dt.sort_values( 'id' )
dt = dt.reset_index( drop=True )
save_memory( dt )
dt.to_parquet( '/mnt/DP_disk3/Recsys/a_count_combined-final.parquet' )
dt.tail(50)

In [None]:
time_end = datetime.now()

print("5th notebook total process time", time_end - time_begin)