Copyright (c) 2020, NVIDIA CORPORATION.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

In [1]:
import os, time
os.environ["CUDA_VISIBLE_DEVICES"]="0"
start = time.time()

In [4]:
import pandas as pd, numpy as np, gc
from datetime import datetime
import joblib

import matplotlib.pyplot as plt
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)
import cudf, cupy, time
cudf.__version__

startNB = time.time()

ModuleNotFoundError: No module named 'cudf'

In [5]:
from numba import jit, njit, prange
from sklearn.metrics import precision_recall_curve, auc, log_loss

def compute_prauc(gt, pred, nafill=True):
    if nafill:
        pred[ np.isnan(pred) ] = np.nanmean( pred )
    prec, recall, thresh = precision_recall_curve(gt, pred)
    prauc = auc(recall, prec)
    return prauc

@jit
def fast_auc(y_true, y_prob):
    y_true = np.asarray(y_true)
    y_true = y_true[np.argsort(y_prob)]
    nfalse = 0
    auc = 0
    n = len(y_true)
    for i in range(n):
        y_i = y_true[i]
        nfalse += (1 - y_i)
        auc += y_i * nfalse
    auc /= (nfalse * (n - nfalse))
    return auc

@njit
def numba_log_loss(y,x):
    n = x.shape[0]
    ll = 0.
    for i in prange(n):
        if y[i]<=0.:
            ll += np.log(1-x[i] + 1e-15 )
        else:
            ll += np.log(x[i] + 1e-15)
    return -ll / n

def compute_rce(gt , pred, nafill=True, verbose=0):
    if nafill:
        pred[ np.isnan(pred) ] = np.nanmean( pred )
        
    cross_entropy = numba_log_loss( gt, pred  )
    
    yt = np.mean(gt>0)     
    strawman_cross_entropy = -(yt*np.log(yt) + (1 - yt)*np.log(1 - yt))
    
    if verbose:
        print( "logloss: {0:.5f} / {1:.5f} = {2:.5f}".format(cross_entropy, strawman_cross_entropy, cross_entropy/strawman_cross_entropy))
        print( 'mean:    {0:.5f} / {1:.5f}'.format( np.nanmean( pred ) , yt  ) )
    
    return (1.0 - cross_entropy/strawman_cross_entropy)*100.0

In [6]:
def save_memory( df ):
    features = df.columns
    for i in range( df.shape[1] ):
        if df.dtypes[i] == 'uint8':
            df[features[i]] = df[features[i]].astype( np.int8 )
            gc.collect()
        elif df.dtypes[i] == 'bool':
            df[features[i]] = df[features[i]].astype( np.int8 )
            gc.collect()
        elif df.dtypes[i] == 'uint32':
            df[features[i]] = df[features[i]].astype( np.int32 )
            gc.collect()
        elif df.dtypes[i] == 'int64':
            df[features[i]] = df[features[i]].astype( np.int32 )
            gc.collect()
        elif df.dtypes[i] == 'float64':
            df[features[i]] = df[features[i]].astype( np.float32 )
            gc.collect()
    

# Load Train

In [7]:
%%time
train = pd.read_parquet( '/mnt/DP_disk3/Recsys/train-1.parquet' )
test0 = pd.read_parquet( '/mnt/DP_disk3/Recsys/test-0.parquet' )
test1 = pd.read_parquet( '/mnt/DP_disk3/Recsys/test-1.parquet' )
gc.collect()

train['tr'] = 0
test0['tr'] = 1
test1['tr'] = 2

train.shape, test0.shape, test1.shape

CPU times: user 1min 31s, sys: 1min 26s, total: 2min 57s
Wall time: 36.1 s


((96600391, 28), (9943301, 28), (9941988, 28))

In [8]:
train = pd.concat( (train,test0,test1), sort=False )
gc.collect()
del test0, test1
gc.collect()
train.shape

(116485680, 28)

In [9]:
%%time
train = train.sort_values('id').reset_index(drop=True) 
gc.collect()

CPU times: user 3min 9s, sys: 5min 12s, total: 8min 21s
Wall time: 1min 25s


20

In [10]:
train.loc[ train.reply>0, 'reply' ] = 1
train.loc[ train.retweet>0, 'retweet' ] = 1
train.loc[ train.retweet_comment>0, 'retweet_comment' ] = 1
train.loc[ train.like>0, 'like' ] = 1
gc.collect()

0

In [11]:
train.head()

Unnamed: 0,hashtags,tweet_id,media,links,domains,tweet_type,language,timestamp,a_user_id,a_follower_count,a_following_count,a_is_verified,a_account_creation,b_user_id,b_follower_count,b_following_count,b_is_verified,b_account_creation,b_follows_a,reply,retweet,retweet_comment,like,id,len_hashtags,len_domains,len_links,tr
0,7406650BAE78F56FBD8068FC460A6A1E,0,0,81DE6DAEA33235BD3AEE98A381D79172,DBB05735DF475AB5DABEE791299AE3D1,2,54,1581131751,0,314265,139,True,1369082795,11153676,2,29,False,1568107028,False,0,0,0,1,0,1,1,1,0
1,,1,5,,,2,9,1581527100,1,43089,2553,True,1244928766,11153677,26,161,False,1513935572,False,0,0,0,0,1,0,0,0,0
2,,2,0,,,2,38,1580979604,2,60258,773,False,1306719127,11153678,4,72,False,1573996260,False,0,0,0,1,2,0,0,0,0
3,,3,9,,,1,38,1580993048,3,119321,220,False,1283447472,11153679,22,251,False,1439637842,False,0,0,0,1,3,0,0,0,0
4,,4,7,,,1,47,1581143484,4,3106126,3740,False,1234718202,11153679,22,251,False,1439637842,False,0,0,0,0,4,0,0,0,0


In [12]:
save_memory(train)
gc.collect()

0

In [13]:
train.dtypes

hashtags              object
tweet_id               int32
media                   int8
links                 object
domains               object
tweet_type              int8
language                int8
timestamp              int32
a_user_id              int32
a_follower_count       int32
a_following_count      int32
a_is_verified           int8
a_account_creation     int32
b_user_id              int32
b_follower_count       int32
b_following_count      int32
b_is_verified           int8
b_account_creation     int32
b_follows_a             int8
reply                  int32
retweet                int32
retweet_comment        int32
like                   int32
id                     int32
len_hashtags           int32
len_domains            int32
len_links              int32
tr                     int32
dtype: object

In [14]:
train['dt_day']  = pd.to_datetime( train['timestamp'] , unit='s' ).dt.day.values.astype( np.int8 )
train['dt_dow']  = pd.to_datetime( train['timestamp'] , unit='s' ).dt.dayofweek.values.astype( np.int8 )
train['dt_hour'] = pd.to_datetime( train['timestamp'] , unit='s' ).dt.hour.values.astype( np.int8 )
_=gc.collect()
train.groupby('dt_day')['id'].agg('count')

dt_day
6     13229321
7     14038276
8     14469921
9     14017137
10    14142151
11    13860437
12    12843148
13     2846965
14     2735253
15     2941958
16     2859895
17     3070378
18     2850502
19     2580338
Name: id, dtype: int64

In [15]:
dt = pd.read_parquet( '/mnt/DP_disk3/Recsys/a_count_combined-final.parquet' )
dt.head()

Unnamed: 0,id,a_count_combined,a_user_fer_count_delta_time,a_user_fing_count_delta_time,a_user_fering_count_delta_time,a_user_fing_count_mode,a_user_fer_count_mode,a_user_fering_count_mode
0,0,5,1,1,1,1,1,1
1,1,5,1,1,1,1,1,1
2,2,5,1,1,1,1,1,1
3,3,5,1,1,1,1,1,1
4,4,7,0,0,0,1,0,0


In [16]:
train['a_count_combined']             = dt['a_count_combined']
train['a_user_fer_count_delta_time']  = dt['a_user_fer_count_delta_time']
train['a_user_fing_count_delta_time'] = dt['a_user_fing_count_delta_time']
train['a_user_fering_count_delta_time']=dt['a_user_fering_count_delta_time']
train['a_user_fing_count_mode']       = dt['a_user_fing_count_mode']
train['a_user_fer_count_mode']        = dt['a_user_fer_count_mode']
train['a_user_fering_count_mode']     = dt['a_user_fering_count_mode']

train.loc[ (train.dt_day==12)|(train.dt_day==18) ,['a_count_combined','a_user_fer_count_delta_time','a_user_fing_count_delta_time','a_user_fering_count_delta_time','a_user_fing_count_mode','a_user_fer_count_mode','a_user_fering_count_mode']] = -9

del dt; _=gc.collect()
train.head()

Unnamed: 0,hashtags,tweet_id,media,links,domains,tweet_type,language,timestamp,a_user_id,a_follower_count,a_following_count,a_is_verified,a_account_creation,b_user_id,b_follower_count,b_following_count,b_is_verified,b_account_creation,b_follows_a,reply,retweet,retweet_comment,like,id,len_hashtags,len_domains,len_links,tr,dt_day,dt_dow,dt_hour,a_count_combined,a_user_fer_count_delta_time,a_user_fing_count_delta_time,a_user_fering_count_delta_time,a_user_fing_count_mode,a_user_fer_count_mode,a_user_fering_count_mode
0,7406650BAE78F56FBD8068FC460A6A1E,0,0,81DE6DAEA33235BD3AEE98A381D79172,DBB05735DF475AB5DABEE791299AE3D1,2,54,1581131751,0,314265,139,1,1369082795,11153676,2,29,0,1568107028,0,0,0,0,1,0,1,1,1,0,8,5,3,5,1,1,1,1,1,1
1,,1,5,,,2,9,1581527100,1,43089,2553,1,1244928766,11153677,26,161,0,1513935572,0,0,0,0,0,1,0,0,0,0,12,2,17,-9,-9,-9,-9,-9,-9,-9
2,,2,0,,,2,38,1580979604,2,60258,773,0,1306719127,11153678,4,72,0,1573996260,0,0,0,0,1,2,0,0,0,0,6,3,9,5,1,1,1,1,1,1
3,,3,9,,,1,38,1580993048,3,119321,220,0,1283447472,11153679,22,251,0,1439637842,0,0,0,0,1,3,0,0,0,0,6,3,12,5,1,1,1,1,1,1
4,,4,7,,,1,47,1581143484,4,3106126,3740,0,1234718202,11153679,22,251,0,1439637842,0,0,0,0,0,4,0,0,0,0,8,5,6,7,0,0,0,1,0,0


In [17]:
train.dtypes

hashtags                          object
tweet_id                           int32
media                               int8
links                             object
domains                           object
tweet_type                          int8
language                            int8
timestamp                          int32
a_user_id                          int32
a_follower_count                   int32
a_following_count                  int32
a_is_verified                       int8
a_account_creation                 int32
b_user_id                          int32
b_follower_count                   int32
b_following_count                  int32
b_is_verified                       int8
b_account_creation                 int32
b_follows_a                         int8
reply                              int32
retweet                            int32
retweet_comment                    int32
like                               int32
id                                 int32
len_hashtags    

In [18]:
dt = pd.read_parquet( '/mnt/DP_disk3/Recsys/text-processings-1.parquet' )
dt.sort_values('id', inplace=True)
dt.head()

Unnamed: 0,id,count_ats,count_char,count_words,tw_hash,tw_freq_hash,tw_first_word,tw_second_word,tw_last_word,tw_llast_word,tw_len,tw_hash0,tw_hash1,tw_rt_uhash
0,0,0,166,25,0,0,0,0,0,0,11,0,0,0
1,1,1,86,10,1,1,1,1,1,1,8,530438,0,530438
2,2,0,128,19,2,2,2,2,2,2,17,0,0,0
3,3,0,90,10,3,3,3,3,3,3,5,0,0,186133
4,4,3,90,9,4,4,4,4,4,4,4,490823,30934,264460


In [19]:
train['count_ats']     = dt['count_ats']
train['count_char']    = dt['count_char']
train['count_words']   = dt['count_words']
train['tw_hash']       = dt['tw_hash']
train['tw_freq_hash']  = dt['tw_freq_hash']
train['tw_first_word'] = dt['tw_first_word']
train['tw_second_word']= dt['tw_second_word']
train['tw_last_word']  = dt['tw_last_word']
train['tw_llast_word'] = dt['tw_llast_word']
train['tw_len']        = dt['tw_len']
train['tw_hash0']      = dt['tw_hash0']
train['tw_hash1']      = dt['tw_hash1']
train['tw_rt_uhash']   = dt['tw_rt_uhash']
del dt
gc.collect()

0

In [20]:
train.tail()

Unnamed: 0,hashtags,tweet_id,media,links,domains,tweet_type,language,timestamp,a_user_id,a_follower_count,a_following_count,a_is_verified,a_account_creation,b_user_id,b_follower_count,b_following_count,b_is_verified,b_account_creation,b_follows_a,reply,retweet,retweet_comment,like,id,len_hashtags,len_domains,len_links,tr,dt_day,dt_dow,dt_hour,a_count_combined,a_user_fer_count_delta_time,a_user_fing_count_delta_time,a_user_fering_count_delta_time,a_user_fing_count_mode,a_user_fer_count_mode,a_user_fering_count_mode,count_ats,count_char,count_words,tw_hash,tw_freq_hash,tw_first_word,tw_second_word,tw_last_word,tw_llast_word,tw_len,tw_hash0,tw_hash1,tw_rt_uhash
116485675,7CC582B5E93DAB72F49B1D55DDCBAE5C\t94544EA41F16...,56173234,5,,,2,11,1581719195,1714410,1472,2681,0,1295918678,2110198,717,464,0,1501554925,1,0,0,0,0,116485675,6,0,0,2,14,4,22,5,1,1,1,1,1,1,0,300,95,39968664,35988883,1927464,1301236,751,217,32,0,0,0
116485676,1F6D05BCF068D59C3CDC1F935C6DB93D,56173235,0,,,2,11,1581665518,2496877,4236,4119,0,1524226898,2110198,717,464,0,1501554925,1,0,0,0,0,116485676,1,0,0,2,14,4,7,5,1,1,1,1,1,1,0,161,42,39968665,35988884,6547145,523022,1549,94,15,0,0,0
116485677,,56173236,0,,,2,11,1581602829,7596256,1943,2881,0,1468355247,2110198,717,464,0,1501554925,1,0,0,0,0,116485677,0,0,0,2,13,3,14,15,-1,-1,-1,-1,-1,-1,0,189,52,39968666,35988885,3756137,725675,3727,1566,23,0,0,0
116485678,DE848668D8312219F01B8561959BBC0B\tBB6FBD3AFF96...,56173237,0,,,2,11,1581722172,304940,4884,4428,0,1373131817,2110198,717,464,0,1501554925,1,0,0,0,0,116485678,4,0,0,2,14,4,23,5,1,1,1,1,1,1,0,84,28,39968667,35988886,3050788,743251,2823,6344,16,0,0,0
116485679,,56173238,0,,,2,11,1581813247,174224,309,285,0,1575377853,2110198,717,464,0,1501554925,1,0,0,0,0,116485679,0,0,0,2,16,6,0,5,1,1,1,1,1,1,1,28,3,39968668,31,31,29,31,29,1,2246012,0,2246012


In [21]:
train.groupby('tr')['id'].agg('count')

tr
0    96600391
1     9943301
2     9941988
Name: id, dtype: int64

In [22]:
def MultiTE_gpu( tra, col, tar, L=1, smooth_method=0  ):
    np.random.seed(L)

    cols = col+[tar]
    gf = cudf.from_pandas(tra[cols])
    mn = gf[tar].mean().astype('float32')
    
    predtrain = np.zeros( tra.shape[0] )
    
    for fold in [7,8,9,10,11,12]:
        px = np.where( tra.dt_day <fold )[0]
        py = np.where( tra.dt_day==fold )[0]
        mn = gf[tar].iloc[px].mean().astype('float32')
        if smooth_method==0:
            te = gf.iloc[px].groupby(col)[tar].agg(['mean','count'])
            te['smooth']  = (te['mean']*te['count'])
            te['smooth'] += (mn*L)
            te['smooth'] /= (te['count']+L)
            te = te.drop( ['mean','count'] )
        elif smooth_method==1:
            te = gf.iloc[px].groupby(col)[tar].agg(['sum','count'])
            te['smooth'] = (te['sum']+L) / (te['count']+1)
            te = te.drop( ['sum','count'] )
        gf2 = gf.iloc[py].copy()
        gf2 = gf2.set_index( col )
        gf2['id'] = cupy.arange( gf2.shape[0] )
        gf2 = gf2.join( te, how='left' )
        gf2 = gf2.sort_values( 'id' )
        del te, gf2['id']
        predtrain[py] = gf2.smooth.fillna(-999).to_array()
        del gf2

    px = np.where( tra.dt_day <13 )[0]
    py = np.where( tra.dt_day>=13 )[0]
    mn = gf[tar].iloc[px].mean().astype('float32')
    if smooth_method==0:
        te = gf.iloc[px].groupby(col)[tar].agg(['mean','count'])
        te['smooth']  = (te['mean']*te['count'])
        te['smooth'] += (mn*L)
        te['smooth'] /= (te['count']+L)
        te = te.drop( ['mean','count'] )
    elif smooth_method==1:
        te = gf.iloc[px].groupby(col)[tar].agg(['sum','count'])
        te['smooth'] = (te['sum']+L) / (te['count']+1)
        te = te.drop( ['sum','count'] )
    gf2 = gf.iloc[py].copy()
    gf2 = gf2.set_index( col )
    gf2['id'] = cupy.arange( gf2.shape[0] )
    gf2 = gf2.join( te, how='left' )
    gf2 = gf2.sort_values( 'id' )
    del te, gf2['id']
    predtrain[py] = gf2.smooth.fillna(-999).to_array()            
    del gf2

    px = np.where( (tra.dt_day>=7)&(tra.dt_day<=11) )[0]
    py = np.where( tra.dt_day==6 )[0]
    mn = gf[tar].iloc[px].mean().astype('float32')
    if smooth_method==0:
        te = gf.iloc[px].groupby(col)[tar].agg(['mean','count'])
        te['smooth']  = (te['mean']*te['count'])
        te['smooth'] += (mn*L)
        te['smooth'] /= (te['count']+L)
        te = te.drop( ['mean','count'] )
    elif smooth_method==1:
        te = gf.iloc[px].groupby(col)[tar].agg(['sum','count'])
        te['smooth'] = (te['sum']+L) / (te['count']+1)
        te = te.drop( ['sum','count'] )
    gf2 = gf.iloc[py].copy()
    gf2 = gf2.set_index( col )
    gf2['id'] = cupy.arange( gf2.shape[0] )
    gf2 = gf2.join( te, how='left' )
    gf2 = gf2.sort_values( 'id' )
    del te, gf2['id']
    predtrain[py] = gf2.smooth.fillna(-999).to_array()            
    del gf2
    
    _ = gc.collect()
    predtrain[predtrain <= -999 ] = np.nan
    return predtrain.astype(np.float32)

In [23]:
train.head()

Unnamed: 0,hashtags,tweet_id,media,links,domains,tweet_type,language,timestamp,a_user_id,a_follower_count,a_following_count,a_is_verified,a_account_creation,b_user_id,b_follower_count,b_following_count,b_is_verified,b_account_creation,b_follows_a,reply,retweet,retweet_comment,like,id,len_hashtags,len_domains,len_links,tr,dt_day,dt_dow,dt_hour,a_count_combined,a_user_fer_count_delta_time,a_user_fing_count_delta_time,a_user_fering_count_delta_time,a_user_fing_count_mode,a_user_fer_count_mode,a_user_fering_count_mode,count_ats,count_char,count_words,tw_hash,tw_freq_hash,tw_first_word,tw_second_word,tw_last_word,tw_llast_word,tw_len,tw_hash0,tw_hash1,tw_rt_uhash
0,7406650BAE78F56FBD8068FC460A6A1E,0,0,81DE6DAEA33235BD3AEE98A381D79172,DBB05735DF475AB5DABEE791299AE3D1,2,54,1581131751,0,314265,139,1,1369082795,11153676,2,29,0,1568107028,0,0,0,0,1,0,1,1,1,0,8,5,3,5,1,1,1,1,1,1,0,166,25,0,0,0,0,0,0,11,0,0,0
1,,1,5,,,2,9,1581527100,1,43089,2553,1,1244928766,11153677,26,161,0,1513935572,0,0,0,0,0,1,0,0,0,0,12,2,17,-9,-9,-9,-9,-9,-9,-9,1,86,10,1,1,1,1,1,1,8,530438,0,530438
2,,2,0,,,2,38,1580979604,2,60258,773,0,1306719127,11153678,4,72,0,1573996260,0,0,0,0,1,2,0,0,0,0,6,3,9,5,1,1,1,1,1,1,0,128,19,2,2,2,2,2,2,17,0,0,0
3,,3,9,,,1,38,1580993048,3,119321,220,0,1283447472,11153679,22,251,0,1439637842,0,0,0,0,1,3,0,0,0,0,6,3,12,5,1,1,1,1,1,1,0,90,10,3,3,3,3,3,3,5,0,0,186133
4,,4,7,,,1,47,1581143484,4,3106126,3740,0,1234718202,11153679,22,251,0,1439637842,0,0,0,0,0,4,0,0,0,0,8,5,6,7,0,0,0,1,0,0,3,90,9,4,4,4,4,4,4,4,490823,30934,264460


In [24]:
%%time
for t in ['reply']:
    for c in [
        ['b_user_id','tweet_type','language'],
        ['tw_first_word','tweet_type','language'],
        ['tw_last_word','tweet_type','language'],
        ['tw_hash0','tweet_type','language'],
        ['tw_hash1','tweet_type','language'],
        ['tw_rt_uhash','tweet_type','language'],
        
        ['a_user_id'],
        ['b_user_id'],
        ['tw_hash'],
        ['tw_freq_hash'],
        
        ['media','tweet_type','language','a_is_verified','b_is_verified','b_follows_a'],
        ['a_count_combined','tweet_type','language'],
        ['a_user_fer_count_delta_time','media','language'],
        ['a_user_fing_count_delta_time','media','language'],
        ['a_user_fering_count_delta_time','tweet_type','language'],
        ['a_user_fing_count_mode','media','language'],
        ['a_user_fer_count_mode','media','language'],
        ['a_user_fering_count_mode','tweet_type','language'],
        
        ['domains','media','tweet_type','language'],
        ['links','media','tweet_type','language'],
        ['hashtags','media','tweet_type','language'],
        ]:
        fname = 'TE_'+'_'.join(c)+'_'+t
        print( fname )
        train[fname] = MultiTE_gpu( train, c, t, 20, 0 )

TE_b_user_id_tweet_type_language_reply


NameError: name 'cudf' is not defined

In [25]:
train.head()

Unnamed: 0,hashtags,tweet_id,media,links,domains,tweet_type,language,timestamp,a_user_id,a_follower_count,a_following_count,a_is_verified,a_account_creation,b_user_id,b_follower_count,b_following_count,b_is_verified,b_account_creation,b_follows_a,reply,retweet,retweet_comment,like,id,len_hashtags,len_domains,len_links,tr,dt_day,dt_dow,dt_hour,a_count_combined,a_user_fer_count_delta_time,a_user_fing_count_delta_time,a_user_fering_count_delta_time,a_user_fing_count_mode,a_user_fer_count_mode,a_user_fering_count_mode,count_ats,count_char,count_words,tw_hash,tw_freq_hash,tw_first_word,tw_second_word,tw_last_word,tw_llast_word,tw_len,tw_hash0,tw_hash1,tw_rt_uhash
0,7406650BAE78F56FBD8068FC460A6A1E,0,0,81DE6DAEA33235BD3AEE98A381D79172,DBB05735DF475AB5DABEE791299AE3D1,2,54,1581131751,0,314265,139,1,1369082795,11153676,2,29,0,1568107028,0,0,0,0,1,0,1,1,1,0,8,5,3,5,1,1,1,1,1,1,0,166,25,0,0,0,0,0,0,11,0,0,0
1,,1,5,,,2,9,1581527100,1,43089,2553,1,1244928766,11153677,26,161,0,1513935572,0,0,0,0,0,1,0,0,0,0,12,2,17,-9,-9,-9,-9,-9,-9,-9,1,86,10,1,1,1,1,1,1,8,530438,0,530438
2,,2,0,,,2,38,1580979604,2,60258,773,0,1306719127,11153678,4,72,0,1573996260,0,0,0,0,1,2,0,0,0,0,6,3,9,5,1,1,1,1,1,1,0,128,19,2,2,2,2,2,2,17,0,0,0
3,,3,9,,,1,38,1580993048,3,119321,220,0,1283447472,11153679,22,251,0,1439637842,0,0,0,0,1,3,0,0,0,0,6,3,12,5,1,1,1,1,1,1,0,90,10,3,3,3,3,3,3,5,0,0,186133
4,,4,7,,,1,47,1581143484,4,3106126,3740,0,1234718202,11153679,22,251,0,1439637842,0,0,0,0,0,4,0,0,0,0,8,5,6,7,0,0,0,1,0,0,3,90,9,4,4,4,4,4,4,4,490823,30934,264460


In [26]:
train.to_parquet( '/mnt/DP_disk3/Recsys/data/train-final-te-reply-1.parquet' )
gc.collect()

8

In [27]:
print('Elapsed Time is %f minutes'%((time.time()-startNB)/60))

NameError: name 'startNB' is not defined