Copyright (c) 2020, NVIDIA CORPORATION.

Modifications copyright Intel. 

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

### 1. Load Pickle data
### 2. Add three new columns, hash_tag, domain, link 's length
### 3. Caculate most freq tokens 
### 4. Split data and write to parquet 

In [36]:
import matplotlib.pyplot as plt
%matplotlib inline  

import pandas as pd
import numpy as np
import gc
import time
from datetime import datetime 

In [37]:
time_begin = datetime.now()

In [38]:
%%time
#Reload the output of previous notebook

time_read_begin = datetime.now()

df = pd.read_pickle('/mnt/DP_disk3/Recsys/tmp.pkl')
print( df.shape )
print("load pickle time", datetime.now() - time_read_begin)


(116485680, 24)
load pickle time 0:00:33.311744
CPU times: user 18.8 s, sys: 14.4 s, total: 33.3 s
Wall time: 33.3 s


In [39]:
%%time
df.head()

CPU times: user 1.29 ms, sys: 31 µs, total: 1.32 ms
Wall time: 1.01 ms


Unnamed: 0,hashtags,tweet_id,media,links,domains,tweet_type,language,timestamp,a_user_id,a_follower_count,...,b_follower_count,b_following_count,b_is_verified,b_account_creation,b_follows_a,reply,retweet,retweet_comment,like,id
0,7406650BAE78F56FBD8068FC460A6A1E,0,0,81DE6DAEA33235BD3AEE98A381D79172,DBB05735DF475AB5DABEE791299AE3D1,2,54,1581131751,0,314265,...,2,29,False,1568107028,False,0,0,0,1581156487,0
1,,1,5,,,2,9,1581527100,1,43089,...,26,161,False,1513935572,False,0,0,0,0,1
2,,2,0,,,2,38,1580979604,2,60258,...,4,72,False,1573996260,False,0,0,0,1580992068,2
3,,3,9,,,1,38,1580993048,3,119321,...,22,251,False,1439637842,False,0,0,0,1580993386,3
4,,4,7,,,1,47,1581143484,4,3106126,...,22,251,False,1439637842,False,0,0,0,0,4


In [40]:
%%time

df['len_hashtags'] = df['hashtags'].apply(lambda x: str(x).count('\t')+1 if not(pd.isnull(x)) else 0)
df['len_domains']  = df['domains'].apply(lambda x: str(x).count('\t')+1 if not(pd.isnull(x)) else 0)
df['len_links']    = df['links'].apply(lambda x: str(x).count('\t')+1 if not(pd.isnull(x)) else 0)

CPU times: user 3min 26s, sys: 13.3 s, total: 3min 39s
Wall time: 3min 15s


In [41]:
%%time
## Caculate count of each domain[x]，key is domain value, value is count 
var = df['domains'].fillna('').values.copy()
gc.collect()
var[:10]

CPU times: user 47 s, sys: 5.74 s, total: 52.7 s
Wall time: 34.4 s


array(['DBB05735DF475AB5DABEE791299AE3D1', '', '', '', '', '',
       '215887F9F98736B07C823045CA9FA2EB',
       '3330516EAADFD093A1C7B9DA83172DEB', '', ''], dtype=object)

In [42]:
%%time

PD = {}
null = var[1]
PD[null] = [0,0]
count = 1
for vs in var:
    if vs != null:  #null if var[0]
        for v in vs.split('\t'):
            if v not in PD:
                PD[v] = [count,1]
                count +=1
            else:
                x = PD[v]
                x[1] += 1
                PD[v] = x
    else:
        x = PD[null]
        x[1] += 1
        PD[null] = x
        
len(PD),list(PD.items())[:10]

CPU times: user 32.8 s, sys: 579 ms, total: 33.4 s
Wall time: 33.5 s


(373477,
 [('', [0, 97114874]),
  ('DBB05735DF475AB5DABEE791299AE3D1', [1, 5279]),
  ('215887F9F98736B07C823045CA9FA2EB', [2, 924]),
  ('3330516EAADFD093A1C7B9DA83172DEB', [3, 5150]),
  ('8A67C9075AF148F8C410E56DC9E6BA2C', [4, 10532]),
  ('66102A1F11120AAAF8326BDE7FB49876', [5, 187]),
  ('9BCB16BA9CDF7220A9637D36E604531D', [6, 122]),
  ('E91CDEC8DC7ABF30592FA024616FF970', [7, 1166115]),
  ('FECA6F2E8244F2294BD2CE957C0602A9', [8, 780567]),
  ('C2D098DE579714E1F2F0A59AB8D05856', [9, 1969])])

In [43]:
%%time
vari = []
for vs in var:
    if vs != null:
        li=[]
        lf=[]
        for v in vs.split('\t'):
            if v!='':
                li.append(PD[v][0])
                lf.append(-PD[v][1])
        vari.append( list(np.array(li)[np.argsort(lf)].astype(np.int32) ) )
    else:
        vari.append( [0] )
del PD
gc.collect()

len(vari), vari[:10]

CPU times: user 5min 31s, sys: 37.4 s, total: 6min 8s
Wall time: 6min 3s


(116485680, [[1], [0], [0], [0], [0], [0], [2], [3], [0], [0]])

In [44]:
%%time
#Get only the most frequent domain
df['domains'] = np.array( [v[0] for v in vari ] ).astype( np.int32 )
gc.collect()
del vari, var
gc.collect()

CPU times: user 1min 45s, sys: 14.1 s, total: 1min 59s
Wall time: 1min 48s


0

In [45]:
%%time
var = df['links'].fillna('').values.copy()
gc.collect()

PD = {}
null = var[1]
PD[null] = [0,0]
count = 1
for vs in var:
    if vs != null: ## null = var[0] 
        for v in vs.split('\t'):
            if v not in PD:
                PD[v] = [count,1]
                count +=1
            else:
                x = PD[v]
                x[1] += 1
                PD[v] = x
    else:
        x = PD[null]
        x[1] += 1
        PD[null] = x
        
len(PD),list(PD.items())[:10]

CPU times: user 1min 28s, sys: 6.92 s, total: 1min 35s
Wall time: 1min 25s


(4881839,
 [('', [0, 97114874]),
  ('81DE6DAEA33235BD3AEE98A381D79172', [1, 109]),
  ('172DBB18AD23E0AA09074F0C5DBA23D6', [2, 37]),
  ('5E671E810416A48B15C2C03D66306322', [3, 320]),
  ('1A248236A438DB019A5547DCD815B1AE', [4, 11]),
  ('3693010C9DBD9C2792C356996071DCCE', [5, 10]),
  ('2FB67F53711999B357E29FAAC146D21A', [6, 10]),
  ('0E99BCC9C116F5F20AF0D877F9AB1ED3', [7, 18]),
  ('8EF4D77D9BA41B5C2068EFCA3DD712A7', [8, 4]),
  ('BD29AD49CA0C68B3723F825CBF372090', [9, 840])])

In [46]:

%%time
vari = []
for vs in var:
    if vs != null:
        li=[]
        lf=[]
        for v in vs.split('\t'):
            if v!='':
                li.append(PD[v][0])
                lf.append(-PD[v][1])
        vari.append( list(np.array(li)[np.argsort(lf)].astype(np.int32) ) )
    else:
        vari.append( [0] )
del PD
gc.collect()

len(vari), vari[:10]

CPU times: user 5min 43s, sys: 40.5 s, total: 6min 24s
Wall time: 6min 18s


(116485680, [[1], [0], [0], [0], [0], [0], [2], [3], [0], [0]])

In [47]:
%%time
#Get only the most frequent link
df['links'] = np.array( [v[0] for v in vari ] ).astype( np.int32 )
gc.collect()
del vari, var
gc.collect()

CPU times: user 1min 45s, sys: 15.2 s, total: 2min
Wall time: 1min 49s


0

In [48]:
%%time
var = df['hashtags'].fillna('').values.copy()
gc.collect()

PD = {}
null = var[1]
PD[null] = [0,0]
count = 1
for vs in var:
    if vs != null:
        for v in vs.split('\t'):
            if v not in PD:
                PD[v] = [count,1]
                count +=1
            else:
                x = PD[v]
                x[1] += 1
                PD[v] = x
    else:
        x = PD[null]
        x[1] += 1
        PD[null] = x
        
len(PD),list(PD.items())[:10]

CPU times: user 1min 44s, sys: 7.05 s, total: 1min 51s
Wall time: 1min 41s


(2679918,
 [('', [0, 90421527]),
  ('7406650BAE78F56FBD8068FC460A6A1E', [1, 10653]),
  ('40A23C9DE38F5B42FDABD7DE6B73AC6E', [2, 2995]),
  ('053074B6C39ADD256B984F5498E431EC', [3, 17789]),
  ('46A5CAAF8FE1D26E72A54A96285C8CAC', [4, 20611]),
  ('ACB82C998C7761067E25AF4DCACDB8B8', [5, 257]),
  ('9887C2F9C8FFECE3524054D91E871F84', [6, 14912]),
  ('8E215CEA6326981F3C8EBEE1316B1933', [7, 1629]),
  ('68A9F5981A5B15BD3B6F99414772DCC5', [8, 25]),
  ('010D7EF7F04CCA48BB42394AEAB41F9D', [9, 5])])

In [49]:
df.tail(10)['hashtags']

9941978                                                  NaN
9941979                                                  NaN
9941980    F2AA2F668B2C6FF79297EB03A516166A\t503DAE37FD19...
9941981                                                  NaN
9941982                     04227C4C2E4C1A14E13EF0C7B3894AB1
9941983    7CC582B5E93DAB72F49B1D55DDCBAE5C\t94544EA41F16...
9941984                     1F6D05BCF068D59C3CDC1F935C6DB93D
9941985                                                  NaN
9941986    DE848668D8312219F01B8561959BBC0B\tBB6FBD3AFF96...
9941987                                                  NaN
Name: hashtags, dtype: object

In [50]:
%%time
vari = []
for vs in var:
    if vs != null:
        li=[]
        lf=[]
        for v in vs.split('\t'):
            if v!='':
                li.append(PD[v][0])
                lf.append(-PD[v][1])
        vari.append( list(np.array(li)[np.argsort(lf)].astype(np.int32) ) )
    else:
        vari.append( [0] )
del PD
gc.collect()

len(vari), vari[:10]

CPU times: user 7min 5s, sys: 36.7 s, total: 7min 41s
Wall time: 7min 41s


(116485680, [[1], [0], [0], [0], [0], [2], [4, 3], [0], [0], [5]])

In [51]:
%%time
#Get only the most frequent hashtag
df['hashtags'] = np.array( [v[0] for v in vari ] ).astype( np.int32 )
gc.collect()
del vari, var
gc.collect()

CPU times: user 1min 46s, sys: 15 s, total: 2min 1s
Wall time: 1min 50s


0

In [52]:
df.tail(40)

Unnamed: 0,hashtags,tweet_id,media,links,domains,tweet_type,language,timestamp,a_user_id,a_follower_count,...,b_account_creation,b_follows_a,reply,retweet,retweet_comment,like,id,len_hashtags,len_domains,len_links
9941948,0,52040555,0,0,0,2,11,1581643126,10775284,167,...,1547111555,True,0,0,0,0,116485640,0,0,0
9941949,0,56173222,0,0,0,1,3,1581637768,10064166,631,...,1404593435,True,0,0,0,0,116485641,0,0,0
9941950,0,44914857,0,0,0,2,54,1582065700,2438,237687,...,1572711116,False,0,0,0,0,116485642,0,0,0
9941951,0,51351335,5,0,0,1,54,1582138292,2438,237687,...,1572711116,False,0,0,0,0,116485643,0,0,0
9941952,0,56173223,0,0,0,1,25,1582030933,360779,52657,...,1503162103,True,0,0,0,0,116485644,0,0,0
9941953,0,56005880,0,0,0,0,54,1581958736,123129,17734,...,1333741456,False,0,0,0,0,116485645,0,0,0
9941954,0,56173224,0,0,0,1,63,1581699032,3137762,6369,...,1333741456,True,0,0,0,0,116485646,0,0,0
9941955,88862,44554117,0,0,0,2,13,1582061969,77459,1190067,...,1574628907,False,0,0,0,0,116485647,1,0,0
9941956,0,50255514,0,0,0,0,11,1581662864,4588,62578,...,1365176017,False,0,0,0,0,116485648,0,0,0
9941957,0,44144508,5,0,0,2,54,1581628573,203233,198527,...,1443317092,False,0,0,0,0,116485649,0,0,0


In [53]:
df.dtypes

hashtags               int32
tweet_id              uint32
media                  uint8
links                  int32
domains                int32
tweet_type             uint8
language               uint8
timestamp             uint32
a_user_id             uint32
a_follower_count      uint32
a_following_count     uint32
a_is_verified           bool
a_account_creation    uint32
b_user_id             uint32
b_follower_count      uint32
b_following_count     uint32
b_is_verified           bool
b_account_creation    uint32
b_follows_a             bool
reply                 uint32
retweet               uint32
retweet_comment       uint32
like                  uint32
id                    uint32
len_hashtags           int64
len_domains            int64
len_links              int64
dtype: object

In [54]:
# old value based on old data, changed to next cell. 
#train = df.iloc[ :121386431 ].copy()
#test0 = df.iloc[ 121386431:(121386431+12434735) ].copy()
#test1 = df.iloc[ (121386431+12434735): ].copy()

# train.shape, test0.shape, test1.shape

In [55]:
%%time
## train, test, val data shape 

train = df.iloc[ :96600391 ].copy()
test0 = df.iloc[ 96600391:(96600391+9943301 ) ].copy()
test1 = df.iloc[ (96600391+9943301 ): ].copy()

train.shape, test0.shape, test1.shape

CPU times: user 2min 16s, sys: 28.1 s, total: 2min 44s
Wall time: 4.64 s


((96600391, 27), (9943301, 27), (9941988, 27))

In [56]:
%%time
train.to_parquet( '/mnt/DP_disk3/Recsys/train-1.parquet' )
test0.to_parquet( '/mnt/DP_disk3/Recsys/test-0.parquet' )
test1.to_parquet( '/mnt/DP_disk3/Recsys/test-1.parquet' )
gc.collect()

CPU times: user 1min 58s, sys: 12.9 s, total: 2min 11s
Wall time: 1min 40s


0

In [57]:
time_end = datetime.now()

print("2nd notebook total process time", time_end - time_begin)

2nd notebook total process time 0:35:22.268928
