Copyright (c) 2020, NVIDIA CORPORATION.

Modifications copyright Intel. 

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

### 1. Process tweets 
### 2. caculate words 
### 3. Extract text features

In [1]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm 
import glob
import gc
import os.path

import hashlib

In [2]:
import time 
from datetime import datetime 
time_begin = datetime.now()

In [3]:
def save_memory( df ):
    features = df.columns
    for i in range( df.shape[1] ):
        if df.dtypes[i] == 'uint8':
            df[features[i]] = df[features[i]].astype( np.int8 )
            gc.collect()
        elif df.dtypes[i] == 'bool':
            df[features[i]] = df[features[i]].astype( np.int8 )
            gc.collect()
        elif df.dtypes[i] == 'uint32':
            df[features[i]] = df[features[i]].astype( np.int32 )
            gc.collect()
        elif df.dtypes[i] == 'int64':
            df[features[i]] = df[features[i]].astype( np.int32 )
            gc.collect()
        elif df.dtypes[i] == 'float64':
            df[features[i]] = df[features[i]].astype( np.float32 )
            gc.collect()
    print( df.dtypes )

In [4]:
def extract_hash(text, split_text='@', no=0):
    text = text.lower()
    uhash = ''
    text_split = text.split('@')
    if len(text_split)>(no+1):
        text_split = text_split[no+1].split(' ')
        cl_loop = True
        uhash += clean_text(text_split[0])
        while cl_loop:
            if len(text_split)>1:
                if text_split[1] in ['_']:
                    uhash += clean_text(text_split[1]) + clean_text(text_split[2])
                    text_split = text_split[2:]
                else:
                    cl_loop = False
            else:
                cl_loop = False
    hash_object = hashlib.md5(uhash.encode('utf-8'))
    return hash_object.hexdigest()

In [5]:
def clean_text(text):
    if len(text)>1:
        if text[-1] in ['!', '?', ':', ';', '.', ',']:
            return(text[:-1])
    return(text)

In [None]:
%%time
train = pd.read_parquet( 'train-tweet-1.parquet' )
test0 = pd.read_parquet( 'test0-tweet-1.parquet' )
test1 = pd.read_parquet( 'test1-tweet-1.parquet' )
train.shape, test0.shape, test1.shape

In [7]:
#train = train.head(100000)
#test0 = test0.head(100000)
#test1 = test1.head(100000)

In [8]:
%%time

WORDS = {}
DF = []
for tweet in train['tweet'].unique():
    words = tweet.split(' ')
    for w in words:
        if w not in WORDS:
            WORDS[w] = 1
        else:
            WORDS[w]+= 1
gc.collect()

for tweet in test0['tweet'].unique():
    words = tweet.split(' ')
    for w in words:
        if w not in WORDS:
            WORDS[w] = 1
        else:
            WORDS[w]+= 1
gc.collect()
for tweet in test1['tweet'].unique():
    words = tweet.split(' ')
    for w in words:
        if w not in WORDS:
            WORDS[w] = 1
        else:
            WORDS[w]+= 1
gc.collect()
                
len(WORDS)

CPU times: user 10min 57s, sys: 46.9 s, total: 11min 44s
Wall time: 11min 44s


52481711

In [9]:
%%time
count=0
for w in WORDS:
    WORDS[w] = [ WORDS[w], count ]
    count+=1
gc.collect()

CPU times: user 1min 53s, sys: 10.2 s, total: 2min 3s
Wall time: 2min 3s


0

In [10]:
WORDS['marvel']

[1653, 70225]

In [11]:
def freq_encode_words( vs ):
    li=[]
    lf=[]
    for v in vs.split(' '):
        if v not in ['','[',']','.','!','@','_','#']:
            f,i = WORDS[v]
            if f<100000:
                if f>2:
                    li.append( str(i) )
                    #li.append( v )
                    lf.append( f )
    return ' '.join( list((np.array(li)[np.argsort(lf)] )) )    
    
#freq_encode_words( train['tweet'].values[191019] )

In [12]:
def ret_word( x, rw=0 ):
    x = x.split(' ')
    if rw==0:
        if len(x)>=1:
            return x[0]
    elif rw==1:
        if len(x)>=2:
            return x[1]
    elif rw== -1:
        if len(x)>=1:
            return x[-1]
    elif rw== -2:
        if len(x)>=2:
            return x[-2]

    return '-1'

In [17]:
%%time

DF = []
train['tweet_nortsign'] = train['tweet'].str.replace('\[CLS\] RT @', '')
train['count_words']    = train['tweet'].str.count(' ')
train['count_char']     = train['tweet'].apply(lambda x: len(x))
train['count_ats']      = train['tweet_nortsign'].str.count('@')
train['hash0']          = train['tweet_nortsign'].apply(lambda x: extract_hash(x))
train['hash1']          = train['tweet_nortsign'].apply(lambda x: extract_hash(x, no=1))
train['tw_uhash']       = train['tweet'].apply(lambda x: extract_hash(x, split_text='RT @', no=0))
train['tw_hash']        = train['tweet'].apply(lambda x: hash(x)%1000000000 )

train['tweet']          = train['tweet'].apply(lambda x: freq_encode_words(x) )
train['tw_freq_hash']   = train['tweet'].apply(lambda x: hash(x)%1000000000 )
train['tw_first_word']  = train['tweet'].apply(lambda x: ret_word(x,0) )
train['tw_second_word'] = train['tweet'].apply(lambda x: ret_word(x,1) )
train['tw_last_word']   = train['tweet'].apply(lambda x: ret_word(x,-1) )
train['tw_llast_word']  = train['tweet'].apply(lambda x: ret_word(x,-2) )
train['tw_len']         = train['tweet'].apply(lambda x: len(x.split(' ')) )



1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
CPU times: user 1h 14min 36s, sys: 6min 34s, total: 1h 21min 11s
Wall time: 1h 18min 41s


In [18]:
%%time 
DF.append( train[['id','count_ats', 'count_char', 'count_words', 'hash0', 'hash1', 'tw_uhash','tw_hash','tw_freq_hash','tw_first_word','tw_second_word','tw_last_word','tw_llast_word','tw_len']] )
del train
gc.collect()
    

CPU times: user 2min 37s, sys: 1min 56s, total: 4min 34s
Wall time: 3min 27s


61

In [19]:
%%time 
test0['tweet_nortsign'] = test0['tweet'].str.replace('\[CLS\] RT @', '')
test0['count_words']    = test0['tweet'].str.count(' ')
test0['count_char']     = test0['tweet'].apply(lambda x: len(x))
test0['count_ats']      = test0['tweet_nortsign'].str.count('@')
test0['hash0']          = test0['tweet_nortsign'].apply(lambda x: extract_hash(x))
test0['hash1']          = test0['tweet_nortsign'].apply(lambda x: extract_hash(x, no=1))
test0['tw_uhash']       = test0['tweet'].apply(lambda x: extract_hash(x, split_text='RT @', no=0))
test0['tw_hash']        = test0['tweet'].apply(lambda x: hash(x)%1000000000 )

test0['tweet']          = test0['tweet'].apply(lambda x: freq_encode_words(x) )
test0['tw_freq_hash']   = test0['tweet'].apply(lambda x: hash(x)%1000000000 )
test0['tw_first_word']  = test0['tweet'].apply(lambda x: ret_word(x,0) )
test0['tw_second_word'] = test0['tweet'].apply(lambda x: ret_word(x,1) )
test0['tw_last_word']   = test0['tweet'].apply(lambda x: ret_word(x,-1) )
test0['tw_llast_word']  = test0['tweet'].apply(lambda x: ret_word(x,-2) )
test0['tw_len']         = test0['tweet'].apply(lambda x: len(x.split(' ')) )




CPU times: user 8min 38s, sys: 34.8 s, total: 9min 12s
Wall time: 7min 53s


In [20]:
%%time
DF.append( test0[['id','count_ats', 'count_char', 'count_words', 'hash0', 'hash1', 'tw_uhash','tw_hash','tw_freq_hash','tw_first_word','tw_second_word','tw_last_word','tw_llast_word','tw_len']] )
del test0
gc.collect()



CPU times: user 31.8 s, sys: 11 s, total: 42.7 s
Wall time: 27.3 s


53

In [21]:
%%time 

test1['tweet_nortsign'] = test1['tweet'].str.replace('\[CLS\] RT @', '')
test1['count_words']    = test1['tweet'].str.count(' ')
test1['count_char']     = test1['tweet'].apply(lambda x: len(x))
test1['count_ats']      = test1['tweet_nortsign'].str.count('@')
test1['hash0']          = test1['tweet_nortsign'].apply(lambda x: extract_hash(x))
test1['hash1']          = test1['tweet_nortsign'].apply(lambda x: extract_hash(x, no=1))
test1['tw_uhash']       = test1['tweet'].apply(lambda x: extract_hash(x, split_text='RT @', no=0))
test1['tw_hash']        = test1['tweet'].apply(lambda x: hash(x)%1000000000 )

test1['tweet']          = test1['tweet'].apply(lambda x: freq_encode_words(x) )
test1['tw_freq_hash']   = test1['tweet'].apply(lambda x: hash(x)%1000000000 )
test1['tw_first_word']  = test1['tweet'].apply(lambda x: ret_word(x,0) )
test1['tw_second_word'] = test1['tweet'].apply(lambda x: ret_word(x,1) )
test1['tw_last_word']   = test1['tweet'].apply(lambda x: ret_word(x,-1) )
test1['tw_llast_word']  = test1['tweet'].apply(lambda x: ret_word(x,-2) )
test1['tw_len']         = test1['tweet'].apply(lambda x: len(x.split(' ')) )



CPU times: user 8min 53s, sys: 40 s, total: 9min 33s
Wall time: 8min 14s


In [22]:
%%time
DF.append( test1[['id','count_ats', 'count_char', 'count_words', 'hash0', 'hash1', 'tw_uhash','tw_hash','tw_freq_hash','tw_first_word','tw_second_word','tw_last_word','tw_llast_word','tw_len']] )
del test1
gc.collect()


DF = pd.concat( DF )
gc.collect()

save_memory( DF )
DF = DF.reset_index( drop=True )
gc.collect()
#DF.to_parquet( '../input/text-processings-1.parquet' )
DF.shape

id                 int32
count_ats          int32
count_char         int32
count_words        int32
hash0             object
hash1             object
tw_uhash          object
tw_hash            int32
tw_freq_hash       int32
tw_first_word     object
tw_second_word    object
tw_last_word      object
tw_llast_word     object
tw_len             int32
dtype: object
CPU times: user 6min 30s, sys: 4min 14s, total: 10min 44s
Wall time: 5min 34s


(116485680, 14)

In [23]:
%%time
uhashes = pd.concat([DF['hash0'], DF['hash1'], DF['tw_uhash']], axis=0)
gc.collect()
uhashes = uhashes.value_counts()
gc.collect()
uhashes = uhashes.reset_index().reset_index()
gc.collect()
uhashes['uid'] = np.arange(0,uhashes.shape[0] )
print( uhashes.shape )
uhashes.head()

(5730566, 4)


Unnamed: 0,level_0,index,0,uid
0,0,d41d8cd98f00b204e9800998ecf8427e,275516656,0
1,1,9b9672de2cbe5ddd7ebd7538945a970a,852486,1
2,2,b14a7b8059d9c055954c92674ce60032,719189,2
3,3,31cf2397511c7cfce33506bef80e25b7,590643,3
4,4,ba9bf05693b9fa202d922dd43a08f281,167705,4


In [24]:
%%time
DF['tw_hash0']    = pd.merge( DF[['hash0']]  , uhashes[['index','uid']], left_on='hash0'  , right_on='index', how='left' )['uid']
gc.collect()
DF['tw_hash1']    = pd.merge( DF[['hash1']]  , uhashes[['index','uid']], left_on='hash1'  , right_on='index', how='left' )['uid']
gc.collect()
DF['tw_rt_uhash'] = pd.merge( DF[['tw_uhash']], uhashes[['index','uid']], left_on='tw_uhash', right_on='index', how='left' )['uid']
gc.collect()
DF.head(20)

Unnamed: 0,id,count_ats,count_char,count_words,hash0,hash1,tw_uhash,tw_hash,tw_freq_hash,tw_first_word,tw_second_word,tw_last_word,tw_llast_word,tw_len,tw_hash0,tw_hash1,tw_rt_uhash
0,0,0,166,25,d41d8cd98f00b204e9800998ecf8427e,d41d8cd98f00b204e9800998ecf8427e,d41d8cd98f00b204e9800998ecf8427e,450182889,833072896,1,18,20,19,11,0,0,0
1,1,1,86,10,c45acebe299622b3a8a7b7a1c195f567,d41d8cd98f00b204e9800998ecf8427e,c45acebe299622b3a8a7b7a1c195f567,276082737,454222968,29,30,28,25,8,530438,0,530438
2,2,0,128,19,d41d8cd98f00b204e9800998ecf8427e,d41d8cd98f00b204e9800998ecf8427e,d41d8cd98f00b204e9800998ecf8427e,755230048,553676980,39,40,38,41,17,0,0,0
3,3,0,90,10,d41d8cd98f00b204e9800998ecf8427e,d41d8cd98f00b204e9800998ecf8427e,9a18866b0599c0a0c3464ec496b25941,238859478,756313760,56,54,55,52,5,0,0,186133
4,4,3,90,9,31e6a7de72799d9cd2469a064d5f82bf,f97a9296ac0c4125eb8a9792ce75e7c7,129b94409fe0e8b2cd0703e7efad27bf,952302660,425346655,59,61,62,63,4,490823,30934,264460
5,5,0,91,9,d41d8cd98f00b204e9800998ecf8427e,d41d8cd98f00b204e9800998ecf8427e,d41d8cd98f00b204e9800998ecf8427e,203865526,388525323,70,69,65,68,5,0,0,0
6,6,0,109,15,d41d8cd98f00b204e9800998ecf8427e,d41d8cd98f00b204e9800998ecf8427e,d41d8cd98f00b204e9800998ecf8427e,567542067,240862829,79,78,74,75,6,0,0,0
7,7,0,129,15,d41d8cd98f00b204e9800998ecf8427e,d41d8cd98f00b204e9800998ecf8427e,d41d8cd98f00b204e9800998ecf8427e,799713274,976866268,92,93,82,81,10,0,0,0
8,8,0,50,5,d41d8cd98f00b204e9800998ecf8427e,d41d8cd98f00b204e9800998ecf8427e,d41d8cd98f00b204e9800998ecf8427e,750552082,937800049,96,95,95,96,2,0,0,0
9,9,4,299,49,ac881112a4613c4726481dcc20d5b723,4c3e01e1604798c29daeeac3b368d923,ac881112a4613c4726481dcc20d5b723,269379878,946419797,132,122,131,105,23,88222,120360,88222


In [25]:
%%time
del DF['hash0'],DF['hash1'],DF['tw_uhash']
gc.collect()
save_memory( DF )

id                 int32
count_ats          int32
count_char         int32
count_words        int32
tw_hash            int32
tw_freq_hash       int32
tw_first_word     object
tw_second_word    object
tw_last_word      object
tw_llast_word     object
tw_len             int32
tw_hash0           int32
tw_hash1           int32
tw_rt_uhash        int32
dtype: object


In [26]:
%%time
DF['tw_hash']        = pd.factorize( DF['tw_hash'] )[0]
DF['tw_freq_hash']   = pd.factorize( DF['tw_freq_hash'] )[0]
DF['tw_first_word']  = pd.factorize( DF['tw_first_word'] )[0]
DF['tw_second_word'] = pd.factorize( DF['tw_second_word'] )[0]
DF['tw_last_word']   = pd.factorize( DF['tw_last_word'] )[0]
DF['tw_llast_word']  = pd.factorize( DF['tw_llast_word'] )[0]
gc.collect()

20

In [27]:
%%time
DF['tw_hash']        = DF['tw_hash'].astype(np.int32)
DF['tw_freq_hash']   = DF['tw_freq_hash'].astype(np.int32)
DF['tw_first_word']  = DF['tw_first_word'].astype(np.int32)
DF['tw_second_word'] = DF['tw_second_word'].astype(np.int32)
DF['tw_last_word']   = DF['tw_last_word'].astype(np.int32)
DF['tw_llast_word']  = DF['tw_llast_word'].astype(np.int32)
gc.collect()

20

In [28]:
%%time
DF.head(10)

Unnamed: 0,id,count_ats,count_char,count_words,tw_hash,tw_freq_hash,tw_first_word,tw_second_word,tw_last_word,tw_llast_word,tw_len,tw_hash0,tw_hash1,tw_rt_uhash
0,0,0,166,25,0,0,0,0,0,0,11,0,0,0
1,1,1,86,10,1,1,1,1,1,1,8,530438,0,530438
2,2,0,128,19,2,2,2,2,2,2,17,0,0,0
3,3,0,90,10,3,3,3,3,3,3,5,0,0,186133
4,4,3,90,9,4,4,4,4,4,4,4,490823,30934,264460
5,5,0,91,9,5,5,5,5,5,5,5,0,0,0
6,6,0,109,15,6,6,6,6,6,6,6,0,0,0
7,7,0,129,15,7,7,7,7,7,7,10,0,0,0
8,8,0,50,5,8,8,8,8,8,8,2,0,0,0
9,9,4,299,49,9,9,9,9,9,9,23,88222,120360,88222


In [29]:
%%time
DF.to_parquet( '/mnt/DP_disk3/Recsys/text-processings-1.parquet' )
gc.collect()

0

In [30]:
%%time
DF.dtypes

id                int32
count_ats         int32
count_char        int32
count_words       int32
tw_hash           int32
tw_freq_hash      int32
tw_first_word     int32
tw_second_word    int32
tw_last_word      int32
tw_llast_word     int32
tw_len            int32
tw_hash0          int32
tw_hash1          int32
tw_rt_uhash       int32
dtype: object

In [31]:
time_end = datetime.now()

print("4th notebook total process time", time_end - time_begin)

4th notebook total process time 5:47:55.091403
