In [1]:
import pandas as pd
import numpy as np

from bpemb import BPEmb
from tqdm import tqdm

In [2]:
path = '../data/dataset/'

RAW_X_train = pd.read_csv(path+'RAW_X_train.csv', index_col='username', keep_default_na=False)
RAW_X_test = pd.read_csv(path+'RAW_X_test.csv', index_col='username', keep_default_na=False)
RAW_y_train = pd.read_csv(path+'RAW_y_train.csv', keep_default_na=False)
RAW_y_test = pd.read_csv(path+'RAW_y_test.csv', keep_default_na=False)

In [3]:
RAW_X_train.columns

Index(['is_name_social_political', 'desc', 'tweets', 'n_tweet',
       'quoted_tweets', 'n_tweet_use_hashtag', 'ratio_tweets_use_hashtag',
       'n_photo', 'n_video', 'buzzer', 'text_used'],
      dtype='object')

## BPE Embedding

In [4]:
bpe = BPEmb(lang="id", vs=100000, dim=300)

In [5]:
def avg_vec(bpe_embed):
    n_subword = bpe_embed.shape[0]
    vec_sum = np.sum(bpe_embed, axis=0)
    sent_embed = np.divide(vec_sum, n_subword)
    return sent_embed

In [6]:
def create_feat_vec(bpe, data):
    feat_vec = []
    for txt in tqdm(data):
        if txt:
            bpe_embed = bpe.embed(txt)
        else:
            bpe_embed = np.zeros((1, 300))
        feat_vec.append(avg_vec(bpe_embed))

    np_feat_vec = np.array(feat_vec)
    return np_feat_vec

## Feature stacking

In [7]:
def feat_stack(X, y, bpe):
    ## Create feat vec for 'desc'
    desc_vec = create_feat_vec(bpe, X['desc'].tolist())
    ## Create feat vec for 'text_used'
    tweet_vec = create_feat_vec(bpe, X['text_used'].tolist())
    
    numeric_feat = ['is_name_social_political', 'n_tweet', 'ratio_tweets_use_hashtag',
                'n_photo', 'n_video']
    numeric_arr = X[numeric_feat].to_numpy()
    
    label = y['buzzer'].to_numpy().reshape(-1, 1)
    
    ready_data = np.hstack((numeric_arr, desc_vec, tweet_vec))
    return ready_data, label

In [8]:
X_train_bpe, y_train_bpe = feat_stack(RAW_X_train, RAW_y_train, bpe)
X_test_bpe, y_test_bpe = feat_stack(RAW_X_test, RAW_y_test, bpe)

100%|██████████| 2369/2369 [00:00<00:00, 9565.94it/s]
100%|██████████| 2369/2369 [00:16<00:00, 139.58it/s]
100%|██████████| 593/593 [00:00<00:00, 7446.57it/s]
100%|██████████| 593/593 [00:03<00:00, 151.94it/s]


## Save dataset

In [9]:
data = {'X_train_bpe.npy':X_train_bpe, 'X_test_bpe.npy':X_test_bpe, 
        'y_train_bpe.npy':y_train_bpe, 'y_test_bpe.npy':y_test_bpe}

for key, value in data.items(): 
    with open('../data/dataset/'+key, 'wb') as f:
        np.save(f, value)