## Import packages


In [1]:
import pandas as pd
import matplotlib
import seaborn as sb
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
import pickle
import os

import sys

sys.path.insert(0, "..")
from preprocess import VietnameseTextCleaner
from utils import dict_handler


matplotlib.style.use("ggplot")

## Paths


In [2]:
CUR_DIR = os.path.abspath(os.curdir)
METADATA_PATH = os.path.join(CUR_DIR, "../data/reintel2020/public_train.csv")
IMAGES_DIR = os.path.join(CUR_DIR, "../data/reintel2020/public_train_final_images/")
CACHE_PATH = os.path.join(CUR_DIR, "../.cache/")
VNCORE_NLP_PATH = os.path.join(CUR_DIR, "../vncorenlp/")
STOPWORDS_PATH = os.path.join(CUR_DIR, "../stop_words/vietnamese-stopwords-dash.txt")

## Init Cleaner


In [3]:
cleaner = VietnameseTextCleaner(
    stopwords_path=STOPWORDS_PATH,
    vncorenlp_path=VNCORE_NLP_PATH,
    cur_dir=CUR_DIR,
)

## Read csv


In [4]:
df_train = pd.read_csv(METADATA_PATH)

## Fixing timestamp errors


In [5]:
# type errors
errors = []
for i in range(len(df_train["timestamp_post"])):
    try:
        float(df_train["timestamp_post"][i])
    except:
        errors.append(i)

In [6]:
# missing
missings = df_train[df_train.timestamp_post.isnull()].index.to_list()

In [7]:
# fill with mean
mean = (
    df_train[~df_train.index.isin(missings + errors)]["timestamp_post"]
    .astype(float)
    .mean()
)
df_train.loc[errors + missings, "timestamp_post"] = mean
df_train.timestamp_post = df_train.timestamp_post.astype(float)
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4372 entries, 0 to 4371
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   id                4372 non-null   int64  
 1   user_name         4372 non-null   object 
 2   post_message      4371 non-null   object 
 3   timestamp_post    4372 non-null   float64
 4   num_like_post     4257 non-null   object 
 5   num_comment_post  4362 non-null   object 
 6   num_share_post    3647 non-null   object 
 7   label             4372 non-null   int64  
dtypes: float64(1), int64(2), object(5)
memory usage: 273.4+ KB


## Sort by timestamp


In [8]:
df_sorted = df_train.sort_values("timestamp_post")

In [9]:
dataset = [
    dict_handler(dict_object=record, cleaner=cleaner)
    for record in tqdm(df_sorted.to_dict("records"))
]

  0%|          | 0/4372 [00:00<?, ?it/s]

## Save dataset as cache


In [None]:
cache = open(os.path.join(CACHE_PATH, "pretrain_dataset.pkl"), "wb")
pickle.dump(dataset, cache)
cache.close()