In [1]:
from typing import Dict

RAW_DATA_DIR: str = "data/cafef/raw_data"
HISTORY_LENGTH: int = 50
TITLE_LENGTH: int = 20
ABSTRACT_LENGTH: int = 50

USER_TO_INT: Dict[str, int] = {"PSEUDO_USER": 0}
POST_TO_INT: Dict[str, int] = {"PSEUDO_POST": 0}
CATEGORY_TO_INT: Dict[str, int] = {"PSEUDO_CATEGORY": 0}
WORD_TO_INT: Dict[str, int] = {"PSEUDO_WORD": 0}

# UTIL FUNCTIONS #

In [2]:
from typing import Dict


def get_int_mapping(data: object, data_to_int: Dict[object, int]) -> int:
    if data not in data_to_int:
        data_to_int[data] = len(data_to_int)
    return data_to_int[data]

# PROCESS BEHAVIOUR DF #

## Process functions ##

In [3]:
import pandas as pd
from typing import List, Dict, Tuple
from datetime import datetime
from tqdm import tqdm
import random


def process_behaviour_df(df: pd.DataFrame) -> List[Dict]:
    def _extract_history(_x: str) -> List[str]:
        if len(_x) == 0:
            return []
        _post_ids: List[str] = _x.split(" ")
        _post_ids: List[str] = _post_ids[:HISTORY_LENGTH]
        return _post_ids
    
    def _extract_positive_negative(_x: str) -> Tuple[List[str], List[str]]:
        _positive: List[str] = []
        _negative: List[str] = []
        for _post_label in _x.split(" "):
            _post_id, _label = _post_label.split("-")
            assert _label in {"1", "0"}
            if _label == "1":
                _positive.append(_post_id)
            else:
                _negative.append(_post_id)
        return _positive, _negative
    
    df.fillna(value="", inplace=True)
    df["timestamp"] = df.timestamp.apply(lambda x: datetime.strptime(x, "%m/%d/%Y %I:%M:%S %p"))
    df["history"] = df.history.apply(_extract_history)
    df["impression"] = df.impression.apply(_extract_positive_negative)
    
    result: List[Dict] = []
    progress_bar = tqdm(df.itertuples(index=False), desc="Processing behaviours data... ")
    for row in progress_bar:
        user_id: str = row.user_id
        timestamp: datetime = row.timestamp
        history: List[str] = row.history
        positive, negative = row.impression

        user_id: int = get_int_mapping(data=user_id, data_to_int=USER_TO_INT)
        history: List[int] = list(map(lambda x: get_int_mapping(data=x, data_to_int=POST_TO_INT), history)) 
        positive: List[int] = list(map(lambda x: get_int_mapping(data=x, data_to_int=POST_TO_INT), positive)) 
        negative: List[int] = list(map(lambda x: get_int_mapping(data=x, data_to_int=POST_TO_INT), negative)) 

        result.append({"user_id": user_id, "timestamp": timestamp,
                       "history": history, "positive": positive, "negative": negative})
    progress_bar.close()
    return result

## Process data ##

In [4]:
import pandas as pd
from typing import List, Dict


TRAIN_BEHAVIOUR_DF: pd.DataFrame = pd.read_csv(f"{RAW_DATA_DIR}/train/behaviours.tsv", sep='\t', header=None, index_col=0)
TRAIN_BEHAVIOUR_DF.columns = ["user_id", "timestamp", "history", "impression"]
TRAIN_BEHAVIOURS: List[Dict] = process_behaviour_df(df=TRAIN_BEHAVIOUR_DF)

Processing behaviours data... : 68999it [00:02, 23510.67it/s]


In [5]:
import pandas as pd
from typing import List, Dict


DEV_BEHAVIOUR_DF: pd.DataFrame = pd.read_csv(f"{RAW_DATA_DIR}/dev/behaviours.tsv", sep='\t', header=None, index_col=0)
DEV_BEHAVIOUR_DF.columns = ["user_id", "timestamp", "history", "impression"]
DEV_BEHAVIOURS: List[Dict] = process_behaviour_df(df=DEV_BEHAVIOUR_DF)

Processing behaviours data... : 27397it [00:01, 20700.57it/s]


In [6]:
import pandas as pd
from typing import List, Dict


TEST_BEHAVIOUR_DF: pd.DataFrame = pd.read_csv(f"{RAW_DATA_DIR}/test/behaviours.tsv", sep='\t', header=None, index_col=0)
TEST_BEHAVIOUR_DF.columns = ["user_id", "timestamp", "history", "impression"]
TEST_BEHAVIOURS: List[Dict] = process_behaviour_df(df=TEST_BEHAVIOUR_DF)

Processing behaviours data... : 26948it [00:01, 20275.24it/s]


## Save data ##

In [7]:
from common.utils import PickleWriteObjectToLocalPatient


print(f"TRAIN BEHAVIOURS SIZE: {len(TRAIN_BEHAVIOURS)};   DEV SIZE: {len(DEV_BEHAVIOURS)};    TEST SIZE: {len(TEST_BEHAVIOURS)}")

PickleWriteObjectToLocalPatient().write(x=TRAIN_BEHAVIOURS, file_name="data/cafef/train/behaviours.pkl")
PickleWriteObjectToLocalPatient().write(x=DEV_BEHAVIOURS, file_name="data/cafef/dev/behaviours.pkl")
PickleWriteObjectToLocalPatient().write(x=TEST_BEHAVIOURS, file_name="data/cafef/test/behaviours.pkl")

TRAIN BEHAVIOURS SIZE: 68999;   DEV SIZE: 27397;    TEST SIZE: 26948


True

In [8]:
from common.utils import PickleWriteObjectToLocalPatient


print(f"TOTAL USERS: {len(USER_TO_INT)}, TOTAL POSTS: {len(POST_TO_INT)}")
PickleWriteObjectToLocalPatient().write(x=USER_TO_INT, file_name="data/cafef/object_to_int/user_to_int.pkl")
PickleWriteObjectToLocalPatient().write(x=POST_TO_INT, file_name="data/cafef/object_to_int/post_to_int.pkl")

TOTAL USERS: 66434, TOTAL POSTS: 58797


True

# PROCESS POST DF #

## Process functions ##

In [9]:
from transformers import AutoTokenizer, AutoModel
import torch
from common.utils import LocalFileHandlerUtils


LocalFileHandlerUtils.check_and_make_directory(directory="pretrained_data/vinai/phobert-base/tokenizer")
LocalFileHandlerUtils.check_and_make_directory(directory="pretrained_data/vinai/phobert-base/model")
BERT_TOKENIZER = AutoTokenizer.from_pretrained("vinai/phobert-base", cache_dir="pretrained_data/vinai/phobert-base/tokenizer")
BERT_MODEL = AutoModel.from_pretrained("vinai/phobert-base", cache_dir="pretrained_data/vinai/phobert-base/model")
DEVICE = torch.device("cuda:1") if torch.cuda.is_available() else torch.device("cpu")
BERT_MODEL = BERT_MODEL.to(DEVICE)
print(f"USING DEVICE {DEVICE}")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Some weights of the model checkpoint at vinai/phobert-base were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.decoder.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


USING DEVICE cuda:1


In [10]:
from typing import Dict, List
from torch import Tensor
import numpy as np


def get_batch_bert_encodes(list_text: List[str], sequence_len: int) -> np.ndarray:
    tokenize_result: Dict = BERT_TOKENIZER(text=list_text, padding="max_length", max_length=sequence_len, truncation=True, return_tensors="pt")
    token_ids: Tensor = tokenize_result["input_ids"]
    attention_mask: Tensor = tokenize_result["attention_mask"]
    assert token_ids.shape == (len(list_text), sequence_len)
    assert attention_mask.shape == (len(list_text), sequence_len)
    
    with torch.no_grad():
        BERT_MODEL.eval()
        output: Tensor = BERT_MODEL(input_ids=token_ids.to(DEVICE), attention_mask=attention_mask.to(DEVICE)).pooler_output
        output: np.ndarray = output.cpu().detach().numpy()

    assert output.shape == (len(list_text), 768)
    
    return output

In [11]:
from vncorenlp import VnCoreNLP
from typing import List


VNCORE_PREPROCESSOR = VnCoreNLP(address="http://10.5.1.230", port=2811)


def get_clean_text(text: str) -> str:
    sentence_words: List[List[str]] = VNCORE_PREPROCESSOR.tokenize(text=text)
    words: List[str] = [word for sentence_word in sentence_words for word in sentence_word]
    text: str = " ".join(words)
    return text.lower()

In [12]:
def get_token_ids(text: str, sequence_len: int, is_training: bool) -> List[str]:
    result: List[int] = list(map(lambda x: 0, range(sequence_len)))
    words: List[str] = text.split(" ")
    for index, word in enumerate(words):
        if index == sequence_len:
            break
        if is_training:
            result[index] = get_int_mapping(data=word, data_to_int=WORD_TO_INT)
        elif word in WORD_TO_INT:
            result[index] = WORD_TO_INT[word]
    
    assert len(result) == sequence_len
    return result

In [13]:
from typing import Dict


def process_post_df(df: pd.DataFrame, is_training: bool) -> Dict[int, Dict]:
    result: Dict[int, Dict] = {}
    df.fillna(value="", inplace=True)
    
    progress_bar = tqdm(df.itertuples(index=False), "Processing posts... ")
    for row in progress_bar:
        post_id: str = row.post_id
        if post_id not in POST_TO_INT:
            continue
        post_id: int = POST_TO_INT[post_id]
        category: str = row.category
        subcategory: str = row.subcategory
        subcategory_name: str = row.subcategory_name
        title: str = get_clean_text(text=row.title)
        abstract: str = get_clean_text(text=row.abstract)
        
        title_token_ids: List[int] = get_token_ids(text=title, sequence_len=TITLE_LENGTH, is_training=is_training)
        abstract_token_ids: List[int] = get_token_ids(text=abstract, sequence_len=ABSTRACT_LENGTH, is_training=is_training)

        if is_training:
            category_id: int = get_int_mapping(data=category, data_to_int=CATEGORY_TO_INT)
            subcategory_id: int = get_int_mapping(data=subcategory, data_to_int=CATEGORY_TO_INT)
        else:
            category_id: int = CATEGORY_TO_INT.get(category, 0)
            subcategory_id: int = CATEGORY_TO_INT.get(subcategory, 0)

        result[post_id] = {"title": title, "abstract": abstract,
                           "title_token_ids": title_token_ids, "abstract_token_ids": abstract_token_ids,
                           "category": category, "subcategory": subcategory, "subcategory_name": subcategory_name,
                           "category_id": category_id, "subcategory_id": subcategory_id}
    progress_bar.close()
    
    
    list_info: List[Dict] = list(result.values())
    batch_size: int = 128
    progress_bar = tqdm(range(0, len(list_info), batch_size), "Updating bert encode")
    for start_index in progress_bar:
        end_index: int = min(start_index+batch_size, len(list_info))
        batch_info: List[Dict] = list_info[start_index:end_index]
        
        titles: List[str] = list(map(lambda x: x["title"], batch_info))
        abstracts: List[str] = list(map(lambda x: x["abstract"], batch_info))
        
        batch_title_bert_encode: np.ndarray = get_batch_bert_encodes(list_text=titles, sequence_len=TITLE_LENGTH)
        batch_abstract_bert_encode: np.ndarray = get_batch_bert_encodes(list_text=abstracts, sequence_len=ABSTRACT_LENGTH)
        
        for index, info in enumerate(batch_info):
            info["title_bert_encode"] = batch_title_bert_encode[index]
            info["abstract_bert_encode"] = batch_abstract_bert_encode[index]
    progress_bar.close()
    
    return result

## Process data ##

### Train ###

In [14]:
############################ Read data ###############################
pseudo_post_row: Dict = {"post_id": "PSEUDO_POST", "category": "PSEUDO_CATEGORY", "subcategory": "PSEUDO_CATEGORY", "subcategory_name": "PSEUDO_CATEGORY", "title": "", "abstract": ""}

TRAIN_POST_DF = pd.read_csv(f"{RAW_DATA_DIR}/train/news.tsv", sep='\t', header=None, index_col=0)
TRAIN_POST_DF.columns = ["post_id", "category", "subcategory", "subcategory_name", "title", "abstract"]
TRAIN_POST_DF = TRAIN_POST_DF.append(pseudo_post_row, ignore_index=True)
TRAIN_POST_DF = TRAIN_POST_DF.astype("str")
TRAIN_POST_DF.fillna(value="", inplace=True)

############################## Process data ################################
TRAIN_POST_ID_TO_INFO: Dict[int, Dict] = process_post_df(df=TRAIN_POST_DF, is_training=True)

  TRAIN_POST_DF = TRAIN_POST_DF.append(pseudo_post_row, ignore_index=True)
Processing posts... : 46732it [05:44, 135.65it/s]
Updating bert encode: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 365/365 [01:03<00:00,  5.74it/s]


In [15]:
from sklearn.feature_extraction.text import CountVectorizer
from scipy.sparse import csr_matrix


VECTORIZER = CountVectorizer(min_df=2, binary=True)
train_documents: List[str] = list(map(lambda x: x["title"] + " " + x["abstract"], TRAIN_POST_ID_TO_INFO.values()))
TRAIN_X: csr_matrix = VECTORIZER.fit_transform(train_documents)
for index, info in enumerate(TRAIN_POST_ID_TO_INFO.values()):
    info['content_bow'] = TRAIN_X[index]
WORDS_CO_OCCUR: csr_matrix = TRAIN_X.T * TRAIN_X

### Dev ###

In [16]:
from common.utils import JsonReadObjectFromLocalPatient


pseudo_post_row: Dict = {"post_id": "PSEUDO_POST", "category": "PSEUDO_CATEGORY", "subcategory": "PSEUDO_CATEGORY", "subcategory_name": "PSEUDO_CATEGORY", "title": "", "abstract": ""}

DEV_POST_DF = pd.read_csv(f"{RAW_DATA_DIR}/dev/news.tsv", sep='\t', header=None, index_col=0)
DEV_POST_DF.columns = ["post_id", "category", "subcategory", "subcategory_name", "title", "abstract"]
DEV_POST_DF = DEV_POST_DF.append(pseudo_post_row, ignore_index=True)
DEV_POST_DF = DEV_POST_DF.astype("str")
DEV_POST_DF.fillna(value="", inplace=True)

############################## Process data ################################
DEV_POST_ID_TO_INFO: Dict[int, Dict] = process_post_df(df=DEV_POST_DF, is_training=False)

  DEV_POST_DF = DEV_POST_DF.append(pseudo_post_row, ignore_index=True)
Processing posts... : 26405it [03:03, 143.83it/s]
Updating bert encode: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 207/207 [00:45<00:00,  4.59it/s]


In [17]:
dev_documents: List[str] = list(map(lambda x: x["title"] + " " + x["abstract"], DEV_POST_ID_TO_INFO.values()))
DEV_X: csr_matrix = VECTORIZER.transform(dev_documents)
for index, info in enumerate(DEV_POST_ID_TO_INFO.values()):
    info['content_bow'] = DEV_X[index]

### Test ###

In [18]:
from common.utils import JsonReadObjectFromLocalPatient


pseudo_post_row: Dict = {"post_id": "PSEUDO_POST", "category": "PSEUDO_CATEGORY", "subcategory": "PSEUDO_CATEGORY", "subcategory_name": "PSEUDO_CATEGORY", "title": "", "abstract": ""}

TEST_POST_DF = pd.read_csv(f"{RAW_DATA_DIR}/test/news.tsv", sep='\t', header=None, index_col=0)
TEST_POST_DF.columns = ["post_id", "category", "subcategory", "subcategory_name", "title", "abstract"]
TEST_POST_DF = TEST_POST_DF.append(pseudo_post_row, ignore_index=True)
TEST_POST_DF = TEST_POST_DF.astype("str")
TEST_POST_DF.fillna(value="", inplace=True)

############################## Process data ################################
TEST_POST_ID_TO_INFO: Dict[int, Dict] = process_post_df(df=TEST_POST_DF, is_training=False)

  TEST_POST_DF = TEST_POST_DF.append(pseudo_post_row, ignore_index=True)
Processing posts... : 27507it [03:21, 136.84it/s]
Updating bert encode: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 215/215 [00:34<00:00,  6.30it/s]


In [19]:
test_documents: List[str] = list(map(lambda x: x["title"] + " " + x["abstract"], TEST_POST_ID_TO_INFO.values()))
TEST_X: csr_matrix = VECTORIZER.transform(test_documents)
for index, info in enumerate(TEST_POST_ID_TO_INFO.values()):
    info['content_bow'] = TEST_X[index]

## Save data ##

In [20]:
from common.utils import PickleWriteObjectToLocalPatient


print(f"NUMBER OF CATEGORIES: {len(CATEGORY_TO_INT)}; NUMBER OF WORDS IN TITLE + ABSTRACT: {len(WORD_TO_INT)}")
PickleWriteObjectToLocalPatient().write(x=CATEGORY_TO_INT, file_name="data/cafef/object_to_int/category_to_int.pkl")
PickleWriteObjectToLocalPatient().write(x=WORD_TO_INT, file_name="data/cafef/object_to_int/word_to_int.pkl")

NUMBER OF CATEGORIES: 20; NUMBER OF WORDS IN TITLE + ABSTRACT: 48346


True

In [21]:
from common.utils import PickleWriteObjectToLocalPatient


print(f"NUMBER OF WORDS IN CONTENT: {len(VECTORIZER.vocabulary_)}")
PickleWriteObjectToLocalPatient().write(x=VECTORIZER, file_name="data/cafef/vectorizer/vectorizer.pkl")
PickleWriteObjectToLocalPatient().write(x=WORDS_CO_OCCUR, file_name="data/cafef/vectorizer/words_co_occur.pkl")

NUMBER OF WORDS IN CONTENT: 22091


True

In [22]:
from common.utils import PickleWriteObjectToLocalPatient


print(f"NUMBER OF POSTS IN TRAIN SET: {len(TRAIN_POST_ID_TO_INFO)}; DEV SET: {len(DEV_POST_ID_TO_INFO)}; TEST SET: {len(TEST_POST_ID_TO_INFO)}")
PickleWriteObjectToLocalPatient().write(x=TRAIN_POST_ID_TO_INFO, file_name="data/cafef/train/post_id_to_info.pkl")
PickleWriteObjectToLocalPatient().write(x=DEV_POST_ID_TO_INFO, file_name="data/cafef/dev/post_id_to_info.pkl")
PickleWriteObjectToLocalPatient().write(x=TEST_POST_ID_TO_INFO, file_name="data/cafef/test/post_id_to_info.pkl")

NUMBER OF POSTS IN TRAIN SET: 46644; DEV SET: 26369; TEST SET: 27452


True

# READING PRETRAINED WORD2VEC #

## Reading functions ##

In [23]:
PRETRAINED_GLOVE_FILE: str = "/data/pretrained/phow2v/word2vec_vi_words_300dims.txt"


def load_word_embedding(word_to_int: Dict[str, int]) -> np.ndarray:
    vocab_size: int = len(word_to_int)
    num_match_words: int = 0
    word_embedding = np.zeros(shape=(vocab_size, 300))
    with open(PRETRAINED_GLOVE_FILE, mode="r", buffering=100000, encoding="utf-8") as file_obj:
        file_obj.readline()
        progress_bar = tqdm(file_obj.readlines(), desc="Reading word embedding data...")
        for line in progress_bar:
            try:
                parts: List[str] = line.strip().split(" ")
                word: str = parts[0]
                if word in word_to_int:
                    num_match_words += 1
                    index: int = word_to_int[word]
                    word_embedding[index] = np.array([float(v) for v in parts[1:301]])
            except Exception as ex:
                print(f"Something wrong occurs: {ex}")
    print(f"THERE ARE {num_match_words} WORDS OVER {vocab_size} WORDS HAVE PRETRAINED EMBEDDING")
    return word_embedding

## Read word embedding ##

In [24]:
TITLE_ABSTRACT_WORD_EMBEDDING = load_word_embedding(word_to_int=WORD_TO_INT)

Reading word embedding data...: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1587507/1587507 [00:53<00:00, 29453.96it/s]


THERE ARE 27923 WORDS OVER 48346 WORDS HAVE PRETRAINED EMBEDDING


In [25]:
CONTENT_WORD_EMBEDDING = load_word_embedding(word_to_int=VECTORIZER.vocabulary_)

Reading word embedding data...: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1587507/1587507 [00:49<00:00, 32334.94it/s]


THERE ARE 17093 WORDS OVER 22091 WORDS HAVE PRETRAINED EMBEDDING


## Save data ##

In [26]:
from common.utils import PickleWriteObjectToLocalPatient


PickleWriteObjectToLocalPatient().write(x=TITLE_ABSTRACT_WORD_EMBEDDING, file_name="data/cafef/pretrained/title_abstract_word_embedding.pkl")
PickleWriteObjectToLocalPatient().write(x=CONTENT_WORD_EMBEDDING, file_name="data/cafef/pretrained/content_word_embedding.pkl")

True