In [2]:
from typing import Dict

PRETRAINED_WORD_EMBEDDING_FILE: str = "/data1/nganv/ResearchThayLinh/data/pretrained/phow2v/word2vec_vi_words_300dims.txt"

VNCORE_ADDRESS: str = "http://10.5.1.230"
VNCORE_PORT: int = 2811

RAW_DATA_DIR: str = "data/version_2/soha/raw_data"
CLEAN_DATA_DIR: str = "data/version_3/soha/clean_data"
HISTORY_LENGTH: int = 50
TITLE_LENGTH: int = 20
ABSTRACT_LENGTH: int = 50
MIN_HISTORY_LENGTH: int = 5

USER_TO_INT: Dict[str, int] = {"PSEUDO_USER": 0}
POST_TO_INT: Dict[str, int] = {"PSEUDO_POST": 0}
CATEGORY_TO_INT: Dict[str, int] = {"PSEUDO_CATEGORY": 0}
WORD_TO_INT: Dict[str, int] = {"PSEUDO_WORD": 0}

# UTIL FUNCTIONS #

In [3]:
from typing import Dict


def get_int_mapping(data: object, data_to_int: Dict[object, int]) -> int:
    if data not in data_to_int:
        data_to_int[data] = len(data_to_int)
    return data_to_int[data]

# PROCESS DATASET DF #

## Process behaviour df functions ##

In [4]:
import pandas as pd
from typing import List, Dict, Tuple
from datetime import datetime
from tqdm import tqdm
import random


def extract_history(x: str) -> List[str]:
    if len(x) == 0:
        return []
    post_ids: List[str] = x.split(" ")
    post_ids: List[str] = post_ids[:HISTORY_LENGTH]
    return post_ids
    
    
def extract_positive_negative(x: str) -> Tuple[List[str], List[str]]:
    positive: List[str] = []
    negative: List[str] = []
    for post_label in x.split(" "):
        post_id, label = post_label.split("-")
        assert label in {"1", "0"}
        if label == "1":
            positive.append(post_id)
        else:
            negative.append(post_id)
    return positive, negative
    

def process_behaviour_df(df: pd.DataFrame) -> List[Dict]:
    df.columns = ["user_id", "timestamp", "history", "impression"]
    df = df.astype("str")
    df.fillna(value="", inplace=True)
    
    df["timestamp"] = df.timestamp.apply(lambda x: datetime.strptime(x, "%m/%d/%Y %I:%M:%S %p"))
    df["history"] = df.history.apply(extract_history)
    df["impression"] = df.impression.apply(extract_positive_negative)
    
    result: List[Dict] = []
    progress_bar = tqdm(df.itertuples(index=False), desc="Processing behaviours data... ")
    for row in progress_bar:
        user_id: str = row.user_id
        timestamp: datetime = row.timestamp
        history: List[str] = row.history
        if len(history) < MIN_HISTORY_LENGTH:
            continue
        positive, negative = row.impression

        user_id: int = get_int_mapping(data=user_id, data_to_int=USER_TO_INT)
        history: List[int] = list(map(lambda x: get_int_mapping(data=x, data_to_int=POST_TO_INT), history)) 
        positive: List[int] = list(map(lambda x: get_int_mapping(data=x, data_to_int=POST_TO_INT), positive)) 
        negative: List[int] = list(map(lambda x: get_int_mapping(data=x, data_to_int=POST_TO_INT), negative)) 

        result.append({"user_id": user_id, "timestamp": timestamp,
                       "history": history, "positive": positive, "negative": negative})
    progress_bar.close()
    return result

## Process post df functions ##

In [5]:
from vncorenlp import VnCoreNLP
from typing import List, Dict


VNCORE_PREPROCESSOR = VnCoreNLP(address=VNCORE_ADDRESS, port=VNCORE_PORT)


def get_clean_text(text: str) -> str:
    sentence_words: List[List[str]] = VNCORE_PREPROCESSOR.tokenize(text=text)
    words: List[str] = [word for sentence_word in sentence_words for word in sentence_word]
    text: str = " ".join(words)
    return text.lower()


def get_token_ids(text: str, sequence_len: int, is_training: bool) -> List[int]:
    result: List[int] = [0 for _ in range(sequence_len)]
    words: List[str] = text.split(" ")
    for index, word in enumerate(words):
        if index == sequence_len:
            break
        if is_training:
            result[index] = get_int_mapping(data=word, data_to_int=WORD_TO_INT)
        elif word in WORD_TO_INT:
            result[index] = WORD_TO_INT[word]
    assert len(result) == sequence_len
    return result


def process_post_df(df: pd.DataFrame, is_training: bool) -> Dict[int, Dict]:
    df.columns = ["post_id", "category", "subcategory", "subcategory_name", "title", "abstract"]
    pseudo_post_row: Dict = {
        "post_id": "PSEUDO_POST", 
        "category": "PSEUDO_CATEGORY", "subcategory": "PSEUDO_CATEGORY", 
        "subcategory_name": "PSEUDO_CATEGORY", 
        "title": "", "abstract": ""
    }
    df = df.append(pseudo_post_row, ignore_index=True)
    df = df.astype("str")
    df.fillna(value="", inplace=True)

    result: Dict[int, Dict] = {}
    progress_bar = tqdm(df.itertuples(index=False), "Processing posts... ")
    for row in progress_bar:
        post_id: str = row.post_id
        if post_id not in POST_TO_INT:
            continue
        post_id: int = POST_TO_INT[post_id]
        category: str = row.category
        subcategory: str = row.subcategory
        subcategory_name: str = row.subcategory_name
        title: str = get_clean_text(text=row.title)
        abstract: str = get_clean_text(text=row.abstract)
        
        title_token_ids: List[int] = get_token_ids(text=title, sequence_len=TITLE_LENGTH, is_training=is_training)
        abstract_token_ids: List[int] = get_token_ids(text=abstract, sequence_len=ABSTRACT_LENGTH, is_training=is_training)

        if is_training:
            category_id: int = get_int_mapping(data=category, data_to_int=CATEGORY_TO_INT)
            subcategory_id: int = get_int_mapping(data=subcategory, data_to_int=CATEGORY_TO_INT)
        else:
            category_id: int = CATEGORY_TO_INT.get(category, 0)
            subcategory_id: int = CATEGORY_TO_INT.get(subcategory, 0)

        result[post_id] = {"title": title, "abstract": abstract,
                           "title_token_ids": title_token_ids, "abstract_token_ids": abstract_token_ids,
                           "category": category, "subcategory": subcategory, "subcategory_name": subcategory_name,
                           "category_id": category_id, "subcategory_id": subcategory_id}
    progress_bar.close()
    return result

# READING WORD EMBEDDING #

In [6]:
import numpy as np


def load_word_embedding() -> np.ndarray:
    vocab_size: int = len(WORD_TO_INT)
    num_match_words: int = 0
    word_embedding = np.zeros(shape=(vocab_size, 300))
    with open(PRETRAINED_WORD_EMBEDDING_FILE, mode="r", 
              buffering=100000, encoding="utf-8") as file_obj:
        file_obj.readline()
        progress_bar = tqdm(file_obj, desc="Reading word embedding data...")
        for line in progress_bar:
            try:
                parts: List[str] = line.strip().split(" ")
                word: str = parts[0]
                if word in WORD_TO_INT:
                    num_match_words += 1
                    index: int = WORD_TO_INT[word]
                    word_embedding[index] = np.array([float(v) for v in parts[1:301]])
            except Exception as ex:
                print(f"Something wrong occurs: {ex}")
    print(f"THERE ARE {num_match_words} WORDS OVER {vocab_size} WORDS HAVE PRETRAINED EMBEDDING")
    return word_embedding

## EXECUTE PROCESS DATA ##

In [7]:
import pandas as pd
from typing import List, Dict


TRAIN_BEHAVIOUR_DF: pd.DataFrame = pd.read_csv(f"{RAW_DATA_DIR}/train/behaviours.tsv", sep='\t', header=None, index_col=0)
TRAIN_BEHAVIOURS: List[Dict] = process_behaviour_df(df=TRAIN_BEHAVIOUR_DF)

DEV_BEHAVIOUR_DF: pd.DataFrame = pd.read_csv(f"{RAW_DATA_DIR}/dev/behaviours.tsv", sep='\t', header=None, index_col=0)
DEV_BEHAVIOURS: List[Dict] = process_behaviour_df(df=DEV_BEHAVIOUR_DF)

TEST_BEHAVIOUR_DF: pd.DataFrame = pd.read_csv(f"{RAW_DATA_DIR}/test/behaviours.tsv", sep='\t', header=None, index_col=0)
TEST_BEHAVIOURS: List[Dict] = process_behaviour_df(df=TEST_BEHAVIOUR_DF)

Processing behaviours data... : 407740it [00:46, 8834.72it/s] 
Processing behaviours data... : 125111it [00:17, 7114.60it/s] 
Processing behaviours data... : 119599it [00:12, 9320.00it/s] 


In [8]:
import pandas as pd
from typing import List, Dict

TRAIN_POST_DF: pd.DataFrame = pd.read_csv(f"{RAW_DATA_DIR}/train/news.tsv", sep='\t', header=None, index_col=0)
TRAIN_POST_ID_TO_INFO: Dict[int, Dict] = process_post_df(df=TRAIN_POST_DF, is_training=True)

DEV_POST_DF: pd.DataFrame = pd.read_csv(f"{RAW_DATA_DIR}/dev/news.tsv", sep='\t', header=None, index_col=0)
DEV_POST_ID_TO_INFO: Dict[int, Dict] = process_post_df(df=DEV_POST_DF, is_training=False)

TEST_POST_DF: pd.DataFrame = pd.read_csv(f"{RAW_DATA_DIR}/test/news.tsv", sep='\t', header=None, index_col=0)
TEST_POST_ID_TO_INFO: Dict[int, Dict] = process_post_df(df=TEST_POST_DF, is_training=False)

  df = df.append(pseudo_post_row, ignore_index=True)
Processing posts... : 137764it [17:52, 128.47it/s]
  df = df.append(pseudo_post_row, ignore_index=True)
Processing posts... : 77365it [10:11, 126.59it/s]
  df = df.append(pseudo_post_row, ignore_index=True)
Processing posts... : 70402it [09:11, 127.55it/s]


In [9]:
WORD_EMBEDDING = load_word_embedding()

Reading word embedding data...: 1587507it [05:01, 5261.48it/s] 

THERE ARE 37364 WORDS OVER 79478 WORDS HAVE PRETRAINED EMBEDDING





## SAVE DATA ##

In [10]:
from common.utils import PickleWriteObjectToLocalPatient


print(f"TRAIN BEHAVIOURS SIZE: {len(TRAIN_BEHAVIOURS)};   DEV SIZE: {len(DEV_BEHAVIOURS)};    TEST SIZE: {len(TEST_BEHAVIOURS)}")

PickleWriteObjectToLocalPatient().write(x=TRAIN_BEHAVIOURS, file_name=f"{CLEAN_DATA_DIR}/train/behaviours.pkl")
PickleWriteObjectToLocalPatient().write(x=DEV_BEHAVIOURS, file_name=f"{CLEAN_DATA_DIR}/dev/behaviours.pkl")
PickleWriteObjectToLocalPatient().write(x=TEST_BEHAVIOURS, file_name=f"{CLEAN_DATA_DIR}/test/behaviours.pkl")

TRAIN BEHAVIOURS SIZE: 363837;   DEV SIZE: 113207;    TEST SIZE: 108171


True

In [11]:
from common.utils import PickleWriteObjectToLocalPatient


print(f"TOTAL USERS: {len(USER_TO_INT)}, TOTAL POSTS: {len(POST_TO_INT)}")
PickleWriteObjectToLocalPatient().write(x=USER_TO_INT, file_name=f"{CLEAN_DATA_DIR}/object_to_int/user_to_int.pkl")
PickleWriteObjectToLocalPatient().write(x=POST_TO_INT, file_name=f"{CLEAN_DATA_DIR}/object_to_int/post_to_int.pkl")
print(f"NUMBER OF CATEGORIES: {len(CATEGORY_TO_INT)}; NUMBER OF WORDS: {len(WORD_TO_INT)}")
PickleWriteObjectToLocalPatient().write(x=CATEGORY_TO_INT, file_name=f"{CLEAN_DATA_DIR}/object_to_int/category_to_int.pkl")
PickleWriteObjectToLocalPatient().write(x=WORD_TO_INT, file_name=f"{CLEAN_DATA_DIR}/object_to_int/word_to_int.pkl")

TOTAL USERS: 113282, TOTAL POSTS: 169003
NUMBER OF CATEGORIES: 55; NUMBER OF WORDS: 79478


True

In [12]:
from common.utils import PickleWriteObjectToLocalPatient


print(f"NUMBER OF POSTS IN TRAIN SET: {len(TRAIN_POST_ID_TO_INFO)}; DEV SET: {len(DEV_POST_ID_TO_INFO)}; TEST SET: {len(TEST_POST_ID_TO_INFO)}")
PickleWriteObjectToLocalPatient().write(x=TRAIN_POST_ID_TO_INFO, file_name=f"{CLEAN_DATA_DIR}/train/post_id_to_info.pkl")
PickleWriteObjectToLocalPatient().write(x=DEV_POST_ID_TO_INFO, file_name=f"{CLEAN_DATA_DIR}/dev/post_id_to_info.pkl")
PickleWriteObjectToLocalPatient().write(x=TEST_POST_ID_TO_INFO, file_name=f"{CLEAN_DATA_DIR}/test/post_id_to_info.pkl")

NUMBER OF POSTS IN TRAIN SET: 135304; DEV SET: 76335; TEST SET: 69782


True

In [13]:
from common.utils import PickleWriteObjectToLocalPatient


PickleWriteObjectToLocalPatient().write(x=WORD_EMBEDDING, file_name=f"{CLEAN_DATA_DIR}/pretrained/title_abstract_word_embedding.pkl")

True