## Paths


In [1]:
import os

CUR_DIR = os.path.abspath(os.curdir)
CACHE_DIR = os.path.join(CUR_DIR, "../.cache/")

## Create dataloader from cache with chunk size 128


In [2]:
import pickle
import sys

sys.path.insert(0, "..")

from utils import *
from vocabulary import Vocabulary
from cusdataset import TestingDataset

from torch.utils.data import DataLoader

cache = open(os.path.join(CACHE_DIR, "pretrain_dataset_with_image.pkl"), "rb")
raw_data = pickle.load(cache)
cache.close()


# create dataloader
dataset = TestingDataset(raw_data)
dataloader = DataLoader(dataset, batch_size=128, shuffle=False)

## Split dataset into batches


In [3]:
from sklearn.model_selection import train_test_split

# user lut
lut = Vocabulary(specials=["<unk>"])
lut.set_default_idx(lut["<unk>"])

# text vocab
vc = Vocabulary(specials=["<pad>", "<unk>"])
vc.set_default_idx(vc["<unk>"])

train_texts_total = None
raw_train_text_total = []
test_texts_total = []

train_images_total = None
test_images_total = None

train_user_name_total = None
test_user_name_total = []

train_metadata_total = None
test_metadata_total = None

train_labels_total = None
test_labels_total = None

for idx, batch in tqdm(
    enumerate(dataloader), desc="Creating sessions data: ", unit="sessions"
):
    # * Target
    labels = batch["label"]

    stratify = labels
    dummy_input = torch.tensor(list(range(len(labels))))

    train_idx, test_idx, train_labels, test_labels = train_test_split(
        dummy_input, labels, test_size=0.1, shuffle=True, stratify=stratify
    )

    # * Text content
    raw_train_texts = [batch["post_message"][i] for i in train_idx]
    raw_train_text_total += raw_train_texts
    train_texts = handling_text(raw_train_texts, vc, train=True)
    raw_test_texts = [batch["post_message"][i] for i in test_idx]
    test_texts_total += raw_test_texts
    test_texts = handling_text(raw_test_texts, vc, train=False)

    # * Images
    images = batch["image"]

    # * Username
    train_user_name = [batch["user_name"][i] for i in train_idx]
    train_user_name = handling_username(train_user_name, lut, train=True)
    test_user_name = [batch["user_name"][i] for i in test_idx]
    test_user_name_total += test_user_name
    test_user_name = handling_username(test_user_name, lut, train=False)

    # * Metadata
    metadata = handling_metadata(
        num_like_post=batch["num_like_post"],
        num_comment_post=batch["num_comment_post"],
        num_share_post=batch["num_share_post"],
        raw_length=batch["raw_length"],
        timestamp_post=batch["timestamp_post"],
    )
    # append to total
    # text ======
    train_texts_total = (
        torch.cat([train_texts_total, train_texts])
        if train_texts_total != None
        else train_texts
    )
    # image ======
    train_images_total = (
        torch.cat([train_images_total, images[train_idx]])
        if train_images_total != None
        else images[train_idx]
    )

    test_images_total = (
        torch.cat([test_images_total, images[test_idx]])
        if test_images_total != None
        else images[test_idx]
    )
    # username ======
    train_user_name_total = (
        torch.cat([train_user_name_total, train_user_name])
        if train_user_name_total != None
        else train_user_name
    )
    # metadata ======
    train_metadata_total = (
        torch.cat([train_metadata_total, metadata[train_idx]])
        if train_metadata_total != None
        else metadata[train_idx]
    )

    test_metadata_total = (
        torch.cat([test_metadata_total, metadata[test_idx]])
        if test_metadata_total != None
        else metadata[test_idx]
    )

    # label ======
    train_labels_total = (
        torch.cat([train_labels_total, train_labels])
        if train_labels_total != None
        else train_labels
    )

    test_labels_total = (
        torch.cat([test_labels_total, test_labels])
        if test_labels_total != None
        else test_labels
    )
    # save session data
    train_inputs = {
        "texts": train_texts,
        "raw_texts": raw_train_texts,
        "images": images[train_idx],
        "user_name": train_user_name,
        "metadata": metadata[train_idx],
    }
    test_inputs = {
        "texts": test_texts,
        "raw_texts": raw_test_texts,
        "images": images[test_idx],
        "user_name": test_user_name,
        "metadata": metadata[test_idx],
    }

    session_dataset = {
        "train_inputs": train_inputs,
        "test_inputs": test_inputs,
        "train_labels": train_labels,
        "test_label": test_labels,
    }

    torch.save(
        {"vocabulary": vc, "LUT": lut, "dataset": session_dataset},
        os.path.join(CACHE_DIR, f"sessions/session_{idx}.pt"),
    )

Creating sessions data: : 0sessions [00:00, ?sessions/s]

In [None]:
test_texts_total_1 = handling_text(test_texts_total, vc, False)
test_user_name_total_1 = handling_text(test_user_name_total, vc, False)

In [None]:
train_inputs_total = {
    "texts": train_texts_total,
    "raw_texts": raw_train_text_total,
    "images": train_images_total,
    "user_name": train_user_name_total,
    "metadata": train_metadata_total,
}

test_inputs_total = {
    "texts": test_texts_total_1,
    "raw_texts": test_texts_total,
    "images": test_images_total,
    "user_name": test_user_name_total_1,
    "metadata": test_metadata_total,
}

final_dataset = {
    "train_inputs": train_inputs_total,
    "test_inputs": test_inputs_total,
    "train_labels": train_labels_total,
    "test_label": test_labels_total,
}

torch.save(
    {"vocabulary": vc, "LUT": lut, "dataset": final_dataset},
    os.path.join(CACHE_DIR, f"dataset.pt"),
)