In [1]:
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
from PIL import Image
import os
import seaborn as sns
import torch

import re

import sys
import numpy as np
import torch.nn as nn
import torchvision.transforms as it

sys.path.insert(0, "..")
from preprocess import VietnameseTextCleaner, dict_handler
from utils import load_json

matplotlib.style.use("ggplot")

DATA_DIR = "C:\\Users\\anhbu\\Desktop\\new_odl\\data\\fevent"
CUR_DIR = os.path.abspath(os.curdir)
CACHE_DIR = "E:\\tools\\new_odl\\cache"
VNCORE_NLP_PATH = os.path.join(CUR_DIR, "../vncorenlp/")
STOPWORDS_PATH = os.path.join(
    CUR_DIR, "../stop_words/vietnamese-stopwords-dash.txt"
)
os.environ["JAVA_HOME"] = os.path.join(os.environ["CONDA_PREFIX"], "Library")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
cleaner = VietnameseTextCleaner(
    stopwords_path=STOPWORDS_PATH,
    vncorenlp_path=VNCORE_NLP_PATH,
    cur_dir=CUR_DIR,
)

In [3]:
data = []
for file in os.listdir(DATA_DIR):
    small_data = load_json(DATA_DIR, file)
    if file == "CritiCat.envent.json":
        for post in small_data:
            if post["event"] == "Unknown event":
                post["event"] = "huflit"
    data.extend(small_data)

for post in data:
    if "event" not in post:
        post["event"] = "Unknown event"

In [4]:
def get_usefull_data(post):
    data = {}

    data["id"] = post["_id"]["$oid"]
    data["event"] = post["event"]
    data["label"] = post["label"]
    data["post_message"] = post["data"]["post_message"]
    data["num_like_post"] = post["data"]["num_like_post"]
    data["num_comment_post"] = post["data"]["num_comment_post"]
    data["num_share_post"] = post["data"]["num_share_post"]
    data["image"] = post["data"]["images"]

    try:
        # print(
        #     list(post["data"]["metadata"]["page_insights"].values())[
        #         "publish_time"
        #     ]
        # )
        data["user_name"] = list(
            post["data"]["metadata"]["page_insights"].values()
        )[0]["page_id"]
        data["timestamp_post"] = list(
            post["data"]["metadata"]["page_insights"].values()
        )[0]["post_context"]["publish_time"]
    except:
        # print(post["data"]["metadata"]["content_owner_id_new"])
        data["user_name"] = post["data"]["metadata"]["content_owner_id_new"]
        data["timestamp_post"] = None

    return data


clean_data = [get_usefull_data(post) for post in data]

df = pd.DataFrame(clean_data)

In [5]:
mean = df["timestamp_post"].mean()
df.timestamp_post.fillna(mean)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1468 entries, 0 to 1467
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   id                1468 non-null   object 
 1   event             1468 non-null   object 
 2   label             1468 non-null   int64  
 3   post_message      1468 non-null   object 
 4   num_like_post     1468 non-null   object 
 5   num_comment_post  1468 non-null   object 
 6   num_share_post    1468 non-null   object 
 7   image             1468 non-null   object 
 8   user_name         1468 non-null   object 
 9   timestamp_post    1249 non-null   float64
dtypes: float64(1), int64(1), object(8)
memory usage: 114.8+ KB


In [6]:
def convert_numerical_string(string):
    pattern = r"\d+,*\d+[Kk]*"
    match = re.match(pattern, str(string))

    if match:
        number = match.group(0)
        mul = 1
        if number[-1].lower() == "k":
            number = number[:-1]
            mul = 1000
        number = number.replace(",", ".")
        return int(float(number) * mul)
    else:
        return 0


df.num_share_post = df.num_share_post.apply(convert_numerical_string)
df.num_like_post = df.num_like_post.apply(convert_numerical_string)
df.num_comment_post = df.num_comment_post.apply(convert_numerical_string)
df.post_message = df.post_message.apply(cleaner.clean_one)

In [7]:
df_sorted = df.sort_values("timestamp_post")

In [8]:
import requests
from PIL import Image
import torch
import numpy as np


class ImageTransform(nn.Module):
    def __init__(self):
        super(ImageTransform, self).__init__()
        self.toTensor = it.ToTensor()
        self.resize = it.Resize((256, 256), antialias=True)

    def forward(self, images):
        images = self.toTensor(images)
        images = self.resize(images)
        return images / 255


image_transform = ImageTransform()


def add_image(dataset):
    for data in tqdm(dataset):
        if len(data["image"]) != 0:
            try:
                image = Image.open(
                    requests.get(data["image"][0], stream=True).raw
                )
                temp = image_transform(
                    np.array(image.convert("RGB")).astype(np.float32)
                )
            except:
                temp = (
                    torch.randint(0, 256, (3, 256, 256), dtype=torch.float)
                    / 255
                )
        else:
            temp = torch.randint(0, 256, (3, 256, 256), dtype=torch.float) / 255

        data["image"] = temp

    return dataset


dataset = []

for name, group in df_sorted.groupby("event"):
    print(name)
    dataset.append(add_image(group.to_dict("records")))

Unknown event


100%|██████████| 474/474 [00:13<00:00, 35.35it/s]


ba-phuong-hang


100%|██████████| 101/101 [00:02<00:00, 44.80it/s]


hao-nam


100%|██████████| 241/241 [00:09<00:00, 26.60it/s]


huflit


100%|██████████| 129/129 [00:03<00:00, 39.62it/s]


nu sinh tu tu 


100%|██████████| 27/27 [00:00<00:00, 39.28it/s]


shark-binh


100%|██████████| 252/252 [00:07<00:00, 32.27it/s]


tranthanh-cgv


100%|██████████| 154/154 [00:04<00:00, 32.60it/s]


viec-lam-online


100%|██████████| 90/90 [00:02<00:00, 42.33it/s]


In [9]:
import copy

save_dir = os.path.join(CACHE_DIR, "clean_metadata_with_image/fevent")

if not os.path.exists(save_dir):
    os.makedirs(save_dir)

for i, chunk in enumerate(dataset):
    filename = f"clean_metadata_with_image_{i+1:02}.pt"
    file_path = os.path.join(save_dir, filename)
    torch.save(copy.deepcopy(chunk), file_path)  # Save
    print(
        f"Saved {filename} with size {(os.path.getsize(file_path)/1024**2):.0f} MB"
    )

Saved clean_metadata_with_image_01.pt with size 356 MB
Saved clean_metadata_with_image_02.pt with size 76 MB
Saved clean_metadata_with_image_03.pt with size 181 MB
Saved clean_metadata_with_image_04.pt with size 97 MB
Saved clean_metadata_with_image_05.pt with size 20 MB
Saved clean_metadata_with_image_06.pt with size 189 MB
Saved clean_metadata_with_image_07.pt with size 116 MB
Saved clean_metadata_with_image_08.pt with size 68 MB


In [10]:
# def split_range_by_weeks(start_timestamp, end_timestamp):
#     start_date = datetime.datetime.fromtimestamp(start_timestamp).date()
#     end_date = datetime.datetime.fromtimestamp(end_timestamp).date()
#     week_start = start_date - datetime.timedelta(days=start_date.weekday())
#     week_end = week_start + datetime.timedelta(days=6)
#     while week_end < end_date:
#         yield (
#             int(
#                 datetime.datetime.combine(
#                     week_start, datetime.datetime.min.time()
#                 ).timestamp()
#             ),
#             int(
#                 datetime.datetime.combine(
#                     week_end, datetime.datetime.max.time()
#                 ).timestamp()
#             ),
#         )
#         week_start = week_end + datetime.timedelta(days=1)
#         week_end = week_end + datetime.timedelta(days=7)
#     yield (
#         int(
#             datetime.datetime.combine(
#                 week_start, datetime.datetime.min.time()
#             ).timestamp()
#         ),
#         int(datetime.datetime.fromtimestamp(end_timestamp).timestamp()),
#     )


# def timestamp_to_id(timestamp, ranges):
#     return int(
#         datetime.datetime.fromtimestamp(timestamp).strftime("%Y%m%d%H%M%S")
#     )


# begin = datetime.datetime(2023, 1, 1).timestamp()
# end = datetime.datetime(2023, 6, 8).timestamp()
# wrs = []
# for week_range in split_range_by_weeks(begin, end):
#     wrs.append(week_range)

# wrs = [(k, v) for k, v in enumerate(wrs)]

# df = pd.DataFrame(clean_data)

# df = df[["timestamp_post", "event"]]


# def get_week(timestamp):
#     for k, v in wrs:
#         if v[0] <= timestamp <= v[1]:
#             return k
#     return -1


# df["timestamp_post"] = df["timestamp_post"].apply(get_week)

# df = pd.get_dummies(df, columns=["event"])

# df.groupby("timestamp_post").sum().plot()