In [1]:
!git clone https://github.com/hoivd/SocialTrend

Cloning into 'SocialTrend'...
remote: Enumerating objects: 1487, done.[K
remote: Counting objects: 100% (18/18), done.[K
remote: Compressing objects: 100% (11/11), done.[K
remote: Total 1487 (delta 5), reused 16 (delta 3), pack-reused 1469 (from 2)[K
Receiving objects: 100% (1487/1487), 232.51 MiB | 40.51 MiB/s, done.
Resolving deltas: 100% (5/5), done.


# IMPORT TH∆Ø VI·ªÜN

In [2]:
import logging
import json
from typing import List, Dict, Optional
import re
from collections import Counter
import pandas as pd
from SocialTrend.scripts.clone_data import HFDatasetCloner
from SocialTrend.scripts.logger import _setup_logger
from datetime import datetime
import os

In [3]:
logger_level = logging.DEBUG

# UTILS

In [4]:
utils_logger = _setup_logger("Utils", logger_level)

class Utils:
    @staticmethod
    def load_json(file_path: str) -> dict:
        """ƒê·ªçc file .json v√† tr·∫£ v·ªÅ d·ªØ li·ªáu d∆∞·ªõi d·∫°ng dict ho·∫∑c list."""
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                data = json.load(f)
                utils_logger.debug(f"Load FILE JSON {file_path} th√†nh c√¥ng")
                return data
        except Exception as e:
            utils_logger.debug(f"Load FILE JSON {file_path} th·∫•t b·∫°i")
            raise Exception(e)

    @staticmethod
    def load_jsonl(file_path: str) -> List[Dict]:
        """ƒê·ªçc file .jsonl v√† tr·∫£ v·ªÅ list ch·ª©a c√°c dict (m·ªói d√≤ng l√† m·ªôt JSON).
    
        Args:
            file_path (str): ƒê∆∞·ªùng d·∫´n t·ªõi file .jsonl
            utils_logger: Logger ƒë√£ ƒë∆∞·ª£c c·∫•u h√¨nh s·∫µn (th∆∞·ªùng d√πng logging.getLogger(...))
    
        Returns:
            List[Dict]: Danh s√°ch c√°c d√≤ng JSON ƒë√£ parse th√†nh dict
        """
        data = []
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                for line_num, line in enumerate(f, start=1):
                    line = line.strip()
                    if line:
                        try:
                            data.append(json.loads(line))
                        except json.JSONDecodeError as e:
                            utils_logger.debug(f"L·ªói JSON ·ªü d√≤ng {line_num}: {e}")
        except FileNotFoundError:
            utils_logger.debug(f"Kh√¥ng t√¨m th·∫•y file: {file_path}")
        except Exception as e:
            utils_logger.debug(f"L·ªói khi ƒë·ªçc file '{file_path}': {e}")
        return data

    @staticmethod
    def write_csv(data: List[Dict], file_path: str, encoding: str = "utf-8", index: bool = False) -> None:
        """
        Ghi list[dict] ho·∫∑c pandas.DataFrame ra file CSV.
    
        Args:
            data (List[Dict] or pd.DataFrame): D·ªØ li·ªáu c·∫ßn ghi.
            file_path (str): ƒê∆∞·ªùng d·∫´n file CSV ƒë·∫ßu ra.
            encoding (str): Ki·ªÉu m√£ h√≥a (m·∫∑c ƒë·ªãnh: "utf-8").
            index (bool): C√≥ ghi ch·ªâ s·ªë d√≤ng (index) hay kh√¥ng (m·∫∑c ƒë·ªãnh: False).
        """
        try:
            if isinstance(data, pd.DataFrame):
                df = data
            else:
                df = pd.DataFrame(data)
    
            df.to_csv(file_path, index=index, encoding=encoding)
            utils_logger.debug(f"[write_csv] ‚úÖ ƒê√£ ghi {len(df)} d√≤ng v√†o '{file_path}'")
    
        except Exception as e:
            utils_logger.debug(f"[write_csv] ‚ùå L·ªói khi ghi CSV: {e}")

# POST ENTITY

In [5]:
entity_logger = _setup_logger("Entity", logger_level)

In [6]:
class CommentReactionsInfo:
    def __init__(self, reaction_data: Dict):
        self.total = reaction_data.get("total", 0)
        self.detail = reaction_data.get("detail", {}) or {}
        self.reaction_key_map = {
            k.lower(): k for k in self.detail.keys()
        }

    def get_total(self) -> int:
        return self.total

    def get_count(self, reaction: str) -> int:
        key = self.reaction_key_map.get(reaction.lower())
        if key is None:
            entity_logger.debug(f"[CommentReactionsInfo] '{reaction}' kh√¥ng t·ªìn t·∫°i trong reaction_key_map")
            return 0
        return self.detail.get(key, 0)

    def get_percentage(self, reaction: str) -> float:
        count = self.get_count(reaction)
        return (count / self.total) * 100 if self.total > 0 else 0.0

    def get_all_counts(self) -> Dict[str, int]:
        return dict(self.detail)

    def get_all_percentages(self) -> Dict[str, float]:
        return {
            k: round((v / self.total) * 100, 2) if self.total > 0 else 0.0
            for k, v in self.detail.items()
        }

    def most_common(self, top_n: int = 2) -> List[tuple]:
        return sorted(self.detail.items(), key=lambda x: x[1], reverse=True)[:top_n]

    def get_available_reactions(self) -> List[str]:
        return list(self.detail.keys())

In [7]:
class ParentComment:
    def __init__(self, data: Dict):
        self.text = data.get("text", "")
        self.reactions_raw = data.get("reactions", {})
        self.raw = data

        entity_logger.debug(f"[ParentComment] Init | Length: {self.length()}")

    def get_text(self) -> str:
        return self.text

    def has_link(self) -> bool:
        has = "http" in self.text
        entity_logger.debug(f"[ParentComment] has_link: {has}")
        return has

    def length(self) -> int:
        return len(self.text)

    def get_keywords(self) -> List[str]:
        words = re.findall(r'\w+', self.text.lower())
        entity_logger.debug(f"[ParentComment] {len(words)} keywords")
        return words

    def get_reactions_info(self) -> CommentReactionsInfo:
        entity_logger.debug("[ParentComment] get_reactions_info")
        return CommentReactionsInfo(self.reactions_raw)

    def get_reaction_count(self) -> int:
        info = self.get_reactions_info()
        return info.get_total()

    def to_dict(self) -> Dict:
        return {
            "text": self.text,
            "reaction_count": self.get_reaction_count(),
            "has_link": self.has_link(),
            "length": self.length(),
        }


In [8]:
class ReactionsInfo:
    def __init__(self, reaction_dict: Dict[str, int]):
        self.reactions = reaction_dict or {}
        self.total = sum(self.reactions.values())       

    def get_total(self) -> int:
        return self.total

    def get_count(self, reaction: str) -> int:
        """Tr·∫£ v·ªÅ s·ªë l∆∞·ª£ng reaction theo t√™n kh√¥ng ph√¢n bi·ªát hoa th∆∞·ªùng"""
        key = reaction
        if key is None:
            entity_logger.debug(f"[get_count] '{reaction}' kh√¥ng t·ªìn t·∫°i trong reaction_key_map")
            return 0
        return self.reactions.get(key, 0)

    def get_percentage(self, reaction: str) -> float:
        count = self.get_count(reaction)
        return (count / self.total) * 100 if self.total > 0 else 0.0

    def get_all_counts(self) -> Dict[str, int]:
        return dict(self.reactions)

    def get_all_percentages(self) -> Dict[str, float]:
        return {
            k: round((v / self.total) * 100, 2) if self.total > 0 else 0.0
            for k, v in self.reactions.items()
        }

    def most_common(self, top_n: int = 3) -> List[tuple]:
        return sorted(self.reactions.items(), key=lambda x: x[1], reverse=True)[:top_n]

    def get_available_reactions(self) -> List[str]:
        """Tr·∫£ v·ªÅ danh s√°ch c√°c lo·∫°i c·∫£m x√∫c c√≥ m·∫∑t trong post"""
        return list(self.reactions.keys())

In [9]:
class FacebookPost:
    def __init__(self, data: Dict):
        entity_logger.debug("[FacebookPost] Initializing...")
        self.data = data

    def get_creation_time(self) -> Optional[str]:
        ts = self.data.get("creation_time")
        if ts:
            try:
                dt = datetime.utcfromtimestamp(ts)
                formatted = dt.strftime('%Y-%m-%d %H:%M:%S UTC')
                entity_logger.debug(f"[get_creation_time] Formatted time: {formatted}")
                return formatted
            except Exception as e:
                entity_logger.debug(f"[get_creation_time] Error formatting timestamp: {e}")
        return None

    def get_reactions_info(self) -> ReactionsInfo:
        reaction_data = self.data.get("reactions_detail", {})
        entity_logger.debug(f"[get_reactions_info] Raw: {reaction_data}")
        return ReactionsInfo(reaction_data)

    def get_post_text(self) -> str:
        text = self.data.get("post_content", "")
        entity_logger.debug(f"[get_post_text] Length: {len(text)}")
        return text

    def get_post_url(self) -> str:
        return self.data.get("post_url", "")

    def get_total_reactions(self) -> int:
        total = self.data.get("total_reactions", 0)
        entity_logger.debug(f"[get_total_reactions] Total: {total}")
        return total

    def get_reaction_breakdown(self) -> Dict[str, int]:
        reactions = self.data.get("reactions_detail", {})
        entity_logger.debug(f"[get_reaction_breakdown] Reactions: {reactions}")
        return reactions

    def get_comment_count(self) -> int:
        count = self.data.get("comment_count", 0)
        entity_logger.debug(f"[get_comment_count] Total: {count}")
        return count

    def get_share_count(self) -> int:
        try:
            share_count = int(self.data.get("share_count", "0"))
        except ValueError:
            share_count = 0
        entity_logger.debug(f"[get_share_count] Share count: {share_count}")
        return share_count

    def get_parent_comments(self) -> List[ParentComment]:
        raw_comments = data.get("comments", {}).get("comments", [])
        entity_logger.debug(f"[FacebookPost] Found {len(raw_comments)} parent comments.")
        self.parent_comments: List[ParentComment] = [
            ParentComment(c) for c in raw_comments
        ]
        entity_logger.debug(f"[get_parent_comments] {len(self.parent_comments)} b√¨nh lu·∫≠n cha")
        return self.parent_comments

    def get_average_comment_length(self) -> float:
        if not self.parent_comments:
            return 0.0
        lengths = [c.length() for c in self.parent_comments]
        avg = sum(lengths) / len(lengths)
        entity_logger.debug(f"[get_average_comment_length] Trung b√¨nh: {avg}")
        return avg

    def get_link_comments(self) -> List[ParentComment]:
        linked = [c for c in self.parent_comments if c.has_link()]
        entity_logger.debug(f"[get_link_comments] C√≥ {len(linked)} b√¨nh lu·∫≠n ch·ª©a link")
        return linked

    def get_top_keywords_in_comments(self, top_n: int = 10) -> List[tuple]:
        counter = Counter()
        for c in self.parent_comments:
            counter.update(c.get_keywords())
        result = counter.most_common(top_n)
        entity_logger.debug(f"[get_top_keywords_in_comments] Top {top_n}: {result}")
        return result

    def get_first_comment_with_link(self) -> Optional[ParentComment]:
        for c in self.parent_comments:
            if c.has_link():
                entity_logger.debug("[get_first_comment_with_link] T√¨m th·∫•y b√¨nh lu·∫≠n c√≥ link")
                return c
        entity_logger.debug("[get_first_comment_with_link] Kh√¥ng c√≥ b√¨nh lu·∫≠n n√†o ch·ª©a link")
        return None

    def print_basic_info(self):
        print("üìÑ TH√îNG TIN B√ÄI VI·∫æT")
        print("-" * 40)
        print(f"üïí Th·ªùi gian ƒëƒÉng   : {self.get_creation_time()}")
        print(f"üîó URL             : {self.get_post_url()}")
        print(f"üí¨ N·ªôi dung        : {self.get_post_text()[:100]}{'...' if len(self.get_post_text()) > 100 else ''}")
        print()
        print(f"üëç T·ªïng reaction   : {self.get_total_reactions()}")
        print(f"üí¨ T·ªïng comment    : {self.get_comment_count()}")
        print(f"üîÅ T·ªïng share      : {self.get_share_count()}")
        print()
        print(f"üë®‚Äçüë©‚Äçüëß‚Äçüë¶ S·ªë b√¨nh lu·∫≠n cha : {len(self.get_parent_comments())}")
        print(f"‚úèÔ∏è ƒê·ªô d√†i TB b√¨nh lu·∫≠n: {self.get_average_comment_length():.2f} k√Ω t·ª±")
        print(f"üîó B√¨nh lu·∫≠n c√≥ link  : {len(self.get_link_comments())}")
        print("-" * 40)

# PREPARE DATA 

In [10]:
prepare_logger = _setup_logger("PrepareData", logger_level)

In [11]:
class PostDataLoader:
    @staticmethod
    def get_posts_file(root_dir: str) -> List[str]:
        """
        Tr·∫£ v·ªÅ danh s√°ch t·∫•t c·∫£ file .json trong th∆∞ m·ª•c (bao g·ªìm th∆∞ m·ª•c con)
        """
        json_files = []
        prepare_logger.info(f"üìÇ T√¨m post data trong: {root_dir}")

        for root, dirs, files in os.walk(root_dir):
            for file in files:
                if file.lower().endswith('.json'):
                    full_path = os.path.join(root, file)
                    prepare_logger.debug(f"   ‚îî‚îÄ‚îÄ üìÑ T√¨m th·∫•y: {file}")
                    json_files.append(full_path)

        return json_files

    @staticmethod
    def get_posts_data(post_files: List[str]) -> List[Dict]:
        """
        ƒê·ªçc d·ªØ li·ªáu t·ª´ danh s√°ch file JSON v√† g·ªôp l·∫°i th√†nh m·ªôt list
        """
        all_posts = []

        for file_path in post_files:
            try:
                data = Utils.load_json(file_path)
                if isinstance(data, list):
                    all_posts.extend(data)
                else:
                    all_posts.append(data)
                prepare_logger.debug(f"‚úÖ Load xong {file_path} ({len(data)} item)")
            except Exception as e:
                prepare_logger.debug(f"‚ùå L·ªói khi load {file_path}: {e}")
        
        prepare_logger.info(f"üì¶ T·ªïng s·ªë post load ƒë∆∞·ª£c: {len(all_posts)}")
        return all_posts

    def __call__(self, root_dir: str) -> List[Dict]:
        """
        Cho ph√©p g·ªçi class nh∆∞ m·ªôt h√†m ƒë·ªÉ l·∫•y to√†n b·ªô post t·ª´ th∆∞ m·ª•c
        """
        prepare_logger.info(f"üöÄ B·∫Øt ƒë·∫ßu load post data t·ª´: {root_dir}")
        post_files = self.get_posts_file(root_dir)
        posts_data = self.get_posts_data(post_files)
        return posts_data

In [12]:
class PrepareData:
    @staticmethod
    def prepare_post_comment_data(post: FacebookPost, max_comment: int = 50) -> List[str]:
        """
        Th·ª±c hi·ªán c√°c b∆∞·ªõc:
        B∆∞·ªõc 1: L·∫•y n·ªôi dung b√†i post (post content)
        B∆∞·ªõc 2: L·∫•y danh s√°ch parent comment v√† n·ªôi dung t·ª´ng comment
        B∆∞·ªõc 3: V·ªõi m·ªói comment, gh√©p n·ªôi dung post + comment th√†nh m·ªôt chu·ªói ‚Üí tr·∫£ v·ªÅ list
        """
        # B∆∞·ªõc 1: l·∫•y n·ªôi dung post
        post_text = post.get_post_text().strip()
        prepare_logger.debug(f"[B∆∞·ªõc 1] Post content: '{post_text[:80]}...' (length: {len(post_text)})")

        # B∆∞·ªõc 2: l·∫•y comment cha
        parent_comments = post.get_parent_comments()
        prepare_logger.debug(f"[B∆∞·ªõc 2] S·ªë l∆∞·ª£ng comment cha: {len(parent_comments)}")

        comment_texts = []
        n_comment = len(parent_comments)
        top_k_cmt = min(n_comment, max_comment)
        for i, c in enumerate(parent_comments[:top_k_cmt]):
            text = c.get_text().strip()
            if text:
                comment_texts.append(text)
                prepare_logger.debug(f"  - Comment #{i+1} | length: {len(text)} | preview: '{text[:60]}...'")

        # B∆∞·ªõc 3: gh√©p post + comment
        merged_texts = [f"{post_text}\n\n{comment}" for comment in comment_texts]
        prepare_logger.debug(f"[B∆∞·ªõc 3] T·ªïng s·ªë m·∫´u gh√©p: {len(merged_texts)}")

        return merged_texts

    @staticmethod
    def prepare_corpus_post_comment_df(posts: List[FacebookPost], max_comment: int = 50) -> pd.DataFrame:
        """
        Tr·∫£ v·ªÅ DataFrame g·ªìm 2 c·ªôt: 'content' v√† 'post_id'
        - 'content': vƒÉn b·∫£n sau khi merge post content + comment content
        - 'post_id': l·∫•y t·ª´ 'feedback_id' trong dict g·ªëc
        """
        prepare_logger.debug(f"[prepare_corpus_post_comment_df] X·ª≠ l√Ω {len(posts)} b√†i vi·∫øt | Max Comment: {max_comment}")
    
        records = []
    
        for i, post in enumerate(posts):
            try:
                prepare_logger.debug(f"[Post {i+1}] ƒêang x·ª≠ l√Ω post")
                post_id = i + 1
                merged_texts = PrepareData.prepare_post_comment_data(post, max_comment)
    
                for content in merged_texts:
                    records.append({
                        "post_id": post_id,
                        "content": content
                    })
    
                prepare_logger.debug(f"[Post {i+1}] Th√™m {len(merged_texts)} d√≤ng v√†o DataFrame")
    
            except Exception as e:
                prepare_logger.debug(f"[Post {i+1}] L·ªói: {e}")
    
        df = pd.DataFrame(records)
        prepare_logger.debug(f"[prepare_corpus_post_comment_df] T·ªïng s·ªë d√≤ng: {len(df)}")
        return df

    @staticmethod
    def prepare_corpus_post_df(posts: List[FacebookPost]) -> pd.DataFrame:
        """
        T·∫°o DataFrame ch·ªâ g·ªìm n·ªôi dung post (kh√¥ng c√≥ comment).
        
        Returns:
            pd.DataFrame v·ªõi 2 c·ªôt:
                - 'post_id': ID th·ª© t·ª±
                - 'content': n·ªôi dung b√†i vi·∫øt
        """
        prepare_logger.debug(f"[prepare_corpus_post_only_df] T·ªïng s·ªë b√†i vi·∫øt: {len(posts)}")
        
        records = []
        
        for i, post in enumerate(posts):
            try:
                content = post.get_post_text().strip()
                if content:
                    records.append({
                        "post_id": i + 1,
                        "content": content
                    })
                    prepare_logger.debug(f"[Post {i+1}] ‚úî Th√™m post content, length: {len(content)}")
                else:
                    prepare_logger.debug(f"[Post {i+1}] ‚ö† Post content r·ªóng")
            except Exception as e:
                prepare_logger.debug(f"[Post {i+1}] ‚ùå L·ªói: {e}")
        
        df = pd.DataFrame(records)
        prepare_logger.debug(f"[prepare_corpus_post_only_df] ‚úÖ T·ªïng s·ªë d√≤ng: {len(df)}")
        return df

# MAIN 

## Load Posts

In [13]:
datasets_dir = "./kaggle/datasets"
posts_dir = datasets_dir + "/data/json"

In [14]:
cloner = HFDatasetCloner("hoivd/SocialData")
cloner.clone_folder("data/json", local_dir="./kaggle/datasets")

üìÇ Cloning folder: data/json
  ‚¨áÔ∏è  Downloading: data/json/posts_K14vn.json


For more details, check out https://huggingface.co/docs/huggingface_hub/main/en/guides/download#download-files-to-local-folder.


posts_K14vn.json:   0%|          | 0.00/3.44M [00:00<?, ?B/s]

  ‚¨áÔ∏è  Downloading: data/json/posts_K14vn_2025-6-14.jsonl


posts_K14vn_2025-6-14.jsonl:   0%|          | 0.00/2.28M [00:00<?, ?B/s]

  ‚¨áÔ∏è  Downloading: data/json/posts_Theanh28.json


posts_Theanh28.json:   0%|          | 0.00/6.65M [00:00<?, ?B/s]

  ‚¨áÔ∏è  Downloading: data/json/posts_Theanh28.jsonl
  ‚¨áÔ∏è  Downloading: data/json/posts_Theanh28_2025-6-14.jsonl


posts_Theanh28_2025-6-14.jsonl:   0%|          | 0.00/4.29M [00:00<?, ?B/s]

  ‚¨áÔ∏è  Downloading: data/json/posts_beatvn.network.json


posts_beatvn.network.json:   0%|          | 0.00/16.3M [00:00<?, ?B/s]

  ‚¨áÔ∏è  Downloading: data/json/posts_beatvn.network_2025-6-14.jsonl


posts_beatvn.network_2025-6-14.jsonl:   0%|          | 0.00/10.3M [00:00<?, ?B/s]

  ‚¨áÔ∏è  Downloading: data/json/posts_congdongvnexpress.json


posts_congdongvnexpress.json:   0%|          | 0.00/1.12M [00:00<?, ?B/s]

  ‚¨áÔ∏è  Downloading: data/json/posts_congdongvnexpress_2025-6-14.jsonl


posts_congdongvnexpress_2025-6-14.jsonl:   0%|          | 0.00/760k [00:00<?, ?B/s]

  ‚¨áÔ∏è  Downloading: data/json/posts_yannews.json


posts_yannews.json:   0%|          | 0.00/3.83M [00:00<?, ?B/s]

  ‚¨áÔ∏è  Downloading: data/json/posts_yannews_2025-6-14.jsonl


posts_yannews_2025-6-14.jsonl:   0%|          | 0.00/2.44M [00:00<?, ?B/s]

‚úÖ ƒê√£ t·∫£i 11 file t·ª´ th∆∞ m·ª•c data/json


['kaggle/datasets/data/json/posts_K14vn.json',
 'kaggle/datasets/data/json/posts_K14vn_2025-6-14.jsonl',
 'kaggle/datasets/data/json/posts_Theanh28.json',
 'kaggle/datasets/data/json/posts_Theanh28.jsonl',
 'kaggle/datasets/data/json/posts_Theanh28_2025-6-14.jsonl',
 'kaggle/datasets/data/json/posts_beatvn.network.json',
 'kaggle/datasets/data/json/posts_beatvn.network_2025-6-14.jsonl',
 'kaggle/datasets/data/json/posts_congdongvnexpress.json',
 'kaggle/datasets/data/json/posts_congdongvnexpress_2025-6-14.jsonl',
 'kaggle/datasets/data/json/posts_yannews.json',
 'kaggle/datasets/data/json/posts_yannews_2025-6-14.jsonl']

In [15]:
logger = _setup_logger("Main", logger_level)

In [16]:
post_loader = PostDataLoader()
posts_data = post_loader(posts_dir)

<INFO-PrepareData> - üöÄ B·∫Øt ƒë·∫ßu load post data t·ª´: ./kaggle/datasets/data/json
<INFO-PrepareData> - üìÇ T√¨m post data trong: ./kaggle/datasets/data/json
<DEBUG-PrepareData> -    ‚îî‚îÄ‚îÄ üìÑ T√¨m th·∫•y: posts_beatvn.network.json
<DEBUG-PrepareData> -    ‚îî‚îÄ‚îÄ üìÑ T√¨m th·∫•y: posts_yannews.json
<DEBUG-PrepareData> -    ‚îî‚îÄ‚îÄ üìÑ T√¨m th·∫•y: posts_Theanh28.json
<DEBUG-PrepareData> -    ‚îî‚îÄ‚îÄ üìÑ T√¨m th·∫•y: posts_K14vn.json
<DEBUG-PrepareData> -    ‚îî‚îÄ‚îÄ üìÑ T√¨m th·∫•y: posts_congdongvnexpress.json
<DEBUG-Utils> - Load FILE JSON ./kaggle/datasets/data/json/posts_beatvn.network.json th√†nh c√¥ng
<DEBUG-PrepareData> - ‚úÖ Load xong ./kaggle/datasets/data/json/posts_beatvn.network.json (177 item)
<DEBUG-Utils> - Load FILE JSON ./kaggle/datasets/data/json/posts_yannews.json th√†nh c√¥ng
<DEBUG-PrepareData> - ‚úÖ Load xong ./kaggle/datasets/data/json/posts_yannews.json (163 item)
<DEBUG-Utils> - Load FILE JSON ./kaggle/datasets/data/json/posts_Theanh28.js

In [17]:
posts = [FacebookPost(p) for p in posts_data]

<DEBUG-Entity> - [FacebookPost] Initializing...
<DEBUG-Entity> - [FacebookPost] Initializing...
<DEBUG-Entity> - [FacebookPost] Initializing...
<DEBUG-Entity> - [FacebookPost] Initializing...
<DEBUG-Entity> - [FacebookPost] Initializing...
<DEBUG-Entity> - [FacebookPost] Initializing...
<DEBUG-Entity> - [FacebookPost] Initializing...
<DEBUG-Entity> - [FacebookPost] Initializing...
<DEBUG-Entity> - [FacebookPost] Initializing...
<DEBUG-Entity> - [FacebookPost] Initializing...
<DEBUG-Entity> - [FacebookPost] Initializing...
<DEBUG-Entity> - [FacebookPost] Initializing...
<DEBUG-Entity> - [FacebookPost] Initializing...
<DEBUG-Entity> - [FacebookPost] Initializing...
<DEBUG-Entity> - [FacebookPost] Initializing...
<DEBUG-Entity> - [FacebookPost] Initializing...
<DEBUG-Entity> - [FacebookPost] Initializing...
<DEBUG-Entity> - [FacebookPost] Initializing...
<DEBUG-Entity> - [FacebookPost] Initializing...
<DEBUG-Entity> - [FacebookPost] Initializing...
<DEBUG-Entity> - [FacebookPost] Initiali

In [18]:
df = PrepareData.prepare_corpus_post_df(posts)

<DEBUG-PrepareData> - [prepare_corpus_post_only_df] T·ªïng s·ªë b√†i vi·∫øt: 855
<DEBUG-Entity> - [get_post_text] Length: 847
<DEBUG-PrepareData> - [Post 1] ‚úî Th√™m post content, length: 847
<DEBUG-Entity> - [get_post_text] Length: 650
<DEBUG-PrepareData> - [Post 2] ‚úî Th√™m post content, length: 650
<DEBUG-Entity> - [get_post_text] Length: 3007
<DEBUG-PrepareData> - [Post 3] ‚úî Th√™m post content, length: 3007
<DEBUG-Entity> - [get_post_text] Length: 777
<DEBUG-PrepareData> - [Post 4] ‚úî Th√™m post content, length: 777
<DEBUG-Entity> - [get_post_text] Length: 126
<DEBUG-PrepareData> - [Post 5] ‚úî Th√™m post content, length: 126
<DEBUG-Entity> - [get_post_text] Length: 1309
<DEBUG-PrepareData> - [Post 6] ‚úî Th√™m post content, length: 1309
<DEBUG-Entity> - [get_post_text] Length: 918
<DEBUG-PrepareData> - [Post 7] ‚úî Th√™m post content, length: 918
<DEBUG-Entity> - [get_post_text] Length: 153
<DEBUG-PrepareData> - [Post 8] ‚úî Th√™m post content, length: 153
<DEBUG-Entity> - [g

In [19]:
post_content.csv

NameError: name 'post_content' is not defined

In [None]:
data_file = "content_posts.csv"
Utils.write_csv(df, data_file)

In [15]:
ls

[0m[01;34minput[0m/  [01;34mlib[0m/  [01;34mworking[0m/
