In [1]:
!git clone https://github.com/hoivd/SocialTrend

Cloning into 'SocialTrend'...
remote: Enumerating objects: 1487, done.[K
remote: Counting objects: 100% (18/18), done.[K
remote: Compressing objects: 100% (11/11), done.[K
remote: Total 1487 (delta 5), reused 16 (delta 3), pack-reused 1469 (from 2)[K
Receiving objects: 100% (1487/1487), 232.51 MiB | 40.51 MiB/s, done.
Resolving deltas: 100% (5/5), done.


# IMPORT THƯ VIỆN

In [2]:
import logging
import json
from typing import List, Dict, Optional
import re
from collections import Counter
import pandas as pd
from SocialTrend.scripts.clone_data import HFDatasetCloner
from SocialTrend.scripts.logger import _setup_logger
from datetime import datetime
import os

In [3]:
logger_level = logging.DEBUG

# UTILS

In [4]:
utils_logger = _setup_logger("Utils", logger_level)

class Utils:
    @staticmethod
    def load_json(file_path: str) -> dict:
        """Đọc file .json và trả về dữ liệu dưới dạng dict hoặc list."""
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                data = json.load(f)
                utils_logger.debug(f"Load FILE JSON {file_path} thành công")
                return data
        except Exception as e:
            utils_logger.debug(f"Load FILE JSON {file_path} thất bại")
            raise Exception(e)

    @staticmethod
    def load_jsonl(file_path: str) -> List[Dict]:
        """Đọc file .jsonl và trả về list chứa các dict (mỗi dòng là một JSON).
    
        Args:
            file_path (str): Đường dẫn tới file .jsonl
            utils_logger: Logger đã được cấu hình sẵn (thường dùng logging.getLogger(...))
    
        Returns:
            List[Dict]: Danh sách các dòng JSON đã parse thành dict
        """
        data = []
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                for line_num, line in enumerate(f, start=1):
                    line = line.strip()
                    if line:
                        try:
                            data.append(json.loads(line))
                        except json.JSONDecodeError as e:
                            utils_logger.debug(f"Lỗi JSON ở dòng {line_num}: {e}")
        except FileNotFoundError:
            utils_logger.debug(f"Không tìm thấy file: {file_path}")
        except Exception as e:
            utils_logger.debug(f"Lỗi khi đọc file '{file_path}': {e}")
        return data

    @staticmethod
    def write_csv(data: List[Dict], file_path: str, encoding: str = "utf-8", index: bool = False) -> None:
        """
        Ghi list[dict] hoặc pandas.DataFrame ra file CSV.
    
        Args:
            data (List[Dict] or pd.DataFrame): Dữ liệu cần ghi.
            file_path (str): Đường dẫn file CSV đầu ra.
            encoding (str): Kiểu mã hóa (mặc định: "utf-8").
            index (bool): Có ghi chỉ số dòng (index) hay không (mặc định: False).
        """
        try:
            if isinstance(data, pd.DataFrame):
                df = data
            else:
                df = pd.DataFrame(data)
    
            df.to_csv(file_path, index=index, encoding=encoding)
            utils_logger.debug(f"[write_csv] ✅ Đã ghi {len(df)} dòng vào '{file_path}'")
    
        except Exception as e:
            utils_logger.debug(f"[write_csv] ❌ Lỗi khi ghi CSV: {e}")

# POST ENTITY

In [5]:
entity_logger = _setup_logger("Entity", logger_level)

In [6]:
class CommentReactionsInfo:
    def __init__(self, reaction_data: Dict):
        self.total = reaction_data.get("total", 0)
        self.detail = reaction_data.get("detail", {}) or {}
        self.reaction_key_map = {
            k.lower(): k for k in self.detail.keys()
        }

    def get_total(self) -> int:
        return self.total

    def get_count(self, reaction: str) -> int:
        key = self.reaction_key_map.get(reaction.lower())
        if key is None:
            entity_logger.debug(f"[CommentReactionsInfo] '{reaction}' không tồn tại trong reaction_key_map")
            return 0
        return self.detail.get(key, 0)

    def get_percentage(self, reaction: str) -> float:
        count = self.get_count(reaction)
        return (count / self.total) * 100 if self.total > 0 else 0.0

    def get_all_counts(self) -> Dict[str, int]:
        return dict(self.detail)

    def get_all_percentages(self) -> Dict[str, float]:
        return {
            k: round((v / self.total) * 100, 2) if self.total > 0 else 0.0
            for k, v in self.detail.items()
        }

    def most_common(self, top_n: int = 2) -> List[tuple]:
        return sorted(self.detail.items(), key=lambda x: x[1], reverse=True)[:top_n]

    def get_available_reactions(self) -> List[str]:
        return list(self.detail.keys())

In [7]:
class ParentComment:
    def __init__(self, data: Dict):
        self.text = data.get("text", "")
        self.reactions_raw = data.get("reactions", {})
        self.raw = data

        entity_logger.debug(f"[ParentComment] Init | Length: {self.length()}")

    def get_text(self) -> str:
        return self.text

    def has_link(self) -> bool:
        has = "http" in self.text
        entity_logger.debug(f"[ParentComment] has_link: {has}")
        return has

    def length(self) -> int:
        return len(self.text)

    def get_keywords(self) -> List[str]:
        words = re.findall(r'\w+', self.text.lower())
        entity_logger.debug(f"[ParentComment] {len(words)} keywords")
        return words

    def get_reactions_info(self) -> CommentReactionsInfo:
        entity_logger.debug("[ParentComment] get_reactions_info")
        return CommentReactionsInfo(self.reactions_raw)

    def get_reaction_count(self) -> int:
        info = self.get_reactions_info()
        return info.get_total()

    def to_dict(self) -> Dict:
        return {
            "text": self.text,
            "reaction_count": self.get_reaction_count(),
            "has_link": self.has_link(),
            "length": self.length(),
        }


In [8]:
class ReactionsInfo:
    def __init__(self, reaction_dict: Dict[str, int]):
        self.reactions = reaction_dict or {}
        self.total = sum(self.reactions.values())       

    def get_total(self) -> int:
        return self.total

    def get_count(self, reaction: str) -> int:
        """Trả về số lượng reaction theo tên không phân biệt hoa thường"""
        key = reaction
        if key is None:
            entity_logger.debug(f"[get_count] '{reaction}' không tồn tại trong reaction_key_map")
            return 0
        return self.reactions.get(key, 0)

    def get_percentage(self, reaction: str) -> float:
        count = self.get_count(reaction)
        return (count / self.total) * 100 if self.total > 0 else 0.0

    def get_all_counts(self) -> Dict[str, int]:
        return dict(self.reactions)

    def get_all_percentages(self) -> Dict[str, float]:
        return {
            k: round((v / self.total) * 100, 2) if self.total > 0 else 0.0
            for k, v in self.reactions.items()
        }

    def most_common(self, top_n: int = 3) -> List[tuple]:
        return sorted(self.reactions.items(), key=lambda x: x[1], reverse=True)[:top_n]

    def get_available_reactions(self) -> List[str]:
        """Trả về danh sách các loại cảm xúc có mặt trong post"""
        return list(self.reactions.keys())

In [9]:
class FacebookPost:
    def __init__(self, data: Dict):
        entity_logger.debug("[FacebookPost] Initializing...")
        self.data = data

    def get_creation_time(self) -> Optional[str]:
        ts = self.data.get("creation_time")
        if ts:
            try:
                dt = datetime.utcfromtimestamp(ts)
                formatted = dt.strftime('%Y-%m-%d %H:%M:%S UTC')
                entity_logger.debug(f"[get_creation_time] Formatted time: {formatted}")
                return formatted
            except Exception as e:
                entity_logger.debug(f"[get_creation_time] Error formatting timestamp: {e}")
        return None

    def get_reactions_info(self) -> ReactionsInfo:
        reaction_data = self.data.get("reactions_detail", {})
        entity_logger.debug(f"[get_reactions_info] Raw: {reaction_data}")
        return ReactionsInfo(reaction_data)

    def get_post_text(self) -> str:
        text = self.data.get("post_content", "")
        entity_logger.debug(f"[get_post_text] Length: {len(text)}")
        return text

    def get_post_url(self) -> str:
        return self.data.get("post_url", "")

    def get_total_reactions(self) -> int:
        total = self.data.get("total_reactions", 0)
        entity_logger.debug(f"[get_total_reactions] Total: {total}")
        return total

    def get_reaction_breakdown(self) -> Dict[str, int]:
        reactions = self.data.get("reactions_detail", {})
        entity_logger.debug(f"[get_reaction_breakdown] Reactions: {reactions}")
        return reactions

    def get_comment_count(self) -> int:
        count = self.data.get("comment_count", 0)
        entity_logger.debug(f"[get_comment_count] Total: {count}")
        return count

    def get_share_count(self) -> int:
        try:
            share_count = int(self.data.get("share_count", "0"))
        except ValueError:
            share_count = 0
        entity_logger.debug(f"[get_share_count] Share count: {share_count}")
        return share_count

    def get_parent_comments(self) -> List[ParentComment]:
        raw_comments = data.get("comments", {}).get("comments", [])
        entity_logger.debug(f"[FacebookPost] Found {len(raw_comments)} parent comments.")
        self.parent_comments: List[ParentComment] = [
            ParentComment(c) for c in raw_comments
        ]
        entity_logger.debug(f"[get_parent_comments] {len(self.parent_comments)} bình luận cha")
        return self.parent_comments

    def get_average_comment_length(self) -> float:
        if not self.parent_comments:
            return 0.0
        lengths = [c.length() for c in self.parent_comments]
        avg = sum(lengths) / len(lengths)
        entity_logger.debug(f"[get_average_comment_length] Trung bình: {avg}")
        return avg

    def get_link_comments(self) -> List[ParentComment]:
        linked = [c for c in self.parent_comments if c.has_link()]
        entity_logger.debug(f"[get_link_comments] Có {len(linked)} bình luận chứa link")
        return linked

    def get_top_keywords_in_comments(self, top_n: int = 10) -> List[tuple]:
        counter = Counter()
        for c in self.parent_comments:
            counter.update(c.get_keywords())
        result = counter.most_common(top_n)
        entity_logger.debug(f"[get_top_keywords_in_comments] Top {top_n}: {result}")
        return result

    def get_first_comment_with_link(self) -> Optional[ParentComment]:
        for c in self.parent_comments:
            if c.has_link():
                entity_logger.debug("[get_first_comment_with_link] Tìm thấy bình luận có link")
                return c
        entity_logger.debug("[get_first_comment_with_link] Không có bình luận nào chứa link")
        return None

    def print_basic_info(self):
        print("📄 THÔNG TIN BÀI VIẾT")
        print("-" * 40)
        print(f"🕒 Thời gian đăng   : {self.get_creation_time()}")
        print(f"🔗 URL             : {self.get_post_url()}")
        print(f"💬 Nội dung        : {self.get_post_text()[:100]}{'...' if len(self.get_post_text()) > 100 else ''}")
        print()
        print(f"👍 Tổng reaction   : {self.get_total_reactions()}")
        print(f"💬 Tổng comment    : {self.get_comment_count()}")
        print(f"🔁 Tổng share      : {self.get_share_count()}")
        print()
        print(f"👨‍👩‍👧‍👦 Số bình luận cha : {len(self.get_parent_comments())}")
        print(f"✏️ Độ dài TB bình luận: {self.get_average_comment_length():.2f} ký tự")
        print(f"🔗 Bình luận có link  : {len(self.get_link_comments())}")
        print("-" * 40)

# PREPARE DATA 

In [10]:
prepare_logger = _setup_logger("PrepareData", logger_level)

In [11]:
class PostDataLoader:
    @staticmethod
    def get_posts_file(root_dir: str) -> List[str]:
        """
        Trả về danh sách tất cả file .json trong thư mục (bao gồm thư mục con)
        """
        json_files = []
        prepare_logger.info(f"📂 Tìm post data trong: {root_dir}")

        for root, dirs, files in os.walk(root_dir):
            for file in files:
                if file.lower().endswith('.json'):
                    full_path = os.path.join(root, file)
                    prepare_logger.debug(f"   └── 📄 Tìm thấy: {file}")
                    json_files.append(full_path)

        return json_files

    @staticmethod
    def get_posts_data(post_files: List[str]) -> List[Dict]:
        """
        Đọc dữ liệu từ danh sách file JSON và gộp lại thành một list
        """
        all_posts = []

        for file_path in post_files:
            try:
                data = Utils.load_json(file_path)
                if isinstance(data, list):
                    all_posts.extend(data)
                else:
                    all_posts.append(data)
                prepare_logger.debug(f"✅ Load xong {file_path} ({len(data)} item)")
            except Exception as e:
                prepare_logger.debug(f"❌ Lỗi khi load {file_path}: {e}")
        
        prepare_logger.info(f"📦 Tổng số post load được: {len(all_posts)}")
        return all_posts

    def __call__(self, root_dir: str) -> List[Dict]:
        """
        Cho phép gọi class như một hàm để lấy toàn bộ post từ thư mục
        """
        prepare_logger.info(f"🚀 Bắt đầu load post data từ: {root_dir}")
        post_files = self.get_posts_file(root_dir)
        posts_data = self.get_posts_data(post_files)
        return posts_data

In [12]:
class PrepareData:
    @staticmethod
    def prepare_post_comment_data(post: FacebookPost, max_comment: int = 50) -> List[str]:
        """
        Thực hiện các bước:
        Bước 1: Lấy nội dung bài post (post content)
        Bước 2: Lấy danh sách parent comment và nội dung từng comment
        Bước 3: Với mỗi comment, ghép nội dung post + comment thành một chuỗi → trả về list
        """
        # Bước 1: lấy nội dung post
        post_text = post.get_post_text().strip()
        prepare_logger.debug(f"[Bước 1] Post content: '{post_text[:80]}...' (length: {len(post_text)})")

        # Bước 2: lấy comment cha
        parent_comments = post.get_parent_comments()
        prepare_logger.debug(f"[Bước 2] Số lượng comment cha: {len(parent_comments)}")

        comment_texts = []
        n_comment = len(parent_comments)
        top_k_cmt = min(n_comment, max_comment)
        for i, c in enumerate(parent_comments[:top_k_cmt]):
            text = c.get_text().strip()
            if text:
                comment_texts.append(text)
                prepare_logger.debug(f"  - Comment #{i+1} | length: {len(text)} | preview: '{text[:60]}...'")

        # Bước 3: ghép post + comment
        merged_texts = [f"{post_text}\n\n{comment}" for comment in comment_texts]
        prepare_logger.debug(f"[Bước 3] Tổng số mẫu ghép: {len(merged_texts)}")

        return merged_texts

    @staticmethod
    def prepare_corpus_post_comment_df(posts: List[FacebookPost], max_comment: int = 50) -> pd.DataFrame:
        """
        Trả về DataFrame gồm 2 cột: 'content' và 'post_id'
        - 'content': văn bản sau khi merge post content + comment content
        - 'post_id': lấy từ 'feedback_id' trong dict gốc
        """
        prepare_logger.debug(f"[prepare_corpus_post_comment_df] Xử lý {len(posts)} bài viết | Max Comment: {max_comment}")
    
        records = []
    
        for i, post in enumerate(posts):
            try:
                prepare_logger.debug(f"[Post {i+1}] Đang xử lý post")
                post_id = i + 1
                merged_texts = PrepareData.prepare_post_comment_data(post, max_comment)
    
                for content in merged_texts:
                    records.append({
                        "post_id": post_id,
                        "content": content
                    })
    
                prepare_logger.debug(f"[Post {i+1}] Thêm {len(merged_texts)} dòng vào DataFrame")
    
            except Exception as e:
                prepare_logger.debug(f"[Post {i+1}] Lỗi: {e}")
    
        df = pd.DataFrame(records)
        prepare_logger.debug(f"[prepare_corpus_post_comment_df] Tổng số dòng: {len(df)}")
        return df

    @staticmethod
    def prepare_corpus_post_df(posts: List[FacebookPost]) -> pd.DataFrame:
        """
        Tạo DataFrame chỉ gồm nội dung post (không có comment).
        
        Returns:
            pd.DataFrame với 2 cột:
                - 'post_id': ID thứ tự
                - 'content': nội dung bài viết
        """
        prepare_logger.debug(f"[prepare_corpus_post_only_df] Tổng số bài viết: {len(posts)}")
        
        records = []
        
        for i, post in enumerate(posts):
            try:
                content = post.get_post_text().strip()
                if content:
                    records.append({
                        "post_id": i + 1,
                        "content": content
                    })
                    prepare_logger.debug(f"[Post {i+1}] ✔ Thêm post content, length: {len(content)}")
                else:
                    prepare_logger.debug(f"[Post {i+1}] ⚠ Post content rỗng")
            except Exception as e:
                prepare_logger.debug(f"[Post {i+1}] ❌ Lỗi: {e}")
        
        df = pd.DataFrame(records)
        prepare_logger.debug(f"[prepare_corpus_post_only_df] ✅ Tổng số dòng: {len(df)}")
        return df

# MAIN 

## Load Posts

In [13]:
datasets_dir = "./kaggle/datasets"
posts_dir = datasets_dir + "/data/json"

In [14]:
cloner = HFDatasetCloner("hoivd/SocialData")
cloner.clone_folder("data/json", local_dir="./kaggle/datasets")

📂 Cloning folder: data/json
  ⬇️  Downloading: data/json/posts_K14vn.json


For more details, check out https://huggingface.co/docs/huggingface_hub/main/en/guides/download#download-files-to-local-folder.


posts_K14vn.json:   0%|          | 0.00/3.44M [00:00<?, ?B/s]

  ⬇️  Downloading: data/json/posts_K14vn_2025-6-14.jsonl


posts_K14vn_2025-6-14.jsonl:   0%|          | 0.00/2.28M [00:00<?, ?B/s]

  ⬇️  Downloading: data/json/posts_Theanh28.json


posts_Theanh28.json:   0%|          | 0.00/6.65M [00:00<?, ?B/s]

  ⬇️  Downloading: data/json/posts_Theanh28.jsonl
  ⬇️  Downloading: data/json/posts_Theanh28_2025-6-14.jsonl


posts_Theanh28_2025-6-14.jsonl:   0%|          | 0.00/4.29M [00:00<?, ?B/s]

  ⬇️  Downloading: data/json/posts_beatvn.network.json


posts_beatvn.network.json:   0%|          | 0.00/16.3M [00:00<?, ?B/s]

  ⬇️  Downloading: data/json/posts_beatvn.network_2025-6-14.jsonl


posts_beatvn.network_2025-6-14.jsonl:   0%|          | 0.00/10.3M [00:00<?, ?B/s]

  ⬇️  Downloading: data/json/posts_congdongvnexpress.json


posts_congdongvnexpress.json:   0%|          | 0.00/1.12M [00:00<?, ?B/s]

  ⬇️  Downloading: data/json/posts_congdongvnexpress_2025-6-14.jsonl


posts_congdongvnexpress_2025-6-14.jsonl:   0%|          | 0.00/760k [00:00<?, ?B/s]

  ⬇️  Downloading: data/json/posts_yannews.json


posts_yannews.json:   0%|          | 0.00/3.83M [00:00<?, ?B/s]

  ⬇️  Downloading: data/json/posts_yannews_2025-6-14.jsonl


posts_yannews_2025-6-14.jsonl:   0%|          | 0.00/2.44M [00:00<?, ?B/s]

✅ Đã tải 11 file từ thư mục data/json


['kaggle/datasets/data/json/posts_K14vn.json',
 'kaggle/datasets/data/json/posts_K14vn_2025-6-14.jsonl',
 'kaggle/datasets/data/json/posts_Theanh28.json',
 'kaggle/datasets/data/json/posts_Theanh28.jsonl',
 'kaggle/datasets/data/json/posts_Theanh28_2025-6-14.jsonl',
 'kaggle/datasets/data/json/posts_beatvn.network.json',
 'kaggle/datasets/data/json/posts_beatvn.network_2025-6-14.jsonl',
 'kaggle/datasets/data/json/posts_congdongvnexpress.json',
 'kaggle/datasets/data/json/posts_congdongvnexpress_2025-6-14.jsonl',
 'kaggle/datasets/data/json/posts_yannews.json',
 'kaggle/datasets/data/json/posts_yannews_2025-6-14.jsonl']

In [15]:
logger = _setup_logger("Main", logger_level)

In [16]:
post_loader = PostDataLoader()
posts_data = post_loader(posts_dir)

<INFO-PrepareData> - 🚀 Bắt đầu load post data từ: ./kaggle/datasets/data/json
<INFO-PrepareData> - 📂 Tìm post data trong: ./kaggle/datasets/data/json
<DEBUG-PrepareData> -    └── 📄 Tìm thấy: posts_beatvn.network.json
<DEBUG-PrepareData> -    └── 📄 Tìm thấy: posts_yannews.json
<DEBUG-PrepareData> -    └── 📄 Tìm thấy: posts_Theanh28.json
<DEBUG-PrepareData> -    └── 📄 Tìm thấy: posts_K14vn.json
<DEBUG-PrepareData> -    └── 📄 Tìm thấy: posts_congdongvnexpress.json
<DEBUG-Utils> - Load FILE JSON ./kaggle/datasets/data/json/posts_beatvn.network.json thành công
<DEBUG-PrepareData> - ✅ Load xong ./kaggle/datasets/data/json/posts_beatvn.network.json (177 item)
<DEBUG-Utils> - Load FILE JSON ./kaggle/datasets/data/json/posts_yannews.json thành công
<DEBUG-PrepareData> - ✅ Load xong ./kaggle/datasets/data/json/posts_yannews.json (163 item)
<DEBUG-Utils> - Load FILE JSON ./kaggle/datasets/data/json/posts_Theanh28.json thành công
<DEBUG-PrepareData> - ✅ Load xong ./kaggle/datasets/data/json/posts_

In [17]:
posts = [FacebookPost(p) for p in posts_data]

<DEBUG-Entity> - [FacebookPost] Initializing...
<DEBUG-Entity> - [FacebookPost] Initializing...
<DEBUG-Entity> - [FacebookPost] Initializing...
<DEBUG-Entity> - [FacebookPost] Initializing...
<DEBUG-Entity> - [FacebookPost] Initializing...
<DEBUG-Entity> - [FacebookPost] Initializing...
<DEBUG-Entity> - [FacebookPost] Initializing...
<DEBUG-Entity> - [FacebookPost] Initializing...
<DEBUG-Entity> - [FacebookPost] Initializing...
<DEBUG-Entity> - [FacebookPost] Initializing...
<DEBUG-Entity> - [FacebookPost] Initializing...
<DEBUG-Entity> - [FacebookPost] Initializing...
<DEBUG-Entity> - [FacebookPost] Initializing...
<DEBUG-Entity> - [FacebookPost] Initializing...
<DEBUG-Entity> - [FacebookPost] Initializing...
<DEBUG-Entity> - [FacebookPost] Initializing...
<DEBUG-Entity> - [FacebookPost] Initializing...
<DEBUG-Entity> - [FacebookPost] Initializing...
<DEBUG-Entity> - [FacebookPost] Initializing...
<DEBUG-Entity> - [FacebookPost] Initializing...
<DEBUG-Entity> - [FacebookPost] Initiali

In [18]:
df = PrepareData.prepare_corpus_post_df(posts)

<DEBUG-PrepareData> - [prepare_corpus_post_only_df] Tổng số bài viết: 855
<DEBUG-Entity> - [get_post_text] Length: 847
<DEBUG-PrepareData> - [Post 1] ✔ Thêm post content, length: 847
<DEBUG-Entity> - [get_post_text] Length: 650
<DEBUG-PrepareData> - [Post 2] ✔ Thêm post content, length: 650
<DEBUG-Entity> - [get_post_text] Length: 3007
<DEBUG-PrepareData> - [Post 3] ✔ Thêm post content, length: 3007
<DEBUG-Entity> - [get_post_text] Length: 777
<DEBUG-PrepareData> - [Post 4] ✔ Thêm post content, length: 777
<DEBUG-Entity> - [get_post_text] Length: 126
<DEBUG-PrepareData> - [Post 5] ✔ Thêm post content, length: 126
<DEBUG-Entity> - [get_post_text] Length: 1309
<DEBUG-PrepareData> - [Post 6] ✔ Thêm post content, length: 1309
<DEBUG-Entity> - [get_post_text] Length: 918
<DEBUG-PrepareData> - [Post 7] ✔ Thêm post content, length: 918
<DEBUG-Entity> - [get_post_text] Length: 153
<DEBUG-PrepareData> - [Post 8] ✔ Thêm post content, length: 153
<DEBUG-Entity> - [get_post_text] Length: 811
<DEBU

In [19]:
post_content.csv

NameError: name 'post_content' is not defined

In [None]:
data_file = "content_posts.csv"
Utils.write_csv(df, data_file)

In [15]:
ls

[0m[01;34minput[0m/  [01;34mlib[0m/  [01;34mworking[0m/
