In [4]:
import os
import sys
import pandas as pd
import argparse
from datetime import datetime
import pathlib

# # Setup Django environment
# # Alternative 1: Use absolute path construction
# current_dir = os.path.abspath('')
# parent_dir = os.path.dirname(current_dir)
# sys.path.insert(0, parent_dir)

# # Alternative 2: Use pathlib for more modern path handling
# parent_path = pathlib.Path().absolute().parent
# sys.path.insert(0, str(parent_path))

# 新增：將上一層目錄加入 sys.path
parent_path = pathlib.Path().absolute().parent
sys.path.insert(0, str(parent_path))

os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'website_configs.settings')
import django
django.setup()
# 重要：設定環境變數以允許在 Jupyter 的異步環境中執行同步操作
os.environ["DJANGO_ALLOW_ASYNC_UNSAFE"] = "true"

# Now we can import Django models
from app_user_keyword_db.models import NewsData

In [8]:
# Read CSV file
csv_file_path = '../app_user_keyword/dataset/news.csv'
# csv_file_path = '../app_user_keyword/dataset/cna_news_preprocessed_12weeks.csv'
df = pd.read_csv(csv_file_path, sep='|')

# Process each row and create a NewsData object
for idx, row in df.iterrows():
    try:
        # Convert date string to datetime object
        date_obj = datetime.strptime(row['date'], '%Y-%m-%d').date()

        # Create or update NewsData object
        news_data, created = NewsData.objects.update_or_create(
            item_id=row['item_id'],
            defaults={
                'date': date_obj,
                'category': row['category'],
                'title': row['title'],
                'content': row['content'],
                # 'sentiment': row['sentiment'],
                #'summary': row['summary'],
                'top_key_freq': row['top_key_freq'],
                'tokens': row['tokens'],
                'tokens_v2': row['tokens_v2'],
                'entities': row['entities'],
                'token_pos': row['token_pos'],
                'link': row['link'],
                'photo_link': row['photo_link'] if row['photo_link'] != "" and not pd.isna(row['photo_link']) else None,
            }
        )
        if created:
            print(f"Created new NewsData object with item_id: {row['item_id']}")
        else:
            print(f"Updated existing NewsData object with item_id: {row['item_id']}")
    except Exception as e:
        print(f"Error at row {idx}: {e}")
        print(row)
# photo_link 欄位的值可能為以下幾種情況：
# 實際有值的 URL 字串
# 空字串 ("")
# Pandas NaN 值（當 CSV 檔案中該欄位為空時）
# None 值        

Created new NewsData object with item_id: _20250327_1
Created new NewsData object with item_id: _20250327_2
Created new NewsData object with item_id: _20250327_3
Created new NewsData object with item_id: _20250327_4
Created new NewsData object with item_id: _20250327_5
Created new NewsData object with item_id: _20250327_6
Created new NewsData object with item_id: _20250327_7
Created new NewsData object with item_id: _20250327_8
Created new NewsData object with item_id: _20250327_9
Created new NewsData object with item_id: _20250327_10
Created new NewsData object with item_id: _20250327_11
Created new NewsData object with item_id: _20250327_12
Created new NewsData object with item_id: _20250327_13
Created new NewsData object with item_id: _20250326_14
Created new NewsData object with item_id: _20250326_15
Created new NewsData object with item_id: _20250326_16
Created new NewsData object with item_id: _20250326_17
Created new NewsData object with item_id: _20250327_18
Created new NewsDat

In [9]:
created

True