In [1]:
import os
import sys
import pandas as pd
import argparse
from datetime import datetime
import pathlib

# # Setup Django environment
# # Alternative 1: Use absolute path construction
# current_dir = os.path.abspath('')
# parent_dir = os.path.dirname(current_dir)
# sys.path.insert(0, parent_dir)

# # Alternative 2: Use pathlib for more modern path handling
# parent_path = pathlib.Path().absolute().parent
# sys.path.insert(0, str(parent_path))

# 新增：將上一層目錄加入 sys.path
parent_path = pathlib.Path().absolute().parent
sys.path.insert(0, str(parent_path))

os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'website_configs.settings')
import django
django.setup()
# 重要：設定環境變數以允許在 Jupyter 的異步環境中執行同步操作
os.environ["DJANGO_ALLOW_ASYNC_UNSAFE"] = "true"

# Now we can import Django models
from app_user_keyword_db.models import NewsData

In [2]:
# Read CSV file
csv_file_path = '../app_user_keyword/dataset/yahoo_news_preprocessed.csv'
# csv_file_path = '../app_user_keyword/dataset/cna_news_preprocessed_12weeks.csv'
df = pd.read_csv(csv_file_path, sep='|')
df.head(1)


Unnamed: 0,item_id,date,category,media,title,content,sentiment,summary,top_key_freq,tokens,tokens_v2,entities,token_pos,link,photo_link
0,world_20250327_1,2025-03-27,國際,BBC NEWS 中文,特朗普宣布美國對進口汽車徵收25%關稅,美國總統特朗普宣布對美國進口的汽車和汽車零件徵收25%的新關稅，這一舉措恐將擴大全球貿易戰。...,0.95,暫無,"[('汽車', 32), ('美國', 28), ('關稅', 19), ('特朗普', 1...","['美國', '總統', '特朗普', '宣布', '對', '美國', '進口', '的'...","['美國', '總統', '特朗普', '宣布', '美國', '進口', '汽車', '汽...","[NerToken(word='美國', ner='GPE', idx=(0, 2)), N...","[('美國', 'Nc'), ('總統', 'Na'), ('特朗普', 'Nb'), ('...",https://tw.news.yahoo.com/%E7%89%B9%E6%9C%97%E...,https://s.yimg.com/uu/api/res/1.2/oxXwPVXR8sGJ...


In [3]:
# Process each row and create a NewsData object
for idx, row in df.iterrows():
    try:
        # Convert date string to datetime object
        date_obj = datetime.strptime(row['date'], '%Y-%m-%d').date()

        # Create or update NewsData object
        news_data, created = NewsData.objects.update_or_create(
            item_id=row['item_id'],
            defaults={
                'date': date_obj,
                'category': row['category'],
                'title': row['title'],
                'content': row['content'],
                'sentiment': row['sentiment'],
                #'summary': row['summary'],
                'top_key_freq': row['top_key_freq'],
                'tokens': row['tokens'],
                'tokens_v2': row['tokens_v2'],
                'entities': row['entities'],
                'token_pos': row['token_pos'],
                'link': row['link'],
                'photo_link': row['photo_link'] if row['photo_link'] != "" and not pd.isna(row['photo_link']) else None,
            }
        )
        if created:
            print(f"Created new NewsData object with item_id: {row['item_id']}")
        else:
            print(f"Updated existing NewsData object with item_id: {row['item_id']}")
    except Exception as e:
        print(f"Error at row {idx}: {e}")
        print(row)
# photo_link 欄位的值可能為以下幾種情況：
# 實際有值的 URL 字串
# 空字串 ("")
# Pandas NaN 值（當 CSV 檔案中該欄位為空時）
# None 值        

Created new NewsData object with item_id: world_20250327_1
Created new NewsData object with item_id: world_20250327_2
Created new NewsData object with item_id: world_20250327_3
Created new NewsData object with item_id: world_20250327_4
Created new NewsData object with item_id: world_20250327_5
Created new NewsData object with item_id: world_20250327_6
Created new NewsData object with item_id: world_20250327_7
Created new NewsData object with item_id: world_20250327_8
Created new NewsData object with item_id: world_20250327_9
Created new NewsData object with item_id: world_20250327_10
Created new NewsData object with item_id: politics_20250327_1
Created new NewsData object with item_id: politics_20250327_2
Created new NewsData object with item_id: politics_20250327_3
Created new NewsData object with item_id: politics_20250327_4
Created new NewsData object with item_id: politics_20250326_5
Created new NewsData object with item_id: politics_20250327_6
Created new NewsData object with item

In [4]:
created

True