In [None]:
import json
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
import requests
import time
import random
from datetime import datetime
import psycopg2
from tqdm import tqdm

# User Agent 설정
ua = UserAgent()
headers = {
    "User-Agent": ua.random
}

# 기준 날짜 설정 (2025년 1월 1일)
cutoff_date = datetime.strptime("20250103", "%Y%m%d")
total_articles_processed = 0

# 회사 정보 직접 설정
companies = {
    '삼성전자': '005930',
    'SK하이닉스': '000660',
    'LG에너지솔루션': '373220', 
    '삼성바이오로직스': '207940',
    '삼성SDI': '006400',
    'LG화학': '051910',
    '현대자동차': '005380',
    '포스코홀딩스': '005490',
    '삼성전자우': '005935',
    '기아': '000270',
    'NAVER': '035420',
    'LG전자': '066570',
    '현대모비스': '012330',
    'SK이노베이션': '096770',
    'SK': '034730',
    'LG': '003550',
    'KB금융': '105560',
    '신한지주': '055550',
    '카카오': '035720',
    '삼성물산': '028260',
    'KT&G': '033780',
    '하나금융지주': '086790',
    '포스코퓨처엠': '003670',
    '삼성생명': '032830',
    'SK텔레콤': '017670',
    '카카오뱅크': '323410',
    '현대중공업': '329180',
    'KT': '030200',
    '기업은행': '024110',
    '삼성화재': '000810',
    'S-Oil': '010950',
    '두산에너빌리티': '034020',
    '고려아연': '010130',
    '우리금융지주': '316140',
    'HMM': '011200',
    '한화에어로스페이스': '012450',
    '한국전력': '015760',
    '삼성전기': '009150',
    'HD현대': '267250',
    '한온시스템': '018880',
    '한화솔루션': '009830',
    '현대글로비스': '086280',
    'SK스퀘어': '402340',
    '셀트리온': '068270',
    '한국조선해양': '009540',
    '두산밥캣': '241560',
    'HD한국조선해양': '042660',
    '맥쿼리인프라': '088980',
    '삼성증권': '016360',
    '한미사이언스': '008930'
}

try:
    # PostgreSQL 연결
    conn = psycopg2.connect(
        host="localhost",
        database="stockhelper",
        user="kdb", 
        password="1234"
    )
    cur = conn.cursor()
    
    # news 테이블이 없다면 생성
    cur.execute("""
        CREATE TABLE IF NOT EXISTS news (
            id SERIAL PRIMARY KEY,
            company VARCHAR(100),
            stock_code VARCHAR(20),
            timestamp TIMESTAMP,
            title TEXT,
            press VARCHAR(100),
            summary TEXT,
            content TEXT,
            url TEXT UNIQUE,
            origin_url TEXT,
            created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
        )
    """)
    
    conn.commit()
    
    for company, code in tqdm(companies.items(), desc="Companies", total=len(companies)):
        try:
            # 첫 페이지만 조회 (최신 뉴스 20개)
            url = f"https://m.stock.naver.com/api/news/stock/{code}?pageSize=20&page=1"
            
            response = requests.get(url, headers=headers)
            response.raise_for_status()
            data = response.json()
            
            if not isinstance(data, list):
                data = [data] if data else []
            
            if len(data) == 0:
                print(f"No news found for {company}")
                continue
            
            # 가장 최신 뉴스 찾기
            latest_article = None
            
            for article in data:
                try:
                    if 'items' not in article or not article['items']:
                        continue
                        
                    article_data = article['items'][0]
                    
                    # 날짜 확인
                    article_date = datetime.strptime(article_data['datetime'], "%Y%m%d%H%M")
                    if article_date > cutoff_date:
                        continue
                    
                    # 첫 번째 유효한 기사를 찾으면 저장하고 종료
                    news_url = f"https://n.news.naver.com/mnews/article/{article_data['officeId']}/{article_data['articleId']}"
                    
                    article_response = requests.get(news_url, headers=headers)
                    article_response.raise_for_status()
                    soup = BeautifulSoup(article_response.text, 'html.parser')
                    
                    title = soup.select_one('#title_area > span')
                    if not title:
                        continue
                    title = title.text.strip()
                    
                    press = article_data['officeName']
                    
                    content_element = soup.select_one('#dic_area')
                    content = content_element.get_text(strip=True) if content_element else ''
                    summary = article_data.get('body', '')
                    
                    origin_element = soup.select_one('#ct > div.media_end_head.go_trans > div.media_end_head_info.nv_notrans > div.media_end_head_info_datestamp > a')
                    origin_url = origin_element['href'] if origin_element else news_url

                    # 문서 저장
                    insert_query = """
                        INSERT INTO news (company, stock_code, timestamp, title, press, 
                                        summary, content, url, origin_url, created_at)
                        VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
                        ON CONFLICT (url) DO UPDATE SET
                            company = EXCLUDED.company,
                            stock_code = EXCLUDED.stock_code,
                            timestamp = EXCLUDED.timestamp,
                            title = EXCLUDED.title,
                            press = EXCLUDED.press,
                            summary = EXCLUDED.summary,
                            content = EXCLUDED.content,
                            origin_url = EXCLUDED.origin_url,
                            created_at = CURRENT_TIMESTAMP
                    """
                    
                    cur.execute(insert_query, (
                        company,
                        code,
                        article_date,
                        title,
                        press,
                        summary,
                        content,
                        news_url,
                        origin_url,
                        datetime.now()
                    ))
                    
                    conn.commit()
                    total_articles_processed += 1
                    print(f"Saved news for {company}: {title[:50]}...")
                    break
                    
                except Exception as e:
                    print(f"Error processing article for {company}: {str(e)}")
                    continue
                
            time.sleep(random.uniform(0.2, 0.4))
                
        except Exception as e:
            print(f"Error processing company {company}: {str(e)}")
            continue

except Exception as e:
    print(f"Major error occurred: {str(e)}")
finally:
    print("\nCrawling completed!")
    print(f"Total articles processed: {total_articles_processed}")
    if 'cur' in locals():
        cur.close()
    if 'conn' in locals():
        conn.close()