In [None]:
import requests
from bs4 import BeautifulSoup
import openai
import google.generativeai as genai
from email.mime.text import MIMEText
from email.mime.multipart import MIMEMultipart
from email.utils import formataddr
import smtplib
import json
from datetime import datetime
import os
import google.generativeai as genai
from urllib.parse import urljoin
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By


def scrape(url):
    # chrome_options = Options()
    # chrome_options.add_argument("--headless")
    # chrome_options.add_argument("--disable-gpu")
    chrome_options = Options()
    # Bỏ chế độ headless để hiện browser
    # chrome_options.add_argument("--headless")
    chrome_options.add_argument("--disable-gpu")
    
    # Thêm các tùy chọn để hiển thị cửa sổ đẹp hơn
    chrome_options.add_argument("--start-maximized")  # Maximize cửa sổ
    chrome_options.add_argument("--disable-notifications")  # Tắt thông báo
    chrome_options.add_argument("--disable-popup-blocking")  # Cho phép popup nếu cần

    driver = webdriver.Chrome(options=chrome_options)

    try:
        driver.get(url)
        
        
        WebDriverWait(driver, 60).until(
        EC.presence_of_element_located((By.TAG_NAME, "body"))
    )  
        
        # Lấy HTML sau khi trang đã load hoàn toàn
        html = driver.page_source
        
        soup = BeautifulSoup(html, 'html5lib')
        
        # Trích xuất URL gốc
        canonical_url = None
        # Kiểm tra thẻ canonical
        canonical_tag = soup.find('link', rel = 'canonical')
        if canonical_tag and canonical_tag.get('href'):
            canonical_url = canonical_tag['href']
            
        if not canonical_url:
            og_url_tag = soup.find("meta", property="og:url")
        if og_url_tag and og_url_tag.get("content"):
            canonical_url = og_url_tag["content"]
        
        if not canonical_url:
            canonical_url = driver.current_url
        
        # --- Trích xuất nội dung văn bản ---
        main_text = ""
        # Danh sách các selector phổ biến để tìm container chính
        selectors = ['article', 'div#main', 'div.content', 'div.article', 'main']
        for sel in selectors:
            container = soup.select_one(sel)
            if container:
                main_text = container.get_text(separator=' ', strip=True)
                if main_text:
                    break
        # Nếu không tìm thấy container phù hợp, fallback về toàn bộ văn bản
        if not main_text:
            main_text = soup.get_text(separator=' ', strip=True)
        
        # Extract images
        images_set = set()
        for img in soup.find_all('img'):
            src = img.get('src') or img.get('data-src') or img.get('data-original')
            if src:
                full_url = urljoin(url, src)
                images_set.add(full_url)
        images = list(images_set)
        
        
        return {"text": main_text, 
                "images": images,
                "canonical_url": canonical_url}

    finally:
        driver.quit()


def summarize_content(content, images, canonical_url):
    try:
        # Get the API key from the environment variable (or use a safe default for testing)
        api_key = os.environ.get('GOOGLE_API_KEY')
        if not api_key:
            print("Warning: GOOGLE_API_KEY environment variable not set.  Using a placeholder key (FOR DEVELOPMENT/TESTING ONLY!).  Set the variable for real usage.")
            api_key = 'AIzaSyB25ElYsVVI2o6y7Mfk-5uL7sApJt9sRR8' 

        genai.configure(api_key=api_key)

        model = genai.GenerativeModel('gemini-2.0-flash')
        # Construct the prompt to request HTML output
        
        prompt = f"""You are a tech news summarizer.
Create concise, engaging summaries about innovation, technology trends, new popular AI products, new trending open source/Github repo.
Based your summary in this content: {content}.
Also, integrate the following relevant image URLs into the HTML template: {images}. Be sure to use the image that best represents the content (if possible use the image that is mentioned in the content).
If no image URLs are provided, use the default placeholder image from the template.
Make sure to return the summary formatted as HTML, no other text, with professional styling intended to be sent as a newsletter.
The template is provided, make sure to extract the exact information and fill in the template.
Extract all valid image URLs from the provided content. For each section in the newsletter, if there is a relevant image URL mentioned, insert it into the corresponding <img> tag. If no image is found, use a default placeholder image URL.
Make sure to add the link to the full article in the 'Learn More', 'Discover More', 'Explore More', or 'Visit More' buttons. I want to click on these buttons and be redirected to the full article.(exact article not homepage): { canonical_url}
1. EXTRACT FROM CONTENT:
   - Article title
   - Brief summary
   - Article URL/link 
   - Featured image (if available)
   - Group articles into appropriate sections (Trends/AI Products/Topics)

2. URL HANDLING:
   - Extract complete URLs from {canonical_url} the content
   - Each "Learn More" button must link directly to its article source
   - Ensure URLs include full protocol (add https:// if missing)
   - Validate URL format before using

3. IMAGE HANDLING:
   - Use relevant images from {images} if available
   - Match images with their related articles
   - Use placeholder only if no relevant image exists

Based on this content: {content}

FORMAT REQUIREMENTS:
- Return only HTML formatted newsletter
- Include all article links in "Learn More" buttons
- No placeholders in final output - all [brackets] should be replaced with real content
- Each article card must have:
  * Title from source
  * Brief summary
  * Direct link to original article
  * Relevant image or placeholder

**Template:**

<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <meta http-equiv="X-UA-Compatible" content="ie=edge">
    <title>Data News Newsletter</title>
    <style>
        /* Reset Styles (Important for Email Consistency) */
        body, table, td, a {{
            -webkit-text-size-adjust: 100%;
            -ms-text-size-adjust: 100%;
        }}
        table, td {{
            mso-table-lspace: 0pt;
            mso-table-rspace: 0pt;
        }}
        img {{
            -ms-interpolation-mode: bicubic;
            border: 0;
            height: auto;
            line-height: 100%;
            outline: none;
            text-decoration: none;
            display: block;
        }}
        /* General Styles */
        body {{
            font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
            margin: 0;
            padding: 0;
            background-color: #f8f9fa;
            color: #495057;
            font-size: 14px;
        }}
        .container {{
            width: 100%;
            max-width: 600px;
            margin: 0 auto;
            background-color: #ffffff;
            box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1);
            border-radius: 8px;
            overflow: hidden;
        }}
        .header {{
            background-color: #007bff;
            color: #ffffff;
            padding: 15px;
            text-align: center;
        }}
        h1, h2, h3 {{
            margin-top: 0;
        }}
        h1 {{
            font-size: 2rem; /* giảm từ 2.5rem */
            font-weight: 300;
            line-height: 1.2;
        }}
        h2 {{
            font-size: 1.5rem; /* giảm từ 1.75rem */
            color: #007bff;
            border-bottom: 2px solid #007bff;
            padding-bottom: 5px;
            margin-bottom: 1rem;
        }}
        h3 {{
            font-size: 1.1rem; /* giảm từ 1.25rem */
            color: #343a40;
        }}
        .section {{
            padding: 15px;
        }}
        .article {{
            margin-bottom: 1rem;
            border-bottom: 1px solid #dee2e6;
            padding-bottom: 1rem;
            display: flex;
            flex-direction: row;
        }}
        .article:last-child {{
            border-bottom: none;
        }}
        .article-image-container {{
            width: 30%;
            margin-right: 10px;
        }}
        .article-image {{
            width: 100%;
            height: auto;
            border-radius: 4px;
            display: block;
        }}
        .article-content {{
            width: 70%;
        }}
        .article-summary {{
            font-size: 0.9rem;
            color: #6c757d;
        }}
        .read-more {{
            display: inline-block;
            background-color: #007bff;
            color: #ffffff !important;
            padding: 8px 12px; /* giảm padding */
            border-radius: 4px;
            text-decoration: none;
            margin-top: 8px;
            font-weight: bold;
            text-align: center;
            font-size: 0.9rem;
        }}
        .read-more:hover {{
            background-color: #0056b3;
        }}
        .footer {{
            text-align: center;
            padding: 15px;
            background-color: #f8f9fa;
            font-size: 0.8rem;
            color: #6c757d;
        }}
        .footer a {{
            color: #007bff;
            text-decoration: none;
        }}
        @media screen and (max-width: 480px) {{
            .article {{
                flex-direction: column;
            }}
            .article-image-container, .article-content {{
                width: 100%;
                margin-right: 0;
            }}
            .article-image {{
                margin-bottom: 10px;
            }}
        }}
    </style>
</head>
<body>
    <table width="100%" border="0" cellspacing="0" cellpadding="0" bgcolor="#f8f9fa">
        <tr>
            <td align="center">
                <div class="container">
                    <div class="header">
                        <h1>Data News</h1>
                    </div>

                    <div class="section">
                        <h2>Innovation Trends</h2>
                        <!-- Trend 1 -->
                        <div class="article">
                            <div class="article-image-container">
                                <img src="https://placehold.co/200x150/007bff/FFFFFF.png?text=Trend+Image+1" alt="Innovation Trend Image 1" class="article-image">
                            </div>
                            <div class="article-content">
                                <h3>[Innovation Trend 1 Title]</h3>
                                <p class="article-summary">[Innovation Trend 1 Summary]</p>
                                <a href="#" class="read-more">Learn More</a>
                            </div>
                        </div>
                        <!-- Trend 2 -->
                        <div class="article">
                            <div class="article-image-container">
                                <img src="https://placehold.co/200x150/007bff/FFFFFF.png?text=Trend+Image+2" alt="Innovation Trend Image 2" class="article-image">
                            </div>
                            <div class="article-content">
                                <h3>[Innovation Trend 2 Title]</h3>
                                <p class="article-summary">[Innovation Trend 2 Summary]</p>
                                <a href="#" class="read-more">Discover More</a>
                            </div>
                        </div>
                        <!-- Trend 3 -->
                        <div class="article">
                            <div class="article-image-container">
                                <img src="https://placehold.co/200x150/007bff/FFFFFF.png?text=Trend+Image+3" alt="Innovation Trend Image 3" class="article-image">
                            </div>
                            <div class="article-content">
                                <h3>[Innovation Trend 3 Title]</h3>
                                <p class="article-summary">[Innovation Trend 3 Summary]</p>
                                <a href="#" class="read-more">Learn More</a>
                            </div>
                        </div>
                    </div>

                    <div class="section">
                        <h2>New AI Products</h2>
                        <!-- AI Product 1 -->
                        <div class="article">
                            <div class="article-image-container">
                                <img src="https://placehold.co/200x150/28a745/FFFFFF.png?text=AI+Product+1" alt="AI Product Image 1" class="article-image">
                            </div>
                            <div class="article-content">
                                <h3>[New AI Product 1 Title]</h3>
                                <p class="article-summary">[New AI Product 1 Summary]</p>
                                <a href="#" class="read-more">Explore More</a>
                            </div>
                        </div>
                        <!-- AI Product 2 -->
                        <div class="article">
                            <div class="article-image-container">
                                <img src="https://placehold.co/200x150/28a745/FFFFFF.png?text=AI+Product+2" alt="AI Product Image 2" class="article-image">
                            </div>
                            <div class="article-content">
                                <h3>[New AI Product 2 Title]</h3>
                                <p class="article-summary">[New AI Product 2 Summary]</p>
                                <a href="#" class="read-more">Explore More</a>
                            </div>
                        </div>
                        <!-- AI Product 3 -->
                        <div class="article">
                            <div class="article-image-container">
                                <img src="https://placehold.co/200x150/28a745/FFFFFF.png?text=AI+Product+3" alt="AI Product Image 3" class="article-image">
                            </div>
                            <div class="article-content">
                                <h3>[New AI Product 3 Title]</h3>
                                <p class="article-summary">[New AI Product 3 Summary]</p>
                                <a href="#" class="read-more">Explore More</a>
                            </div>
                        </div>
                    </div>

                    <div class="section">
                        <h2>Related Topics</h2>
                        <!-- Topic 1 -->
                        <div class="article">
                            <div class="article-image-container">
                                <img src="https://placehold.co/200x150/6c757d/FFFFFF.png?text=Topic+1" alt="Related Topic Image 1" class="article-image">
                            </div>
                            <div class="article-content">
                                <h3>[Topic 1 Title]</h3>
                                <p class="article-summary">[Topic 1 Summary]</p>
                                <a href="#" class="read-more">Visit More</a>
                            </div>
                        </div>
                        <!-- Topic 2 -->
                        <div class="article">
                            <div class="article-image-container">
                                <img src="https://placehold.co/200x150/6c757d/FFFFFF.png?text=Topic+2" alt="Related Topic Image 2" class="article-image">
                            </div>
                            <div class="article-content">
                                <h3>[Topic 2 Title]</h3>
                                <p class="article-summary">[Topic 2 Summary]</p>
                                <a href="#" class="read-more">Visit More</a>
                            </div>
                        </div>
                        <!-- Topic 3 -->
                        <div class="article">
                            <div class="article-image-container">
                                <img src="https://placehold.co/200x150/6c757d/FFFFFF.png?text=Topic+3" alt="Related Topic Image 3" class="article-image">
                            </div>
                            <div class="article-content">
                                <h3>[Topic 3 Title]</h3>
                                <p class="article-summary">[Topic 3 Summary]</p>
                                <a href="#" class="read-more">Visit More</a>
                            </div>
                        </div>
                    </div>

                    <div class="footer">
                        <p>© 2024 Data News. All rights reserved.</p>
                        <p><a href="#">Unsubscribe</a> | <a href="#">View in Browser</a></p>
                    </div>
                </div>
            </td>
        </tr>
    </table>
</body>
</html>
"""
         
        # Generate the summary
        response = model.generate_content(prompt)

        return response.text

    except Exception as e:
        print(f"Error in summarization: {e}")
        return "<p>Summary not available due to an error.</p>"
    





In [157]:
url = r'https://tldr.tech/'
content = scrape(url)
text = content['text']
images = content['images']
canonical_url = content['canonical_url']

In [158]:
print(len(images))

89


In [159]:
print(images)

['https://substackcdn.com/image/fetch/w_1200,h_600,c_fill,f_jpg,q_auto:good,fl_progressive:steep,g_auto/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F8f70956b-24b6-432b-81c4-dcfa4095ead7_1024x1024.png', 'https://cdn.prod.website-files.com/659d98b7c73d0348ba2d1adb/65bce59176a00315c3510339_og.jpg', 'https://cdn.mos.cms.futurecdn.net/mFFHbRYjQzxks9trfXWefL-1200-80.jpg', 'https://www.co.dev/images/preview.png', 'https://threadreaderapp.com/images/screenshots/thread/1891738830534766710.jpg', 'https://opengraph.githubassets.com/6edb6cd62ab1020f896168f7b181ad381c3b04bbd4469c9a5b24e2eb24e03e1b/awslabs/mountpoint-s3', 'https://repository-images.githubusercontent.com/727975692/2ca69d98-751b-4066-9a98-fb7bb3ab8599', 'https://substackcdn.com/image/fetch/w_1200,h_600,c_fill,f_jpg,q_auto:good,fl_progressive:steep,g_auto/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F10bcfee1-044e-440d-a575-86128d889a7f_3644x2188.png', 'https://substackcdn.com/image/fe

In [160]:
text


"Keep up with tech in 5 minutes Get the free daily email with summaries of the most interesting stories in startups 🚀, tech 📱, and programming 💻! Subscribe Join 1,250,000 readers for one daily email Feb 19 | Tech Scientists Created the Lightest and Strongest Nanomaterial Ever (3 minute read) Researchers at the University of Toronto have created a new material with the toughness of steel that weighs about as much as foam. They used AI to recognize the best configurations of nanostructures to create the tough and impossibly light material. The new nanomaterial can be mass-produced, unlike its predecessors. The lightweight nature of the material could result in more comfortable prosthetics and implants and more efficient vehicles. Subscribe Feb 19 | AI Humane's AI Pin is dead, as HP buys startup's assets for $116M (3 minute read) HP acquired most of Humane's assets for $116M, leading to the discontinuation of Humane's AI Pins. The AI Pins will lose functionality by February 28. Customers 

In [161]:
canonical_url

'https://tldr.tech/'

In [None]:
html = summarize_content(text, images, canonical_url)
with open('newest_2_tech_newsletter.html', 'w') as f:
    f.write(html)



In [166]:
import smtplib
from email.mime.text import MIMEText
from email.mime.multipart import MIMEMultipart
from email.utils import formataddr

def send_innovation_newsletter(sender_email, sender_password, recipient_email, html_content):
    """Sends an HTML email newsletter."""
    
    # Tạo message
    msg = MIMEMultipart('alternative')
    msg['From'] = formataddr(('Innovation Newsletter', sender_email))
    msg['To'] = formataddr(('Recipient', recipient_email))
    msg['Subject'] = "Innovation Insights Newsletter"
    
    # Tạo phần HTML với encoding utf-8
    html_part = MIMEText(html_content, 'html', 'utf-8')
    msg.attach(html_part)

    try:
        # Kết nối SMTP server
        server = smtplib.SMTP('smtp.gmail.com', 587)
        server.starttls()
        server.login(sender_email, sender_password)
        print("Successfully connected to the SMTP server!")

        # Gửi email
        server.send_message(msg)
        print("Email sent successfully!")

    except Exception as e:
        print(f"Error sending email: {e}")

    finally:
        server.quit()

sender_email = r'mailoc121517@gmail.com'
sender_password = r"qfyq igtg cyse etzd"
recipient_email = r"ngocntt2@techcombank.com.vn"

with open(r"D:\Materials_Tech\newest_tech_newsletter.html", encoding="utf-8") as f:
    html_content = f.read()

send_innovation_newsletter(sender_email, sender_password, recipient_email, html_content)

Successfully connected to the SMTP server!
Email sent successfully!
