# Library

In [1]:
from google.cloud import texttospeech
from gtts import gTTS
from datetime import datetime, timedelta
import os



def get_today():
    tomorrow = datetime.now() + timedelta(days=1)
    return tomorrow.strftime("%B %d, %Y")

today = get_today()  
print(today)

March 17, 2025


### Get content from HTML and make a short podcast script

In [2]:
from bs4 import BeautifulSoup

def create_podcast_script_from_html(html_file):
    with open(html_file, "r") as file:
        html_content = file.read()
    
    soup = BeautifulSoup(html_content, "html.parser")
    
    #Header
    main_title = soup.find('h1', class_= 'main-title').text.strip()
    sub_title = soup.find('h2', class_= 'sub-title').text.strip()
    date = soup.find('div', class_= 'date').text.strip()
    #Intro
    podcast_script = f"Welcome to {main_title}, {sub_title}. Today is {today}. This is the daily automated newsletter from DP."
    #Sections
    sections = soup.find_all('td', class_= 'section')
    
    for section in sections:
        section_title = section.find('h2', class_= 'section-title').text.strip()
        podcast_script += f" Next, we will talk about {section_title}."
        
        #Extract articles and summary
        articles = section.find_all('td', class_= 'article')
        for article in articles:
            article_title = article.find('h3', class_= 'article-title').text.strip()
            article_summary = article.find('div', class_= 'article-summary').text.strip()
            podcast_script += f"Next, we will talk about {article_title}. {article_summary}."
    #Outro
    podcast_script += "That's all for today. Thank you for listening."
    
    return podcast_script

In [3]:
newsletter_path = rf"../Newsletter_HTML/tech_newsletter_{today}.html"
podcast_script = create_podcast_script_from_html(newsletter_path)

In [4]:
podcast_script

"Welcome to Think Data, Data News. Today is March 17, 2025. This is the daily automated newsletter from DP. Next, we will talk about Newsletter Audio (English). Next, we will talk about Newsletter Audio (Vietnamese). Next, we will talk about Data Products.Next, we will talk about Product Analysis: How to Assess a Product. Assessing a product involves understanding its market fit, user experience, and potential for growth. This article provides a framework to effectively evaluate products..Next, we will talk about Concierge AI (Product). Concierge AI is a product that likely offers AI-powered assistance to users, providing personalized and efficient support. This can help users with various tasks..Next, we will talk about Screen Studio 3.0 (Product). Screen Studio 3.0 is a product designed for screen recording and editing, offering tools to create engaging video content. It helps to create high-quality recordings.. Next, we will talk about AI Trending.Next, we will talk about Multimodal

# Make an example for LLM

In [5]:
example = """
Think Data Newsletter Podcast Script - March 04, 2025

Intro (300 words)

Welcome to Think Data, your daily dose of insights on technology, AI, and data trends. Today is March 04, 2025, and we’re bringing you a curated summary of the latest advancements in data products, AI, engineering, governance, and business intelligence. Stay tuned for deep dives into today’s most exciting developments!

Section 1: Data Products (600 words)

Skyvern 2.0 – Enhancing Data Automation

Skyvern 2.0 is redefining how businesses handle data extraction and automation, offering more efficiency and intelligence in workflow integration.

Tana – Streamlining Data Organization

Tana's latest update introduces powerful tools for structured data organization, making collaboration seamless across industries.

Stella AI – AI-Powered Decision-Making

Stella AI is emerging as a leader in AI-driven data analysis, providing automated insights for smarter business strategies.

Agora API – Real-Time Developer Tools

Agora API simplifies real-time communication, particularly benefiting finance and trading platforms with seamless integrations.

Section 2: AI Trending (600 words)

Apple's AI Struggles

Reports suggest delays in Apple’s AI-powered Siri updates, pushing advancements to iOS 20 by 2027.

OpenAI GPT-4.5 'Orion'

The latest OpenAI model, GPT-4.5 'Orion,' showcases breakthroughs in natural language processing and efficiency.

DeepSeek's Financial Insights

DeepSeek reveals revenue details, shedding light on the financial side of AI research and development.

Warp’s AI-Enhanced Terminal

Warp introduces an AI-first terminal for Windows, optimizing the developer experience through intelligent command-line interactions.

Section 3: Data Engineering (500 words)

GitLab’s Data Loss Incident

GitLab’s recent data loss of 300GB highlights key lessons in backup and disaster recovery strategies.

Docker Engine v28 – Security Updates

Docker’s latest release improves container networking security by default, making cloud deployments more resilient.

Azure Key Vault & Kubernetes Integration

A new approach to securing cloud-native applications, providing four streamlined methods to access secrets within Azure Kubernetes Service.

Section 4: Data Governance & Cybersecurity (500 words)

Exposed API Keys in AI Training Data

A shocking 12,000 API keys and passwords have been discovered in open-source AI training datasets, raising security concerns.

Dreadnode – New Infosec Solution

Dreadnode launches a promising new cybersecurity tool, focusing on threat detection and risk mitigation.

Meta’s Internal Governance Crackdown

Meta has terminated 20 employees for information leaks, reinforcing the importance of internal data security policies.

Vulnerable Building Management Systems

Over 49,000 misconfigured building management systems have been found online, highlighting IoT security risks.

Section 5: Business Intelligence & Strategy (500 words)

AI’s Disruptive Impact on Business

Some companies thrive while others struggle with AI adoption, showing the uneven impact of automation on industries.

Mergers, Acquisitions, & IPO Trends

Major software IPOs and acquisitions are shaping the investment landscape, indicating where the industry is heading.

Section 6: Related Topics (GitHub Repositories) (300 words)

Notable Open-Source Projects

LLM-Data-Cleaner – A tool for pre-processing large datasets for AI training.

CyberScan-Toolkit – An open-source security scanner for enterprise data protection.

AutoPipeline-X – A machine learning workflow automation library.

K8s-Vault-Manager – A Kubernetes-based secret management solution.

Outro (300 words)

That’s a wrap for today’s Think Data podcast! We explored game-changing data products, AI trends, engineering advancements, and the evolving cybersecurity landscape. Stay informed and ahead of the curve—join us again tomorrow for more insights. Until next time, this is Think Data, signing off!
"""

### Generate detailed script for podcast

In [6]:
import os
import google.generativeai as genai

def expand_podcast_LLM(summary):
    try:
        # Get API key from environment
        api_key = os.environ.get('GOOGLE_API_KEY')
        
        if not api_key:
            api_key = 'AIzaSyB25ElYsVVI2o6y7Mfk-5uL7sApJt9sRR8' 

        genai.configure(api_key=api_key)

        # Generation configuration for precise control
        generation_config = {
            'temperature': 0.7,
            'top_k': 60,
            'top_p': 0.9,
        }

        # Using Gemini Pro for high-quality output
        model = genai.GenerativeModel(
            'gemini-2.0-flash', 
            generation_config=generation_config
        )

        # Detailed prompt for podcast script generation
        prompt = f"""
Task: Generate a professional podcast script from the following summary. Reference is {example}

Requirements:
- Length: 2500-3000 words
- Structure: Intro, Main Content, Conclusion
- Tone: Professional but engaging
- Optimized for Text-to-Speech

Source Summary:  
{summary}

Guidelines:
1. **Intro** (80 words)
   - Welcome listeners
   - Introduce the main topics
   - Provide context and why it matters

2. **Main Content** (450 words)
   - Expand key points from the summary  
   - Stricly follow the source content (must include information of 6 sections)
   - Maintain a natural flow  

3. **Conclusion** (150 words)
   - Summarize key takeaways  
   - End with an engaging note  
   - Encourage listener interaction  

Technical Considerations:
- Avoid overly complex terms  
- Maintain smooth transitions  
- Keep the pacing natural  
- Ensure clarity for text-to-speech  
- Name of Host: Innovation Lab  

Now, generate the full podcast script.
"""


        # Generate script
        response = model.generate_content(prompt)
        return response.text

    except Exception as e:
        print(f"Error generating podcast script: {e}")
        return None

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
new_podcast_script = expand_podcast_LLM(podcast_script)

# Keep regenerating if the script is too long
max_length = 6500 

attempts = 0
max_attempts = 3

while len(new_podcast_script) > max_length and attempts < max_attempts:
    print(f"Script too long ({len(new_podcast_script)} chars). Regenerating...")
    new_podcast_script = expand_podcast_LLM(podcast_script)
    attempts += 1



In [8]:
new_podcast_script

"Okay, here's a podcast script based on the provided summaries and guidelines, optimized for text-to-speech and a professional, engaging tone.\n\n**Podcast Script: Think Data - March 17, 2025**\n\n**(Intro - 80 words)**\n\n**Host (Innovation Lab):** Welcome to Think Data, your daily briefing on the cutting edge of technology, AI, and data. Today is March 17th, 2025, and we're diving into a curated selection of the day's most important developments. From AI-powered product enhancements to critical data governance challenges, we'll explore the trends shaping the future of how we work and live with data. Let's get started!\n\n**(Main Content - 450 words)**\n\n**Host (Innovation Lab):** Let's kick things off with **Data Products**. Today, we're highlighting a framework on **How to Assess a Product**. It's crucial to understand its market fit, user experience, and growth potential. Then, we have **Concierge AI**, an AI assistant designed to provide personalized and efficient support. Imagin

In [9]:
len(new_podcast_script)

5208

# Clean script 

In [10]:
import re

def clean_text_for_tts(text):
    # Loại bỏ nội dung trong ngoặc đơn và ngoặc nhọn
    text = re.sub(r'\(.*?\)', '', text)
    text = re.sub(r'\{.*?\}', '', text)
    
    # Loại bỏ markdown
    text = re.sub(r'`{1,3}.*?`{1,3}', '', text)
    text = re.sub(r'\*{1,3}', '', text)
    
    # Loại bỏ newline và tab
    text = text.replace('\n', ' ')
    text = text.replace('\t', ' ')
    
    # Chỉ giữ lại chữ, số, dấu chấm, dấu phẩy, khoảng trắng
    text = re.sub(r'[^a-zA-Z0-9,.:\s]', '', text)
    
    text = text.replace('Host', '')
    text = text.replace('Innovation Host', '')
    text = text.replace('DP.Next', 'DP')
    # Loại bỏ khoảng trắng thừa
    text = re.sub(r'\s+', ' ', text).strip()
    match = re.search(r"Welcome.*", text, re.IGNORECASE)
    if match:
        text = match.group(0)       
    else:
        text = text 
    return text




In [11]:
clean_script = clean_text_for_tts(new_podcast_script)
clean_script

'Welcome to Think Data, your daily briefing on the cutting edge of technology, AI, and data. Today is March 17th, 2025, and were diving into a curated selection of the days most important developments. From AIpowered product enhancements to critical data governance challenges, well explore the trends shaping the future of how we work and live with data. Lets get started : Lets kick things off with Data Products. Today, were highlighting a framework on How to Assess a Product. Its crucial to understand its market fit, user experience, and growth potential. Then, we have Concierge AI, an AI assistant designed to provide personalized and efficient support. Imagine having AI to help with a variety of tasks Finally, Screen Studio 3.0 is making waves in screen recording and editing, offering tools to create highquality video content. Moving into AI Trending, multimodal representation learning, or MMRL, enhances fewshot adaptation of visionlanguage models by introducing a shared representatio

In [12]:
len(clean_script)

4669

In [None]:
with open(f'../Related_json/podcast_script_{today}.txt', 'w') as file:
    file.write(clean_script)

# Vietnamese part


In [14]:
ENTITY_PRONUNCIATION = """
1. ChatGPT → Chát Gờ Pờ Tờ
2. OpenAI → Ô pân Ây Ai
3. Alibaba Cloud → A li ba ba Clâu đơ
4. Deepseek → Díp xích
5. Claude → Clâu đơ
6. Google → Gu gồ
7. Microsoft → Mai cờ rô sốp
8. Apple → Ép pồ
9. Amazon → A ma zôn
10. Facebook → Phây búc
11. Twitter → Tu ít tờ
12. Instagram → In sờ ta grăm
13. WhatsApp → Vhát sáp
14. YouTube → Giu tu bộ
15. Zoom → Dzu mờ
16. GitHub → Gít hắp
17. Dropbox → Drốp bốc
18. Slack → Xlắc
19. Spotify → Xpốt i phai
20. Netflix → Nét phlix
21. Alibaba → A li ba ba
22. Tencent → Ten xèn
23. Baidu → Bai đu
24. Samsung → Sám xung
25. Huawei → Hứa uê
26. Gmail → Gờ meo
27. iPhone → Ai phôn
28. Android → An đroi
29. Windows → Vin đồ
30. MacOS → Mắc cơ át
31. Tesla → Tét la
32. PlayStation → Plei sờ tê shần
33. Xbox → Éch bốc
34. Nintendo → Ni nen đô
35. Adobe → A đốp
36. Photoshop → Phô tô sọp
37. Illustrator → Il lú sờ trê tờ
38. Chrome → Crôm
39. Firefox → Phai e rốc
40. Safari → Sa pha ri
41. Edge → Éo ch
42. LinkedIn → Lin kờ đin
43. TikTok → Tik tók
44. Snapchat → Xnắp chát
45. Shopify → Sờ hóp ai phai
46. AI → Ây ai
47. ML → Em Eo
48. LLM → Eo EO EM
49. API → Ây pi ai
50. IoT → Ai âu ti
51. VR → Vi a
52. AR → Ây a
53. NLP → En el pi
54. GPT → Gờ Pờ Tờ
55. GPU → Gờ Pờ U
56. CPU → Cờ Pờ U
57. RAM → Rờ am em
58. SSD → Éc éc đê
59. HTTP → Eich ti ti pi
60. HTTPS → Eich ti ti pi es
61. DNS → Di en es
62. HTML → Et chi ti em el
63. CSS → Si es es
64. JSON → Jê sờn
65. XML → Ech em el
66. NASA → Nờ A Es A
67. UN → U En
68. WHO → Wờ Hờ O
69. IMF → Ai Em Ef
70. WTO → Wờ Ti O
71. UNESCO → U En Es Co
72. NATO → Nờ A Ti O
73. FIFA → Fai Fa
74. UEFA → U E Fa
75. Red Cross → Rét Crốt
76. Disney → Đin ni
77. Warner Bros → O nơ Brốt
78. Universal Studios → U ni vờ sờ nờ stu đi ô
79. Hulu → Hu lu
80. HBO → Hờ Bi O
81. CNN → Si en en
82. BBC → Bi bi si
83. Fox News → Phốc nưu
84. The New York Times → Đi nyoo yoo tờ mai
85. Visa → Vai za
86. MasterCard → Mắt sờ thẻ ca
87. PayPal → Pê pô
88. Stripe → Xtrai pờ
89. Square → Xquâ
90. Goldman Sachs → Gôl môn Sách
91. JPMorgan Chase → Jê Pi Mô gân Chê
92. Wells Fargo → Wels Fác gô
93. Citibank → Xi ti băn
94. HSBC → Hờ Es Bi Ci
95. Wikipedia → Wi ki pê đi a
96. Reddit → Rê dít
97. Quora → Quô ra
98. Medium → Mê đi ăm
99. WordPress → Wờ dpres
100. Wix → Wích
101. Canva → Can va
102. Salesforce → Sếil pho
103. Oracle → Ô ra cl
104. IBM → Ai Bi Em
"""

In [15]:
import os
from openai import OpenAI


def translate_vietnamese(text, example):
    try:
        # Get API key from environment
        api_key = os.environ.get('GOOGLE_API_KEY')
        
        if not api_key:
            api_key = 'AIzaSyB25ElYsVVI2o6y7Mfk-5uL7sApJt9sRR8' 

        genai.configure(api_key=api_key)

        # Generation configuration for precise control
        generation_config = {
            'temperature': 0.7,
            'top_k': 60,
            'top_p': 0.9,
        }

        # Using Gemini Pro for high-quality output
        model = genai.GenerativeModel(
            'gemini-2.0-flash', 
            generation_config=generation_config
        )
        
        prompt = f"""You are a professional Vietnamese translator 
        Task:  Translate the following English text into Vietnamese: {text} in a clear, accurate, and engaging manner. 
        Stricly do not generate [text] in the translated text such as: [Chuyển cảnh], [Nhạc hiệu kết thúc], [Nhạc nền], [Nhạc nền kết thúc], [Nhạc nền kết thúc],...
        The translation should maintain the original meaning and tone of the text while being optimized for text-to-speech conversion. The translated text should be suitable for a podcast script. 
        Remember to keep the original name of sections: Data Product, AI Trending, Data Engineering, Data Governance, Business Intelligence. For the others, you need to change the abbreviated words to Vietnamese accent. For example, DP to Đi Pi, AI to ây ai, PDF to pi đi ép, Google: Gu Gồ, API: ây pi ai, Github: Ghít hắp.
        Translate all the english name into Vietnamese accent. For example, Apple to Ép pờ, Microsoft to Mai cờ rô sốp, Google to Gu gồ, Facebook to Phây búc, Amazon to A ma zôn, ChatGPT to Chát Gi Pi Ti.
        You also need to change the number to Vietnamese accent. For example, 1 to một, 2 to hai, 3 to ba, etc.
        Example: Follow {example}
        """
        response = model.generate_content(prompt)
        return response.text
    except Exception as e:
        print(f"Error translating text: {e}")
        return None
    

In [16]:
# import os
# from openai import OpenAI
# def translate_vietnamese(text):
#     try:
#         client = OpenAI(
#     # If the environment variable is not configured, replace the following line with: api_key="sk-xxx",
#     api_key=os.getenv("DASHSCOPE_API_KEY"), 
#     base_url="https://dashscope-intl.aliyuncs.com/compatible-mode/v1",
#     )    
        
        
#         completion = client.chat.completions.create(
#         model="qwen-max",  # Sử dụng mô hình Qwen-Max
#         messages=[
#         {'role': 'system', 'content': f"""You are a professional Vietnamese translator 
#         Task:
#         Strictly do not generate any entity names in English, I want to generate all the words in Vietnamese accent.
#         Translate the following English text into Vietnamese: {text} in a clear, accurate, and engaging manner. 
#         The translation should maintain the original meaning and tone of the text while being optimized for text-to-speech conversion. The translated text should be suitable for a podcast script. 
#         Stricly do not generate [text] in the translated text such as: [Chuyển cảnh], [Nhạc hiệu kết thúc], [Nhạc nền], [Nhạc nền kết thúc], [Nhạc nền kết thúc],...
#         Translate the English entity names into Vietnamese pronounciantion. For example, Apple to Ép pờ, Microsoft to Mai cờ rô sốp, Google to Gu gồ, Facebook to Phây búc, Amazon to A ma zôn, ChatGPT to Chát Gi Pi Ti.
#         You also need to change the number to Vietnamese accent. For example, 1 to một, 2 to hai, 3 to ba, etc.
#         '1. **ChatGPT** → Chát Gờ Pờ Tờ  \n2. **OpenAI** → Ô pân Ây Ai  \n3. **Alibaba Cloud** → A li ba ba Clâu đơ  \n4. **Deepseek** → Díp xích  \n5. **Claude** → Clâu đơ  \n6. **Google** → Gu gồ  \n7. **Microsoft** → Mai cờ rô sốp  \n8. **Apple** → Ép pồ  \n9. **Amazon** → A ma zôn  \n10. **Facebook** → Phây búc  \n11. **Twitter** → Tu ít tờ  \n12. **Instagram** → In sờ ta grăm  \n13. **WhatsApp** → Vhát sáp  \n14. **YouTube** → Giu tu bộ  \n15. **Zoom** → Dzu mờ  \n16. **GitHub** → Gít húp  \n17. **Dropbox** → Drốp bốc  \n18. **Slack** → Xlắc  \n19. **Spotify** → Xpốt i phai  \n20. **Netflix** → Nét phlix  \n21. **Alibaba** → A li ba ba  \n22. **Tencent** → Ten xèn  \n23. **Baidu** → Bai đu  \n24. **Samsung** → Sám xung  \n25. **Huawei** → Hứa uê  \n26. **Gmail** → Gờ meo  \n27. **iPhone** → Ai phôn  \n28. **Android** → An đroi  \n29. **Windows** → Vin đồ  \n30. **MacOS** → Mắc cơ át  \n31. **Tesla** → Tét la  \n32. **PlayStation** → Plei sờ tê shần  \n33. **Xbox** → Éch bốc  \n34. **Nintendo** → Ni nen đô  \n35. **Adobe** → A đốp  \n36. **Photoshop** → Phô tô sọp  \n37. **Illustrator** → Il lú sờ trê tờ  \n38. **Chrome** → Crôm  \n39. **Firefox** → Phai e rốc  \n40. **Safari** → Sa pha ri  \n41. **Edge** → Éo ch  \n42. **LinkedIn** → Lin kờ đin  \n43. **TikTok** → Tik tók  \n44. **Snapchat** → Xnắp chát  \n45. **Shopify** → Sờ hóp ai phai  \n46. **AI** → Ây ai  \n47. **ML** → Em Eo  \n48. **LLM** → Eo EO EM  \n49. **API** → Ây pi ai  \n50. **IoT** → Ai âu ti  \n51. **VR** → Vi a  \n52. **AR** → Ây a  \n53. **NLP** → En el pi  \n54. **GPT** → Gờ Pờ Tờ  \n55. **GPU** → Gờ Pờ U  \n56. **CPU** → Cờ Pờ U  \n57. **RAM** → Rờ am em  \n58. **SSD** → Éc éc đê  \n59. **HTTP** → Eich ti ti pi  \n60. **HTTPS** → Eich ti ti pi es  \n61. **DNS** → Di en es  \n62. **HTML** → Et chi ti em el  \n63. **CSS** → Si es es  \n64. **JSON** → Jê sờn  \n65. **XML** → Ech em el  \n66. **NASA** → Nờ A Es A  \n67. **UN** → U En  \n68. **WHO** → Wờ Hờ O  \n69. **IMF** → Ai Em Ef  \n70. **WTO** → Wờ Ti O  \n71. **UNESCO** → U En Es Co  \n72. **NATO** → Nờ A Ti O  \n73. **FIFA** → Fai Fa  \n74. **UEFA** → U E Fa  \n75. **Red Cross** → Rét Crốt  \n76. **Disney** → Đin ni  \n77. **Warner Bros** → O nơ Brốt  \n78. **Universal Studios** → U ni vờ sờ nờ stu đi ô  \n79. **Hulu** → Hu lu  \n80. **HBO** → Hờ Bi O  \n81. **CNN** → Si en en  \n82. **BBC** → Bi bi si  \n83. **Fox News** → Phốc nưu  \n84. **The New York Times** → Đi nyoo yoo tờ mai  \n85. **Visa** → Vai za  \n86. **MasterCard** → Mắt sờ thẻ ca  \n87. **PayPal** → Pê pô  \n88. **Stripe** → Xtrai pờ  \n89. **Square** → Xquâ  \n90. **Goldman Sachs** → Gôl môn Sách  \n91. **JPMorgan Chase** → Jê Pi Mô gân Chê  \n92. **Wells Fargo** → Wels Fác gô  \n93. **Citibank** → Xi ti băn  \n94. **HSBC** → Hờ Es Bi Ci  \n95. **Wikipedia** → Wi ki pê đi a  \n96. **Reddit** → Rê dít  \n97. **Quora** → Quô ra  \n98. **Medium** → Mê đi ăm  \n99. **WordPress** → Wờ dpres  \n100. **Wix** → Wích  \n101. **Canva** → Can va  \n102. **Salesforce** → Sếil pho  \n103. **Oracle** → Ô ra cl  \n104. **IBM** → Ai Bi Em'
#         """},
#         {'role': 'user', 'content': f'Translate {text}'},
#     ]
# )
#         return completion.choices[0].message.content
#     except Exception as e:
#         print(f"Error translating text: {e}")
#         return None
    

In [17]:
vietnamese_text = translate_vietnamese(clean_script, ENTITY_PRONUNCIATION )
vietnamese_text

'Chào mừng bạn đến với Think Data, bản tóm tắt hàng ngày của bạn về công nghệ tiên tiến, ây ai và dữ liệu. Hôm nay là ngày mười bảy tháng ba năm hai nghìn không trăm hai mươi lăm, và chúng ta sẽ đi sâu vào một tuyển chọn các phát triển quan trọng nhất trong ngày. Từ các cải tiến sản phẩm do ây ai hỗ trợ đến các thách thức quản trị dữ liệu quan trọng, chúng ta sẽ khám phá các xu hướng định hình tương lai về cách chúng ta làm việc và sống với dữ liệu. Hãy bắt đầu nào: Hãy bắt đầu với Data Products (Đi Pi). Hôm nay, chúng tôi muốn làm nổi bật một khuôn khổ về Cách Đánh Giá Sản Phẩm. Điều quan trọng là phải hiểu sự phù hợp của nó với thị trường, trải nghiệm người dùng và tiềm năng tăng trưởng. Tiếp theo, chúng ta có Concierge AI (Ây ai), một trợ lý ây ai được thiết kế để cung cấp hỗ trợ cá nhân hóa và hiệu quả. Hãy tưởng tượng có ây ai giúp bạn thực hiện nhiều tác vụ khác nhau. Cuối cùng, Screen Studio ba chấm không đang tạo nên làn sóng trong việc quay và chỉnh sửa màn hình, cung cấp các 

In [18]:
def clean_text_for_tts_vietnamese(text):
    # Loại bỏ nội dung trong ngoặc đơn và ngoặc nhọn
    text = re.sub(r'\(.*?\)', '', text)
    text = re.sub(r'\{.*?\}', '', text)

    
    # Loại bỏ newline và tab
    text = text.replace('\n', ' ')
    text = text.replace('\t', ' ')
    text = text.replace(r'**', '')
    text = text.replace('Im', '')
    text = text.replace('[Chuyển cảnh]', '')
    # Loại bỏ khoảng trắng thừa
    text = re.sub(r'\s+', ' ', text).strip()
    match = re.search(r"Chào mừng.*", text, re.IGNORECASE)
    if match:
        text = match.group(0)       
    else:
        text = text 
    return text


In [19]:
vietnamese_text_clean = clean_text_for_tts_vietnamese(vietnamese_text)
vietnamese_text_clean

'Chào mừng bạn đến với Think Data, bản tóm tắt hàng ngày của bạn về công nghệ tiên tiến, ây ai và dữ liệu. Hôm nay là ngày mười bảy tháng ba năm hai nghìn không trăm hai mươi lăm, và chúng ta sẽ đi sâu vào một tuyển chọn các phát triển quan trọng nhất trong ngày. Từ các cải tiến sản phẩm do ây ai hỗ trợ đến các thách thức quản trị dữ liệu quan trọng, chúng ta sẽ khám phá các xu hướng định hình tương lai về cách chúng ta làm việc và sống với dữ liệu. Hãy bắt đầu nào: Hãy bắt đầu với Data Products . Hôm nay, chúng tôi muốn làm nổi bật một khuôn khổ về Cách Đánh Giá Sản Phẩm. Điều quan trọng là phải hiểu sự phù hợp của nó với thị trường, trải nghiệm người dùng và tiềm năng tăng trưởng. Tiếp theo, chúng ta có Concierge AI , một trợ lý ây ai được thiết kế để cung cấp hỗ trợ cá nhân hóa và hiệu quả. Hãy tưởng tượng có ây ai giúp bạn thực hiện nhiều tác vụ khác nhau. Cuối cùng, Screen Studio ba chấm không đang tạo nên làn sóng trong việc quay và chỉnh sửa màn hình, cung cấp các công cụ để tạo

In [20]:
len(vietnamese_text_clean)

5451

In [21]:
import json
with open(f'../Related_json/podcast_script_vietnamese_{today}.txt', 'w') as f:
    json.dump(vietnamese_text_clean, f, ensure_ascii= False, indent=4)

# Model

In [22]:
import torch
from TTS.api import TTS
model_name = "tts_models/en/ljspeech/vits"
tts = TTS(model_name).to("cuda" if torch.cuda.is_available() else "cpu")

 > tts_models/en/ljspeech/vits is already downloaded.
 > Using model: vits
 > Setting up Audio Processor...
 | > sample_rate:22050
 | > resample:False
 | > num_mels:80
 | > log_func:np.log10
 | > min_level_db:0
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:None
 | > fft_size:1024
 | > power:None
 | > preemphasis:0.0
 | > griffin_lim_iters:None
 | > signal_norm:None
 | > symmetric_norm:None
 | > mel_fmin:0
 | > mel_fmax:None
 | > pitch_fmin:None
 | > pitch_fmax:None
 | > spec_gain:20.0
 | > stft_pad_mode:reflect
 | > max_norm:1.0
 | > clip_norm:True
 | > do_trim_silence:False
 | > trim_db:60
 | > do_sound_norm:False
 | > do_amp_to_db_linear:True
 | > do_amp_to_db_mel:True
 | > do_rms_norm:False
 | > db_level:None
 | > stats_path:None
 | > base:10
 | > hop_length:256
 | > win_length:1024


In [23]:
import io
import soundfile as sf
from pydub import AudioSegment
import os
# 1. Generate audio as numpy array (not saved to disk yet)
wav_array = tts.tts(clean_script)

# 2. Convert numpy array to WAV format in memory (BytesIO buffer)
wav_io = io.BytesIO()
sf.write(wav_io, wav_array, 22050, format='WAV')
wav_io.seek(0)  # Reset pointer to beginning of buffer

# 3. Use pydub to convert in-memory WAV to MP3 file
audio = AudioSegment.from_wav(wav_io)
mp3_path = f'../Newsletter_Audio/Coqui_2_{today}.mp3'
audio.export(mp3_path, format="mp3", bitrate="128k")

print(f"Saved MP3 file to: {mp3_path}")

 > Text splitted to sentences.
['Welcome to Think Data, your daily briefing on the everevolving world of technology, AI, and data.', 'Im your host, Innovation Lab, and today, March 4th, 2025, were diving into the latest and greatest across data products, AI trends, cuttingedge data engineering, critical data governance issues, and the shifts were seeing in business intelligence.', 'Lets get started : First up, lets explore the latest in Data Products.', 'Skyvern 2.0 is making waves with its enhanced data extraction and automation capabilities, promising smarter workflow integration.', 'Tanas new update streamlines data organization, fostering better collaboration across industries.', 'Stella AI is emerging as a key player in AIdriven data analysis, providing businesses with automated insights for more strategic decisionmaking.', 'Finally, Agora API simplifies realtime communication, especially beneficial for finance and trading platforms.', ': Shifting gears to AI Trends.', 'Reports in

In [None]:
# file_path_save = f'../Newsletter_Audio/Coqui_{today}.wav'
# tts.tts_to_file(clean_script, file_path= file_path_save)


 > Text splitted to sentences.
['Welcome to Think Data, your daily briefing on the world of data, AI, and emerging technologies.', 'Im your host, Innovation Lab.', 'Today is March 12th, 2025.', 'Were diving into the latest happenings, from AI model advancements and data engineering breakthroughs to critical data governance updates.', 'Lets get started and explore the innovations shaping our digital future.', ': Lets kick things off with Data Products.', 'Mistral AI has launched a new API that converts PDF documents into AIready Markdown files, streamlining the process of using PDFs in AI applications.', 'This simplifies the integration of PDF content into AI workflows.', 'Next, Lynx unlocks native capabilities for web applications, providing access to features previously unavailable.', 'It enhances web applications by enabling native functionalities.', 'Finally, Descript, a tool for creating and editing videos, podcasts, and other audio content, simplifies the editing process with its 

'../Newsletter_Audio/Coqui_March 12, 2025.wav'

In [None]:
# from pydub import AudioSegment
# import os
# import base64
# # Chuyển đổi sang định dạng khác
# def convert_audio(input_path, output_format, bitrate="64k"):
#     output_path = input_path.rsplit(".", 1)[0] + f".{output_format}"
#     audio = AudioSegment.from_file(input_path, format="mp3")
    
#     if output_format == "opus":
#         audio.export(output_path, format="opus", bitrate=bitrate)
#     elif output_format == "aac":
#         audio.export(output_path, format="adts", bitrate=bitrate)
#     elif output_format == "mp3":
#         audio.export(output_path, format="mp3", bitrate=bitrate)
    
    
#     else:
#         raise ValueError("Unsupported format")
    
#     print(f"Converted to {output_path}")
#     return output_path

# # Chuyển sang Opus 64kbps
# converted_file = convert_audio(rf'../Newsletter_Audio/Coqui_{today}.wav', "mp3", "64k")

Converted to ../Newsletter_Audio/Coqui_March 12, 2025.mp3


In [None]:
# import base64
# import sys
# import os
# def convert_opus_to_base64(input_file, output_file=None):
#     """
#     Convert an Opus audio file to base64 encoding.
    
#     Args:
#         input_file (str): Path to the input Opus file
#         output_file (str, optional): Path to save the base64 output.
#                                     If None, prints to stdout.
    
#     Returns:
#         str: The base64 encoded string
#     """
#     try:
#         # Check if file exists
#         if not os.path.exists(input_file):
#             raise FileNotFoundError(f"Input file not found: {input_file}")
            
#         # Check if file is empty
#         if os.path.getsize(input_file) == 0:
#             raise ValueError(f"Input file is empty: {input_file}")
        
#         # Read the binary content of the Opus file
#         with open(input_file, 'rb') as f:
#             binary_data = f.read()
        
#         # Convert to base64
#         base64_encoded = base64.b64encode(binary_data).decode('utf-8')
        
#         # Output handling
#         if output_file:
#             with open(output_file, 'w') as f:
#                 f.write(base64_encoded)
#             print(f"Base64 encoded data saved to {output_file}")
#         else:
#             return base64_encoded
            
#     except Exception as e:
#         print(f"Error: {e}", file=sys.stderr)
#         sys.exit(1)


In [None]:
# convert_opus_to_base64(r"D:\Materials_Tech\Newsletter_Audio\Coqui_March 08, 2025.opus",r"D:\Materials_Tech\Newsletter_Audio\audio_output" )

Base64 encoded data saved to D:\Materials_Tech\Newsletter_Audio\audio_output
