In [None]:
!pip install openai pydub gTTS moviepy google-api-python-client google-auth-httplib2 google-auth-oauthlib


Collecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting gTTS
  Downloading gTTS-2.5.4-py3-none-any.whl.metadata (4.1 kB)
Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Downloading gTTS-2.5.4-py3-none-any.whl (29 kB)
Installing collected packages: pydub, gTTS
Successfully installed gTTS-2.5.4 pydub-0.25.1


In [None]:
import requests

def clean_text(raw_text):
    """
    1. Skip all early sections before the text "INTRODUCTORY CHAPTER"
    2. Skip everything after "END OF THE PROJECT GUTENBERG EBOOK"
    Return the extracted portion.
    """

    start_marker = "PREFATORY NOTE TO FIRST EDITION."
    end_marker   = "UNWIN BROTHERS, PRINTERS, CHILWORTH AND LONDON."

    # Convert to uppercase for case-insensitive matching
    raw_text_upper       = raw_text.upper()
    start_marker_upper   = start_marker.upper()
    end_marker_upper     = end_marker.upper()

    # 1) Find the start marker
    start_idx = raw_text_upper.find(start_marker_upper)
    if start_idx == -1:
        # If not found, start from the beginning
        start_idx = 0

    # 2) Find the end marker
    end_idx = raw_text_upper.find(end_marker_upper)
    if end_idx == -1:
        # If not found, go to the very end
        end_idx = len(raw_text)

    # Extract the relevant portion
    cleaned_text = raw_text[start_idx:end_idx].strip()
    return cleaned_text

#  URL containing a plain-text eBook
url = 'https://www.gutenberg.org/cache/epub/40121/pg40121.txt'
response = requests.get(url)
raw_text = response.text

# Clean the text
cleaned_text = clean_text(raw_text)

# Show the first 1000 characters of the cleaned text
print(cleaned_text[:1000])



PREFATORY NOTE TO FIRST EDITION.


The following pages contain a strictly personal narrative of my Travels
and Adventures in Asia and in Europe. They make no pretence whatever to
be a geographical and ethnological description of the actual Central
Asia. Upon these points recent works have greatly added to the knowledge
we possessed twenty years ago, when I performed my dangerous pilgrimage
from Budapest to Samarkand. A _resume_ of the various publications of
Russian, English, French and German travellers in this region would have
formed a separate book, but these have nothing to do with the variegated
adventures of my own career, of which I here propose to give the first
complete picture to the English reader.

ARMINIUS VAMBERY.

BUDAPEST.




INTRODUCTORY CHAPTER

TO THE BOYS OF ENGLAND.


In presenting this narrative of my adventures in Europe and in Asia to
the juvenile reader in England, I must add a few remarks which have not
been embodied in the autobi

In [None]:
import re

def split_by_roman_numerals(text):
    """
    Splits 'text' into sections, using lines that contain only a Roman numeral
    followed by a dot (e.g. "I.", "XIV.") as section dividers.
    """
    pattern = r'^[IVXLCDM]+\.\s*$'
    lines = text.splitlines()
    sections = []
    current_section = []

    for line in lines:
        # Check if this line is a Roman-numeral-only heading
        if re.match(pattern, line.strip(), re.IGNORECASE):
            # If there's an existing section, close it
            if current_section:
                sections.append("\n".join(current_section))
                current_section = []
        current_section.append(line)

    # Append the last collected section if any
    if current_section:
        sections.append("\n".join(current_section))

    return sections


sections = split_by_roman_numerals(cleaned_text)
print(f"Found {len(sections)} sections.")




Found 34 sections.


In [None]:

import os
from google.colab import drive
from openai import OpenAI

# Mount Google Drive
drive.mount('/content/drive')

api_key = "sk-proj-RElo8cA"

# Folder to save the files in Google Drive
output_folder_path = "/content/drive/My Drive/Translated_Sections"

# Create the folder if it doesn't exist
os.makedirs(output_folder_path, exist_ok=True)

def translate_to_turkish(text, index):
    file_name = f"section_{index}.txt"  # Example file name: section_1.txt
    file_path = os.path.join(output_folder_path, file_name)

    # Check if the file already exists
    if os.path.exists(file_path):
        # read files
        with open(file_path, "r", encoding="utf-8") as file:
            translated_text = file.read()
        print("Skipped ---",index,translated_text[-100:])
        return

    print(f"Translating {index}...")

    system_prompt = (
        "You are a helpful assistant that accurately translates text from English to Turkish."
    )

    user_content = f"Please translate the following text to Turkish:\n\n{text}"

    client = OpenAI(api_key=api_key)

    chat_completion = client.chat.completions.create(
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_content},
        ],
        model="gpt-4o",
    )

    response = chat_completion.model_dump()
    translated_text = response["choices"][0]["message"]["content"]
    # print last 100 chars
    print(translated_text[-100:])

    # Save the translation to a file
    with open(file_path, "w", encoding="utf-8") as file:
        file.write(translated_text)

    print(f"Translated and saved section {index} to {file_path}")

for index, section in enumerate(sections, start=1):
    translate_to_turkish(section, index)

print(f"All translations are saved in: {output_folder_path}")


In [None]:
import os
import re
from pathlib import Path
from openai import OpenAI

def split_text_into_chunks(text, max_length=4090):
    """Split text into chunks of max_length, ensuring chunks end with a period."""
    chunks = []
    while len(text) > max_length:
        # Find the last period within the max_length
        split_index = text[:max_length].rfind(".")
        if split_index == -1:  # No period found, fallback to max_length
            split_index = max_length
        chunks.append(text[:split_index + 1].strip())  # Include the period
        text = text[split_index + 1:].strip()  # Move to the next chunk
    if text:  # Add any remaining text
        chunks.append(text.strip())
    return chunks

def generate_mp3_for_translated_sections(api_key, folder_path):
    """
    1) Find all text files named 'section_{index}.txt' in folder_path.
    2) For each file, split its text into chunks of <= 4090 characters, ending with a period.
    3) For each chunk, check if 'section_{index}_chunk_{chunk_num}.mp3' exists.
       - If it exists, skip that chunk.
       - If it does NOT exist, generate the MP3 for the chunk.
    """
    client = OpenAI(api_key=api_key)

    # List all files in the folder
    for filename in os.listdir(folder_path):
        # Only consider files like "section_{index}.txt"
        if not filename.startswith("section_") or not filename.endswith(".txt"):
            continue

        # Extract the section index from the filename using a regex
        match = re.match(r"section_(\d+)\.txt", filename)
        if not match:
            continue

        section_index = match.group(1)
        txt_file_path = os.path.join(folder_path, filename)

        # Read the text from the .txt file
        with open(txt_file_path, "r", encoding="utf-8") as f:
            text_to_speak = f.read()

        # Split the text into chunks
        chunks = split_text_into_chunks(text_to_speak)

        for chunk_num, chunk in enumerate(chunks, start=1):
            print(f"Processing chunk {chunk_num} of section {section_index}")
            # Build the corresponding MP3 filename for the chunk
            mp3_filename = f"section_{section_index}_chunk_{chunk_num}.mp3"
            mp3_file_path = os.path.join(folder_path, mp3_filename)

            # Check if the MP3 file for this chunk already exists
            if os.path.exists(mp3_file_path):
                print(f"Skipped section {section_index}, chunk {chunk_num} (MP3 already exists).")
                continue

            print(f"Generating TTS for section {section_index}, chunk {chunk_num}...")

            # Use OpenAI TTS to create the speech audio
            response = client.audio.speech.create(
                model="tts-1",
                voice="alloy",
                input=chunk,
            )

            # Save the audio directly to the MP3 file
            response.stream_to_file(mp3_file_path)

            print(f"Created MP3: {mp3_file_path}")

# --------------------------------------------------------
# Usage Example
# --------------------------------------------------------

# Make sure you have the same api_key and output_folder_path as before
generate_mp3_for_translated_sections(api_key, output_folder_path)


In [None]:
def collect_mp3_files_in_order(folder_path):
    """
    Collect all MP3 files in the specified folder, sort them by section and chunk order,
    and return them in an array.
    """
    mp3_files = []

    # Regex to extract section and chunk indices
    pattern = re.compile(r"section_(\d+)_chunk_(\d+)\.mp3")

    # Iterate over all files in the folder
    for filename in os.listdir(folder_path):
        match = pattern.match(filename)
        if match:
            section_index = int(match.group(1))
            chunk_index = int(match.group(2))
            file_path = os.path.join(folder_path, filename)
            mp3_files.append((section_index, chunk_index, file_path))

    # Sort by section and then by chunk
    mp3_files.sort(key=lambda x: (x[0], x[1]))

    # Extract file paths in the correct order
    sorted_file_paths = [file[2] for file in mp3_files]

    print("Sorted MP3 Files:")
    print(sorted_file_paths)

    return sorted_file_paths

collect_mp3_files_in_order(output_folder_path)

Sorted MP3 Files:
['/content/drive/My Drive/Translated_Sections/section_1_chunk_1.mp3', '/content/drive/My Drive/Translated_Sections/section_1_chunk_2.mp3', '/content/drive/My Drive/Translated_Sections/section_2_chunk_1.mp3', '/content/drive/My Drive/Translated_Sections/section_2_chunk_2.mp3', '/content/drive/My Drive/Translated_Sections/section_2_chunk_3.mp3', '/content/drive/My Drive/Translated_Sections/section_2_chunk_4.mp3', '/content/drive/My Drive/Translated_Sections/section_3_chunk_1.mp3', '/content/drive/My Drive/Translated_Sections/section_3_chunk_2.mp3', '/content/drive/My Drive/Translated_Sections/section_3_chunk_3.mp3', '/content/drive/My Drive/Translated_Sections/section_3_chunk_4.mp3', '/content/drive/My Drive/Translated_Sections/section_3_chunk_5.mp3', '/content/drive/My Drive/Translated_Sections/section_3_chunk_6.mp3', '/content/drive/My Drive/Translated_Sections/section_4_chunk_1.mp3', '/content/drive/My Drive/Translated_Sections/section_4_chunk_2.mp3', '/content/drive

['/content/drive/My Drive/Translated_Sections/section_1_chunk_1.mp3',
 '/content/drive/My Drive/Translated_Sections/section_1_chunk_2.mp3',
 '/content/drive/My Drive/Translated_Sections/section_2_chunk_1.mp3',
 '/content/drive/My Drive/Translated_Sections/section_2_chunk_2.mp3',
 '/content/drive/My Drive/Translated_Sections/section_2_chunk_3.mp3',
 '/content/drive/My Drive/Translated_Sections/section_2_chunk_4.mp3',
 '/content/drive/My Drive/Translated_Sections/section_3_chunk_1.mp3',
 '/content/drive/My Drive/Translated_Sections/section_3_chunk_2.mp3',
 '/content/drive/My Drive/Translated_Sections/section_3_chunk_3.mp3',
 '/content/drive/My Drive/Translated_Sections/section_3_chunk_4.mp3',
 '/content/drive/My Drive/Translated_Sections/section_3_chunk_5.mp3',
 '/content/drive/My Drive/Translated_Sections/section_3_chunk_6.mp3',
 '/content/drive/My Drive/Translated_Sections/section_4_chunk_1.mp3',
 '/content/drive/My Drive/Translated_Sections/section_4_chunk_2.mp3',
 '/content/drive/My 

In [None]:
!pip install pydub
!apt-get install -y ffmpeg


Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
ffmpeg is already the newest version (7:4.4.2-0ubuntu0.22.04.1).
0 upgraded, 0 newly installed, 0 to remove and 49 not upgraded.


In [None]:
from pydub import AudioSegment
import os
import re

def combine_mp3_files(sorted_file_paths, output_file_path):
    """
    Combine multiple MP3 files into a single MP3 file.

    Parameters:
    - sorted_file_paths: List of MP3 file paths in the correct order.
    - output_file_path: Path to save the combined MP3 file.
    """
    if not sorted_file_paths:
        print("No MP3 files to combine.")
        return

    print("Combining MP3 files...")

    # Load the first MP3 file
    combined_audio = AudioSegment.from_file(sorted_file_paths[0], format="mp3")

    # Append subsequent MP3 files
    for file_path in sorted_file_paths[1:]:
        print(f"Adding {file_path}")
        audio = AudioSegment.from_file(file_path, format="mp3")
        combined_audio += audio

    # Export the combined audio
    combined_audio.export(output_file_path, format="mp3")
    print(f"Combined MP3 saved to {output_file_path}")

# --------------------------------------------------------
# Usage Example
# --------------------------------------------------------

# Example: Combine sorted MP3 files
sorted_mp3_files = collect_mp3_files_in_order(output_folder_path)
output_combined_mp3_path = "/content/drive/My Drive/Translated_Sections/combined_output.mp3"
combine_mp3_files(sorted_mp3_files, output_combined_mp3_path)


In [None]:
!pip install moviepy




In [None]:
from moviepy.editor import AudioFileClip, ImageClip

def create_video_with_cover(mp3_file, cover_image, output_video_path):
    """
    Create a video file with a static cover image and an MP3 audio file.

    Parameters:
    - mp3_file: Path to the MP3 file (audio).
    - cover_image: Path to the cover image (JPEG).
    - output_video_path: Path to save the output video file.
    """
    # Load the audio file
    audio = AudioFileClip(mp3_file)

    # Load the image and set its duration to match the audio
    image = ImageClip(cover_image, duration=audio.duration)

    # Set the resolution for the video (optional, default is the image resolution)
    video = image.set_audio(audio)

    # Export the video
    video.write_videofile(output_video_path, fps=1, codec="libx264", audio_codec="aac")

    print(f"Video file created: {output_video_path}")

# --------------------------------------------------------
# Usage Example
# --------------------------------------------------------

# Paths for the MP3, cover image, and output video
mp3_path = "/content/drive/My Drive/Translated_Sections/combined_output.mp3"
cover_path = "/content/drive/My Drive/Translated_Sections/cover.jpeg"
output_video_path = "/content/drive/My Drive/Translated_Sections/output_video.mp4"

# Create the video
create_video_with_cover(mp3_path, cover_path, output_video_path)


t:   2%|▏         | 11784/737494 [07:09<5:43:03, 35.26it/s, now=None]

Moviepy - Building video /content/drive/My Drive/Translated_Sections/output_video.mp4.
MoviePy - Writing audio in output_videoTEMP_MPY_wvf_snd.mp4


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
chunk:  67%|██████▋   | 454019/677572 [17:06<08:44, 425.89it/s, now=None][A
chunk:  67%|██████▋   | 454062/677572 [17:06<08:51, 420.17it/s, now=None][A
chunk:  67%|██████▋   | 454114/677572 [17:06<08:21, 445.60it/s, now=None][A
chunk:  67%|██████▋   | 454159/677572 [17:06<08:24, 442.77it/s, now=None][A
chunk:  67%|██████▋   | 454206/677572 [17:06<08:20, 446.22it/s, now=None][A
chunk:  67%|██████▋   | 454251/677572 [17:06<08:21, 444.96it/s, now=None][A
chunk:  67%|██████▋   | 454296/677572 [17:06<08:31, 436.63it/s, now=None][A
chunk:  67%|██████▋   | 454340/677572 [17:06<08:38, 430.45it/s, now=None][A
chunk:  67%|██████▋   | 454386/677572 [17:06<08:30, 437.41it/s, now=None][A
chunk:  67%|██████▋   | 454436/677572 [17:07<08:12, 452.61it/s, now=None][A
chunk:  67%|██████▋   | 454483/677572 [17:07<08:12, 452.91it/s, now=None][A
chunk:  67%|██████▋   | 454532/677572 [17:07<08:04, 459.98it/s, now=None][A
chunk:  67%

MoviePy - Done.
Moviepy - Writing video /content/drive/My Drive/Translated_Sections/output_video.mp4



[1;30;43mStreaming output truncated to the last 5000 lines.[0m
t:  54%|█████▍    | 16528/30729 [15:05<11:42, 20.22it/s, now=None][A
t:  54%|█████▍    | 16531/30729 [15:05<11:41, 20.24it/s, now=None][A
t:  54%|█████▍    | 16534/30729 [15:05<10:59, 21.52it/s, now=None][A
t:  54%|█████▍    | 16537/30729 [15:05<11:56, 19.79it/s, now=None][A
t:  54%|█████▍    | 16540/30729 [15:06<12:54, 18.33it/s, now=None][A
t:  54%|█████▍    | 16542/30729 [15:06<13:15, 17.82it/s, now=None][A
t:  54%|█████▍    | 16544/30729 [15:06<15:09, 15.60it/s, now=None][A
t:  54%|█████▍    | 16547/30729 [15:06<15:09, 15.59it/s, now=None][A
t:  54%|█████▍    | 16550/30729 [15:06<13:00, 18.16it/s, now=None][A
t:  54%|█████▍    | 16552/30729 [15:06<14:22, 16.43it/s, now=None][A
t:  54%|█████▍    | 16554/30729 [15:07<14:01, 16.84it/s, now=None][A
t:  54%|█████▍    | 16556/30729 [15:07<14:30, 16.29it/s, now=None][A
t:  54%|█████▍    | 16558/30729 [15:07<14:50, 15.92it/s, now=None][A
t:  54%|█████▍    | 16560

Moviepy - Done !
Moviepy - video ready /content/drive/My Drive/Translated_Sections/output_video.mp4
Video file created: /content/drive/My Drive/Translated_Sections/output_video.mp4
