In [16]:
import os
import time
import pandas as pd
import re

# Define base directory for saving files
base_directory = r"C:\Users\krgod\Documents\Texas MSBA\Fall Semester\Advanced Machine Learning\Final Project"

# Define folder paths
subtitles_folder = os.path.join(base_directory, "subtitles_test_v3")
cleaned_subtitles_folder = os.path.join(base_directory, "cleaned_subtitles_test_v3")

# Create folders if they don't exist
os.makedirs(subtitles_folder, exist_ok=True)
os.makedirs(cleaned_subtitles_folder, exist_ok=True)

# Function to clean subtitles
def clean_subtitles(input_file, output_file):
    """
    Cleans up a subtitle file by keeping timestamps and content including [Applause] and [Laughter],
    removing unnecessary metadata, inline tags, and avoiding repeated subtitle lines across timestamps.
    Args:
    - input_file (str): Path to the input subtitle file.
    - output_file (str): Path to the output cleaned subtitle file.
    """
    try:
        with open(input_file, 'r', encoding='utf-8') as infile:
            lines = infile.readlines()

        cleaned_lines = []
        seen_lines = set()  # Track unique subtitles globally
        timestamp_pattern = r"^\d{2}:\d{2}:\d{2}\.\d{3} --> \d{2}:\d{2}:\d{2}\.\d{3}"

        for line in lines:
            line = line.strip()

            # Keep lines with timestamps but remove align and position metadata
            if re.match(timestamp_pattern, line):
                cleaned_timestamp = re.sub(r" align:.*$", "", line)
                cleaned_lines.append(cleaned_timestamp)
                continue

            # Keep non-empty lines, including [Applause] and [Laughter], and remove inline tags
            if line and not re.search(r"align:start|position:\d+%", line):
                # Remove inline HTML-like tags such as <c>...</c> or <00:...>
                cleaned_line = re.sub(r"<[^>]*>", "", line)

                # Add line only if it hasn't been seen before
                if cleaned_line not in seen_lines:
                    cleaned_lines.append(cleaned_line)
                    seen_lines.add(cleaned_line)

        # Write cleaned lines to the output file
        with open(output_file, 'w', encoding='utf-8') as outfile:
            for line in cleaned_lines:
                outfile.write(line + '\n')
    except Exception as e:
        print(f"Error cleaning subtitles {input_file}: {e}")

# Load the CSV file containing YouTube links
csv_file_path = "comedy_vid_links_v2_test.csv"  # Replace with the actual path to your CSV file

try:
    # Read the CSV file
    df = pd.read_csv(csv_file_path)
    
    # Assume the YouTube links are in the first column
    youtube_links = df.iloc[:, 0].dropna().tolist()
    
    # List to track skipped videos
    skipped_videos = []

    # Loop through each link and download subtitles
    for link in youtube_links:
        try:
            # Step 1: Try to download manual captions
            manual_subtitles_success = False
            for attempt in range(3):
                subtitles_command = f'yt-dlp --write-sub --sub-langs "en,en.*" --skip-download -o "{subtitles_folder}/%(title)s_%(id)s.manual.%(ext)s" "{link}"'
                result = os.system(subtitles_command)
                if result == 0:
                    manual_subtitles_success = True
                    break
                print(f"Manual subtitle download failed for {link}, attempt {attempt + 1}/3.")
                time.sleep(5)

            # Step 2: If manual captions fail, try auto-generated captions
            if not manual_subtitles_success:
                print(f"Manual subtitles unavailable for {link}. Attempting auto-generated subtitles.")
                auto_subtitles_success = False
                for attempt in range(3):
                    auto_subtitles_command = f'yt-dlp --write-auto-sub --sub-langs "en,en.*" --skip-download -o "{subtitles_folder}/%(title)s_%(id)s.auto.%(ext)s" "{link}"'
                    result = os.system(auto_subtitles_command)
                    if result == 0:
                        auto_subtitles_success = True
                        break
                    print(f"Auto-generated subtitle download failed for {link}, attempt {attempt + 1}/3.")
                    time.sleep(5)

                if not auto_subtitles_success:
                    print(f"Subtitle download failed after 3 attempts for {link}. Skipping.")
                    skipped_videos.append(link)
                    continue
            
            # Process subtitles (manual if available, otherwise auto-generated)
            processed = False
            for file_name in os.listdir(subtitles_folder):
                if file_name.endswith(".vtt") and link.split("v=")[-1] in file_name:
                    input_file = os.path.join(subtitles_folder, file_name)
                    output_file = os.path.join(cleaned_subtitles_folder, file_name)
                    clean_subtitles(input_file, output_file)
                    processed = True
                    break
            
            if not processed:
                print(f"No subtitles found for {link}. Skipping.")
                skipped_videos.append(link)

        except Exception as e:
            print(f"Error processing link {link}: {e}")
            skipped_videos.append(link)

    # Print summary of skipped videos
    print("Download and cleaning completed!")
    if skipped_videos:
        print("Skipped videos:")
        for video in skipped_videos:
            print(video)

except Exception as e:
    print(f"An error occurred: {e}")


No subtitles found for https://www.youtube.com/watch?v=oLhZVRphhew . Skipping.
No subtitles found for https://www.youtube.com/watch?v=8qfndbEYroE . Skipping.
No subtitles found for https://www.youtube.com/watch?v=kZZez42HWvU . Skipping.
Download and cleaning completed!
Skipped videos:
https://www.youtube.com/watch?v=oLhZVRphhew 
https://www.youtube.com/watch?v=8qfndbEYroE 
https://www.youtube.com/watch?v=kZZez42HWvU 
