In [10]:
import pandas as pd
import numpy as np
import os
import re

In [11]:
video_directory = '/Volumes/BackupDrive/all_unique_videos'
transcript_directory = '/Users/ianduke/Desktop/ACLU/siskiyou/transcripts'

# Define functions

In [12]:
def convert_timestamps_to_uniform_format(transcript):
    def format_timestamp(total_seconds):
        minutes, seconds = divmod(total_seconds, 60)
        return f"{int(minutes):02}:{seconds:06.3f}".replace('.', ':')

    def parse_timestamp(timestamp_str):
        parts = list(map(float, re.split(r'[:\s]', timestamp_str)))
        if len(parts) == 3:
            total_seconds = parts[0] * 3600 + parts[1] * 60 + parts[2]
        elif len(parts) == 2:
            total_seconds = parts[0] * 60 + parts[1]
        else:
            total_seconds = parts[0]
        return total_seconds

    def process_line(line, next_line=""):
        if re.match(r"\[\d+\.\d+ - \d+\.\d+\]", line):
            timestamps = re.findall(r"[\d.]+", line)
            start = format_timestamp(float(timestamps[0]))
            end = format_timestamp(float(timestamps[1]))
            return f"[{start} --> {end}]{line[line.index(']')+1:]}"
        
        elif re.match(r"\d{1,2}:\d{2}:\d{2}\s*\–\s*\d{1,2}:\d{2}:\d{2}", next_line):
            timestamps = re.findall(r"\d{1,2}:\d{2}:\d{2}", next_line)
            start = format_timestamp(parse_timestamp(timestamps[0]))
            end = format_timestamp(parse_timestamp(timestamps[1]))
            return f"[{start} --> {end}]  {line.strip()}"
        
        return line.strip()

    formatted_transcript = []
    lines = transcript.split('\n')
    skip_next_line = False

    for i, line in enumerate(lines):
        if skip_next_line:
            skip_next_line = False
            continue
        
        if i + 1 < len(lines) and re.match(r"\d{1,2}:\d{2}:\d{2}\s*\–\s*\d{1,2}:\d{2}:\d{2}", lines[i + 1]):
            formatted_transcript.append(process_line(line, lines[i + 1]))
            skip_next_line = True
        else:
            formatted_transcript.append(process_line(line))
    
    # Remove empty lines and join the transcript
    return '\n'.join(line for line in formatted_transcript if line.strip())

# Example usage:
transcript = """
[30.00 - 34.34]  This is a test
[34.34 - 70.64]  transcript

This is a test transcript in another format
0:00:00 – 0:00:10

Blah blah blah
0:00:10 – 0:00:17
"""

formatted_transcript = convert_timestamps_to_uniform_format(transcript)
print(formatted_transcript)


[00:30:000 --> 00:34:340]  This is a test
[00:34:340 --> 01:10:640]  transcript
[00:00:000 --> 00:10:000]  This is a test transcript in another format
[00:10:000 --> 00:17:000]  Blah blah blah


In [13]:
def remove_repetitions(transcript):
    lines = transcript.split('\n')
    cleaned_lines = []
    last_content = None
    repeated_line_indices = []

    for line in lines:
        match = re.match(r'\[(\d+:\d+:\d+) - (\d+:\d+:\d+)\] (.*)', line)
        if match:
            content = match.group(3)

            if content == last_content:
                # We have repeated content
                repeated_line_indices.append(len(cleaned_lines))
            else:
                # Before moving on, check if we need to remove repetitions
                if len(repeated_line_indices) > 3:
                    # Remove all but the first occurrence
                    indices_to_remove = repeated_line_indices[1:]
                    for idx in reversed(indices_to_remove):
                        del cleaned_lines[idx]
                # Reset repeated_line_indices for the new content
                repeated_line_indices = []
                last_content = content
                # Start tracking the new content
                repeated_line_indices.append(len(cleaned_lines))
            # Append the current line
            cleaned_lines.append(line)
        else:
            # Non-matching line
            if len(repeated_line_indices) > 3:
                indices_to_remove = repeated_line_indices[1:]
                for idx in reversed(indices_to_remove):
                    del cleaned_lines[idx]
            # Reset tracking variables
            repeated_line_indices = []
            last_content = None
            cleaned_lines.append(line)

    # Handle any remaining repetitions at the end
    if len(repeated_line_indices) > 3:
        indices_to_remove = repeated_line_indices[1:]
        for idx in reversed(indices_to_remove):
            del cleaned_lines[idx]

    return '\n'.join(cleaned_lines)

# Test transcript
transcript = """[0:05:47 - 0:05:48]  This is line 1
[0:05:48 - 0:05:49] This is line 2
[0:05:49 - 0:05:50]  This is line 3
[0:05:50 - 0:05:51]  This is line 3
[0:05:51 - 0:05:52]  This is line 3
[0:05:52 - 0:05:53]  This is line 3
[0:05:53 - 0:05:55]  This is line 4
[0:05:55 - 0:05:57]  This is line 5
"""

print(remove_repetitions(transcript))


[0:05:47 - 0:05:48]  This is line 1
[0:05:48 - 0:05:49] This is line 2
[0:05:49 - 0:05:50]  This is line 3
[0:05:53 - 0:05:55]  This is line 4
[0:05:55 - 0:05:57]  This is line 5



# Apply functions to timestamp

In [14]:
def process_text_files(directory):
    for filename in os.listdir(directory):
        if filename.endswith(".txt"):
            file_path = os.path.join(directory, filename)
            
            # Read the contents of the file
            with open(file_path, 'r', encoding='utf-8') as file:
                file_contents = file.read()
            
            # Apply the functions to the file contents
            #file_contents = convert_timestamps_to_uniform_format(file_contents) # Only run if timestamps are not already in HH:MM:SS format
            file_contents = remove_repetitions(file_contents) 
            
            # Resave the modified contents back to the original file
            with open(file_path, 'w', encoding='utf-8') as file:
                file.write(file_contents)

In [15]:
process_text_files(transcript_directory)

# Double check that all transcripts have been created

In [7]:
def check_file_mismatches(video_dir, txt_dir):
    # Get the list of video files without extensions
    video_files = {os.path.splitext(f)[0] for f in os.listdir(video_dir) if os.path.isfile(os.path.join(video_dir, f))}
    
    # Get the list of txt files without extensions
    txt_files = {os.path.splitext(f)[0] for f in os.listdir(txt_dir) if os.path.isfile(os.path.join(txt_dir, f))}
    
    # Find video files that do not have corresponding txt files
    missing_txt_files = video_files - txt_files
    
    # Find txt files that do not have corresponding video files
    missing_video_files = txt_files - video_files
    
    # Print results
    if missing_txt_files:
        print("Video files missing corresponding txt files:")
        for video in missing_txt_files:
            print(f"{video}")
    else:
        print("All video files have corresponding txt files.")
    
    if missing_video_files:
        print("\ntxt files missing corresponding video files:")
        for txt in missing_video_files:
            print(f"{txt}")
    else:
        print("All txt files have corresponding video files.")

check_file_mismatches(video_directory, transcript_directory)


All video files have corresponding txt files.
All txt files have corresponding video files.


# Count total files and corrupted files

In [8]:
import os

def count_txt_files(directory):
    total_txt_files = 0
    empty_or_damaged_count = 0
    search_text = 'FILE EMPTY OR DAMAGED'
    
    # Loop through all files in the directory
    for filename in os.listdir(directory):
        # Check if the file is a txt file
        if filename.endswith('.txt'):
            total_txt_files += 1
            filepath = os.path.join(directory, filename)
            
            # Open the file and check for the search text
            with open(filepath, 'r', encoding='utf-8') as file:
                content = file.read()
                if search_text in content:
                    empty_or_damaged_count += 1
    
    return total_txt_files, empty_or_damaged_count


total_files, empty_or_damaged = count_txt_files(transcript_directory)
print(f"Total number of txt files: {total_files}")
print(f"Number of txt files containing 'FILE EMPTY OR DAMAGED': {empty_or_damaged}")


Total number of txt files: 1
Number of txt files containing 'FILE EMPTY OR DAMAGED': 0


In [9]:
def count_all_files(directory):
    total_files = 0
    
    # Loop through all items in the directory
    for item in os.listdir(directory):
        # Check if the item is a file
        if os.path.isfile(os.path.join(directory, item)):
            total_files += 1
    
    return total_files

# Example usage
total_files = count_all_files(transcript_directory)
print(f"Total number of files in the directory: {total_files}")


Total number of files in the directory: 2
