In [3]:

import json
import difflib
from openai import OpenAI

# Step 3: Extract Key Highlights using OpenAI API
def get_key_highlights(openai_api_key, punctuated_transcript):
    """
    Extract key sections of a given transcript using the OpenAI API.

    Parameters:
    - openai_api_key: API key for accessing the OpenAI API.
    - punctuated_transcript: The punctuated version of the transcript.

    Returns:
    - List of highlights (section titles) extracted from the transcript.
    """
    # Initialize the OpenAI API client
    client = OpenAI(api_key=openai_api_key)

    # Send request to OpenAI API to get key highlights
    response = client.chat.completions.create(
        model="gpt-3.5-turbo-0125",  # Use the most recent model available
        messages=[
            {"role": "system", "content": """You are supposed to read and understand the provided transcript, and identify key highlights.
                        These are section titles, so all you need to provide is section titles in chronological order. 
                        The output should be in structured JSON format with key 'highlights'."""},
            {"role": "user", "content": f"Transcript: {punctuated_transcript}. Please return the key sections of the transcript in a structured JSON format."}
        ]
    )

    # Extract the 'content' from the response
    content = response.choices[0].message.content

    # Parse the JSON response and extract highlights
    summary_json = json.loads(content)
    return summary_json.get('highlights', [])

# Step 1: Load transcription data from JSON and text files
def load_transcription_data(json_file_path, text_file_path):
    """
    Load transcription data from a JSON file and a punctuated text file.

    Parameters:
    - json_file_path: Path to the JSON transcription file.
    - text_file_path: Path to the punctuated text transcription file.

    Returns:
    - A tuple containing the JSON transcription and punctuated transcription.
    """
    with open(json_file_path, 'r') as json_file:
        json_transcription = json.load(json_file)
    
    with open(text_file_path, 'r') as text_file:
        punctuated_transcription = text_file.read()
    
    return json_transcription, punctuated_transcription

# Step 2: Preprocess the transcripts for comparison
def preprocess_transcripts(json_transcript, punctuated_transcript):
    """
    Preprocess both the JSON and punctuated transcript into sentences.

    Parameters:
    - json_transcript: JSON format transcript with time and duration data.
    - punctuated_transcript: Punctuated text version of the transcript.

    Returns:
    - Tuple containing processed JSON sentences and punctuated sentences.
    """
    # Tokenize punctuated transcription into sentences
    punctuated_sentences = punctuated_transcript.splitlines()
    
    # Clean up the JSON transcript and add timing information
    json_sentences = [{'text': entry['text'].strip(), 
                       'start_time': entry['offset'] / 1000.0, 
                       'end_time': (entry['offset'] + entry['duration']) / 1000.0}
                      for entry in json_transcript]
    
    return json_sentences, punctuated_sentences

# Step 5: Find best match for a given highlight and map to timestamps
def find_best_match(summary_sentence, json_sentences, threshold=0.4):
    """
    Find the best matching sentence for a given highlight using sequence matching.

    Parameters:
    - summary_sentence: A sentence from the highlights.
    - json_sentences: List of sentences with timestamps from the JSON transcript.
    - threshold: Minimum match ratio to consider a match valid.

    Returns:
    - Dictionary containing start time, end time, and confidence score for the match.
    """
    best_match = None
    highest_ratio = 0

    # Compare the highlight with all sentences in the JSON transcript
    for entry in json_sentences:
        match_ratio = difflib.SequenceMatcher(None, summary_sentence.lower(), entry['text'].lower()).ratio()
        if match_ratio > highest_ratio:
            highest_ratio = match_ratio
            best_match = entry
    
    # If the highest match ratio exceeds the threshold, return the matched entry
    if highest_ratio > threshold and best_match:
        return {
            'start_time': best_match['start_time'],
            'end_time': best_match['end_time'],
            'confidence': round(highest_ratio, 2)
        }
    return None

# Step 5 (Updated): Match highlights to timestamps from the list
def match_summary_to_timestamps_from_list(highlights_list, json_sentences):
    """
    Match each highlight to its best-matching sentence in the JSON transcript.

    Parameters:
    - highlights_list: List of highlight sentences.
    - json_sentences: List of JSON sentences with timestamps.

    Returns:
    - List of matched highlights with their corresponding timestamps and confidence scores.
    """
    matched_highlights = []
    
    # Match each highlight to the best JSON sentence
    for highlight in highlights_list:
        best_match = find_best_match(highlight, json_sentences)
        if best_match:
            matched_highlights.append({
                "highlight": highlight,
                "start_time": best_match['start_time'],
                "end_time": best_match['end_time'],
                "confidence": best_match['confidence']
            })
    
    return matched_highlights

# Step 6: Main function to orchestrate the process
def generate_video_highlights(json_file_path, text_file_path, openai_api_key=None, highlights=None):
    """
    Generate video highlights by loading transcripts, processing them, and optionally using OpenAI API to extract highlights.

    Parameters:
    - json_file_path: Path to the JSON transcription file.
    - text_file_path: Path to the punctuated transcription file.
    - openai_api_key: API key for accessing OpenAI API (optional).
    - highlights: Predefined list of highlights (optional).

    Returns:
    - Matched highlights with timestamps if available.
    """
    # Load transcription data
    json_transcript, punctuated_transcript = load_transcription_data(json_file_path, text_file_path)
    
    # Preprocess transcripts
    json_sentences, punctuated_sentences = preprocess_transcripts(json_transcript, punctuated_transcript)
    
    # If no highlights are provided, use the OpenAI API to generate them
    if openai_api_key and not highlights:
        highlights = get_key_highlights(openai_api_key, punctuated_transcript)
    
    # If highlights are available, match them to the timestamps
    if highlights:
        matched_highlights = match_summary_to_timestamps_from_list(highlights, json_sentences)
        return matched_highlights
    else:
        return None



In [4]:

# Define file paths and API key
json_file = 'transcript.json'  # Path to the JSON transcription file
text_file = 'transcript.txt'   # Path to the punctuated text transcription file
openai_api_key = ''

# Step 4: Optionally get key highlights using OpenAI API
matched_highlights = generate_video_highlights(json_file, text_file, openai_api_key=openai_api_key)

# Output the matched highlights with timestamps
if matched_highlights:
    print(json.dumps(matched_highlights, indent=2))
else:
    print("No highlights available.")

No highlights available.
