In [None]:
#  Project Steps
#1) Identify your video sources: Organize them in such you with id and their url so you can identify them later on. 
    #Also keep in mind your hardware resources. Better to use smaller videos e.g. 1-5 mins.

#2) Transcribe your videos: Convert your videos to text. 
    #If you don't have transcription tool you can extract audio from a video using opensource tools like "shotcut". 
    #And then you can use Open AI whisper to transcribe.

#3) Create a semantic text search back end: You will find semantic search technique 
    #taught in this course useful for this purpose.

#4) Push to GitHub and share your repository with the class in a comment/question to this assignment.



In [None]:
#0) Install moviepy
#!pip install moviepy==1.0.3

In [None]:

import moviepy.editor as mp
print(moviepy.editor.__version__)

In [None]:
#1) extract audio from a video using moviepy==1.0.3


import os
import moviepy.editor as mp




def extract_audio_from_video_in_folder(input_folder_path):
    
    input_folder_path = r"C:\Users\imeyalo\Downloads\Videos"  # Replace with your video file path
    output_folder_path = r"C:\Users\imeyalo\Downloads\Audio" # Replace with your desired output audio file path
    
    if not os.path.isdir(input_folder_path):
        print(f"Error: Folder '{input_folder_path}' does not exist.")
        return

    try:
        for filename in os.listdir(input_folder_path):
            filepath = os.path.join(input_folder_path, filename)
            outputfilepath = os.path.join(output_folder_path, filename)
            
            # Load the video clip
            video_clip = mp.VideoFileClip(filepath)

            # Extract the audio from the video clip
            audio_clip = video_clip.audio
                       
       
            
            # Check if audio is available and write to output folder
            if audio_clip is not None:
                
                audio_clip.write_audiofile(outputfilepath,codec="libmp3lame")
                
            
            else:
                print("No audio track found in the video.")

                    

            # Close the clips
            if audio_clip is not None:
                audio_clip.close()
                
            if video_clip is not None:    
                video_clip.close()

            print(f"Audio extracted successfully to {output_folder_path}")

    except Exception as e:
        print(f"An error occurred: {e}")
        
# Example usage:
extract_audio_from_video_in_folder(r"C:\Users\imeyalo\Downloads\Videos")

In [1]:
#2)Transcribe your videos: Convert your videos to text. 

from dotenv import load_dotenv
import os
import openai
import json



# Load API key from .env
load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")

# Input/output folders
input_folder = r"C:\Users\imeyalo\Downloads\Audio"
output_folder = r"C:\Users\imeyalo\Downloads\Transcripts_json"
os.makedirs(output_folder, exist_ok=True)

# Audio file formats supported by Whisper API
valid_extensions = (".mp3", ".mp4", ".m4a", ".wav", ".webm")

# Process all audio files
for filename in os.listdir(input_folder):
    if filename.lower().endswith(valid_extensions):
        input_path = os.path.join(input_folder, filename)
        base_name = os.path.splitext(filename)[0]
        output_path = os.path.join(output_folder, f"{base_name}.json")

        print(f"Transcribing: {filename}")

        try:
            with open(input_path, "rb") as audio_file:
                # Use verbose_json to include timestamps
                transcript = openai.Audio.transcribe(
                    model="whisper-1",
                    file=audio_file,
                    response_format="verbose_json"
                )

            # Save full timestamped transcript as JSON
            with open(output_path, "w", encoding="utf-8") as f:
                json.dump(transcript, f, ensure_ascii=False, indent=2)

            print(f"Saved to: {output_path}")

        except Exception as e:
            print(f"Error transcribing {filename}: {e}")

Transcribing: 1_854877-hd_1920_1080_25fps.mp4
Saved to: C:\Users\imeyalo\Downloads\Transcripts_json\1_854877-hd_1920_1080_25fps.json
Transcribing: 2_1585618-hd_1280_720_30fps.mp4
Saved to: C:\Users\imeyalo\Downloads\Transcripts_json\2_1585618-hd_1280_720_30fps.json
Transcribing: 3_4438080-hd_1920_1080_25fps.mp4
Saved to: C:\Users\imeyalo\Downloads\Transcripts_json\3_4438080-hd_1920_1080_25fps.json
Transcribing: 4_4.mp4
Saved to: C:\Users\imeyalo\Downloads\Transcripts_json\4_4.json
Transcribing: 5_10182004-hd_3240_2160_24fps.mp4
Saved to: C:\Users\imeyalo\Downloads\Transcripts_json\5_10182004-hd_3240_2160_24fps.json


In [2]:
#3)Extract and chunk segments from each json file
#use the segments array to group them into larger text blocks. This is useful for: 
#Reducing the number of embeddings
#Improving semantic coherence in search
#Better handling of long transcripts

import os
import json

# Settings
input_folder = r"C:\Users\imeyalo\Downloads\Transcripts_json"
chunk_size = 3  # Number of segments per chunk (adjust as needed)

all_chunks = []

for filename in os.listdir(input_folder):
    if filename.endswith(".json"):
        filepath = os.path.join(input_folder, filename)
        with open(filepath, "r", encoding="utf-8") as f:
            data = json.load(f)
            segments = data.get("segments", [])

            # Chunk the segments
            for i in range(0, len(segments), chunk_size):
                chunk_segments = segments[i:i + chunk_size]

                # Combine text and calculate time range
                chunk_text = " ".join(seg["text"].strip() for seg in chunk_segments)
                start_time = chunk_segments[0]["start"]
                end_time = chunk_segments[-1]["end"]

                # Store the chunk with metadata
                all_chunks.append({
                    "file": filename,
                    "start": start_time,
                    "end": end_time,
                    "text": chunk_text
                })

print(f"Extracted {len(all_chunks)} text chunks.")

# (Optional) Save chunks to a file
with open(r"C:\Users\imeyalo\Downloads\chunked_transcripts.json", "w", encoding="utf-8") as f:
    json.dump(all_chunks, f, ensure_ascii=False, indent=2)

print("Chunks saved to chunked_transcripts.json")

Extracted 5 text chunks.
Chunks saved to chunked_transcripts.json


In [2]:
#Average token per file

import os
import tiktoken

# Set the path to your folder with .txt transcript files
folder_path = r"C:\Users\imeyalo\Downloads\Transcripts_json"  # Change if needed

# Choose tokenizer model (e.g., gpt-3.5-turbo, gpt-4, etc.)
tokenizer = tiktoken.encoding_for_model("gpt-3.5-turbo")

token_counts = []

# Loop through all .txt files in the folder
for filename in os.listdir(folder_path):
    if filename.endswith(".json"):
        file_path = os.path.join(folder_path, filename)
        with open(file_path, "r", encoding="utf-8") as f:
            text = f.read()
            tokens = tokenizer.encode(text)
            count = len(tokens)
            token_counts.append(count)
            print(f"{filename}: {count} tokens")

# Compute average
if token_counts:
    avg_tokens = sum(token_counts) / len(token_counts)
    print(f"\n Average tokens per file: {avg_tokens:.2f}")
else:
    print("No .json files found in the folder.")

1_854877-hd_1920_1080_25fps.json: 190 tokens
2_1585618-hd_1280_720_30fps.json: 210 tokens
3_4438080-hd_1920_1080_25fps.json: 190 tokens
4_4.json: 224 tokens
5_10182004-hd_3240_2160_24fps.json: 190 tokens

 Average tokens per file: 200.80


In [9]:
#4)Upload Embeddings to Pinecone


import os
import json
import uuid
import openai
import pinecone
from dotenv import load_dotenv
from pinecone import Pinecone, ServerlessSpec
from tqdm.autonotebook import tqdm
from tqdm.auto import tqdm
import numpy as np


# Load environment variables
load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")
#pinecone_api_key = os.getenv("PINECONE_API_KEY")
pinecone_env = os.getenv("PINECONE_ENVIRONMENT")
pinecone_index_name = os.getenv("PINECONE_INDEX")

# Initialize Pinecone

pc = Pinecone(api_key=os.environ.get("PINECONE_API_KEY"))

# Create index if it doesn't exist
if pinecone_index_name not in pc.list_indexes().names():
    pc.create_index(
        name='audio-search',
            dimension=1536,
            metric='cosine',
            spec=ServerlessSpec(
                cloud='aws',
                region='us-east-1'
            )
        )
    
index = pc.Index(pinecone_index_name)

# Load chunked transcript file
with open( r"C:\Users\imeyalo\Downloads\chunked_transcripts.json", "r", encoding="utf-8") as f:
    chunks = json.load(f)

# Get embeddings from OpenAI
def get_embedding(text):
    response = openai.Embedding.create(
        input=text,
        model="text-embedding-3-small"
    )
    return response["data"][0]["embedding"]

# Upload chunks to Pinecone in batches
batch_size = 25
for i in range(0, len(chunks), batch_size):
    batch = chunks[i:i + batch_size]
    vectors = []
    for item in batch:
        vector_id = str(uuid.uuid4())
        embedding = get_embedding(item["text"])
        metadata = {
            "file": item["file"],
            "start": item["start"],
            "end": item["end"],
            "text": item["text"]
        }
        vectors.append((vector_id, embedding, metadata))
    index.upsert(vectors)
    print(f"Uploaded {len(vectors)} vectors.")

print("All chunks uploaded to Pinecone.")




Uploaded 5 vectors.
All chunks uploaded to Pinecone.


In [10]:
# Query Pinecone with Semantic Search

def semantic_search(query, top_k=5):
    query_embedding = get_embedding(query)
    results = index.query(vector=query_embedding, top_k=top_k, include_metadata=True)
    return results["matches"]

# Test search
print("\n Test search:")
query = "Thank you"
results = semantic_search(query)

for match in results:
    meta = match["metadata"]
    print(f"[{meta['start']}s - {meta['end']}s] ({meta['file']})")
    print(meta['text'])
    print("---")


 Test search:
[0.0s - 9.5s] (4_4.json)
Thank you for watching the video.
---
[0.0s - 23.5s] (2_1585618-hd_1280_720_30fps.json)
Thank you for watching!
---
[0.0s - 2.059999942779541s] (3_4438080-hd_1920_1080_25fps.json)
you
---
[0.0s - 2.059999942779541s] (5_10182004-hd_3240_2160_24fps.json)
you
---
[0.0s - 2.059999942779541s] (1_854877-hd_1920_1080_25fps.json)
you
---


In [None]:
References
1. https://www.pexels.com
2. https://www.pixabay.com
    