## Installation

In [None]:
!pip install --upgrade chromadb

## Process the subtitles (SRT) file

The subtitles file consists of the following fields: 

1. A numeric counter identifying each sequential subtitle.
2. The time that the subtitle should appear on the screen, followed by --> and the time it should disappear.
3. Subtitle text itself on one or more lines.
4. A blank line containing no text, indicating the end of the subtitle.


In [2]:
import re
# function to extract the subtitles data from SRT file
def extract_srt_blocks(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        srt_content = file.read()

    # Define a regular expression pattern to match SRT blocks
    pattern = r'(\d+)\n(\d{2}:\d{2}:\d{2},\d{3}) --> (\d{2}:\d{2}:\d{2},\d{3})\n(.+?)(?=\n\n|\n$|\Z)'
    
    # Use re.findall to find all matches in the SRT content
    matches = re.findall(pattern, srt_content, re.DOTALL)

    # Create a list of dictionaries containing index, start time, end time, and text
    srt_blocks = [{'index': int(match[0]),
                   'start_time': match[1],
                   'end_time': match[2],
                   'text': match[3].strip()} for match in matches]

    return srt_blocks

file_path = 'imitation game.srt'
blocks = extract_srt_blocks(file_path)

# Check some blocks
for block in blocks[:4]:
    print(f"Index: {block['index']}")
    print(f"Start Time: {block['start_time']}")
    print(f"End Time: {block['end_time']}")
    print(f"Text: {block['text']}")
    print()
    
# assert that the length of the blocks and the id of last block match
assert len(blocks) == blocks[-1]['index']

Index: 1
Start Time: 00:00:33,466
End Time: 00:00:35,769
Text: (Morse code transmissions)

Index: 2
Start Time: 00:00:49,448
End Time: 00:00:51,651
Text: (footsteps approaching)

Index: 3
Start Time: 00:00:51,684
End Time: 00:00:54,054
Text: ♪ ♪

Index: 4
Start Time: 00:01:04,463
End Time: 00:01:05,932
Text: (sniffs)



Aggregate the text into chunks / windows.

In [3]:
def srt_time_to_seconds(time_str):
    # Converts SRT time format (hh:mm:ss,mmm) to seconds
    parts = time_str.replace(',', ':').split(':')
    hours, minutes, seconds, milliseconds = map(int, parts)
    return hours * 3600 + minutes * 60 + seconds + milliseconds / 1000

def seconds_to_srt_time(seconds):
    # Converts seconds to SRT time format (hh:mm:ss,mmm)
    hours, remainder = divmod(seconds, 3600)
    minutes, seconds = divmod(remainder, 60)
    milliseconds = int((seconds - int(seconds)) * 1000)
    # Return the time string
    time_str = f"{int(hours):02d}:{int(minutes):02d}:{int(seconds):02d},{milliseconds:03d}"
    return time_str

# function to aggregate the subtitles into windows of specified size
def aggregate_srt_blocks(blocks, window_size=60):
    # initialize list to save the windows
    aggregated_windows = []
    # convert last end time from srt-time format to seconds
    last_end_time = srt_time_to_seconds(blocks[-1]['end_time'])
    # initialize the start time
    curr_start_time = 0
    # initialize the block index
    curr_block_index = 0
    # initialize the window index
    curr_id = 0
    # iterate till the last end timestamp of subtitles
    while curr_start_time < last_end_time+1:
        # define a dict to collect the aggregated data
        current_window = {}
        current_window['id'] = str(curr_id)
        current_window['text'] = ''
        # convert start time from seconds to srt-time format
        current_window['start_time'] = seconds_to_srt_time(curr_start_time)
        # convert current block's start time from srt-time format to seconds
        block_start_seconds = srt_time_to_seconds(blocks[curr_block_index]['start_time'])
        # iterate through the blocks based on the current time and window size
        while (block_start_seconds > curr_start_time) and (block_start_seconds <= curr_start_time + window_size) and curr_block_index < len(blocks):
            # concatenate (aggregate) the text with the previously saved text
            current_window['text'] += "\n" + blocks[curr_block_index]['text']
            curr_block_index += 1
            if curr_block_index < len(blocks):
                # update the start time for next block in seconds
                block_start_seconds = srt_time_to_seconds(blocks[curr_block_index]['start_time'])
        # increment the window size
        curr_start_time += window_size
        # convert end time from seconds to srt-time format
        current_window['end_time'] = seconds_to_srt_time(curr_start_time)
        # append the aggregated data to a list
        aggregated_windows.append(current_window)
        # increment the index for windows
        curr_id += 1
    # return the aggregated window data
    return aggregated_windows

chunks = aggregate_srt_blocks(blocks, window_size=120)

## Create a collection in Chromadb

Collections are where you'll store your embeddings, documents, and any additional metadata. You can create a collection with a name.

In [4]:
import chromadb

# create a chroma client
chroma_client = chromadb.Client()

In [5]:
# create a collection
collection = chroma_client.create_collection(name="imitation_game")
# get the collection(optional)
collection = chroma_client.get_collection("imitation_game")

## Add some text documents to the collection

Chroma will store your text, and handle tokenization, embedding, and indexing automatically.

In [6]:
# add documents to your collection
collection.add(
    documents=[chunk['text'] for chunk in chunks],
    metadatas=[{'start': chunk['start_time'], 'end': chunk['end_time']} for chunk in chunks],
    ids=[chunk['id'] for chunk in chunks]
)

## Query the collection

Query the collection using the `.query()` method.

In [7]:
# query and fetch the results
results = collection.query(
    query_texts=["people who no one can imagine does things that no one can imagine"],
    n_results=1
)
print (results['metadatas'][0][0]['start'],'-->',results['metadatas'][0][0]['end'])
print (results['documents'][0][0])

01:46:00,000 --> 01:48:00,000

Do you know,
this morning...
(sighs) I was
on a train
that went through a city
that wouldn't exist
if it wasn't for you.
I bought a ticket
from a man
who would likely be dead
if it wasn't for you.
I read up
on my work...
a whole field of
scientific inquiry
that only exists
because of you.
Now, if you wish you
could have been normal...
I can promise
you I do not.
The world is an
infinitely better place
precisely because
you weren't.
Do you...
do you really think that?
I think...
that sometimes
it is the people
who no one imagines
anything of
who do the things
that no one can imagine.
♪ ♪
(sighs)
(exhales)
♪ ♪


## Clean up

Delete the created collection.

In [8]:
chroma_client.delete_collection("imitation_game")