In [None]:
model_id = "Llama-2-7b-chat-hf"
# input_path = "pipeline_files/2_transcribed_audio"
input_path = "pipeline_files/2_transcribed_audio/gestion_de_requisitos_con_Redmine.json"
output_path = "pipeline_files/3_vectordb"

In [None]:
import os
import shutil

if os.path.exists(output_path):
    shutil.rmtree(output_path)
os.makedirs(output_path)

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

In [None]:
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"

## LOADER

In [None]:
import json

def read_json_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)
    return data

# Example usage:
json_data = read_json_file(input_path)
print(json_data)


In [None]:
chunk_duration = 10

def time_to_seconds(time_str):
    time_components = time_str.split(':')
    hours = int(time_components[0])
    minutes = int(time_components[1])
    seconds, milliseconds = map(float, time_components[2].split(','))
    total_seconds = hours * 3600 + minutes * 60 + seconds + milliseconds / 1000
    return total_seconds

duration_chunk, chunk_text, chunk_start, chunk_end = 0, "", "00:00:00,000", "00:00:00,000"
chunks = []
for el in json_data:
    start = float(time_to_seconds(el["start"]))
    end = float(time_to_seconds(el["end"]))
    duration_chunk += end - start
    chunk_text += el["text"] + " "
    if duration_chunk >= chunk_duration:
        chunk_end = el["end"]
        chunks.append({"filename": el["filename"], "start": chunk_start, "end": chunk_end, "text": chunk_text})
        chunk_start = el["end"]
        chunk_text = ""
        duration_chunk = 0
print(chunks)
json_data = chunks

In [None]:
from langchain.docstore.document import Document

splits = []
for el in json_data:
    metadata = {}
    metadata["filename"] = el["filename"]
    metadata["start"] = el["start"]
    metadata["end"] = el["end"]
    doc =  Document(page_content=el["text"], metadata=metadata)
    splits.append(doc)
splits[:5]

In [None]:
len(splits)

## VECTORS

In [None]:
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings

In [None]:
vectordb = Chroma.from_documents(
    documents=splits,
    embedding=HuggingFaceEmbeddings(),
    persist_directory=output_path
)

In [None]:
print(vectordb._collection.count())

In [None]:
question = "que es la wiki?"
answer = vectordb.similarity_search(question,k=3)
for el in answer:
    print("{" + f"'filename': '{el.metadata['filename']}', 'start': '{el.metadata['start']}', 'end': '{el.metadata['end']}'" + "}")
    print(el.page_content)

## CLASE

In [1]:
# input_path = "pipeline_files/2_transcribed_audio/gestion_de_requisitos_con_Redmine.json"
input_path = "pipeline_files/2_transcribed_audio_coffee"
# output_path = "pipeline_files/3_vectordb"
output_path = "pipeline_files/3_vectordb_coffee"
chunk_duration = 10 # in seconds
reset_db = True

In [2]:
import json
from langchain.docstore.document import Document
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
import os
import shutil
import torch


class VectorDBGenerator:
    def __init__(self, output_path, reset_db=True):
        self.output_path = output_path
        if reset_db:
            self.__delete_db()
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.embeddings = HuggingFaceEmbeddings()

    def generate_vectordb(self, input_path, chunk_duration):
        json_data = self.__read_json_file(input_path)
        chunks = self.__chunk_aggregator(json_data, chunk_duration)
        documents = self.__generate_documents(chunks)
        vectordb = self.__generate_vectors(documents)
        return vectordb
    

    def __read_json_file(self, file_path):
        with open(file_path, 'r', encoding='utf-8') as file:
            data = json.load(file)
        return data


    def __chunk_aggregator(self, data, chunk_duration):
        # duration_chunk, chunk_text, chunk_start, chunk_end = 0, "", "00:00:00,000", "00:00:00,000"
        duration_chunk, chunk_text, chunk_start, chunk_end = 0, "", "00:00:00.000", "00:00:00.000"
        chunks = []
        for el in data:
            start = float(self.__time_to_seconds(el["start"]))
            end = float(self.__time_to_seconds(el["end"]))
            duration_chunk += end - start
            # chunk_text += el["text"] + " "
            chunk_text += el["content"] + " "
            if duration_chunk >= chunk_duration:
                chunk_end = el["end"]
                # chunks.append({"filename": el["filename"], "start": chunk_start, "end": chunk_end, "text": chunk_text})
                chunks.append({"episode": el["episode"], "start": chunk_start, "end": chunk_end, "content": chunk_text})
                chunk_start = el["end"]
                chunk_text = ""
                duration_chunk = 0
        return chunks
    
    def __time_to_seconds(self, time_str):
        time_components = time_str.split(':')
        hours = int(time_components[0])
        minutes = int(time_components[1])
        # seconds, milliseconds = map(float, time_components[2].split(','))
        seconds, milliseconds = map(float, time_components[2].split('.'))
        total_seconds = hours * 3600 + minutes * 60 + seconds + milliseconds / 1000
        return total_seconds
    

    def __generate_documents(self, data):
        splits = []
        for el in data:
            metadata = {}
            # metadata["filename"] = el["filename"]
            # metadata["start"] = el["start"]
            # metadata["end"] = el["end"]
            # doc =  Document(page_content=el["text"], metadata=metadata)
            metadata["filename"] = el["episode"]
            metadata["start"] = el["start"]
            metadata["end"] = el["end"]
            doc =  Document(page_content=el["content"], metadata=metadata)
            splits.append(doc)
        return splits
    
    def __generate_vectors(self, documents):
        vectordb = Chroma.from_documents(
            documents=documents,
            embedding=self.embeddings,
            persist_directory=self.output_path
        )
        return vectordb
    
    def __delete_db(self):
        if os.path.exists(self.output_path):
            shutil.rmtree(self.output_path)
        os.makedirs(self.output_path)


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# generator = VectorDBGenerator(output_path, reset_db)
# if os.path.isdir(input_path):
#     for file_name in os.listdir(input_path):
#         if file_name.endswith(".json"):
#             file_path = os.path.join(input_path, file_name)
#             generator.generate_vectordb(file_path, chunk_duration)
# else:
#     generator.generate_vectordb(input_path, chunk_duration)

In [4]:
from tqdm import tqdm

generator = VectorDBGenerator(output_path, reset_db)
if os.path.isdir(input_path):
    for file_name in tqdm(os.listdir(input_path), desc="Processing files"):
        if file_name.endswith(".json"):
            file_path = os.path.join(input_path, file_name)
            generator.generate_vectordb(file_path, chunk_duration)
else:
    generator.generate_vectordb(input_path, chunk_duration)


Processing files: 100%|██████████| 490/490 [04:39<00:00,  1.75it/s]
