# Chargement des données depuis la BDD

In [10]:
import sqlite3

#Connection à la base de données
conn = sqlite3.connect('../src/videos_youtube.db')
cursor = conn.cursor()

# Charger l'url
cursor.execute(f"SELECT id, transcription FROM videos WHERE transcription IS NOT NULL;")
results = cursor.fetchall()

# Fermer la connexion à la base de données
conn.close()

# Chunks des transcriptions

In [11]:
def split_text(text: str, chunk_size: int = 500, chunk_overlap: int = 50) -> list[str]:
    """Permet de splitter le texte en plusieurs chunks"""
    chunks = []
    start = 0

    while start < len(text):
        end = start + chunk_size
        chunks.append(text[start:end])
        start += chunk_size - chunk_overlap

    return chunks

In [12]:
def generate_chunk_ids(input_list: list[tuple[int, str | None]], chunk_size: int = 500, chunk_overlap: int = 50) -> list[tuple[str, str]]:
    """Génère une liste d'ID de chunks au format '010001'"""
    chunk_list = []

    for original_id, text in input_list:
        if text is None:
            continue

        chunks = split_text(text, chunk_size, chunk_overlap)
        for chunk_id, chunk in enumerate(chunks, start=1):
            formatted_id = f"{original_id:02d}{chunk_id:04d}"
            chunk_list.append((formatted_id, chunk))

    return chunk_list

In [13]:
chunks_with_ids = generate_chunk_ids(results)
print(chunks_with_ids)



In [12]:
#Sauvegarde CSV
import csv

def save_chunks_to_csv(chunks_with_ids: list[tuple[str, str]], filename: str):
    """Sauvegarde les chunks avec leurs IDs dans un fichier CSV"""
    with open(filename, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        # Écrire l'en-tête
        writer.writerow(["Chunk ID", "Chunk Text"])
        # Écrire les données
        for chunk_id, chunk_text in chunks_with_ids:
            writer.writerow([chunk_id, chunk_text])

save_chunks_to_csv(chunks_with_ids, 'chunks.csv')

# Embeddings des chunks

In [13]:
from mistralai import Mistral

api_key = "ilmf54nAP0qBHRhICxyqyC5UkrOWH8px"

# Initialiser le client Mistral
client = Mistral(api_key=api_key)

def generate_embedding(text : str):
    """Génère un embedding"""
    # Spécifier le modèle d'embedding
    model = "mistral-embed"

    # Appeler l'API pour générer les embeddings
    response = client.embeddings.create(
        model=model,
        inputs=[text]
    )

    # Extraire les embeddings de la réponse
    prompt_embedding = response.data[0].embedding
    return prompt_embedding

In [8]:
import json

# Charger les embeddings depuis le fichier JSON
with open('/Users/pierrebourbon/Desktop/archive sise camp/transcripts_embeddings.json', 'r') as f:
    chunk_embed = json.load(f)

# Afficher les premiers éléments pour vérification
print(chunk_embed[:2])  # Affiche les 2 premiers embeddings pour vérifier

[[-0.030181884765625, 0.037109375, 0.043853759765625, 0.022491455078125, 0.01277923583984375, 0.0179443359375, 0.00940704345703125, -0.009368896484375, -0.0079193115234375, -0.016815185546875, -0.0289459228515625, 0.042572021484375, -0.029571533203125, 0.0138092041015625, -0.059112548828125, 0.05853271484375, 0.0017900466918945312, 0.01495361328125, 0.0243377685546875, 0.0272979736328125, -0.021759033203125, -0.01209259033203125, -0.0662841796875, -0.0281829833984375, 0.0032634735107421875, -0.00801849365234375, -0.00447845458984375, -0.06390380859375, -0.027801513671875, -0.006488800048828125, -0.01087188720703125, -0.016387939453125, -0.00811767578125, -0.026885986328125, -0.0010395050048828125, 0.006534576416015625, -0.01107025146484375, -0.01629638671875, -0.004547119140625, 0.0341796875, 0.018524169921875, -0.01436614990234375, -0.0200042724609375, -0.0075225830078125, -0.0209808349609375, -0.055206298828125, 0.021697998046875, 0.007549285888671875, 0.031463623046875, -0.034240722

In [6]:
#Création liste de chunks
chunks = [chunk for _, chunk in chunks_with_ids]

In [15]:
import time

#Générer les embeddings
chunk_embed = []

for chunk in chunks:
    chunk_embed.append(generate_embedding(chunk))
    time.sleep(2)


In [19]:
#Sauvegarde des embeddings
import json
with open('transcripts_embeddings.json', 'w') as f:
    json.dump(chunk_embed, f)

# Création de l'index Faiss pour les chapitres 

In [7]:
import faiss
import numpy as np

#Générer la liste des IDs
chunk_id = [id for id, _ in chunks_with_ids]

#Définir la dimension des vecteurs 
dimension = 1024

#Crée l'index Faiss
index2 = faiss.IndexFlatL2(dimension)

# Ajouter les IDs des chapitres à l'index FAISS
index_with_ids2 = faiss.IndexIDMap(index2)  # Permet d'ajouter des IDs personnalisés
index_with_ids2.add_with_ids(np.array(chunk_embed, dtype=np.float32), np.array(chunk_id, dtype=np.int64))

faiss.write_index(index_with_ids2, "faiss_index_transcripts.bin")  # Sauvegarde