<a href="https://colab.research.google.com/github/hhlearntocode/quoepWEB/blob/main/ASR_2024.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


**SPLIT AUDIO FROM VIDEO**

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install moviepy
!pip install ffmpeg-python
!pip install tqdm

Collecting ffmpeg-python
  Downloading ffmpeg_python-0.2.0-py3-none-any.whl.metadata (1.7 kB)
Downloading ffmpeg_python-0.2.0-py3-none-any.whl (25 kB)
Installing collected packages: ffmpeg-python
Successfully installed ffmpeg-python-0.2.0


In [4]:
from moviepy.editor import VideoFileClip
import os
import sys

def convert_video_to_audio_moviepy(video_file, path_destination, output_ext="wav"):
    """Converts video to audio using MoviePy library
    that uses `ffmpeg` under the hood"""
    if not os.path.exists(path_destination):
      os.makedirs(path_destination)
    filename = os.path.basename(video_file)
    name, _ = os.path.splitext(filename)
    clip = VideoFileClip(video_file)
    # Create the full path for the output file
    output_path = os.path.join(path_destination, f"{name}.{output_ext}")
    clip.audio.write_audiofile(output_path)
    clip.close()

def process_videos_in_folder(input_folder, output_folder):
    # Ensure the output folder exists
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    # List all files in the input folder
    for filename in os.listdir(input_folder):
        # Check if the file is a video (you may want to add more extensions)
        if filename.endswith(('.mp4', '.avi', '.mov', '.mkv')):
            video_path = os.path.join(input_folder, filename)
            convert_video_to_audio_moviepy(video_path, output_folder)

##### INPUT_FOLDER ######
video_path = '/content/drive/MyDrive/database/input'

##### OUTPUT_FOLDER #####
path_destination = '/content/drive/MyDrive/database/output'

process_videos_in_folder(video_path, path_destination)

MoviePy - Writing audio in /content/drive/MyDrive/database/output/C0001.wav




MoviePy - Done.
MoviePy - Writing audio in /content/drive/MyDrive/database/output/C0000.wav


                                                                      

MoviePy - Done.




**SPLIT AUDIO ACCORDING TO SCENES (SEGMENTATION)**

In [7]:
import os
import json
import ffmpeg
import glob
from tqdm import tqdm

def convert_frame_to_second(number_frame, path_file_audio):
    info = ffmpeg.probe(path_file_audio)
    f, s = info['streams'][0]['avg_frame_rate'].split("/")
    return (number_frame*int(s))/int(f)


# Text file (Scenes)
FOLDER_TEXT_FILES = "/content/drive/MyDrive/database/textfile"
# Path video
FOLDER_VIDEOS = "/content/drive/MyDrive/database/input"
# Path audio want to save
FOLDER_AUDIOS_SPLIT = "/content/drive/MyDrive/database/result"


def create_dict_scenes(file_path):
    dict_scenes = {}
    key = os.path.basename(file_path)
    scenes = []
    with open(file_path, 'r') as f:
        for line in f:
            start, end = map(int, line.strip().split())
            scenes.append([start, end])
    dict_scenes[key] = scenes
    return dict_scenes

dict_scenes = {}
for txt_file in os.listdir(FOLDER_TEXT_FILES):
    if txt_file.endswith('.txt'):
        file_path = os.path.join(FOLDER_TEXT_FILES, txt_file)
        dict_scenes.update(create_dict_scenes(file_path))

print(dict_scenes)

for file_name_video in tqdm(os.listdir(FOLDER_VIDEOS)):
    path_file_name_video = os.path.join(FOLDER_VIDEOS, file_name_video)
    name_video_txt = f"{os.path.splitext(file_name_video)[0]}.txt"

    if name_video_txt in dict_scenes:
        output_folder = os.path.join(FOLDER_AUDIOS_SPLIT, os.path.splitext(file_name_video)[0])
        if not os.path.isdir(output_folder):
            os.makedirs(output_folder)
            print("created folder : ", output_folder)

        # Get start scene and end scene of each scene
        for start_scene, end_scene in dict_scenes[name_video_txt]:
            # Convert to second
            start_time = convert_frame_to_second(start_scene, path_file_name_video)
            end_time = convert_frame_to_second(end_scene, path_file_name_video)
            stream = ffmpeg.input(path_file_name_video)

            # path save audio
            name_file_video_split = os.path.join(output_folder, f"{os.path.splitext(name_video_txt)[0]}_{start_scene}_{end_scene}.wav")

            audio = stream.audio.filter('atrim', start=start_time, end=end_time)
            # overwrite_output overwrites it automatically
            try:
                out, err = ffmpeg.output(audio, name_file_video_split, **{'ar': '16000', 'ac': '1'}).overwrite_output().run(capture_stderr=True)
            except ffmpeg.Error as e:
                print("output")
                print(e.stdout)
                print("err")
                print(e.stderr)

{'C0000.txt': [[0, 43], [44, 88], [89, 110], [111, 121], [122, 142], [143, 282], [283, 648], [650, 702], [703, 793], [794, 872], [873, 973], [974, 1107], [1108, 1185], [1186, 1257], [1258, 2132], [2133, 2519], [2520, 2609], [2610, 2710], [2711, 2797], [2798, 2874], [2875, 2951], [2952, 3018], [3019, 3083], [3084, 3157], [3158, 3230], [3231, 3295], [3296, 3388], [3389, 3914], [3915, 3975], [3976, 4421], [4422, 4514], [4515, 4520], [4521, 4802], [4803, 4917], [4918, 5079], [5080, 5187], [5188, 5284], [5285, 5387], [5388, 5441], [5442, 5490], [5491, 5544], [5545, 5592], [5593, 6434], [6435, 7811], [7812, 7889], [7890, 7968], [7969, 8058], [8059, 8135], [8136, 8219], [8220, 8280], [8281, 8343], [8344, 8422], [8423, 8508], [8509, 8602], [8603, 8705], [8706, 8771], [8772, 8895], [8896, 8931], [8932, 9050]], 'C0001.txt': [[0, 43], [44, 321], [322, 376], [377, 412], [413, 442], [443, 470], [471, 503], [504, 529], [530, 559], [560, 593], [594, 627], [628, 664], [665, 695], [696, 722], [723, 752

  0%|          | 0/2 [00:00<?, ?it/s]

created folder :  /content/drive/MyDrive/database/result/C0001


 50%|█████     | 1/2 [04:47<04:47, 287.12s/it]

created folder :  /content/drive/MyDrive/database/result/C0000


100%|██████████| 2/2 [06:22<00:00, 191.35s/it]


**INFERENCE MODEL**

In [1]:
!pip3 install transformers
!pip3 install soundfile
!pip3 install datasets
!pip3 install pyctcdecode
!pip3 install https://github.com/kpu/kenlm/archive/master.zip

Collecting pyctcdecode
  Using cached pyctcdecode-0.5.0-py2.py3-none-any.whl.metadata (20 kB)
Collecting pygtrie<3.0,>=2.1 (from pyctcdecode)
  Using cached pygtrie-2.5.0-py3-none-any.whl.metadata (7.5 kB)
Collecting hypothesis<7,>=6.14 (from pyctcdecode)
  Using cached hypothesis-6.108.5-py3-none-any.whl.metadata (6.3 kB)
Using cached pyctcdecode-0.5.0-py2.py3-none-any.whl (39 kB)
Using cached hypothesis-6.108.5-py3-none-any.whl (465 kB)
Using cached pygtrie-2.5.0-py3-none-any.whl (25 kB)
Installing collected packages: pygtrie, hypothesis, pyctcdecode
Successfully installed hypothesis-6.108.5 pyctcdecode-0.5.0 pygtrie-2.5.0
Collecting https://github.com/kpu/kenlm/archive/master.zip
  Using cached https://github.com/kpu/kenlm/archive/master.zip (553 kB)
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: kenlm
  Building wheel fo

In [2]:
from huggingface_hub import hf_hub_download, hf_hub_url
import os, zipfile
import shutil
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
from datasets import load_dataset
import soundfile as sf
import torch
import kenlm
from pyctcdecode import Alphabet, BeamSearchDecoderCTC, LanguageModel
import IPython

cache_dir = './cache/'
processor = Wav2Vec2Processor.from_pretrained("nguyenvulebinh/wav2vec2-base-vietnamese-250h", cache_dir=cache_dir)
model = Wav2Vec2ForCTC.from_pretrained("nguyenvulebinh/wav2vec2-base-vietnamese-250h", cache_dir=cache_dir)
lm_file = hf_hub_download(repo_id="nguyenvulebinh/wav2vec2-base-vietnamese-250h", filename='vi_lm_4grams.bin.zip', cache_dir=cache_dir)
with zipfile.ZipFile(lm_file, 'r') as zip_ref:
    zip_ref.extractall(cache_dir)
lm_file = cache_dir + 'vi_lm_4grams.bin'

preprocessor_config.json:   0%|          | 0.00/215 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/181 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.11k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.65k [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/378M [00:00<?, ?B/s]

Some weights of the model checkpoint at nguyenvulebinh/wav2vec2-base-vietnamese-250h were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at nguyenvulebinh/wav2vec2-base-vietnamese-250h and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You

vi_lm_4grams.bin.zip:   0%|          | 0.00/904M [00:00<?, ?B/s]

In [3]:
# Get model

def get_decoder_ngram_model(tokenizer, ngram_lm_path):
    vocab_dict = tokenizer.get_vocab()
    sort_vocab = sorted((value, key) for (key, value) in vocab_dict.items())
    vocab = [x[1] for x in sort_vocab][:-2]
    vocab_list = vocab

    # Convert CTC blank character representation
    vocab_list[tokenizer.pad_token_id] = ""
    # Replace special characters
    vocab_list[tokenizer.unk_token_id] = ""
    # Convert space character representation
    vocab_list[tokenizer.word_delimiter_token_id] = " "

    # Ensure unique vocabulary entries
    vocab_list.append('!')
    vocab_list = list(dict.fromkeys(vocab_list))
    print(len(vocab_list))
    print(vocab_list)
    # Create the alphabet
    alphabet = Alphabet.build_alphabet(vocab_list)
    lm_model = kenlm.Model(ngram_lm_path)
    decoder = BeamSearchDecoderCTC(alphabet, language_model=LanguageModel(lm_model))
    return decoder

# define function to read in sound file
def map_to_array(batch):
    speech, sampling_rate = sf.read(batch["file"])
    batch["speech"] = speech
    batch["sampling_rate"] = sampling_rate
    return batch

ngram_lm_model = get_decoder_ngram_model(processor.tokenizer, lm_file)



110
['ẻ', '6', 'ụ', 'í', '3', 'ỹ', 'ý', 'ẩ', 'ở', 'ề', 'õ', '7', 'ê', 'ứ', 'ỏ', 'v', 'ỷ', 'a', 'l', 'ự', 'q', 'ờ', 'j', 'ố', 'à', 'ỗ', 'n', 'é', 'ủ', 'у', 'ô', 'u', 'y', 'ằ', '4', 'w', 'b', 'ệ', 'ễ', 's', 'ì', 'ầ', 'ỵ', '8', 'd', 'ể', ' ', 'r', 'ũ', 'c', 'ạ', '9', 'ế', 'ù', 'ỡ', '2', 't', 'i', 'g', '́', 'ử', '̀', 'á', '0', 'ậ', 'e', 'ộ', 'm', 'ẳ', 'ợ', 'ĩ', 'h', 'â', 'ú', 'ọ', 'ồ', 'ặ', 'f', 'ữ', 'ắ', 'ỳ', 'x', 'ó', 'ã', 'ổ', 'ị', '̣', 'z', 'ả', 'đ', 'è', 'ừ', 'ò', 'ẵ', '1', 'ơ', 'k', 'ẫ', 'p', 'ấ', 'ẽ', 'ỉ', 'ớ', 'ẹ', 'ă', 'o', 'ư', '5', '', '!']


In [6]:
from tqdm import tqdm
import os

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
# Path audio splitted
FOLDER_RESULTS = "/content/drive/MyDrive/database/result"

# Path trancsrip want to save
FOLDER_TRANSCRIPTS = "/content/drive/MyDrive/database/transcript"

os.makedirs(FOLDER_TRANSCRIPTS, exist_ok=True)

for folder_name in tqdm(os.listdir(FOLDER_RESULTS)):
    FOLDER_AUDIOS_SPLIT = os.path.join(FOLDER_RESULTS, folder_name)

    if not os.path.isdir(FOLDER_AUDIOS_SPLIT):
        continue  # Skip if it's not a directory

    for file_name_audio in os.listdir(FOLDER_AUDIOS_SPLIT):
        if file_name_audio.endswith('.wav'):
            path_file_name_audio = os.path.join(FOLDER_AUDIOS_SPLIT, file_name_audio)

            frame = file_name_audio[6:-4].split("_")
            frame_start, frame_end = int(frame[0]), int(frame[1])

            if frame_end == frame_start:
                continue

            transcript_folder = os.path.join(FOLDER_TRANSCRIPTS, folder_name, file_name_audio[:-4])
            os.makedirs(transcript_folder, exist_ok=True)

            try:
                ds = map_to_array({"file": path_file_name_audio})

                input_values = processor(
                    ds["speech"],
                    sampling_rate=ds["sampling_rate"],
                    return_tensors="pt"
                ).input_values

                logits = model(input_values.to(device)).logits[0]
                pred_ids = torch.argmax(logits, dim=-1)
                output = ngram_lm_model.decode(logits.cpu().detach().numpy(), beam_width=500)

                # save transcript
                transcript_file = os.path.join(transcript_folder, f"{file_name_audio[:-4]}.txt")
                with open(transcript_file, "w") as f:
                    f.write(output)

            except Exception as e:
                print(f"Error processing {file_name_audio}: {str(e)}")

print("Processing complete!")

  0%|          | 0/2 [00:00<?, ?it/s]

Error processing C0001_8808_8881.wav: [Errno 2] No such file or directory: '/content/drive/MyDrive/database/transcript/C0001/C0001_8808_8881/C0001_8808_8881.txt'


100%|██████████| 2/2 [01:20<00:00, 40.21s/it]

Processing complete!





In [7]:
import os

def remove_exclamation_marks(transcript_folder):
    """
    Recursively access all text files in the transcript folder and its subfolders,
    replacing '!' with '' in each file.

    :param transcript_folder: Path to the main transcript folder
    """
    for root, dirs, files in os.walk(transcript_folder):
        for file in files:
            if file.endswith('.txt'):
                file_path = os.path.join(root, file)

                # Read the content of the file
                with open(file_path, 'r', encoding='utf-8') as f:
                    content = f.read()

                # Replace '!' with ''
                updated_content = content.replace('!', '')

                # Write the updated content back to the file
                with open(file_path, 'w', encoding='utf-8') as f:
                    f.write(updated_content)

                print(f"Processed: {file_path}")

# Usage
FOLDER_TRANSCRIPTS = "/content/drive/MyDrive/database/transcript"
remove_exclamation_marks(FOLDER_TRANSCRIPTS)

Processed: /content/drive/MyDrive/database/transcript/C0001/C0001_8882_8919/C0001_8882_8919.txt
Processed: /content/drive/MyDrive/database/transcript/C0001/C0001_8920_9074/C0001_8920_9074.txt
Processed: /content/drive/MyDrive/database/transcript/C0001/C0001_9075_9130/C0001_9075_9130.txt
Processed: /content/drive/MyDrive/database/transcript/C0001/C0001_9131_9206/C0001_9131_9206.txt
Processed: /content/drive/MyDrive/database/transcript/C0001/C0001_9207_9278/C0001_9207_9278.txt
Processed: /content/drive/MyDrive/database/transcript/C0001/C0001_9279_9340/C0001_9279_9340.txt
Processed: /content/drive/MyDrive/database/transcript/C0001/C0001_9341_9367/C0001_9341_9367.txt
Processed: /content/drive/MyDrive/database/transcript/C0001/C0001_9368_9570/C0001_9368_9570.txt
Processed: /content/drive/MyDrive/database/transcript/C0001/C0001_9571_9596/C0001_9571_9596.txt
Processed: /content/drive/MyDrive/database/transcript/C0001/C0001_9597_9659/C0001_9597_9659.txt
Processed: /content/drive/MyDrive/databa

In [13]:
import os
import json
from tqdm import tqdm

def create_json_from_transcripts(transcript_folder):
    """
    Create a single JSON file for each video folder (C0000, C0001, etc.) in the transcript directory.
    Each JSON file contains keys starting from 0, incrementing for each scene,
    with values from the txt files in the scene folders.

    :param transcript_folder: Path to the main transcript folder
    """
    # Iterate through video folders (C0000, C0001, etc.)
    for video_folder in os.listdir(transcript_folder):
        video_folder_path = os.path.join(transcript_folder, video_folder)

        if not os.path.isdir(video_folder_path):
            continue  # Skip if it's not a directory

        json_data = {}
        key = 0

        # Get all scene folders and sort them
        scene_folders = sorted([f for f in os.listdir(video_folder_path) if os.path.isdir(os.path.join(video_folder_path, f))],
                               key=lambda x: int(x.split('_')[1]))  # Sort by the start frame number

        for scene_folder in tqdm(scene_folders, desc=f"Processing {video_folder}"):
            scene_folder_path = os.path.join(video_folder_path, scene_folder)

            # There should be only one txt file in each scene folder
            txt_files = [f for f in os.listdir(scene_folder_path) if f.endswith('.txt')]

            if txt_files:
                txt_file_path = os.path.join(scene_folder_path, txt_files[0])

                # Read the content of the txt file
                with open(txt_file_path, 'r', encoding='utf-8') as f:
                    content = f.read().strip()

                # Add to json_data with key starting from 0
                json_data[str(key)] = content
                key += 1

        # Create JSON file for the video
        json_file_path = os.path.join(json_folder, f"{video_folder}.json")
        with open(json_file_path, 'w', encoding='utf-8') as json_file:
            json.dump(json_data, json_file, ensure_ascii=False, indent=2)

        print(f"Created JSON file: {json_file_path}")

# PATH TO SAVE JSON FILE
json_folder = "/content/drive/MyDrive/database/json"
os.makedirs(json_folder, exist_ok=True)
# Usage
FOLDER_TRANSCRIPTS = "/content/drive/MyDrive/database/transcript"
create_json_from_transcripts(FOLDER_TRANSCRIPTS)

Processing C0001: 100%|██████████| 177/177 [00:00<00:00, 227.62it/s]


Created JSON file: /content/drive/MyDrive/database/json/C0001.json


Processing C0000: 100%|██████████| 59/59 [00:00<00:00, 196.71it/s]


Created JSON file: /content/drive/MyDrive/database/json/C0000.json


**CREATE CLIPFEATURES**

In [2]:
!pip install git+https://github.com/openai/CLIP.git

Collecting git+https://github.com/openai/CLIP.git
  Cloning https://github.com/openai/CLIP.git to /tmp/pip-req-build-0bsccran
  Running command git clone --filter=blob:none --quiet https://github.com/openai/CLIP.git /tmp/pip-req-build-0bsccran
  Resolved https://github.com/openai/CLIP.git to commit dcba3cb2e2827b402d2701e7e1c7d9fed8a20ef1
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting ftfy (from clip==1.0)
  Downloading ftfy-6.2.0-py3-none-any.whl.metadata (7.3 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch->clip==1.0)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch->clip==1.0)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch->clip==1.0)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia

In [4]:
import torch
import clip
import numpy as np
import json
import glob
import os

device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)

# Define the maximum context length
context_length = 77

# Function to split text into chunks within the context length
def split_text_into_chunks(text, max_length):
    words = text.split()
    chunks = []
    current_chunk = []
    current_length = 0

    for word in words:
        tokenized_word = clip.tokenize([word]).shape[1]

        if current_length + tokenized_word > max_length:
            if current_chunk:
                chunks.append(" ".join(current_chunk))
            current_chunk = [word]
            current_length = tokenized_word
        else:
            current_chunk.append(word)
            current_length += tokenized_word

    if current_chunk:
        chunks.append(" ".join(current_chunk))

    # If any chunk is still too long, split it further
    final_chunks = []
    for chunk in chunks:
        tokenized_chunk = clip.tokenize([chunk])
        if tokenized_chunk.shape[1] > max_length:
            words = chunk.split()
            sub_chunk = []
            sub_length = 0
            for word in words:
                tokenized_word = clip.tokenize([word]).shape[1]
                if sub_length + tokenized_word <= max_length:
                    sub_chunk.append(word)
                    sub_length += tokenized_word
                else:
                    final_chunks.append(" ".join(sub_chunk))
                    sub_chunk = [word]
                    sub_length = tokenized_word
            if sub_chunk:
                final_chunks.append(" ".join(sub_chunk))
        else:
            final_chunks.append(chunk)

    return final_chunks

# Define the input folder and output folder paths
input_folder_path = '/content/drive/MyDrive/database/json'
output_folder_path = '/content/drive/MyDrive/database/CLIP FEATUREs'
os.makedirs(output_folder_path, exist_ok=True)

# Get a list of all JSON files in the input folder
json_files = sorted(glob.glob(os.path.join(input_folder_path, '*.json')))

for json_file_path in json_files:
    with open(json_file_path, 'r', encoding='utf-8') as fp:
        data = json.load(fp)

    descriptions = {}
    re_feats = []

    # Process each entry in the JSON data
    for idx, text in data.items():
        if text.strip():  # Only process non-empty strings
            text_chunks = split_text_into_chunks(text, context_length - 2)  # Leave room for start and end tokens
            chunk_features = []
            for chunk in text_chunks:
                clip_text = clip.tokenize([chunk]).to(device)
                with torch.no_grad():
                    text_features = model.encode_text(clip_text)
                text_features /= text_features.norm(dim=-1, keepdim=True)
                text_features = text_features.detach().cpu().numpy().astype(np.float16).flatten()
                chunk_features.append(text_features)

            # Average the features if there are multiple chunks
            if chunk_features:
                avg_features = np.mean(chunk_features, axis=0)
                re_feats.append(avg_features)

            descriptions[idx] = text_chunks

    # Define the output file names
    base_name = os.path.basename(json_file_path)
    file_name, _ = os.path.splitext(base_name)
    output_npy_path = os.path.join(output_folder_path, f'{file_name}_features.npy')

    # Save the text features as a .npy file
    np.save(output_npy_path, re_feats)

print("Inference completed and results saved.")


Inference completed and results saved.


In [5]:
!pip install faiss-gpu

Collecting faiss-gpu
  Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.4 kB)
Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (85.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.5/85.5 MB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-gpu
Successfully installed faiss-gpu-1.7.2


In [7]:
import faiss
import numpy as np
import glob
import os

# Path to the CLIP features
feat_path = '/content/drive/MyDrive/database/CLIP FEATUREs'

# Path to save the FAISS index
faiss_save_path = '/content/drive/MyDrive/database/FAISS_INDEX'

# Create the directory if it doesn't exist
os.makedirs(faiss_save_path, exist_ok=True)

# Get all feature files
feature_files = sorted(glob.glob(os.path.join(feat_path, '*_features.npy')))

# Load the first feature file to get the dimensionality
temp = np.load(feature_files[0])
dim = temp.shape[1]

# Initialize the FAISS index
index = faiss.IndexFlatL2(dim)

# Load all features into the index
for feature_file in feature_files:
    feats = np.load(feature_file)
    feats = feats.astype(np.float32)  # Ensure the features are in float32 format
    index.add(feats)

# Save the FAISS index
faiss_file_path = os.path.join(faiss_save_path, 'faiss_ASR.bin')
faiss.write_index(index, faiss_file_path)

print(f"FAISS index built and saved to {faiss_file_path}. Total vectors: {index.ntotal}")

FAISS index built and saved to /content/drive/MyDrive/database/FAISS_INDEX/faiss_ASR.bin. Total vectors: 223
