In [1]:
import psutil
ram_usage = psutil.virtual_memory().percent
print(f"Current RAM Usage: {ram_usage}%")


Current RAM Usage: 11.8%


In [2]:
# Define paths
import zipfile
zip_path = "/content/drive/MyDrive/chunked_subtitles.zip"  # Your saved zip file
extract_path = "/content/chunked_subtitles_text"  # Extracted folder

# Extract the zip file
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

print("✅ Chunked subtitles extracted successfully!")

✅ Chunked subtitles extracted successfully!


In [3]:
import os

In [4]:
!pip install langchain langchain_community




In [5]:
from langchain_community.document_loaders import DirectoryLoader, TextLoader

# Load all chunked subtitle .txt files
loader = DirectoryLoader(extract_path, glob="*.txt", show_progress=True, loader_cls=TextLoader)
documents = loader.load()

# Extract subtitle texts & movie/series names
texts = [doc.page_content for doc in documents]
names = [doc.metadata["source"] for doc in documents]  # Movie/series names

print(f"✅ Loaded {len(texts)} subtitle chunks for embedding.")
print(f"Example Movie/Series: {names[0]}")
print(f"Example Subtitle Chunk: {texts[0][:300]}")


100%|██████████| 24747/24747 [00:05<00:00, 4746.04it/s]

✅ Loaded 24747 subtitle chunks for embedding.
Example Movie/Series: /content/chunked_subtitles_text/dead_end_paranormal_park_13526.txt
Example Subtitle Chunk: -Whoa! Ah! -Ah! No! -Huh? Ah! Ouch! -Hey! Hello? Barney? Friends? You again. -Help! Help me! Barney? Norma? I'm coming! I'm coming! Phew. How long have you been watching me sleep? I watch you every night. You don't need to do that. I don't want to worry Barney, but I keep having these, well, they do





In [6]:
!pip install sentence-transformers
# !pip install chromadb




In [7]:
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
import numpy as np

# Load BERT Model (Fast & Efficient)
model = SentenceTransformer('all-MiniLM-L6-v2', device="cuda")  # Use GPU

# Process embeddings in batches (Prevents RAM crashes)
batch_size = 500
embeddings = []

for i in tqdm(range(0, len(texts), batch_size)):
    batch = texts[i:i+batch_size]
    batch_embeddings = model.encode(batch, device="cuda")  # Use GPU
    embeddings.extend(batch_embeddings)

# Convert embeddings to NumPy array
embeddings = np.array(embeddings)
print(f"✅ BERT embeddings complete! Shape: {embeddings.shape}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
100%|██████████| 50/50 [10:49<00:00, 12.98s/it]

✅ BERT embeddings complete! Shape: (24747, 384)





In [8]:
import os

extract_path = "/content/chunked_subtitles_text"

# List the first 10 files
files = os.listdir(extract_path)
print(f"Total Files: {len(files)}")
print("First 10 filenames:", files[:10])


Total Files: 24747
First 10 filenames: ['dead_end_paranormal_park_13526.txt', 'kimi_ni_todoke_from_me_to_you_19955.txt', 'australian_survivor_12380.txt', 'capitain_marleau_18380.txt', 'pirates_of_the_great_salt_lake_10118.txt', 'the_young_riders_18935.txt', 'lima_elang_22473.txt', 'may_i_help_you_15687.txt', 'the_wonder_of_animals_11184.txt', 'if_winter_comes_12847.txt']


In [9]:
sample_file = os.path.join(extract_path, files[0])  # Pick the first file

with open(sample_file, "r", encoding="utf-8") as f:
    content = f.read()

print(f"📄 File: {sample_file}\n")
print(content[:500])  # Print first 500 characters


📄 File: /content/chunked_subtitles_text/dead_end_paranormal_park_13526.txt

-Whoa! Ah! -Ah! No! -Huh? Ah! Ouch! -Hey! Hello? Barney? Friends? You again. -Help! Help me! Barney? Norma? I'm coming! I'm coming! Phew. How long have you been watching me sleep? I watch you every night. You don't need to do that. I don't want to worry Barney, but I keep having these, well, they don't feel like dreams, more like-- Premonitions? Yeah, maybe. I've had this one dream all my life, but I'm seeing it more and more. Such as? Well, the park is in ruins and there's this big flaming circ


In [10]:
import os

# Extract movie/series names from file names
movie_series_names = [os.path.splitext(file)[0] for file in files]  # Remove .txt extension

# Check some extracted names
print(f"Example Movie/Series Name: {movie_series_names[0]}")
print(f"Example Subtitle Chunk: {texts[0][:300]}")


Example Movie/Series Name: dead_end_paranormal_park_13526
Example Subtitle Chunk: -Whoa! Ah! -Ah! No! -Huh? Ah! Ouch! -Hey! Hello? Barney? Friends? You again. -Help! Help me! Barney? Norma? I'm coming! I'm coming! Phew. How long have you been watching me sleep? I watch you every night. You don't need to do that. I don't want to worry Barney, but I keep having these, well, they do


In [11]:
import psutil
print(f"Current RAM Usage: {psutil.virtual_memory().percent}%")


Current RAM Usage: 27.5%


In [None]:
import numpy as np

npz_base_path = "/content/drive/MyDrive"

# Save embeddings separately (this is the largest file)
embedding_path = f"{npz_base_path}/embeddings.npy"
np.save(embedding_path, embeddings)
print(f"✅ Embeddings saved at: {embedding_path}")

# Save movie names separately
names_path = f"{npz_base_path}/movie_series_names.npy"
np.save(names_path, np.array(movie_series_names))
print(f"✅ Movie/Series names saved at: {names_path}")

# Save subtitle texts separately
texts_path = f"{npz_base_path}/subtitles.npy"
np.save(texts_path, np.array(texts))
print(f"✅ Subtitle texts saved at: {texts_path}")


✅ Embeddings saved at: /content/drive/MyDrive/embeddings.npy
✅ Movie/Series names saved at: /content/drive/MyDrive/movie_series_names.npy
