In [1]:
import psutil
ram_usage = psutil.virtual_memory().percent
print(f"Current RAM Usage: {ram_usage}%")


Current RAM Usage: 7.3%


In [3]:
import zipfile
# Define file paths
zip_path = "/content/drive/MyDrive/decoded_subtitles.zip"
extract_path = "/content/"

# Extract the ZIP file
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

print("Extraction complete! File is now available in Colab.")

Extraction complete! File is now available in Colab.


In [4]:
import pandas as pd

# Load the extracted CSV file
df_decoded = pd.read_csv("/content/decoded_subtitles.csv")

# Keep only 'name' and 'file_content' columns
df_decoded = df_decoded[['name', 'file_content']]
df_decoded.head()



Unnamed: 0,name,file_content
0,my.girlfriend.is.an.alien.s01.e04.episode.1.4....,"1\r\n00:00:06,000 --> 00:00:12,074\r\napi.Open..."
1,call.the.midwife.christmas.special.2022.(2022)...,"ï»¿1\r\n00:00:06,000 --> 00:00:12,074\r\nWatch..."
2,queen.margot.(1994).eng.1cd,"ï»¿1\r\n00:00:06,000 --> 00:00:12,074\r\nAdver..."
3,survivor.s35.e04.i.dont.like.having.snakes.aro...,"1\r\n00:00:01,292 --> 00:00:06,881\r\n>> Jeff:..."
4,hero.(2022).eng.1cd,"1\r\n00:00:39,580 --> 00:00:42,340\r\n(early 1..."


In [5]:
df_decoded.shape

(24749, 2)

In [6]:
import re

def clean_subtitles(text):
    """ Cleans subtitle text by removing timestamps, line numbers, and unwanted characters. """
    if pd.isna(text):  # Handle NaN values
        return ""

    # Remove timestamps (e.g., 00:00:06,000 --> 00:00:12,074)
    text = re.sub(r'\d{2}:\d{2}:\d{2},\d{3} --> \d{2}:\d{2}:\d{2},\d{3}', '', text)

    # Remove line numbers (any number at the start of a line)
    text = re.sub(r'^\d+\s*$', '', text, flags=re.MULTILINE)

    # Remove HTML tags (e.g., <i>italic text</i>)
    text = re.sub(r'<.*?>', '', text)

    # Remove promotional messages (e.g., "Please rate this subtitle", "Watch any video online", etc.)
    text = re.sub(r'Please rate this subtitle.*|Watch any video online.*|Synced By.*', '', text, flags=re.IGNORECASE)

    # Normalize spaces and newlines
    text = re.sub(r'\s+', ' ', text).strip()

    return text


In [7]:
import os

# Create a folder for cleaned subtitle text files
text_folder = "/content/cleaned_subtitles_text"
os.makedirs(text_folder, exist_ok=True)

batch_size = 1000  # Prevents RAM overload
total_rows = len(df_decoded)

for i in range(0, total_rows, batch_size):
    df_decoded.loc[i:i+batch_size, "file_content"] = df_decoded.loc[i:i+batch_size, "file_content"].apply(clean_subtitles)

    # Save each row as a separate text file
    for idx, row in df_decoded.iloc[i:i+batch_size].iterrows():
        file_name = f"{text_folder}/{row['name'].replace('/', '_')}_{idx}.txt"
        with open(file_name, "w", encoding="utf-8") as f:
            f.write(row["file_content"])

    print(f"Processed batch {i} to {min(i+batch_size, total_rows)}")

print("Cleaning complete! All subtitles saved as text files.")


Processed batch 0 to 1000
Processed batch 1000 to 2000
Processed batch 2000 to 3000
Processed batch 3000 to 4000
Processed batch 4000 to 5000
Processed batch 5000 to 6000
Processed batch 6000 to 7000
Processed batch 7000 to 8000
Processed batch 8000 to 9000
Processed batch 9000 to 10000
Processed batch 10000 to 11000
Processed batch 11000 to 12000
Processed batch 12000 to 13000
Processed batch 13000 to 14000
Processed batch 14000 to 15000
Processed batch 15000 to 16000
Processed batch 16000 to 17000
Processed batch 17000 to 18000
Processed batch 18000 to 19000
Processed batch 19000 to 20000
Processed batch 20000 to 21000
Processed batch 21000 to 22000
Processed batch 22000 to 23000
Processed batch 23000 to 24000
Processed batch 24000 to 24749
Cleaning complete! All subtitles saved as text files.


In [8]:
import zipfile

zip_filename = "/content/drive/MyDrive/cleaned_subtitles.zip"  # Path to save in Drive
text_folder = "/content/cleaned_subtitles_text"  # Folder containing cleaned .txt files

# Create a ZIP file with all text files
with zipfile.ZipFile(zip_filename, 'w', zipfile.ZIP_DEFLATED) as zipf:
    for root, _, files in os.walk(text_folder):
        for file in files:
            zipf.write(os.path.join(root, file), arcname=file)

print("Zipping complete! Cleaned subtitles saved as 'cleaned_subtitles.zip' in Google Drive.")


Zipping complete! Cleaned subtitles saved as 'cleaned_subtitles.zip' in Google Drive.


In [9]:
import os
print(os.listdir("/content/drive/MyDrive/"))  # Should list 'cleaned_subtitles.zip'


['JEE(Main)', 'Physics formulae short notes.pdf', 'AP EAPCET - 2021', 'eamcet hallticket', 'abcd.txt', 'Untitled Diagram.drawio', 'assignment 1 (1).pdf', 'assignment 1.pdf', 'radha.jpg', 'Colab Notebooks', 'Working.gdoc', "4W's ppt.gdoc", 'Academia Saved Papers', 'train.csv', 'major.csv', 'modified_dataset_targets.csv', 'MYDUKURI RADHA_KOMALIDEVI.pdf', 'RADHA_KOMALIDEVI MYDUKURI  (2).pdf', 'testing_images', 'RADHA_KOMALIDEVI MYDUKURI .pdf', 'work_1.gsheet', 'download and upload speed_screenshot.jpeg', 'Download and upload speed_ screenshot.jpeg', 'IMG_20230128_175617.jpg', 'Artificial Intelligence Minor Project.gdoc', 'Artificial Intelligence Major Project.pdf', 'Artificial Intelligence Minor Project.pdf', 'Teachnook COURSE Completion Certificate _ Mydukuri Radha Komalidevi (2).pdf', 'TEACHNOOK Internship Completion Certificate _ Mydukuri Radha Komalidevi (5).pdf', 'words_250000_train.txt', 'RADHA_KOMALIDEVI MYDUKURI  (1).pdf', 'kuhar_labels.csv', 'KU_har_time_freq_spectrogram_SP.mat',

In [10]:
with zipfile.ZipFile("/content/drive/MyDrive/cleaned_subtitles.zip", 'r') as zip_ref:
    zip_ref.extractall("/content/extracted_subtitles/")
print("Extraction complete!")


Extraction complete!


In [11]:
!pip install langchain langchain_community


Collecting langchain_community
  Downloading langchain_community-0.3.20-py3-none-any.whl.metadata (2.4 kB)
Collecting langchain
  Downloading langchain-0.3.21-py3-none-any.whl.metadata (7.8 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain_community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain_community)
  Downloading pydantic_settings-2.8.1-py3-none-any.whl.metadata (3.5 kB)
Collecting httpx-sse<1.0.0,>=0.4.0 (from langchain_community)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting langchain-text-splitters<1.0.0,>=0.3.7 (from langchain)
  Downloading langchain_text_splitters-0.3.7-py3-none-any.whl.metadata (1.9 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain_community)
  Downloading marshmallow-3.26.1-py3-none-any.whl.metadata (7.3 kB)
Collecting typing-inspect<1,>=0.4.0 (from dataclasses-json<0.7,>=0.5.7->langchain_commun

In [12]:
from langchain_community.document_loaders import DirectoryLoader, TextLoader

# Load all subtitle text files
text_folder = "/content/extracted_subtitles"
loader = DirectoryLoader(text_folder, glob="*.txt", show_progress=True, loader_cls=TextLoader)
documents = loader.load()

# Check loaded data
print(f"Total documents loaded: {len(documents)}")
print("Example document:\n", documents[0].page_content[:300])  # Show first 300 characters
print("Metadata:", documents[0].metadata)  # Check metadata (filename)


100%|█████████▉| 24747/24749 [00:05<00:00, 4777.09it/s]


Total documents loaded: 24747
Example document:
 ï»¿1 Free Browser extension: osdb.link/ext âª MY EYES ARE GETTIN' WEARY âª âª MY BACK IS GETTIN' TIGHT âª âª I'M SITTIN' HERE IN TRAFFIC âª âª ON THE QUEENSBORO BRIDGE TONIGHT âª âª BUT I DON'T CARE, 'CAUSE ALL I WANT TO DO âª âª IS CASH MY CHECK AND DRIVE RIGHT HOME TO YOU âª âª 'CAUSE
Metadata: {'source': '/content/extracted_subtitles/the.king.of.queens.s06.e05.nocturnal.omission.(2003).eng.1cd_4440.txt'}


In [13]:
import re

def fix_encoding_issues(text):
    """ Fixes encoding issues and removes unwanted content. """
    if not isinstance(text, str) or pd.isna(text):
        return ""

    # Remove BOM characters (ï»¿)
    text = text.replace("\ufeff", "")

    # Fix incorrectly decoded characters (optional)
    try:
        text = text.encode("latin1").decode("utf-8", errors="ignore")
    except UnicodeEncodeError:
        pass  # Skip if conversion fails

    # Remove promotional messages (e.g., "osdb.link/ext")
    text = re.sub(r'osdb\.link/.*|Please rate this subtitle.*|Synced By.*', '', text, flags=re.IGNORECASE)

    # Normalize spaces
    text = re.sub(r'\s+', ' ', text).strip()

    return text


In [14]:
# Apply encoding fix to all documents
for doc in documents:
    doc.page_content = fix_encoding_issues(doc.page_content)

# Verify if text is now clean
print("Example cleaned document:\n", documents[0].page_content[:500])  # Preview first 500 chars


Example cleaned document:
 ﻿1 Free Browser extension:


In [15]:
documents[1].page_content[:500]

'\ufeff1 (music playing) The price is firm unless, of course, you make an all-cash offer. ♪ Yeah... ♪ (man vocalizing) ♪ Yeah, Lazy Fair ♪ ♪ When I was growing up, I had a lot of dreams ♪ ♪ My mama told me, "Son, you could be anything ♪ ♪ Long as you spread your wings, I know one day you\'ll be great ♪ ♪ Just wait, soon you\'ll aim up at the sky ♪ ♪ And I\'ll watch you float away" ♪ - ♪ But now that I am older... ♪ - I want it. Really, you don\'t have to decide today. Now, we just seen 20 apartments, man.'

In [16]:
import re

def fix_encoding_issues(text):
    """ Cleans subtitle text by removing unwanted characters, fixing encoding, and normalizing spaces. """
    if not isinstance(text, str) or pd.isna(text):
        return ""

    # Remove BOM characters (ï»¿ or \ufeff)
    text = text.replace("\ufeff", "")

    # Fix incorrectly decoded characters (force UTF-8 decoding)
    try:
        text = text.encode("latin1").decode("utf-8", errors="ignore")
    except UnicodeEncodeError:
        pass  # Skip if conversion fails

    # Remove music notes and unwanted symbols (e.g., ♪, (music playing), etc.)
    text = re.sub(r'[♪]', '', text)  # Remove music symbols
    #text = re.sub(r'\(.*?playing\)', '', text, flags=re.IGNORECASE)  # Remove "(music playing)"

    # Remove promotional messages
    text = re.sub(r'osdb\.link/.*|Please rate this subtitle.*|Synced By.*', '', text, flags=re.IGNORECASE)

    # Normalize spaces and newlines
    text = re.sub(r'\s+', ' ', text).strip()

    return text


In [17]:
import os

# Path where the extracted .txt files are stored
text_folder = "/content/extracted_subtitles"

# Apply encoding fixes and overwrite files
for doc in documents:
    cleaned_text = fix_encoding_issues(doc.page_content)  # Fix encoding

    # Get the original file path from metadata
    file_name = os.path.basename(doc.metadata["source"])  # Extract filename from metadata
    file_path = os.path.join(text_folder, file_name)  # Get full path

    # Overwrite the file with cleaned text
    with open(file_path, "w", encoding="utf-8") as f:
        f.write(cleaned_text)

print("All subtitle text files have been cleaned and overwritten.")


All subtitle text files have been cleaned and overwritten.


In [18]:
import os

text_folder = "/content/extracted_subtitles"
file_list = os.listdir(text_folder)

# Print the first 5 filenames
print("Available files:", file_list[:5])


Available files: ['the.king.of.queens.s06.e05.nocturnal.omission.(2003).eng.1cd_4440.txt', 'survivors.remorse.s01.e01.in.the.offing.(2014).eng.1cd_4250.txt', 'the.mentalist.s05.e05.red.dawn.(2012).eng.1cd_3859.txt', 'mushikaburihime.s01.e09.a.letter.from.the.pearl.princess.(2022).eng.1cd_11100.txt', 'the.brokenwood.mysteries.s08.e06.four.fires.and.a.funeral.(2022).eng.1cd_20831.txt']


In [20]:
file_to_check = f"/content/extracted_subtitles/{file_list[1]}"  # First file in the folder

with open(file_to_check, "r", encoding="utf-8") as f:
    content = f.read()

print("Checking file:", file_to_check)
print(content[:500])  # Print first 500 characters


Checking file: /content/extracted_subtitles/survivors.remorse.s01.e01.in.the.offing.(2014).eng.1cd_4250.txt
1 (music playing) The price is firm unless, of course, you make an all-cash offer. Yeah... (man vocalizing) Yeah, Lazy Fair When I was growing up, I had a lot of dreams My mama told me, "Son, you could be anything Long as you spread your wings, I know one day you'll be great Just wait, soon you'll aim up at the sky And I'll watch you float away" - But now that I am older... - I want it. Really, you don't have to decide today. Now, we just seen 20 apartments, man. Look! Will you let her tell you 


In [21]:
import zipfile
import os

zip_filename = "/content/drive/MyDrive/clean_extracted_subtitles.zip"  # Updated filename
text_folder = "/content/extracted_subtitles"  # Folder containing cleaned .txt files

# Create a ZIP file with all text files
with zipfile.ZipFile(zip_filename, 'w', zipfile.ZIP_DEFLATED) as zipf:
    for root, _, files in os.walk(text_folder):
        for file in files:
            zipf.write(os.path.join(root, file), arcname=file)

print("Zipping complete! Cleaned subtitles saved as 'clean_extracted_subtitles.zip' in Google Drive.")


Zipping complete! Cleaned subtitles saved as 'clean_extracted_subtitles.zip' in Google Drive.


In [22]:
import os
print(os.listdir("/content/drive/MyDrive/"))  # Should list 'clean_extracted_subtitles.zip'


['JEE(Main)', 'Physics formulae short notes.pdf', 'AP EAPCET - 2021', 'eamcet hallticket', 'abcd.txt', 'Untitled Diagram.drawio', 'assignment 1 (1).pdf', 'assignment 1.pdf', 'radha.jpg', 'Colab Notebooks', 'Working.gdoc', "4W's ppt.gdoc", 'Academia Saved Papers', 'train.csv', 'major.csv', 'modified_dataset_targets.csv', 'MYDUKURI RADHA_KOMALIDEVI.pdf', 'RADHA_KOMALIDEVI MYDUKURI  (2).pdf', 'testing_images', 'RADHA_KOMALIDEVI MYDUKURI .pdf', 'work_1.gsheet', 'download and upload speed_screenshot.jpeg', 'Download and upload speed_ screenshot.jpeg', 'IMG_20230128_175617.jpg', 'Artificial Intelligence Minor Project.gdoc', 'Artificial Intelligence Major Project.pdf', 'Artificial Intelligence Minor Project.pdf', 'Teachnook COURSE Completion Certificate _ Mydukuri Radha Komalidevi (2).pdf', 'TEACHNOOK Internship Completion Certificate _ Mydukuri Radha Komalidevi (5).pdf', 'words_250000_train.txt', 'RADHA_KOMALIDEVI MYDUKURI  (1).pdf', 'kuhar_labels.csv', 'KU_har_time_freq_spectrogram_SP.mat',

In [23]:
from langchain_community.document_loaders import DirectoryLoader, TextLoader

# Load all subtitle text files
text_folder = "/content/extracted_subtitles"
loader = DirectoryLoader(text_folder, glob="*.txt", show_progress=True, loader_cls=TextLoader)
documents = loader.load()

# Check loaded data
print(f"Total documents loaded: {len(documents)}")
print("Example document:\n", documents[1].page_content[:300])  # Show first 300 characters


100%|█████████▉| 24747/24749 [00:07<00:00, 3474.67it/s]

Total documents loaded: 24747
Example document:
 1 Free Browser extension:





In [24]:
print("Example document:\n", documents[1].page_content[:300])  # Show first 300 characters

Example document:
 1 (music playing) The price is firm unless, of course, you make an all-cash offer. Yeah... (man vocalizing) Yeah, Lazy Fair When I was growing up, I had a lot of dreams My mama told me, "Son, you could be anything Long as you spread your wings, I know one day you'll be great Just wait, soon you'll a


In [26]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Define chunking parameters
chunk_size = 500  # Number of tokens per chunk
chunk_overlap = 100  # Overlapping tokens to maintain context

# Initialize text splitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)

# Split documents into chunks
chunked_docs = text_splitter.split_documents(documents)

# Check results
print(f"Total chunks created: {len(chunked_docs)}")
print("Example chunk:\n", chunked_docs[0].page_content[:300])  # Preview first chunk


Total chunks created: 1644340
Example chunk:
 1 Free Browser extension:


In [28]:
print("Example chunk:\n", chunked_docs[1].page_content[:300])  # Preview first chunk

Example chunk:
 1 (music playing) The price is firm unless, of course, you make an all-cash offer. Yeah... (man vocalizing) Yeah, Lazy Fair When I was growing up, I had a lot of dreams My mama told me, "Son, you could be anything Long as you spread your wings, I know one day you'll be great Just wait, soon you'll a


In [27]:
import pandas as pd

# Convert chunked documents into a structured DataFrame
chunked_data = pd.DataFrame({
    "chunk_id": [i for i in range(len(chunked_docs))],
    "text": [doc.page_content for doc in chunked_docs]
})

# Save as a CSV file
chunked_csv_path = "/content/chunked_subtitles.csv"
chunked_data.to_csv(chunked_csv_path, index=False)

print("Chunked subtitles saved successfully!")


Chunked subtitles saved successfully!


In [29]:
import zipfile

zip_path = "/content/drive/MyDrive/chunked_subtitles.zip"

# Create a ZIP file
with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
    zipf.write(chunked_csv_path, arcname="chunked_subtitles.csv")

print("Zipping complete! Saved as 'chunked_subtitles.zip' in Google Drive.")


Zipping complete! Saved as 'chunked_subtitles.zip' in Google Drive.


In [31]:
df_chunked=pd.read_csv('/content/chunked_subtitles.csv')

In [32]:
df_chunked.head()

Unnamed: 0,chunk_id,text
0,0,1 Free Browser extension:
1,1,"1 (music playing) The price is firm unless, of..."
2,2,"don't have to decide today. Now, we just seen ..."
3,3,your product or brand here contact www.OpenSub...
4,4,That's my boy that's moving me to Atlanta. Cal...
