In [None]:
!pip install langchain langchain-chroma langchain-community langchain-text-splitters

Collecting langchain
  Downloading langchain-0.1.16-py3-none-any.whl (817 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m817.7/817.7 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting langchain-chroma
  Downloading langchain_chroma-0.1.0-py3-none-any.whl (8.5 kB)
Collecting langchain-community
  Downloading langchain_community-0.0.34-py3-none-any.whl (1.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m15.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting langchain-text-splitters
  Downloading langchain_text_splitters-0.0.1-py3-none-any.whl (21 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain)
  Downloading dataclasses_json-0.6.4-py3-none-any.whl (28 kB)
Collecting jsonpatch<2.0,>=1.33 (from langchain)
  Downloading jsonpatch-1.33-py2.py3-none-any.whl (12 kB)
Collecting langchain-core<0.2.0,>=0.1.42 (from langchain)
  Downloading langchain_core-0.1.45-py3-none-any.whl (291 kB)
[2K     [90m━━━━━━━━━━━━━━

In [None]:
!pip install sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-2.7.0-py3-none-any.whl (171 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m171.5/171.5 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch>=1.11.0->sentence-transform

In [None]:
import numpy as np
import pandas as pd
import re
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document
from langchain_chroma import Chroma
from langchain_community.embeddings.sentence_transformer import (
    SentenceTransformerEmbeddings,
)
from tqdm import tqdm

In [None]:
df = pd.read_csv('/content/drive/MyDrive/search_engine/subtitles_split_0.csv')

In [None]:
df.head()

Unnamed: 0,num,name,content
0,9180533,the message (1976),"1\r\n00:00:06,000 --> 00:00:12,074\r\nWatch an..."
1,9180583,here comes the grump s01 e09 joltin jack in bo...,"1\r\n00:00:29,359 --> 00:00:32,048\r\nAh! Ther..."
2,9180592,yumis cells s02 e13 episode 2 13 (2022),"1\r\n00:00:53,200 --> 00:00:56,030\r\n<i>Yumi'..."
3,9180594,yumis cells s02 e14 episode 2 14 (2022),"1\r\n00:00:06,000 --> 00:00:12,074\r\nWatch an..."
4,9180600,broker (2022),"ï»¿1\r\n00:00:06,000 --> 00:00:12,074\r\nWatch..."


In [None]:
def cleaning_subtitles_text(text):
    lines = text.strip().split("\n")
    cleaned_lines = []
    for i in range(len(lines) - 1):
        if "-->" in lines[i]:
            line = lines[i + 1].strip().strip("-").strip()
            if 'subtitl' not in line.lower():  # Check for 'subtitle'
                cleaned_line = re.sub(r"[^\w\s]", "", line).lower()
                if cleaned_line:
                    cleaned_lines.append(cleaned_line)
    return "\n".join(cleaned_lines)

In [None]:
df['content'] = df['content'].apply(cleaning_subtitles_text)

In [None]:
df.head()

Unnamed: 0,num,name,content
0,9180533,the message (1976),in the name of god the most gracious the most ...
1,9180583,here comes the grump s01 e09 joltin jack in bo...,ah theres princess\nblooney looney soldiers\no...
2,9180592,yumis cells s02 e13 episode 2 13 (2022),iyumis cells 2i\niepisode 36\niyumii\nishould ...
3,9180594,yumis cells s02 e14 episode 2 14 (2022),iyumis cells 2i\niepisode 39 laptopi\nfirst pl...
4,9180600,broker (2022),if youre going to throw it away\nplease take c...


In [None]:
def chunk_and_explode(df, text_column, chunk_size=850, chunk_overlap=50):
  """
  Chunks a dataframe based on a text column using RecursiveCharacterTextSplitter and explodes it.
  """

  # Initialize the text splitter
  splitter = RecursiveCharacterTextSplitter(chunk_size=850, chunk_overlap=60)

  df['chunks'] = df[text_column].apply(splitter.split_text)

  # Explode the dataframe based on the 'chunks' column
  exploded_df = df.explode('chunks')

  # Reset the index and return
  return exploded_df.reset_index(drop=True)

In [None]:
exploded_df = chunk_and_explode(df, 'content')

In [None]:
exploded_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 448047 entries, 0 to 448046
Data columns (total 4 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   num      448047 non-null  int64 
 1   name     448047 non-null  object
 2   content  448047 non-null  object
 3   chunks   447122 non-null  object
dtypes: int64(1), object(3)
memory usage: 13.7+ MB


In [None]:
df = exploded_df.drop(columns='content')

In [None]:
df.tail()

Unnamed: 0,num,name,chunks
448042,9264094,charmed s01 e11 feats of clay (1999),then you risk paying the price\ni crashingi \n...
448043,9264094,charmed s01 e11 feats of clay (1999),i came for the thief\nwho are you\nim the guar...
448044,9264094,charmed s01 e11 feats of clay (1999),maybe this is something\ni mean it doesnt\nbut...
448045,9264094,charmed s01 e11 feats of clay (1999),right this way please\nthank you\ngood thing w...
448046,9264094,charmed s01 e11 feats of clay (1999),yeah\num\nyou know i hope\nill be the guy you\...


In [None]:
def create_embeddings(df, text_column):
  embeddings_func = SentenceTransformerEmbeddings(
      model_name="sentence-transformers/all-MiniLM-L6-v2",
      model_kwargs = {'device': 'cuda'})

  embeddings = []
  for chunk in tqdm(df[text_column].values):
    if isinstance(chunk, str):
      embedding = embeddings_func.embed_query(chunk)
      embeddings.append(embedding)

    else:
      embeddings.append(None)

  df['chunk_embeddings'] = embeddings

  return df

In [None]:
df_with_embeddings = create_embeddings(df, 'chunks')

100%|██████████| 448047/448047 [1:23:55<00:00, 88.97it/s]


In [None]:
df_with_embeddings.to_csv('/content/drive/MyDrive/search_engine/processed_split_0.csv', index=False)