In [1]:
import sqlite3
import pandas as pd
import numpy as np
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
from tqdm import tqdm
tqdm.pandas()

[nltk_data] Downloading package punkt to C:\Users\VISHRUTH
[nltk_data]     NIMALAN\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\VISHRUTH
[nltk_data]     NIMALAN\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\VISHRUTH
[nltk_data]     NIMALAN\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## **Step 1 - Reading the Tables from Database file**

In [3]:
# Read the code below and write your observation in the next cell

conn = sqlite3.connect('D:/Codes_VS/New folder/data/eng_subtitles_database.db')
cursor = conn.cursor()
cursor.execute("SELECT name FROM sqlite_master WHERE type='table'")
print(cursor.fetchall())

[('zipfiles',)]


**In the above cell, I am able to read the table inside the database. As mentioned earlier, table name is `zipfiles`. We also know from README.txt that this table contains three columns: 'num', 'name' and 'content'.**

## **Step 2 - Reading the columns of Table**

In [4]:
cursor.execute("PRAGMA table_info('zipfiles')")
cols = cursor.fetchall()
for col in cols:
    print(col[1])

num
name
content


**The above code helps in checking the column names in the database table.**

**Let's now use `SELECT * FROM zipfiles` to read all the data into a `df` variable.**

## **Step 3 - Loading the Database Table inside a Pandas DataFrame**

In [5]:
df = pd.read_sql_query("""SELECT * FROM zipfiles""", conn)
df.head()

Unnamed: 0,num,name,content
0,9180533,the.message.(1976).eng.1cd,b'PK\x03\x04\x14\x00\x00\x00\x08\x00\x1c\xa9\x...
1,9180583,here.comes.the.grump.s01.e09.joltin.jack.in.bo...,b'PK\x03\x04\x14\x00\x00\x00\x08\x00\x17\xb9\x...
2,9180592,yumis.cells.s02.e13.episode.2.13.(2022).eng.1cd,b'PK\x03\x04\x14\x00\x00\x00\x08\x00L\xb9\x99V...
3,9180594,yumis.cells.s02.e14.episode.2.14.(2022).eng.1cd,b'PK\x03\x04\x14\x00\x00\x00\x08\x00U\xa9\x99V...
4,9180600,broker.(2022).eng.1cd,b'PK\x03\x04\x14\x00\x00\x00\x08\x001\xa9\x99V...


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 82498 entries, 0 to 82497
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   num      82498 non-null  int64 
 1   name     82498 non-null  object
 2   content  82498 non-null  object
dtypes: int64(1), object(2)
memory usage: 1.9+ MB


**Looks like the `content` column donot contain the subtitles text. Instead as mentioned in README.txt, it might be latin-1 encoded.**

## **Step 4 - Printing `content` of 0th Row**

In [7]:
b_data = df.iloc[0, 2]

# here 2 represent the index of content column
# 0 represents the row number

In [8]:
print(b_data)

b'PK\x03\x04\x14\x00\x00\x00\x08\x00\x1c\xa9\x99V\x9fx\x96\xf0\x8c\x9e\x00\x00\x86\x9b\x01\x00;\x00\x00\x00The.Message.1976.REMASTERED.1080p.BluRay.x264-PiGNUS.EN.srt\xad\xbdm\x93\xdc\xc6\x91.\xfa\x9d\x11\xfc\x0f-}\xe1=\x11-\x9d\x06P\x85\x17\x9d\x8d\xd5%%[\xa4-Y>&u\x15>\xdf\xd0\xd3\x98\x19x\xfae\x0cts<\xfe\xf57\x9f\'\xb3\n\xd9\xa4\xbc\xbb\xf7\xc6Fl\xacELW\xa2\xaa\x90\x95\x95\xafO\x16/_l6\xdf\xe0\xff\xea\xf5f\xb3Y}\xf5\xd5\xbf\xaf\xf4AQ\xae7Mx\xf9\xe2\xd7\xfe|s\xbf\xea\x8f\xcf\xab\x8f\xe3n8\xadN\xc7\xfdx\x1cVO\xe3\xf9~\xf5\xf3\xe3p\xfc\xea\xfd/o>\xbc\xfb\xf0\xe3\xef\xde\xbf|\xf1\xfbi\x18Vo\xa6\xd3\xd3<L\xab\xe1\x1f\xe7\xe18\x8f\xa7\xe37\xab\xd3\xbc\xdb~-\xc3\x1e\xfe\xa7<|\xf9\xe2\xe5\x8bR_[~S\xd6\xeb\xa2k\xf3k\xe5A\xb7\xeeb\xf5\xf2\xc5\xbb\xe3\xea|?\xac\x8e\xfdaX\x9dnW?\x9cvk>8\x9c\xe6\xf3\xean\xeao\xc6\xd3ev\x8f~\x1a\xa6\x9b\xf1\xf6\xb2\xff\x1a\xe4\xabD\xbe*d\x11\xa5#_U\xeb\xaa\xd9`\xa6\xa7\xc3\xea\xa7\xcb}\x7f8\xf4F\xf9\xa7a\x9e\x87\xe3\x9d\xcc\\\xdf\x07B!\x13\xaa\xd61n<!\xd9\xaf\xd0\

**From the content, it appears to start with the bytes "PK\x03\......", which suggests that it might be a ZIP archive file. How do I know it? Experience! I have worked with something similar earlier.**

## **Step 5 - Unzipping the content of 385th row and decoding using `latin-1`**

In [9]:
import zipfile
import io

# Assuming 'content' is the binary data from your database
binary_data = df.iloc[385, 2]

# Decompress the binary data using the zipfile module
with io.BytesIO(binary_data) as f:
    with zipfile.ZipFile(f, 'r') as zip_file:
        # Reading only one file in the ZIP archive
        subtitle_content = zip_file.read(zip_file.namelist()[0])

# Now 'subtitle_content' should contain the extracted subtitle content
print(subtitle_content.decode('latin-1'))  # Assuming the content is latin-1 encoded text

1
00:00:06,000 --> 00:00:12,074
Watch any video online with Open-SUBTITLES
Free Browser extension: osdb.link/ext

2
00:00:15,370 --> 00:00:16,506
You lose everything, my girl.

3
00:00:16,530 --> 00:00:19,360
So you've said - four times.

4
00:00:20,330 --> 00:00:22,120
I definitely had
it on yesterday.

5
00:00:22,465 --> 00:00:25,785
Your gloves, your keys, that
handkerchief I embroidered for you

6
00:00:25,809 --> 00:00:26,168
Everything!

7
00:00:26,192 --> 00:00:27,280
Five times.

8
00:00:31,610 --> 00:00:32,920
Miss Scarlet?
- Yes.

9
00:00:36,390 --> 00:00:37,390
I'm Miss Scarlet.

10
00:00:37,872 --> 00:00:40,880
May I inquire if
you've lost something?

11
00:00:41,350 --> 00:00:42,530
Some jewellery perhaps?

12
00:00:42,870 --> 00:00:45,130
Yes, my mother's wedding ring.

13
00:00:45,220 --> 00:00:45,840
Have you found it?

14
00:00:45,950 --> 00:00:47,656
Does your ring have
an inscription?

15
00:00:48,650 --> 00:00:51,720
From my father to my mother 'For
my beloved, Livi

**Look's like it worked.**

In [10]:
new_df = df.sample(n=30000)

## **Step 6 - Applying the above Function on the Entire Data**

In [11]:
import zipfile
import io

count = 0

def decode_method(binary_data):
    global count
    # Decompress the binary data using the zipfile module
    # print(count, end=" ")
    count += 1
    with io.BytesIO(binary_data) as f:
        with zipfile.ZipFile(f, 'r') as zip_file:
            # Assuming there's only one file in the ZIP archive
            subtitle_content = zip_file.read(zip_file.namelist()[0])

    # Now 'subtitle_content' should contain the extracted subtitle content
    return subtitle_content.decode('latin-1')  # Assuming the content is UTF-8 encoded text

In [12]:
new_df['file_content'] = new_df['content'].progress_apply(decode_method)

new_df.head()

100%|██████████| 30000/30000 [00:07<00:00, 3810.45it/s]


Unnamed: 0,num,name,content,file_content
7890,9214600,zone.troopers.(1985).eng.1cd,b'PK\x03\x04\x14\x00\x00\x00\x08\x00\x0e\x90\x...,"ï»¿1\r\n00:00:08,716 --> 00:00:10,968\r\n['IN ..."
50922,9395586,the.beasts.(2022).eng.1cd,b'PK\x03\x04\x14\x00\x00\x00\x08\x00\x164\x9aV...,"ï»¿1\r\n00:02:02,383 --> 00:02:04,646\r\nAbduy..."
5660,9205498,a.little.bit.of.heaven.(2011).eng.1cd,b'PK\x03\x04\x14\x00\x00\x00\x08\x00\xb5\x8d\x...,"ï»¿1\r\n00:00:06,000 --> 00:00:12,074\r\nWatch..."
28177,9294175,the.serpent.queen.s01.e02.to.war.rather.than.t...,b'PK\x03\x04\x14\x00\x00\x00\x08\x005\xa6\x99V...,"1\r\n00:00:05,339 --> 00:00:07,209\r\n[person]..."
10783,9226529,the.devil.is.a.parttimer.s02.e08.the.devil.beg...,b'PK\x03\x04\x14\x00\x00\x00\x08\x00\xb7\x94\x...,[Script Info]\r\nTitle: English (US)\r\nOrigin...


In [13]:
new_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 30000 entries, 7890 to 72263
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   num           30000 non-null  int64 
 1   name          30000 non-null  object
 2   content       30000 non-null  object
 3   file_content  30000 non-null  object
dtypes: int64(1), object(3)
memory usage: 1.1+ MB


## **Step 7 - Applying Preprocessing on the File Content**

In [2]:
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

def preprocess_text(text):

    # Remove timestamps
    clean_text = re.sub(r'\b\d+\b\s*?\n?', '', text)
    clean_text = re.sub(r'\d{2}:\d{2}:\d{2},\d{3} --> \d{2}:\d{2}:\d{2},\d{3}\n?', '', clean_text)

    # Define the pattern to match website links
    pattern = r'https?://\S+|www\.\S+'
    # Replace website links with an empty string
    cleaned_text = re.sub(pattern, '', clean_text)

    # Remove special characters
    pattern = r'[^a-zA-Z0-9\s]'
    textt = re.sub(pattern, '', cleaned_text)

    # Lowercase conversion
    textt = textt.lower()

    # Tokenization
    tokens = word_tokenize(textt)

    # Remove stop words
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words]

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]

    # Join tokens back into text
    cleaned_text = ' '.join(lemmatized_tokens)
    return cleaned_text


In [15]:
rowdata = new_df.iloc[10, 3]
print(rowdata)
print('\n',preprocess_text(rowdata))

ï»¿1
00:00:06,000 --> 00:00:12,074
Use the free code JOINNOW at 
â¨www.playships.eu

2
00:02:08,428 --> 00:02:12,229
Only once did I jump my car
over a hill in pursuit.

3
00:02:12,566 --> 00:02:13,659
During the riots.

4
00:02:13,900 --> 00:02:15,140
You catch who you were chasing?

5
00:02:15,402 --> 00:02:16,402
Er, no.

6
00:02:16,937 --> 00:02:19,270
Lost control, crashed into a Blockbuster.

7
00:02:19,673 --> 00:02:22,108
Well, that would never happen again.

8
00:02:22,309 --> 00:02:23,538
Mmm-hmm.

9
00:02:24,111 --> 00:02:25,204
Blockbuster's gone.

10
00:02:29,082 --> 00:02:30,448
Why the hell won't this turn off?

11
00:02:31,284 --> 00:02:33,617
You need a remote,
the same brand as the TV.

12
00:02:33,787 --> 00:02:34,846
Right. Hmm.

13
00:02:35,555 --> 00:02:36,555
<i>Two remotes.</i>

14
00:02:36,723 --> 00:02:40,455
Once again, technology makes life easier
for all of us.

15
00:02:40,627 --> 00:02:41,890
- Lieutenant...
- Mmm.

16
00:02:44,464 --> 00:02:45,464
Okay.

In [16]:
new_df['cleaned_content'] = new_df['file_content'].progress_apply(preprocess_text)

100%|██████████| 30000/30000 [09:55<00:00, 50.40it/s]


In [17]:
'''new_df.to_csv('D:/Codes_VS/New folder/preprocessed_data.csv')'''

"new_df.to_csv('D:/Codes_VS/New folder/preprocessed_data.csv')"

In [18]:
new_df.head()

Unnamed: 0,num,name,content,file_content,cleaned_content
7890,9214600,zone.troopers.(1985).eng.1cd,b'PK\x03\x04\x14\x00\x00\x00\x08\x00\x0e\x90\x...,"ï»¿1\r\n00:00:08,716 --> 00:00:10,968\r\n['IN ...",mood playing advertise product brand contact t...
50922,9395586,the.beasts.(2022).eng.1cd,b'PK\x03\x04\x14\x00\x00\x00\x08\x00\x164\x9aV...,"ï»¿1\r\n00:02:02,383 --> 00:02:04,646\r\nAbduy...",abduya abduya chipo chipo chipo abduya abduya ...
5660,9205498,a.little.bit.of.heaven.(2011).eng.1cd,b'PK\x03\x04\x14\x00\x00\x00\x08\x00\xb5\x8d\x...,"ï»¿1\r\n00:00:06,000 --> 00:00:12,074\r\nWatch...",watch video online opensubtitles free browser ...
28177,9294175,the.serpent.queen.s01.e02.to.war.rather.than.t...,b'PK\x03\x04\x14\x00\x00\x00\x08\x005\xa6\x99V...,"1\r\n00:00:05,339 --> 00:00:07,209\r\n[person]...",person ipreviously oni serpent queen catherine...
10783,9226529,the.devil.is.a.parttimer.s02.e08.the.devil.beg...,b'PK\x03\x04\x14\x00\x00\x00\x08\x00\xb7\x94\x...,[Script Info]\r\nTitle: English (US)\r\nOrigin...,script info title english u original script sh...


In [19]:
new_df.iloc[4, 4]

'script info title english u original script showgate original translation original editing original timing synch point script updated update detail scripttype v4 collision normal playresx playresy timer wrapstyle v4 style format namefontnamefontsizeprimarycoloursecondarycolouroutlinecolourbackcolourbolditalicunderlinestrikeoutscalexscaleyspacingangleborderstyleoutlineshadowalignmentmarginlmarginrmarginvencoding style defaulttrebuchet msh00ffffffh000000ffh00120301h00000000 style maintrebuchet msh00ffffffh000000ffh00000000h00000000 style italicstrebuchet msh00ffffffh000000ffh00000000h00000000 style toptrebuchet msh00ffffffh000000ffh00000000h00000000 style flashbacktrebuchet msh00ffffffh000000ffh00400000h00400000 style italicstoptrebuchet msh00ffffffh000000ffh00000000h00000000 style flashback italicstrebuchet msh00ffffffh000000ffh00400000h00400000 style flashbacktoptrebuchet msh00ffffffh000000ffh00400000h00400000 style flashbackitalicstoptrebuchet msh00ffffffh000000ffh00400000h00400000 s

In [20]:
# Find rows with null values in the 'cleaned_content' column
null_rows = new_df[new_df['cleaned_content'].isnull()]

# Display the rows with null values
print(null_rows)

Empty DataFrame
Columns: [num, name, content, file_content, cleaned_content]
Index: []


## **Step 8 - Chunking the documents for efficiency**

In [21]:
def chunk_subtitle(subtitle_text, chunk_size=100, overlap=50):
    chunks = []
    words = word_tokenize(subtitle_text)
    start_idx = 0
    while start_idx < len(words):
        end_idx = min(start_idx + chunk_size, len(words))
        chunk = words[start_idx:end_idx]
        chunks.append(' '.join(chunk))
        start_idx += chunk_size - overlap
    return chunks


In [22]:
# Apply chunking to each subtitle file
new_df['chunks'] = new_df['cleaned_content'].progress_apply(lambda x: chunk_subtitle(x))

100%|██████████| 30000/30000 [02:06<00:00, 236.54it/s]


In [23]:
new_df.head()

Unnamed: 0,num,name,content,file_content,cleaned_content,chunks
7890,9214600,zone.troopers.(1985).eng.1cd,b'PK\x03\x04\x14\x00\x00\x00\x08\x00\x0e\x90\x...,"ï»¿1\r\n00:00:08,716 --> 00:00:10,968\r\n['IN ...",mood playing advertise product brand contact t...,[mood playing advertise product brand contact ...
50922,9395586,the.beasts.(2022).eng.1cd,b'PK\x03\x04\x14\x00\x00\x00\x08\x00\x164\x9aV...,"ï»¿1\r\n00:02:02,383 --> 00:02:04,646\r\nAbduy...",abduya abduya chipo chipo chipo abduya abduya ...,[abduya abduya chipo chipo chipo abduya abduya...
5660,9205498,a.little.bit.of.heaven.(2011).eng.1cd,b'PK\x03\x04\x14\x00\x00\x00\x08\x00\xb5\x8d\x...,"ï»¿1\r\n00:00:06,000 --> 00:00:12,074\r\nWatch...",watch video online opensubtitles free browser ...,[watch video online opensubtitles free browser...
28177,9294175,the.serpent.queen.s01.e02.to.war.rather.than.t...,b'PK\x03\x04\x14\x00\x00\x00\x08\x005\xa6\x99V...,"1\r\n00:00:05,339 --> 00:00:07,209\r\n[person]...",person ipreviously oni serpent queen catherine...,[person ipreviously oni serpent queen catherin...
10783,9226529,the.devil.is.a.parttimer.s02.e08.the.devil.beg...,b'PK\x03\x04\x14\x00\x00\x00\x08\x00\xb7\x94\x...,[Script Info]\r\nTitle: English (US)\r\nOrigin...,script info title english u original script sh...,[script info title english u original script s...


In [24]:
'''new_df.to_csv('D:/Codes_VS/New folder/chunked_data.csv')'''

"new_df.to_csv('D:/Codes_VS/New folder/chunked_data.csv')"

In [25]:
'''new_df = pd.read_csv('D:/Codes_VS/New folder/chunked_data.csv')'''

"new_df = pd.read_csv('D:/Codes_VS/New folder/chunked_data.csv')"

## **Step 9 - Applying TF-IDF Vectorizer to the Preprocessed File Content**

In [26]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TfidfVectorizer for embedding
vectorizer = TfidfVectorizer()

# Initialize a list to store embeddings for each chunk
chunk_embeddings = []

# Iterate through each row of the DataFrame
for index, row in new_df.iterrows():
    # Fit and transform each chunk to get embeddings
    chunk_embeddings.append(vectorizer.fit_transform(row['chunks']))

In [27]:
new_df['tfidf_embeddings'] = chunk_embeddings
new_df.head()

Unnamed: 0,num,name,content,file_content,cleaned_content,chunks,tfidf_embeddings
7890,9214600,zone.troopers.(1985).eng.1cd,b'PK\x03\x04\x14\x00\x00\x00\x08\x00\x0e\x90\x...,"ï»¿1\r\n00:00:08,716 --> 00:00:10,968\r\n['IN ...",mood playing advertise product brand contact t...,[mood playing advertise product brand contact ...,"(0, 480)\t0.04021062914391863\n (0, 31)\t0...."
50922,9395586,the.beasts.(2022).eng.1cd,b'PK\x03\x04\x14\x00\x00\x00\x08\x00\x164\x9aV...,"ï»¿1\r\n00:02:02,383 --> 00:02:04,646\r\nAbduy...",abduya abduya chipo chipo chipo abduya abduya ...,[abduya abduya chipo chipo chipo abduya abduya...,"(0, 796)\t0.0888262547508804\n (0, 477)\t0...."
5660,9205498,a.little.bit.of.heaven.(2011).eng.1cd,b'PK\x03\x04\x14\x00\x00\x00\x08\x00\xb5\x8d\x...,"ï»¿1\r\n00:00:06,000 --> 00:00:12,074\r\nWatch...",watch video online opensubtitles free browser ...,[watch video online opensubtitles free browser...,"(0, 685)\t0.07456110876900245\n (0, 488)\t0..."
28177,9294175,the.serpent.queen.s01.e02.to.war.rather.than.t...,b'PK\x03\x04\x14\x00\x00\x00\x08\x005\xa6\x99V...,"1\r\n00:00:05,339 --> 00:00:07,209\r\n[person]...",person ipreviously oni serpent queen catherine...,[person ipreviously oni serpent queen catherin...,"(0, 427)\t0.10697483699926216\n (0, 449)\t0..."
10783,9226529,the.devil.is.a.parttimer.s02.e08.the.devil.beg...,b'PK\x03\x04\x14\x00\x00\x00\x08\x00\xb7\x94\x...,[Script Info]\r\nTitle: English (US)\r\nOrigin...,script info title english u original script sh...,[script info title english u original script s...,"(0, 266)\t0.02821897058559377\n (0, 584)\t0..."


## **Step 10 - Applying BoW to the Preprocessed File Content**

In [28]:
from sklearn.feature_extraction.text import CountVectorizer

# Initialize CountVectorizer for embedding
cvectorizer = CountVectorizer()

# Initialize a list to store embeddings for each chunk
cchunk_embeddings = []

# Iterate through each row of the DataFrame
for index, row in new_df.iterrows():
    # Fit and transform each chunk to get embeddings
    cchunk_embeddings.append(cvectorizer.fit_transform(row['chunks']))

In [29]:
new_df['bow_embeddings'] = cchunk_embeddings
new_df.head()

Unnamed: 0,num,name,content,file_content,cleaned_content,chunks,tfidf_embeddings,bow_embeddings
7890,9214600,zone.troopers.(1985).eng.1cd,b'PK\x03\x04\x14\x00\x00\x00\x08\x00\x0e\x90\x...,"ï»¿1\r\n00:00:08,716 --> 00:00:10,968\r\n['IN ...",mood playing advertise product brand contact t...,[mood playing advertise product brand contact ...,"(0, 480)\t0.04021062914391863\n (0, 31)\t0....","(0, 549)\t1\n (0, 635)\t1\n (0, 5)\t1\n (..."
50922,9395586,the.beasts.(2022).eng.1cd,b'PK\x03\x04\x14\x00\x00\x00\x08\x00\x164\x9aV...,"ï»¿1\r\n00:02:02,383 --> 00:02:04,646\r\nAbduy...",abduya abduya chipo chipo chipo abduya abduya ...,[abduya abduya chipo chipo chipo abduya abduya...,"(0, 796)\t0.0888262547508804\n (0, 477)\t0....","(0, 5)\t5\n (0, 148)\t3\n (0, 498)\t1\n (..."
5660,9205498,a.little.bit.of.heaven.(2011).eng.1cd,b'PK\x03\x04\x14\x00\x00\x00\x08\x00\xb5\x8d\x...,"ï»¿1\r\n00:00:06,000 --> 00:00:12,074\r\nWatch...",watch video online opensubtitles free browser ...,[watch video online opensubtitles free browser...,"(0, 685)\t0.07456110876900245\n (0, 488)\t0...","(0, 1153)\t1\n (0, 1136)\t1\n (0, 735)\t1\..."
28177,9294175,the.serpent.queen.s01.e02.to.war.rather.than.t...,b'PK\x03\x04\x14\x00\x00\x00\x08\x005\xa6\x99V...,"1\r\n00:00:05,339 --> 00:00:07,209\r\n[person]...",person ipreviously oni serpent queen catherine...,[person ipreviously oni serpent queen catherin...,"(0, 427)\t0.10697483699926216\n (0, 449)\t0...","(0, 723)\t3\n (0, 493)\t1\n (0, 692)\t1\n ..."
10783,9226529,the.devil.is.a.parttimer.s02.e08.the.devil.beg...,b'PK\x03\x04\x14\x00\x00\x00\x08\x00\xb7\x94\x...,[Script Info]\r\nTitle: English (US)\r\nOrigin...,script info title english u original script sh...,[script info title english u original script s...,"(0, 266)\t0.02821897058559377\n (0, 584)\t0...","(0, 842)\t3\n (0, 269)\t1\n (0, 954)\t1\n ..."


In [30]:
print(new_df.iloc[0, 5])

['mood playing advertise product brand contact today mitten joey hey joey give mitten im reading still got book blonde dame space pirate woman venus thats one sorry pal swapped guy gsquad pack luckies nut swell cover say verona dont smoke luckies thought youd never ask thats gon na cost every candy bar got em smell fine virginia tobacco hand em distant explosion hows ankle philip gasp pain cold aint good really need nurse yeah doc snowman scotty one snowman scotty one radio frequency hum scotty hell still cant reach sarge get junk tell weird ive never heard anything like radio', 'candy bar got em smell fine virginia tobacco hand em distant explosion hows ankle philip gasp pain cold aint good really need nurse yeah doc snowman scotty one snowman scotty one radio frequency hum scotty hell still cant reach sarge get junk tell weird ive never heard anything like radio frequency hum keep trying snowman scotty one read sarge ferguson let see compass sure pinging thanks hey sarge long gon na 

In [31]:
print(new_df.iloc[0, 6])

  (0, 480)	0.04021062914391863
  (0, 31)	0.07318901333042895
  (0, 371)	0.0982363435855484
  (0, 427)	0.0600417043224166
  (0, 908)	0.08860980768227561
  (0, 837)	0.07089897595037044
  (0, 440)	0.0982363435855484
  (0, 317)	0.04339281612167593
  (0, 702)	0.03537332257174566
  (0, 667)	0.0982363435855484
  (0, 107)	0.0649583734803344
  (0, 374)	0.0600417043224166
  (0, 398)	0.10462052383733031
  (0, 297)	0.10462052383733031
  (0, 663)	0.15135685138051422
  (0, 711)	0.31386157151199096
  (0, 764)	0.20924104767466062
  (0, 210)	0.11285116368742486
  (0, 953)	0.044246915183355745
  (0, 584)	0.11285116368742486
  (0, 565)	0.08860980768227561
  (0, 671)	0.08141965318052348
  (0, 332)	0.08860980768227561
  (0, 11)	0.05584735051175915
  (0, 142)	0.08860980768227561
  :	:
  (54, 83)	0.1374171792741874
  (54, 650)	0.1374171792741874
  (54, 5)	0.1374171792741874
  (55, 191)	0.28436946691562304
  (55, 738)	0.28436946691562304
  (55, 922)	0.22328457191542836
  (55, 128)	0.22328457191542836
  (55, 4

In [32]:
print(new_df.iloc[0, 7])

  (0, 549)	1
  (0, 635)	1
  (0, 5)	1
  (0, 650)	1
  (0, 83)	1
  (0, 156)	1
  (0, 860)	1
  (0, 547)	2
  (0, 436)	2
  (0, 378)	1
  (0, 322)	1
  (0, 410)	1
  (0, 669)	1
  (0, 802)	2
  (0, 334)	2
  (0, 77)	1
  (0, 73)	1
  (0, 189)	1
  (0, 776)	1
  (0, 627)	1
  (0, 936)	1
  (0, 892)	1
  (0, 843)	2
  (0, 592)	3
  (0, 773)	1
  :	:
  (54, 944)	1
  (54, 738)	1
  (54, 191)	1
  (55, 5)	1
  (55, 650)	1
  (55, 83)	1
  (55, 156)	1
  (55, 860)	1
  (55, 547)	1
  (55, 592)	1
  (55, 953)	1
  (55, 367)	1
  (55, 95)	1
  (55, 915)	1
  (55, 495)	1
  (55, 556)	1
  (55, 314)	1
  (55, 910)	1
  (55, 877)	1
  (55, 457)	1
  (55, 408)	1
  (55, 128)	1
  (55, 922)	1
  (55, 738)	1
  (55, 191)	1


In [33]:
'''new_df.to_csv('D:/Codes_VS/New folder/embedded_data.csv')'''

"new_df.to_csv('D:/Codes_VS/New folder/embedded_data.csv')"

In [41]:
'''new_df = pd.read_csv('D:/Codes_VS/New folder/embedded_data.csv')'''

## **Step 11 - Applying BERT Based Sentence Transformer to the Preprocessed File Content**

In [3]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-MiniLM-L6-v2')
def encode_text(text):
  embedding = model.encode(text)
  return embedding.tolist()

In [49]:
new_df['bert_embeddings'] = new_df['chunks'].progress_apply(encode_text)

100%|██████████| 30000/30000 [38:42<00:00, 12.92it/s] 


In [50]:
new_df.head()

Unnamed: 0.1,Unnamed: 0,num,name,content,file_content,cleaned_content,chunks,tfidf_embeddings,bow_embeddings,bert_embeddings
0,78585,9506254,wan.pisu.(1999).eng.1cd,b'PK\x03\x04\x14\x00\x00\x00\x08\x00\xc8\x99\x...,"ï»¿1\r\n00:00:13,430 --> 00:00:15,140 \r\n<i>I...",iinherited willi ithe tide time people dreamsi...,['iinherited willi ithe tide time people dream...,"(0, 407)\t0.08534731708381925\n (0, 399)\t0...","(0, 223)\t1\n (0, 492)\t1\n (0, 244)\t3\n ...","[-0.00775002408772707, -0.007927365601062775, ..."
1,72856,9480321,record.of.lodoss.war.chronicles.of.the.heroic....,b'PK\x03\x04\x14\x00\x00\x00\x08\x00\x10s\x9aV...,"ï»¿1\r\n00:00:03,770 --> 00:00:08,765\r\n""To t...",south continent alecrast lie land people call ...,['south continent alecrast lie land people cal...,"(0, 160)\t0.0784120469214599\n (0, 93)\t0.0...","(0, 349)\t1\n (0, 52)\t1\n (0, 4)\t1\n (0...","[-0.023287566378712654, -0.008225442841649055,..."
2,30177,9302410,halloween.5.the.revenge.of.michael.myers.(1989...,b'PK\x03\x04\x14\x00\x00\x00\x08\x00}\xab\x99V...,"1\r\n00:00:06,000 --> 00:00:12,074\r\nWatch an...",watch video online opensubtitles free browser ...,['watch video online opensubtitles free browse...,"(0, 282)\t0.06277336490376696\n (0, 155)\t0...","(0, 800)\t1\n (0, 788)\t1\n (0, 511)\t1\n ...","[0.0025689578615128994, -0.057597894221544266,..."
3,6883,9210217,tekken.bloodline.s01.e04.episode.4.(2022).eng.1cd,b'PK\x03\x04\x14\x00\x00\x00\x08\x00\xb0\x9e\x...,"ï»¿1\r\n00:00:11,803 --> 00:00:13,013\r\n<i>[s...",isinister music playingi iheihachii haunt youi...,['isinister music playingi iheihachii haunt yo...,"(0, 255)\t0.10618870762154287\n (0, 101)\t0...","(0, 245)\t2\n (0, 337)\t3\n (0, 373)\t1\n ...","[-0.028539583086967468, -0.014185408130288124,..."
4,1458,9187759,jude.(1996).eng.1cd,b'PK\x03\x04\x14\x00\x00\x00\x08\x006\x7f\x99V...,"1\r\n00:00:06,000 --> 00:00:12,074\r\napi.Open...",apiopensubtitlesorg deprecated please implemen...,['apiopensubtitlesorg deprecated please implem...,"(0, 251)\t0.11556283081348522\n (0, 65)\t0....","(0, 32)\t1\n (0, 219)\t1\n (0, 652)\t1\n ...","[-0.05298641696572304, -0.05065608024597168, 0..."


In [51]:
'''new_df.to_csv('D:/Codes_VS/New folder/bert_embedded_data_final.csv')'''

In [4]:
new_df = pd.read_csv('D:/Codes_VS/New folder/bert_embedded_data_final.csv')

In [24]:
new_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30000 entries, 0 to 29999
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Unnamed: 0.1      30000 non-null  int64 
 1   Unnamed: 0        30000 non-null  int64 
 2   num               30000 non-null  int64 
 3   name              30000 non-null  object
 4   content           30000 non-null  object
 5   file_content      30000 non-null  object
 6   cleaned_content   30000 non-null  object
 7   chunks            30000 non-null  object
 8   tfidf_embeddings  30000 non-null  object
 9   bow_embeddings    30000 non-null  object
 10  bert_embeddings   30000 non-null  object
dtypes: int64(3), object(8)
memory usage: 2.5+ MB


In [5]:
new_df.drop(columns=['Unnamed: 0.1', 'Unnamed: 0'], inplace=True)
new_df.head()

Unnamed: 0,num,name,content,file_content,cleaned_content,chunks,tfidf_embeddings,bow_embeddings,bert_embeddings
0,9506254,wan.pisu.(1999).eng.1cd,b'PK\x03\x04\x14\x00\x00\x00\x08\x00\xc8\x99\x...,"ï»¿1\r\n00:00:13,430 --> 00:00:15,140 \r\n<i>I...",iinherited willi ithe tide time people dreamsi...,['iinherited willi ithe tide time people dream...,"(0, 407)\t0.08534731708381925\n (0, 399)\t0...","(0, 223)\t1\n (0, 492)\t1\n (0, 244)\t3\n ...","[-0.00775002408772707, -0.007927365601062775, ..."
1,9480321,record.of.lodoss.war.chronicles.of.the.heroic....,b'PK\x03\x04\x14\x00\x00\x00\x08\x00\x10s\x9aV...,"ï»¿1\r\n00:00:03,770 --> 00:00:08,765\r\n""To t...",south continent alecrast lie land people call ...,['south continent alecrast lie land people cal...,"(0, 160)\t0.0784120469214599\n (0, 93)\t0.0...","(0, 349)\t1\n (0, 52)\t1\n (0, 4)\t1\n (0...","[-0.023287566378712654, -0.008225442841649055,..."
2,9302410,halloween.5.the.revenge.of.michael.myers.(1989...,b'PK\x03\x04\x14\x00\x00\x00\x08\x00}\xab\x99V...,"1\r\n00:00:06,000 --> 00:00:12,074\r\nWatch an...",watch video online opensubtitles free browser ...,['watch video online opensubtitles free browse...,"(0, 282)\t0.06277336490376696\n (0, 155)\t0...","(0, 800)\t1\n (0, 788)\t1\n (0, 511)\t1\n ...","[0.0025689578615128994, -0.057597894221544266,..."
3,9210217,tekken.bloodline.s01.e04.episode.4.(2022).eng.1cd,b'PK\x03\x04\x14\x00\x00\x00\x08\x00\xb0\x9e\x...,"ï»¿1\r\n00:00:11,803 --> 00:00:13,013\r\n<i>[s...",isinister music playingi iheihachii haunt youi...,['isinister music playingi iheihachii haunt yo...,"(0, 255)\t0.10618870762154287\n (0, 101)\t0...","(0, 245)\t2\n (0, 337)\t3\n (0, 373)\t1\n ...","[-0.028539583086967468, -0.014185408130288124,..."
4,9187759,jude.(1996).eng.1cd,b'PK\x03\x04\x14\x00\x00\x00\x08\x006\x7f\x99V...,"1\r\n00:00:06,000 --> 00:00:12,074\r\napi.Open...",apiopensubtitlesorg deprecated please implemen...,['apiopensubtitlesorg deprecated please implem...,"(0, 251)\t0.11556283081348522\n (0, 65)\t0....","(0, 32)\t1\n (0, 219)\t1\n (0, 652)\t1\n ...","[-0.05298641696572304, -0.05065608024597168, 0..."


In [6]:
# Function to extract scores from each row
def extract_scores_tfidf(row):
    scores = re.findall(r'\d+\.\d+', row)
    return [float(score) for score in scores]

In [7]:
# Apply the function to the column
new_df['tfidf_embeddings'] = new_df['tfidf_embeddings'].progress_apply(extract_scores_tfidf)

100%|██████████| 30000/30000 [00:01<00:00, 20716.67it/s]


In [26]:
import json
new_df['tfidf_embeddings'] = new_df['tfidf_embeddings'].progress_apply(lambda x: json.dumps(x))

100%|██████████| 30000/30000 [00:00<00:00, 45425.94it/s]


In [27]:
new_df.head()

Unnamed: 0,num,name,content,file_content,cleaned_content,chunks,tfidf_embeddings,bow_embeddings,bert_embeddings
0,9506254,wan.pisu.(1999).eng.1cd,b'PK\x03\x04\x14\x00\x00\x00\x08\x00\xc8\x99\x...,"ï»¿1\r\n00:00:13,430 --> 00:00:15,140 \r\n<i>I...",iinherited willi ithe tide time people dreamsi...,['iinherited willi ithe tide time people dream...,"[0.08534731708381925, 0.08534731708381925, 0.0...","(0, 223)\t1\n (0, 492)\t1\n (0, 244)\t3\n ...","[-0.00775002408772707, -0.007927365601062775, ..."
1,9480321,record.of.lodoss.war.chronicles.of.the.heroic....,b'PK\x03\x04\x14\x00\x00\x00\x08\x00\x10s\x9aV...,"ï»¿1\r\n00:00:03,770 --> 00:00:08,765\r\n""To t...",south continent alecrast lie land people call ...,['south continent alecrast lie land people cal...,"[0.0784120469214599, 0.0784120469214599, 0.078...","(0, 349)\t1\n (0, 52)\t1\n (0, 4)\t1\n (0...","[-0.023287566378712654, -0.008225442841649055,..."
2,9302410,halloween.5.the.revenge.of.michael.myers.(1989...,b'PK\x03\x04\x14\x00\x00\x00\x08\x00}\xab\x99V...,"1\r\n00:00:06,000 --> 00:00:12,074\r\nWatch an...",watch video online opensubtitles free browser ...,['watch video online opensubtitles free browse...,"[0.06277336490376696, 0.08892120981179234, 0.0...","(0, 800)\t1\n (0, 788)\t1\n (0, 511)\t1\n ...","[0.0025689578615128994, -0.057597894221544266,..."
3,9210217,tekken.bloodline.s01.e04.episode.4.(2022).eng.1cd,b'PK\x03\x04\x14\x00\x00\x00\x08\x00\xb0\x9e\x...,"ï»¿1\r\n00:00:11,803 --> 00:00:13,013\r\n<i>[s...",isinister music playingi iheihachii haunt youi...,['isinister music playingi iheihachii haunt yo...,"[0.10618870762154287, 0.10618870762154287, 0.1...","(0, 245)\t2\n (0, 337)\t3\n (0, 373)\t1\n ...","[-0.028539583086967468, -0.014185408130288124,..."
4,9187759,jude.(1996).eng.1cd,b'PK\x03\x04\x14\x00\x00\x00\x08\x006\x7f\x99V...,"1\r\n00:00:06,000 --> 00:00:12,074\r\napi.Open...",apiopensubtitlesorg deprecated please implemen...,['apiopensubtitlesorg deprecated please implem...,"[0.11556283081348522, 0.09551592955184321, 0.1...","(0, 32)\t1\n (0, 219)\t1\n (0, 652)\t1\n ...","[-0.05298641696572304, -0.05065608024597168, 0..."


## **Step 12 - Creating a ChromaDB Collection for Semantic Search Engine**

In [6]:
import chromadb

# Initialize PersistentClient with the desired path to save the database
client = chromadb.PersistentClient(path="D:/Codes_VS/New folder")

collection = client.get_or_create_collection(
    name="semantic_search_engine",  # Specify a name for the collection
    metadata={"hnsw:space": "cosine"}  # Specify metadata for the collection
)

## **Step 13 - Adding Records to the Collection**

In [7]:
import ast

# Assuming 'new_df' is your DataFrame containing the data
for index, row in new_df.iterrows():
    # Extract necessary columns including embeddings
    num = str(row['num'])  # Convert num to string
    name = row['name']
    
    # Parse the embeddings from string to list
    embeddings_str = row['bert_embeddings']
    embeddings = ast.literal_eval(embeddings_str)
    
    # Insert data into ChromaDB
    collection.add(
        ids=[num],  # Assuming 'num' is the ID
        embeddings=[embeddings],  # Pass embeddings in a list
        documents=[name]  # Adjust documents parameter
    )

In [8]:
collection.peek()

{'ids': ['9180592',
  '9180607',
  '9180684',
  '9180705',
  '9181183',
  '9181572',
  '9181655',
  '9181723',
  '9181731',
  '9181932'],
 'embeddings': [[-0.09897163510322571,
   -0.11225833743810654,
   0.07381220161914825,
   0.004592361394315958,
   -0.09962938725948334,
   0.006864966358989477,
   0.0631253719329834,
   -0.021242111921310425,
   0.0355447456240654,
   -0.04534251615405083,
   0.049408625811338425,
   -0.0632108822464943,
   -0.0006260192603804171,
   -0.06877058744430542,
   0.06062163785099983,
   -0.02483232505619526,
   0.07259579747915268,
   -0.00022970499412622303,
   -0.16268180310726166,
   -0.0039878361858427525,
   -0.029241356998682022,
   0.021817201748490334,
   -0.0003767944872379303,
   -0.07005324959754944,
   -0.06036784127354622,
   0.0027923278976231813,
   -0.014921597205102444,
   -0.0047696735709905624,
   -0.047016214579343796,
   -0.057330239564180374,
   0.005739076528698206,
   0.1672646701335907,
   0.05129978060722351,
   0.035912360996

In [9]:
collection.count()

30000

In [10]:
'''client.delete_collection(name='semantic_search_engine')'''

## **Step 14 - Querying to Return Top 10 Files With High Similarity Scores with the Query**

In [10]:
def query_and_print_results(collection, query_text, n_results=10):
    print("User query : ", query_text)
    # Preprocess the query text and encode it into a vector
    query_processed = preprocess_text(query_text)
    query_vector = encode_text(query_processed)
    
    # Query the collection
    query_results = collection.query(
        [query_vector],  # Pass the query vector as a list
        n_results=n_results)  # Retrieve top n_results
    
    # Sort the results by similarity score in descending order
    sorted_results = sorted(zip(query_results['documents'][0], query_results['distances'][0]), key=lambda x: x[1], reverse=True)
    
    # Print the sorted results
    print("Top 10 Similar Documents:")
    print("----------------------------")
    for document, score in sorted_results:
        print(f"Document: {document}, Similarity Score: {score}")

In [11]:
# Example usage:
#query_text = input("Enter the query : ")
query_text = 'Jang Uk loves Nak Su'
query_and_print_results(collection, query_text)

User query :  Jang Uk loves Nak Su
Top 10 Similar Documents:
----------------------------
Document: confession.(2022).eng.1cd, Similarity Score: 0.5832622051239014
Document: udan.patolas.s02.e01.chak.de.(2022).eng.1cd, Similarity Score: 0.5783436894416809
Document: the.glory.s01.e14.episode.1.14.(2023).eng.1cd, Similarity Score: 0.5768985748291016
Document: glitter.s01.e06.what.a.girl.dreams.of.(2022).eng.1cd, Similarity Score: 0.5758943557739258
Document: bali.2002.s01.e01.island.of.the.gods.(2022).eng.1cd, Similarity Score: 0.5738612413406372
Document: the.glory.s01.e07.episode.1.7.(2022).eng.1cd, Similarity Score: 0.5737365484237671
Document: alchemy.of.souls.s01.e17.episode.1.17.(2022).eng.1cd, Similarity Score: 0.5706062316894531
Document: unlock.my.boss.s01.e11.episode.1.11.(2023).eng.1cd, Similarity Score: 0.5687031745910645
Document: whats.up.connection.(1990).eng.1cd, Similarity Score: 0.5643433332443237
Document: the.glory.s01.e05.episode.1.5.(2022).eng.1cd, Similarity Score:

#### It is to be noted that for the above query "Jang Uk loves Nak Su", the appropriate file name is "Alchemy of Souls". The subtitle of this series may not contain this query as its line. Yet the search engine managed to display it as one of the results, due to it capturing semantic information.