In [1]:
import sqlite3
import zipfile
import io
import os

# Connect to the SQLite database
conn = sqlite3.connect('subtitles_database.db')
cursor = conn.cursor()

# Fetch rows containing compressed zip files
cursor.execute('SELECT num, content FROM zipfiles')
rows = cursor.fetchall()

# Output directory for saving decoded .srt files
output_dir = 'output_directory'
os.makedirs(output_dir, exist_ok=True)

# Iterate over rows
for row in rows:
    row_id, compressed_content = row
    
    # Decompress the zip file
    with zipfile.ZipFile(io.BytesIO(compressed_content)) as zip_ref:
        
        # Extract only .srt files from the zip
        for zip_info in zip_ref.infolist():
            if zip_info.filename.lower().endswith('.srt'):
                
                # Sanitize the file name
                sanitized_name = os.path.basename(zip_info.filename)
                sanitized_name = ''.join(c for c in sanitized_name if c.isalnum() or c in ('.', '_', ' '))
                file_path = os.path.join(output_dir, sanitized_name)
                
                # Extract the file
                with open(file_path, 'wb') as f:
                    f.write(zip_ref.read(zip_info.filename))
                    
                # Update the content column in the database with the extracted content
                with open(file_path, 'rb') as f:
                    content = f.read()
                    cursor.execute('UPDATE zipfiles SET content = ? WHERE num = ?', (content, row_id))

# Commit the changes to the database and close the connection
conn.commit()
conn.close()


In [1]:
# load the data from database to an dataframe
import pandas as pd
import sqlite3
conn = sqlite3.connect('subtitles_database.db')
query = 'SELECT * FROM  zipfiles'
df = pd.read_sql_query(query,conn)
conn.close()

print(df.head(10))

       num                                               name  \
0  9180533                         the.message.(1976).eng.1cd   
1  9180583  here.comes.the.grump.s01.e09.joltin.jack.in.bo...   
2  9180592    yumis.cells.s02.e13.episode.2.13.(2022).eng.1cd   
3  9180594    yumis.cells.s02.e14.episode.2.14.(2022).eng.1cd   
4  9180600                              broker.(2022).eng.1cd   
5  9180607                            the.myth.(2005).eng.1cd   
6  9180608                    the.great.beauty.(2013).eng.1cd   
7  9180662  rudrabinar.obhishaap.s02.e01.swaralipir.kut.ta...   
8  9180684  rudrabinar.obhishaap.s02.e02.arek.naad.(2022)....   
9  9180694  rudrabinar.obhishaap.s02.e03.anandagarher.akhh...   

                                             content  
0  b'1\r\n00:00:06,000 --> 00:00:12,074\r\nWatch ...  
1  b"1\r\n00:00:29,359 --> 00:00:32,048\r\nAh! Th...  
2  b'1\r\n00:00:53,200 --> 00:00:56,030\r\n<i>Yum...  
3  b'1\r\n00:00:06,000 --> 00:00:12,074\r\nWatch ...  
4  b'\xef

In [2]:
df.shape

(82498, 3)

In [3]:
df['content'].isnull()

0        False
1        False
2        False
3        False
4        False
         ...  
82493    False
82494    False
82495    False
82496    False
82497    False
Name: content, Length: 82498, dtype: bool

In [4]:
# each file are extracted and decoded prefectly no null values are present in column  
df['content'].isnull().nunique()

1

In [5]:
df['content'].isnull().nunique()

1

In [6]:
df.columns

Index(['num', 'name', 'content'], dtype='object')

In [7]:
import sqlite3
# Connect to your SQLite database
conn = sqlite3.connect('subtitles_database.db')
cursor = conn.cursor()

# Execute the query to rename the column
cursor.execute("ALTER TABLE zipfiles RENAME COLUMN num TO sub_id")

conn.commit()
conn.close()

print("Column name updated successfully.")


Column name updated successfully.


In [9]:
# load the data from database to an dataframe and save it in .csv file
import pandas as pd
import sqlite3

conn = sqlite3.connect('subtitles_database.db')
query = 'SELECT * FROM  zipfiles'
df = pd.read_sql_query(query,conn)
df.to_csv('subtitles_database.csv',index=False)
conn.close()

print(df.head(10))

    sub_id                                               name  \
0  9180533                         the.message.(1976).eng.1cd   
1  9180583  here.comes.the.grump.s01.e09.joltin.jack.in.bo...   
2  9180592    yumis.cells.s02.e13.episode.2.13.(2022).eng.1cd   
3  9180594    yumis.cells.s02.e14.episode.2.14.(2022).eng.1cd   
4  9180600                              broker.(2022).eng.1cd   
5  9180607                            the.myth.(2005).eng.1cd   
6  9180608                    the.great.beauty.(2013).eng.1cd   
7  9180662  rudrabinar.obhishaap.s02.e01.swaralipir.kut.ta...   
8  9180684  rudrabinar.obhishaap.s02.e02.arek.naad.(2022)....   
9  9180694  rudrabinar.obhishaap.s02.e03.anandagarher.akhh...   

                                             content  
0  b'1\r\n00:00:06,000 --> 00:00:12,074\r\nWatch ...  
1  b"1\r\n00:00:29,359 --> 00:00:32,048\r\nAh! Th...  
2  b'1\r\n00:00:53,200 --> 00:00:56,030\r\n<i>Yum...  
3  b'1\r\n00:00:06,000 --> 00:00:12,074\r\nWatch ...  
4  b'\xef

In [11]:
#Load csv file data to your data frame
import pandas as pd
 
df = pd.read_csv('subtitles_database.csv')
print(df.head())

    sub_id                                               name  \
0  9180533                         the.message.(1976).eng.1cd   
1  9180583  here.comes.the.grump.s01.e09.joltin.jack.in.bo...   
2  9180592    yumis.cells.s02.e13.episode.2.13.(2022).eng.1cd   
3  9180594    yumis.cells.s02.e14.episode.2.14.(2022).eng.1cd   
4  9180600                              broker.(2022).eng.1cd   

                                             content  
0  b'1\r\n00:00:06,000 --> 00:00:12,074\r\nWatch ...  
1  b"1\r\n00:00:29,359 --> 00:00:32,048\r\nAh! Th...  
2  b'1\r\n00:00:53,200 --> 00:00:56,030\r\n<i>Yum...  
3  b'1\r\n00:00:06,000 --> 00:00:12,074\r\nWatch ...  
4  b'\xef\xbb\xbf1\r\n00:00:06,000 --> 00:00:12,0...  


In [13]:
df.shape

(82498, 3)

In [14]:
df['content'].iloc[0]

'b\'1\\r\\n00:00:06,000 --> 00:00:12,074\\r\\nWatch any video online with Open-SUBTITLES\\r\\nFree Browser extension: osdb.link/ext\\r\\n\\r\\n2\\r\\n00:02:26,198 --> 00:02:29,953\\r\\nIn the name of God, the most gracious, the most Merciful.\\r\\n\\r\\n3\\r\\n00:02:31,072 --> 00:02:33,370\\r\\nFrom Muhammad, the Messenger of God\\r\\n\\r\\n4\\r\\n00:02:33,550 --> 00:02:36,047\\r\\nto Heraclius, the emperor of Byzantium.\\r\\n\\r\\n5\\r\\n00:02:36,407 --> 00:02:39,464\\r\\ngreetings to him who is the\\r\\nfollower of righteous guidance.\\r\\n\\r\\n6\\r\\n00:02:39,783 --> 00:02:42,591\\r\\nI bid you to hear the divine call.\\r\\n\\r\\n7\\r\\n00:02:43,160 --> 00:02:45,817\\r\\nI am the messenger of God to the people;\\r\\n\\r\\n8\\r\\n00:02:46,337 --> 00:02:48,784\\r\\naccept Islam for your salvation.\\r\\n\\r\\n9\\r\\n00:02:52,231 --> 00:02:54,709\\r\\nHe speaks of a new prophet in Arabia.\\r\\n\\r\\n10\\r\\n00:02:55,068 --> 00:02:57,825\\r\\nWas it like this when John, the Baptist\\r\\

In [15]:
sampled_df = df.sample(frac=0.3, replace=False)

In [16]:
sampled_df.head()

Unnamed: 0,sub_id,name,content
76587,9496891,tout.le.monde.ment.(2022).eng.1cd,"b'\xef\xbb\xbf1\r\n00:00:10,120 --> 00:00:12,6..."
28905,9296520,drakosha.tosha.(2017).eng.1cd,"b'\xef\xbb\xbf1\r\n00:00:00,880 --> 00:00:02,8..."
46562,9375624,the.equalizer.s01.e08.lifeline.(2021).eng.1cd,"b'\xef\xbb\xbf1\r\n00:00:01,068 --> 00:00:02,8..."
7868,9214530,the.kings.avatar.s01.e11.using.the.right.metho...,"b""\xef\xbb\xbf1\r\n00:00:06,000 --> 00:00:12,0..."
52230,9400640,northern.exposure.s03.e19.wake.up.call.(1992)....,b'PK\x03\x04\x14\x00\x00\x00\x08\x007(\x9aVW4i...


In [17]:
sampled_df.shape

(24749, 3)

In [18]:
import pandas as pd
import re

# Define a regular expression pattern to match time stamps
pattern1 = r'\d{2}:\d{2}:\d{2},\d{3} --> \d{2}:\d{2}:\d{2},\d{3}'
pattern2 = r'\\r'
pattern3 = r'\\n'
pattern4 = r'\d+' 
# Remove time stamps and extra spaces from the 'content' column
sampled_df['content'] = sampled_df['content'].str.replace(pattern1, '', regex=True).str.strip()
sampled_df['content'] = sampled_df['content'].str.replace(pattern2, '', regex=True)
sampled_df['content'] = sampled_df['content'].str.replace(pattern3, '', regex=True)
sampled_df['content'] = sampled_df['content'].str.replace(pattern4, '', regex=True)
sampled_df['content'] = sampled_df['content'].str.replace('\\', '').str.lower()
sampled_df['content'] = sampled_df['content'].str.replace('</i>\d+<i>', '', regex=True)
sampled_df['content'] = sampled_df['content'].str.replace('"\d+"', '', regex=True)
sampled_df['content'] = sampled_df['content'].str.replace('"\d+ "', '', regex=True)
sampled_df['content'] = sampled_df['content'].str.replace('</i>', '', regex=True)
sampled_df['content'] = sampled_df['content'].str.replace('<i>', '', regex=True)
sampled_df['content'] = sampled_df['content'].str.replace('<i>\d+<', '', regex=True)
sampled_df['content'] = sampled_df['content'].str.replace('"', '')
sampled_df['content'] = sampled_df['content'].str.replace('-', '')
sampled_df['content'] = sampled_df['content'].apply(lambda x: re.sub(r'\.\d+', '.', x))
sampled_df['content'] = sampled_df['content'].apply(lambda x: re.sub(r'\?\d+', '?', x))
sampled_df['content'] = sampled_df['content'].apply(lambda x: re.sub(r'\!\d+', '!', x))
sampled_df['content'] = sampled_df['content'].apply(lambda x: re.sub(r'\, \d+', ',', x))
sampled_df['content'] = sampled_df['content'].apply(lambda x: re.sub(r'\,\d+', ',', x))

sampled_df.head()

Unnamed: 0,sub_id,name,content
76587,9496891,tout.le.monde.ment.(2022).eng.1cd,b'xefxbbxbfthank you.a bit of champagne!have y...
28905,9296520,drakosha.tosha.(2017).eng.1cd,b'xefxbbxbf(whimsical music)xexxaa knowing how...
46562,9375624,the.equalizer.s01.e08.lifeline.(2021).eng.1cd,b'xefxbbxbfbishop: previouslyon the equalizer....
7868,9214530,the.kings.avatar.s01.e11.using.the.right.metho...,bxefxbbxbfuse the free code joinnow at xexxaww...
52230,9400640,northern.exposure.s03.e19.wake.up.call.(1992)....,b'pkxxxxxxxx(xavwixbxcbxxxaxffxx?xxxnorthern e...


In [19]:
sampled_df['content'].iloc[0]

"b'xefxbbxbfthank you.a bit of champagne!have you been here before, julie?it's my first time.oh, this is your first time here?lebel, you've been had.could we turn the lights down?of course.thanks.damn!what is this?shit.it's a camera.what?it's a camera.did you know there was a camera?a camera?you know what a camera is?what camera?what's this? where's it from?i have no idea.are you trying to blackmail me?who's paying you?you think i'm a sucker, you bitch?i didn't do anything! it wasn't me!you didn't put the camera there?no, i didn't know.give me that!i'm calling favan.favan? give it to me.you're not calling favan!stop!you're not calling favan!victor?sir?we have a problem. for dl alpha.we have a car on fireon victor hugo in chatou.are you available?patrol dl alpha, we're on it.we're finishing up a stop,and then we'll go.did you call the fire department?help me.sir?yes.she's not dead.what do you mean she's not dead?she's still alive.what do i do? call for help?no, especially notthe emergen

In [20]:
sampled_df['content'].iloc[1]

"b'xefxbbxbf(whimsical music)xexxaa knowing how tolove your friends xexxaaxexxaa is the most important thing xexxaaxexxaa let's all get upto some mischief xexxaaxexxaa together, we canplay and sing xexxaaxexxaa no, he's nothinglike a hippo xexxaaxexxaa not like a tall giraffe xexxaaxexxaa he's like a dinosaur xexxaaxexxaa and he really loves to laugh xexxaaxexxaa yes, here comes tommy xexxaaxexxaa our little tommy xexxaaxexxaa it's tommy thedragon and friends xexxaaxexxaa tommy, our favorite tommy xexxaaxexxaa it's tommy thedragon and friends xexxaa [tommy] the magic mirror.that day, andy gota new stacking toy(mandy huffs) mandy, why are youso undragonish? because andy's got a newstacking toy and i haven't. but it's for babies. so, what?he's got one for babies,and i've got none.(andy chuckles happily)(mandy huffs)(mandy yelps) hmm.(crying) andy, what's the matter?(andy blubbering)have you lost your stacking toy?(andy crying louder)mandy, have you seen the toy?andy, we'll find out where

In [21]:
sampled_df['content'].iloc[100]



In [22]:
sampled_df.to_csv('clean_database.csv',index=False)

In [1]:
pip install transformers torch




In [2]:
!pip install sentence-transformers 



In [3]:
!pip install nltk



In [4]:
! pip install -U sentence-transformers



In [5]:
pip install --upgrade transformers

Note: you may need to restart the kernel to use updated packages.


In [23]:
import pandas as pd

def create_overlapping_chunks(data, chunk_size, overlap):
    chunks = []
    chunk_id = 0
    for i in range(0, len(data), chunk_size - overlap):
        chunk_data = data[i:i + chunk_size]
        chunks.append(chunk_data)
        chunk_id += 1
    return chunks



# Define chunk size and overlap
chunk_size = 2500
overlap = 50

# Create overlapping chunks of the 'content' column
chunks = []
for index, row in sampled_df.iterrows():
    content_chunks = create_overlapping_chunks(row['content'], chunk_size, overlap)
    for i, chunk in enumerate(content_chunks):
        chunks.append((row['sub_id'], row['name'], i, chunk))

# Create DataFrame from chunks
chunk_df = pd.DataFrame(chunks, columns=['sub_id', 'name', 'chunk_id', 'content_chunk'])

# Print the result
print(chunk_df)


         sub_id                                               name  chunk_id  \
0       9496891                  tout.le.monde.ment.(2022).eng.1cd         0   
1       9496891                  tout.le.monde.ment.(2022).eng.1cd         1   
2       9496891                  tout.le.monde.ment.(2022).eng.1cd         2   
3       9496891                  tout.le.monde.ment.(2022).eng.1cd         3   
4       9496891                  tout.le.monde.ment.(2022).eng.1cd         4   
...         ...                                                ...       ...   
278756  9375308  gossip.girl.s02.e06.how.to.bury.a.millionaire....        12   
278757  9375308  gossip.girl.s02.e06.how.to.bury.a.millionaire....        13   
278758  9375308  gossip.girl.s02.e06.how.to.bury.a.millionaire....        14   
278759  9375308  gossip.girl.s02.e06.how.to.bury.a.millionaire....        15   
278760  9375308  gossip.girl.s02.e06.how.to.bury.a.millionaire....        16   

                                       

In [24]:
chunk_df['content_chunk'][1]

"ficer?no, prosecutor.alice mojodi.i'm not a cop anymore.there are lots of cops out there., of them.mr. verner!the minister of justiceand the minister of the interioragreed to set upa special task forceto specialize in sensitive cases.sensitive casesis the polite way of sayingshitty cases, right?no, it's for cases that involvepeople with enough power and moneyto cover up their crimes.people are tiredof the double standards.mr. verner, that's where you come in.i need an asset.an asset. you're not lookingfor an asset.you're looking for a fuse.no!you hate lying.that's why you were fired.i'm countingon your desire for revenge.even in detective stories,there's no finer motivator.you've been put in chargeof this group?i earned my place there.you're a woman, young,wearing cheap clothes.i'm sure you're a scholarship kid.i hope you have a point.you're an advertisementfor the ministry of justice.great, so no one will be suspicious.you don't give up, do you?no!does this sensitive cases grouphave 

In [25]:
import pandas as pd
from tqdm import tqdm
from sentence_transformers import SentenceTransformer

# Define chunk size and overlap
chunk_size = 2500
overlap = 50

def create_overlapping_chunks(data, chunk_size, overlap):
    chunks = []
    for i in range(0, len(data), chunk_size - overlap):
        chunk_data = data[i:i + chunk_size]
        chunks.append(chunk_data)
    return chunks

# Initialize SBERT model
model = SentenceTransformer("paraphrase-distilroberta-base-v1")

# List to store embeddings
embeddings_list = []

# Iterate through sampled DataFrame
for index, row in tqdm(sampled_df.iterrows(), total=len(sampled_df), desc="Processing chunks"):
    content_chunks = create_overlapping_chunks(row['content'], chunk_size, overlap)
    embeddings = model.encode(content_chunks)  # Encode the content chunks
    for i, (chunk, embedding) in enumerate(zip(content_chunks, embeddings)):
        embeddings_list.append((row['sub_id'],row['name'], i, chunk, embedding))

# Create DataFrame from embeddings list
embeddings_df = pd.DataFrame(embeddings_list, columns=['sub_id','name', 'chunk_id', 'content_chunk', 'embedding'])

# Print the result
print(embeddings_df.head())


Processing chunks: 100%|███████████████████████████████████████████████████████| 24749/24749 [7:29:29<00:00,  1.09s/it]


    sub_id                               name  chunk_id  \
0  9496891  tout.le.monde.ment.(2022).eng.1cd         0   
1  9496891  tout.le.monde.ment.(2022).eng.1cd         1   
2  9496891  tout.le.monde.ment.(2022).eng.1cd         2   
3  9496891  tout.le.monde.ment.(2022).eng.1cd         3   
4  9496891  tout.le.monde.ment.(2022).eng.1cd         4   

                                       content_chunk  \
0  b'xefxbbxbfthank you.a bit of champagne!have y...   
1  ficer?no, prosecutor.alice mojodi.i'm not a co...   
2  rength.reprimands,hard to manage,sixmonth leav...   
3  teran's last wishesweren't respected,it's beca...   
4   this wall.this is drywall. this can't happen....   

                                           embedding  
0  [-0.13100319, -0.16329323, -0.1829499, -0.0291...  
1  [-0.10158858, 0.64996386, -0.048901327, -0.160...  
2  [-0.2386091, 0.11502704, 0.09005959, 0.3683690...  
3  [-0.030090947, 0.3809962, -0.020147875, 0.2887...  
4  [-0.08161045, 0.37052864, -0.1

In [26]:
embeddings_df.shape

(278761, 5)

In [27]:
embeddings_df['embedding'].iloc[0]

array([-1.31003186e-01, -1.63293228e-01, -1.82949901e-01, -2.91580148e-02,
       -1.97110683e-01,  2.85665710e-02,  1.13054223e-01, -1.03226140e-01,
       -4.12672460e-01,  8.34588930e-02,  3.32275271e-01,  5.80952950e-02,
        1.57324925e-01, -5.80274761e-01, -7.66258389e-02,  6.88483536e-01,
       -4.78134565e-02,  1.42915606e-01, -2.25074030e-03, -3.56231518e-02,
       -1.21242031e-01,  6.47134781e-02,  1.39525160e-01,  4.90551628e-02,
       -2.64032453e-01, -1.54480487e-01, -8.01839456e-02,  1.98057547e-01,
       -1.54249698e-01,  2.81834930e-01, -4.94543239e-02,  3.74054134e-01,
       -1.61572769e-01,  6.88532218e-02, -1.24244556e-01, -2.85252601e-01,
       -1.43624976e-01, -8.55457317e-03, -2.87112474e-01,  2.52150316e-02,
       -3.02332461e-01, -4.20096159e-01, -1.05929330e-01,  6.21226802e-02,
        2.67326981e-02,  2.02794969e-02, -1.93416327e-01, -3.31353605e-01,
        8.60164911e-02, -1.08697051e-02,  3.04477274e-01,  1.03859439e-01,
        1.31243259e-01,  

In [28]:
embeddings_df.to_csv('embeddings_database.csv',index=False)

In [29]:
import pandas as pd 
df = pd.read_csv("embeddings_database.csv") 

df.shape

(278761, 5)

In [30]:
df.head()

Unnamed: 0,sub_id,name,chunk_id,content_chunk,embedding
0,9496891,tout.le.monde.ment.(2022).eng.1cd,0,b'xefxbbxbfthank you.a bit of champagne!have y...,[-1.31003186e-01 -1.63293228e-01 -1.82949901e-...
1,9496891,tout.le.monde.ment.(2022).eng.1cd,1,"ficer?no, prosecutor.alice mojodi.i'm not a co...",[-1.01588577e-01 6.49963856e-01 -4.89013270e-...
2,9496891,tout.le.monde.ment.(2022).eng.1cd,2,"rength.reprimands,hard to manage,sixmonth leav...",[-2.38609105e-01 1.15027040e-01 9.00595933e-...
3,9496891,tout.le.monde.ment.(2022).eng.1cd,3,"teran's last wishesweren't respected,it's beca...",[-3.00909467e-02 3.80996197e-01 -2.01478750e-...
4,9496891,tout.le.monde.ment.(2022).eng.1cd,4,this wall.this is drywall. this can't happen....,[-8.16104487e-02 3.70528638e-01 -1.21984214e-...


In [31]:
df.tail()

Unnamed: 0,sub_id,name,chunk_id,content_chunk,embedding
278756,9375308,gossip.girl.s02.e06.how.to.bury.a.millionaire....,12,"of someone's story?what i said was a lot,and i...",[ 7.97352940e-03 4.52708006e-01 1.17553249e-...
278757,9375308,gossip.girl.s02.e06.how.to.bury.a.millionaire....,13,must have referencedan old contact list.max: a...,[-2.35176444e-01 -1.01413699e-02 -2.35147048e-...
278758,9375308,gossip.girl.s02.e06.how.to.bury.a.millionaire....,14,my spy at lvmhfinally got back to me.they fou...,[ 2.78149657e-02 2.46125702e-02 2.75885426e-...
278759,9375308,gossip.girl.s02.e06.how.to.bury.a.millionaire....,15,you guys up. roy: no. things were brokenbetwee...,[-5.67019939e-01 1.34965315e-01 2.70368010e-...
278760,9375308,gossip.girl.s02.e06.how.to.bury.a.millionaire....,16,p there.i don't do feelings. ew. okay. (cellph...,[-2.96848536e-01 5.68363130e-01 1.92018263e-...


In [32]:
df['sub_id'].nunique()

24749