# Loading and Preprocessing the dataframe

In [None]:
import pandas as pd
import numpy as np
import sqlite3
import zipfile
import io

In [None]:
conn = sqlite3.connect('/content/drive/MyDrive/search_engine/eng_subtitles_database.db')

# SQL query
query = "SELECT num, name, content FROM zipfiles"

In [None]:
df = pd.read_sql_query(query, conn)
df.head()

Unnamed: 0,num,name,content
0,9180533,the.message.(1976).eng.1cd,b'PK\x03\x04\x14\x00\x00\x00\x08\x00\x1c\xa9\x...
1,9180583,here.comes.the.grump.s01.e09.joltin.jack.in.bo...,b'PK\x03\x04\x14\x00\x00\x00\x08\x00\x17\xb9\x...
2,9180592,yumis.cells.s02.e13.episode.2.13.(2022).eng.1cd,b'PK\x03\x04\x14\x00\x00\x00\x08\x00L\xb9\x99V...
3,9180594,yumis.cells.s02.e14.episode.2.14.(2022).eng.1cd,b'PK\x03\x04\x14\x00\x00\x00\x08\x00U\xa9\x99V...
4,9180600,broker.(2022).eng.1cd,b'PK\x03\x04\x14\x00\x00\x00\x08\x001\xa9\x99V...


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 82498 entries, 0 to 82497
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   num      82498 non-null  int64 
 1   name     82498 non-null  object
 2   content  82498 non-null  object
dtypes: int64(1), object(2)
memory usage: 1.9+ MB


## cleaning filenames

In [None]:
import re
def clean_filename(filename):
    # Remove extension and language/version info
    filename = re.sub(r"\.\w+$", "", filename)  # Removes .1cd
    # Replace dots and underscores with spaces
    filename = re.sub(r"[\._]", " ", filename)
    # Remove string eng
    filename = re.sub(r" eng", " ", filename)

    return filename.strip().lower()

df["name"] = df["name"].apply(clean_filename)

In [None]:
df.name[:10]

0                                   the message (1976)
1    here comes the grump s01 e09 joltin jack in bo...
2              yumis cells s02 e13 episode 2 13 (2022)
3              yumis cells s02 e14 episode 2 14 (2022)
4                                        broker (2022)
5                                      the myth (2005)
6                              the great beauty (2013)
7    rudrabinar obhishaap s02 e01 swaralipir kut ta...
8        rudrabinar obhishaap s02 e02 arek naad (2022)
9    rudrabinar obhishaap s02 e03 anandagarher akhh...
Name: name, dtype: object

## Decoding the content fo subtitles files.

In [None]:
def extract_and_decode(binary_data):
    '''
    Function to extract content from binary zip file and decode it
    '''
    try:
        # Create a BytesIO object from the binary data
        with io.BytesIO(binary_data) as f:

            with zipfile.ZipFile(f, 'r') as zip_file:

                subtitle_content = zip_file.read(zip_file.namelist()[0])

        # Decode the content using 'latin-1' encoding
        decoded_content = subtitle_content.decode('latin-1')
        return decoded_content

    except Exception as e:
        return f"Extraction failed with error: {str(e)}"

In [None]:
#applying the extract and decode function to whole database to see that content

%time df["content"] = df['content'].apply(extract_and_decode)

CPU times: user 41.7 s, sys: 3.33 s, total: 45.1 s
Wall time: 45.8 s


In [None]:
df.head()

Unnamed: 0,num,name,content
0,9180533,the message (1976),"1\r\n00:00:06,000 --> 00:00:12,074\r\nWatch an..."
1,9180583,here comes the grump s01 e09 joltin jack in bo...,"1\r\n00:00:29,359 --> 00:00:32,048\r\nAh! Ther..."
2,9180592,yumis cells s02 e13 episode 2 13 (2022),"1\r\n00:00:53,200 --> 00:00:56,030\r\n<i>Yumi'..."
3,9180594,yumis cells s02 e14 episode 2 14 (2022),"1\r\n00:00:06,000 --> 00:00:12,074\r\nWatch an..."
4,9180600,broker (2022),"ï»¿1\r\n00:00:06,000 --> 00:00:12,074\r\nWatch..."


In [None]:
#example content
print(df.content[67560][:500])

ï»¿1
00:00:06,000 --> 00:00:12,074
api.OpenSubtitles.org is deprecated, please
implement REST API from OpenSubtitles.com

2
00:00:21,669 --> 00:00:30,010
âªâª

3
00:01:05,610 --> 00:01:06,927
- It's all right, dear.

4
00:01:06,966 --> 00:01:07,976
- Whereabouts?

5
00:01:08,010 --> 00:01:09,434
- Upstairs.

6
00:01:18,621 --> 00:01:20,176
- Ah. Morning, Higgins.

7
00:01:20,222 --> 00:01:21,265
- George.

8
00:01:21,300 --> 00:01:22,855
- What have we...

9
0


In [None]:
import pandas as pd
from tqdm import tqdm

def split_and_save_df(df, n_splits=4, base_filename="subtitles_split"):
    """
    Splits a DataFrame into n_splits smaller DataFrames and saves them to CSV files.
    """
    split_dfs = np.array_split(df, n_splits)  # Split into equal parts
    for i, split_df in enumerate(tqdm(split_dfs)):
        filename = f"{base_filename}_{i}.csv"
        split_df.to_csv(f'/content/drive/MyDrive/search_engine/{filename}', index=False, escapechar='\\')  # Save each DataFrame to CSV
        print(f"Saved {filename}")

In [None]:
split_and_save_df(df)

 25%|██▌       | 1/4 [00:56<02:50, 56.97s/it]

Saved subtitles_split_0.csv


 50%|█████     | 2/4 [05:42<06:23, 191.61s/it]

Saved subtitles_split_1.csv


 75%|███████▌  | 3/4 [12:43<04:56, 296.33s/it]

Saved subtitles_split_2.csv


100%|██████████| 4/4 [19:59<00:00, 299.83s/it]

Saved subtitles_split_3.csv



